Theory NotebookMath for LLMs

LLM Evaluation Observability and Guardrails

Production ML and MLOps / LLM Evaluation Observability and Guardrails

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

LLM Evaluation Observability and Guardrails

LLM observability connects prompts, retrieval, tool use, evaluations, guardrails, and incidents into a production reliability loop.

This notebook is the executable companion to notes.md. It uses synthetic production signals so every cell runs without external services or data files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def softmax(z):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    e = np.exp(z)
    return e / e.sum()

def psi(ref, cur, eps=1e-8):
    ref = np.asarray(ref, dtype=float) + eps
    cur = np.asarray(cur, dtype=float) + eps
    ref = ref / ref.sum()
    cur = cur / cur.sum()
    return float(np.sum((cur - ref) * np.log(cur / ref)))

def js_divergence(p, q, eps=1e-8):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p = p / p.sum()
    q = q / q.sum()
    m = 0.5 * (p + q)
    return float(0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m)))

def percentile(values, q):
    return float(np.percentile(np.asarray(values, dtype=float), q))

print("Helper functions ready.")

Demo 1: LLM applications are pipelines not single models

This demo makes the production idea concrete with a small numerical object.

Code cell 5

header("Demo 1 - LLM applications are pipelines not single models: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 2: traces reveal hidden failures

This demo makes the production idea concrete with a small numerical object.

Code cell 7

header("Demo 2 - traces reveal hidden failures: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 3: offline evaluation versus online observability

This demo makes the production idea concrete with a small numerical object.

Code cell 9

header("Demo 3 - offline evaluation versus online observability: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 4: guardrails as runtime controls

This demo makes the production idea concrete with a small numerical object.

Code cell 11

header("Demo 4 - guardrails as runtime controls: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 5: reliability loop

This demo makes the production idea concrete with a small numerical object.

Code cell 13

header("Demo 5 - reliability loop: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 6: trace $\tau$

This demo makes the production idea concrete with a small numerical object.

Code cell 15

header("Demo 6 - trace $\\tau$: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 7: span

This demo makes the production idea concrete with a small numerical object.

Code cell 17

header("Demo 7 - span: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 8: prompt version

This demo makes the production idea concrete with a small numerical object.

Code cell 19

header("Demo 8 - prompt version: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 9: evaluation case

This demo makes the production idea concrete with a small numerical object.

Code cell 21

header("Demo 9 - evaluation case: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 10: guardrail action

This demo makes the production idea concrete with a small numerical object.

Code cell 23

header("Demo 10 - guardrail action: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 11: traces metrics and logs

This demo makes the production idea concrete with a small numerical object.

Code cell 25

header("Demo 11 - traces metrics and logs: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 12: token and cost tracking

This demo makes the production idea concrete with a small numerical object.

Code cell 27

header("Demo 12 - token and cost tracking: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 13: latency by component

This demo makes the production idea concrete with a small numerical object.

Code cell 29

header("Demo 13 - latency by component: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 14: tool-call traces

This demo makes the production idea concrete with a small numerical object.

Code cell 31

header("Demo 14 - tool-call traces: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 15: retrieval traces

This demo makes the production idea concrete with a small numerical object.

Code cell 33

header("Demo 15 - retrieval traces: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 16: golden sets

This demo makes the production idea concrete with a small numerical object.

Code cell 35

header("Demo 16 - golden sets: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 17: regression evaluations

This demo makes the production idea concrete with a small numerical object.

Code cell 37

header("Demo 17 - regression evaluations: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 18: LLM-as-judge caveats

This demo makes the production idea concrete with a small numerical object.

Code cell 39

header("Demo 18 - LLM-as-judge caveats: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 19: human review sampling

This demo makes the production idea concrete with a small numerical object.

Code cell 41

header("Demo 19 - human review sampling: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 20: evaluation versioning

This demo makes the production idea concrete with a small numerical object.

Code cell 43

header("Demo 20 - evaluation versioning: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 21: input filters

This demo makes the production idea concrete with a small numerical object.

Code cell 45

header("Demo 21 - input filters: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 22: output validators

This demo makes the production idea concrete with a small numerical object.

Code cell 47

header("Demo 22 - output validators: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 23: tool guards

This demo makes the production idea concrete with a small numerical object.

Code cell 49

header("Demo 23 - tool guards: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 24: retrieval guards

This demo makes the production idea concrete with a small numerical object.

Code cell 51

header("Demo 24 - retrieval guards: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 25: fallback and escalation

This demo makes the production idea concrete with a small numerical object.

Code cell 53

header("Demo 25 - fallback and escalation: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Previous lesson Next lesson