Theory NotebookMath for LLMs

Model Serving and Inference Optimization

Production ML and MLOps / Model Serving and Inference Optimization

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Model Serving and Inference Optimization

Serving mathematics connects model quality to latency, throughput, utilization, and rollback-safe release decisions.

This notebook is the executable companion to notes.md. It uses synthetic production signals so every cell runs without external services or data files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def softmax(z):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    e = np.exp(z)
    return e / e.sum()

def psi(ref, cur, eps=1e-8):
    ref = np.asarray(ref, dtype=float) + eps
    cur = np.asarray(cur, dtype=float) + eps
    ref = ref / ref.sum()
    cur = cur / cur.sum()
    return float(np.sum((cur - ref) * np.log(cur / ref)))

def js_divergence(p, q, eps=1e-8):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p = p / p.sum()
    q = q / q.sum()
    m = 0.5 * (p + q)
    return float(0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m)))

def percentile(values, q):
    return float(np.percentile(np.asarray(values, dtype=float), q))

print("Helper functions ready.")

Demo 1: training optimizes loss while serving optimizes systems constraints

This demo makes the production idea concrete with a small numerical object.

Code cell 5

header("Demo 1 - training optimizes loss while serving optimizes systems constraints: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 2: latency throughput cost triangle

This demo makes the production idea concrete with a small numerical object.

Code cell 7

header("Demo 2 - latency throughput cost triangle: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 3: batch versus online inference

This demo makes the production idea concrete with a small numerical object.

Code cell 9

header("Demo 3 - batch versus online inference: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 4: model endpoints as contracts

This demo makes the production idea concrete with a small numerical object.

Code cell 11

header("Demo 4 - model endpoints as contracts: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 5: failure budgets

This demo makes the production idea concrete with a small numerical object.

Code cell 13

header("Demo 5 - failure budgets: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 6: service $s(\mathbf{x})$

This demo makes the production idea concrete with a small numerical object.

Code cell 15

header("Demo 6 - service $s(\\mathbf{x})$: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 7: latency distribution

This demo makes the production idea concrete with a small numerical object.

Code cell 17

header("Demo 7 - latency distribution: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 8: throughput $q$

This demo makes the production idea concrete with a small numerical object.

Code cell 19

header("Demo 8 - throughput $q$: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 9: utilization $\rho$

This demo makes the production idea concrete with a small numerical object.

Code cell 21

header("Demo 9 - utilization $\\rho$: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 10: service-level objectives and service-level agreements

This demo makes the production idea concrete with a small numerical object.

Code cell 23

header("Demo 10 - service-level objectives and service-level agreements: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 11: REST and gRPC endpoints

This demo makes the production idea concrete with a small numerical object.

Code cell 25

header("Demo 11 - REST and gRPC endpoints: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 12: batch serving

This demo makes the production idea concrete with a small numerical object.

Code cell 27

header("Demo 12 - batch serving: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 13: streaming inference

This demo makes the production idea concrete with a small numerical object.

Code cell 29

header("Demo 13 - streaming inference: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 14: model routers

This demo makes the production idea concrete with a small numerical object.

Code cell 31

header("Demo 14 - model routers: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 15: shadow and canary deployments

This demo makes the production idea concrete with a small numerical object.

Code cell 33

header("Demo 15 - shadow and canary deployments: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 16: batching

This demo makes the production idea concrete with a small numerical object.

Code cell 35

header("Demo 16 - batching: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 17: caching

This demo makes the production idea concrete with a small numerical object.

Code cell 37

header("Demo 17 - caching: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 18: quantization preview

This demo makes the production idea concrete with a small numerical object.

Code cell 39

header("Demo 18 - quantization preview: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 19: distillation preview

This demo makes the production idea concrete with a small numerical object.

Code cell 41

header("Demo 19 - distillation preview: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 20: hardware-aware scheduling

This demo makes the production idea concrete with a small numerical object.

Code cell 43

header("Demo 20 - hardware-aware scheduling: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 21: arrival rates

This demo makes the production idea concrete with a small numerical object.

Code cell 45

header("Demo 21 - arrival rates: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 22: Little's law

This demo makes the production idea concrete with a small numerical object.

Code cell 47

header("Demo 22 - Little's law: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 23: tail latency

This demo makes the production idea concrete with a small numerical object.

Code cell 49

header("Demo 23 - tail latency: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 24: autoscaling

This demo makes the production idea concrete with a small numerical object.

Code cell 51

header("Demo 24 - autoscaling: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 25: overload control

This demo makes the production idea concrete with a small numerical object.

Code cell 53

header("Demo 25 - overload control: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Previous lesson Next lesson