Exercises NotebookMath for LLMs

Model Serving and Inference Optimization

Production ML and MLOps / Model Serving and Inference Optimization

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Exercises: Model Serving and Inference Optimization

There are 10 exercises. Exercises 1-3 are mechanics, 4-6 are theory, and 7-10 are production AI applications.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def softmax(z):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    e = np.exp(z)
    return e / e.sum()

def psi(ref, cur, eps=1e-8):
    ref = np.asarray(ref, dtype=float) + eps
    cur = np.asarray(cur, dtype=float) + eps
    ref = ref / ref.sum()
    cur = cur / cur.sum()
    return float(np.sum((cur - ref) * np.log(cur / ref)))

def js_divergence(p, q, eps=1e-8):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p = p / p.sum()
    q = q / q.sum()
    m = 0.5 * (p + q)
    return float(0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m)))

def percentile(values, q):
    return float(np.percentile(np.asarray(values, dtype=float), q))

print("Helper functions ready.")

Exercise 1: training optimizes loss while serving optimizes systems constraints (*)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 5

# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)

Code cell 6

# Solution
header("Exercise 1: Model Serving and Inference Optimization")
values = np.array([10, 12, 12, 15], dtype=float)
weights = np.arange(1, len(values) + 1)
fingerprint = float(np.sum(values * weights))
check_close(fingerprint, 130.0, name="weighted fingerprint")
print("Artifact values:", values.tolist())
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 2: latency throughput cost triangle (*)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 8

# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)

Code cell 9

# Solution
header("Exercise 2: Model Serving and Inference Optimization")
baseline = np.array([0.80, 0.81, 0.79, 0.82])
candidate = np.array([0.82, 0.83, 0.81, 0.84])
delta = float(np.mean(candidate - baseline))
check_true(delta > 0.0, "candidate mean is better")
print("Mean improvement:", round(delta, 4))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 3: batch versus online inference (*)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 11

# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)

Code cell 12

# Solution
header("Exercise 3: Model Serving and Inference Optimization")
ref = np.array([0.25, 0.25, 0.25, 0.25])
cur = np.array([0.10, 0.20, 0.30, 0.40])
score = js_divergence(ref, cur)
check_true(score >= 0.0, "JS divergence is nonnegative")
print("JS divergence:", round(score, 6))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 4: model endpoints as contracts (**)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 14

# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)

Code cell 15

# Solution
header("Exercise 4: Model Serving and Inference Optimization")
arrivals = 30.0
service = 50.0
rho = arrivals / service
check_true(0.0 <= rho < 1.0, "queue utilization is stable")
print("Utilization:", round(rho, 3))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 5: failure budgets (**)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 17

# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)

Code cell 18

# Solution
header("Exercise 5: Model Serving and Inference Optimization")
scores = np.array([0.15, 0.66, 0.71, 0.88])
threshold = 0.70
blocked = scores >= threshold
check_true(int(blocked.sum()) == 2, "two events exceed threshold")
print("Blocked mask:", blocked.tolist())
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 6: service s(x)s(\mathbf{x}) (**)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 20

# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)

Code cell 21

# Solution
header("Exercise 6: Model Serving and Inference Optimization")
values = np.array([10, 12, 12, 15], dtype=float)
weights = np.arange(1, len(values) + 1)
fingerprint = float(np.sum(values * weights))
check_close(fingerprint, 130.0, name="weighted fingerprint")
print("Artifact values:", values.tolist())
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 7: latency distribution (***)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 23

# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)

Code cell 24

# Solution
header("Exercise 7: Model Serving and Inference Optimization")
baseline = np.array([0.80, 0.81, 0.79, 0.82])
candidate = np.array([0.82, 0.83, 0.81, 0.84])
delta = float(np.mean(candidate - baseline))
check_true(delta > 0.0, "candidate mean is better")
print("Mean improvement:", round(delta, 4))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 8: throughput qq (***)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 26

# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)

Code cell 27

# Solution
header("Exercise 8: Model Serving and Inference Optimization")
ref = np.array([0.25, 0.25, 0.25, 0.25])
cur = np.array([0.10, 0.20, 0.30, 0.40])
score = js_divergence(ref, cur)
check_true(score >= 0.0, "JS divergence is nonnegative")
print("JS divergence:", round(score, 6))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 9: utilization ρ\rho (***)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 29

# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)

Code cell 30

# Solution
header("Exercise 9: Model Serving and Inference Optimization")
arrivals = 30.0
service = 50.0
rho = arrivals / service
check_true(0.0 <= rho < 1.0, "queue utilization is stable")
print("Utilization:", round(rho, 3))
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")

Exercise 10: service-level objectives and service-level agreements (***)

Define the production object, compute a small check, and explain what action the system should take.

Code cell 32

# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)

Code cell 33

# Solution
header("Exercise 10: Model Serving and Inference Optimization")
scores = np.array([0.15, 0.66, 0.71, 0.88])
threshold = 0.70
blocked = scores >= threshold
check_true(int(blocked.sum()) == 2, "two events exceed threshold")
print("Blocked mask:", blocked.tolist())
print("\nTakeaway: production ML decisions should be backed by explicit objects, checks, and logged evidence.")