Serving and Systems Tradeoffs: Theory Notebook

This notebook makes serving tradeoffs concrete: Little's law, queue utilization, latency budgets, batching, KV memory, cost per token, autoscaling, and SLO budgets.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

1. Little's law

Code cell 4

arrival_rate = 20  # requests/sec
avg_latency = 1.5  # sec
concurrency = arrival_rate * avg_latency
print("average concurrency:", concurrency)

2. Utilization and queueing delay intuition

Code cell 6

rho = np.linspace(0.1, 0.95, 50)
service_time = 0.2
mm1_latency = service_time / (1 - rho)
fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(rho, mm1_latency)
ax.set_title("Queueing delay grows quickly near high utilization")
ax.set_xlabel("utilization rho")
ax.set_ylabel("mean response time proxy")
fig.tight_layout()
plt.show()
print("latency at rho=0.8:", service_time / (1 - 0.8))

3. Latency budget

Code cell 8

queue_ms = 30
prefill_ms = 250
output_tokens = 80
tpot_ms = 25
post_ms = 15
ttft = queue_ms + prefill_ms
total = ttft + output_tokens * tpot_ms + post_ms
print("TTFT ms:", ttft)
print("total latency ms:", total)

4. Batch throughput versus latency

Code cell 10

batch = np.arange(1, 65)
throughput = 1000 * (1 - np.exp(-batch / 16))
queue_wait = 2.0 * batch
fig, ax1 = plt.subplots(figsize=(8, 4))
ax1.plot(batch, throughput, label="throughput proxy")
ax1.set_xlabel("batch size")
ax1.set_ylabel("tokens/sec proxy")
ax2 = ax1.twinx()
ax2.plot(batch, queue_wait, color="red", label="queue wait proxy")
ax2.set_ylabel("wait ms proxy")
ax1.set_title("Batching improves throughput but can add wait")
fig.tight_layout()
plt.show()
print("batch 32 throughput proxy:", throughput[31], "wait proxy:", queue_wait[31])

5. Max concurrency from KV memory

Code cell 12

gpu_gb = 80
weights_gb = 28
workspace_gb = 8
available_kv = gpu_gb - weights_gb - workspace_gb
kv_per_request_gb = 0.55
max_requests = int(available_kv // kv_per_request_gb)
print("available KV GB:", available_kv)
print("max concurrent requests:", max_requests)

6. Cost per million tokens

Code cell 14

gpu_hour_cost = 4.0
tokens_per_sec = 1200
cpm = 1e6 * gpu_hour_cost / (3600 * tokens_per_sec)
print("cost per million tokens:", cpm)

7. Utilization-adjusted cost

Code cell 16

nominal_cpm = cpm
for util in [0.3, 0.5, 0.7, 0.9]:
    print(f"util={util:.1f}: effective CPM={nominal_cpm / util:.3f}")

8. Autoscaling replica count

Code cell 18

arrival_rate = 180
service_rate_per_replica = 40
target_util = 0.70
replicas = int(np.ceil(arrival_rate / (target_util * service_rate_per_replica)))
print("needed replicas:", replicas)
print("resulting utilization:", arrival_rate / (replicas * service_rate_per_replica))

9. Error budget

Code cell 20

requests = 2_000_000
slo = 0.995
allowed_bad = int((1 - slo) * requests)
print("allowed bad requests per window:", allowed_bad)

10. Tail latency from samples

Code cell 22

rng = np.random.default_rng(9)
latency = rng.lognormal(mean=np.log(800), sigma=0.45, size=5000)
for p in [50, 90, 95, 99]:
    print(f"p{p} latency ms:", np.percentile(latency, p))

11. Graceful degradation choice

Code cell 24

actions = [
    {"name": "keep full", "latency": 2500, "quality": 1.00},
    {"name": "smaller model", "latency": 900, "quality": 0.92},
    {"name": "shorter answer", "latency": 1200, "quality": 0.95},
    {"name": "lower retrieval k", "latency": 1500, "quality": 0.97},
]
slo_ms = 1600
eligible = [a for a in actions if a["latency"] <= slo_ms]
best = max(eligible, key=lambda a: a["quality"])
print("best eligible action:", best)

12. Serving trace checklist

Code cell 26

checks = [
    "arrival time, queue time, and scheduler decision",
    "prompt tokens, output tokens, TTFT, and TPOT",
    "prefill time and decode time separately",
    "KV cache memory and batch size over time",
    "model version, quantization format, and fallback path",
    "status, error type, and user-visible latency",
]
for i, check in enumerate(checks, 1):
    print(f"{i}. {check}")