Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
Serving and Systems Tradeoffs: Theory Notebook
This notebook makes serving tradeoffs concrete: Little's law, queue utilization, latency budgets, batching, KV memory, cost per token, autoscaling, and SLO budgets.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
1. Little's law
Code cell 4
arrival_rate = 20 # requests/sec
avg_latency = 1.5 # sec
concurrency = arrival_rate * avg_latency
print("average concurrency:", concurrency)
2. Utilization and queueing delay intuition
Code cell 6
rho = np.linspace(0.1, 0.95, 50)
service_time = 0.2
mm1_latency = service_time / (1 - rho)
fig, ax = plt.subplots(figsize=(7, 4))
ax.plot(rho, mm1_latency)
ax.set_title("Queueing delay grows quickly near high utilization")
ax.set_xlabel("utilization rho")
ax.set_ylabel("mean response time proxy")
fig.tight_layout()
plt.show()
print("latency at rho=0.8:", service_time / (1 - 0.8))
3. Latency budget
Code cell 8
queue_ms = 30
prefill_ms = 250
output_tokens = 80
tpot_ms = 25
post_ms = 15
ttft = queue_ms + prefill_ms
total = ttft + output_tokens * tpot_ms + post_ms
print("TTFT ms:", ttft)
print("total latency ms:", total)
4. Batch throughput versus latency
Code cell 10
batch = np.arange(1, 65)
throughput = 1000 * (1 - np.exp(-batch / 16))
queue_wait = 2.0 * batch
fig, ax1 = plt.subplots(figsize=(8, 4))
ax1.plot(batch, throughput, label="throughput proxy")
ax1.set_xlabel("batch size")
ax1.set_ylabel("tokens/sec proxy")
ax2 = ax1.twinx()
ax2.plot(batch, queue_wait, color="red", label="queue wait proxy")
ax2.set_ylabel("wait ms proxy")
ax1.set_title("Batching improves throughput but can add wait")
fig.tight_layout()
plt.show()
print("batch 32 throughput proxy:", throughput[31], "wait proxy:", queue_wait[31])
5. Max concurrency from KV memory
Code cell 12
gpu_gb = 80
weights_gb = 28
workspace_gb = 8
available_kv = gpu_gb - weights_gb - workspace_gb
kv_per_request_gb = 0.55
max_requests = int(available_kv // kv_per_request_gb)
print("available KV GB:", available_kv)
print("max concurrent requests:", max_requests)
6. Cost per million tokens
Code cell 14
gpu_hour_cost = 4.0
tokens_per_sec = 1200
cpm = 1e6 * gpu_hour_cost / (3600 * tokens_per_sec)
print("cost per million tokens:", cpm)
7. Utilization-adjusted cost
Code cell 16
nominal_cpm = cpm
for util in [0.3, 0.5, 0.7, 0.9]:
print(f"util={util:.1f}: effective CPM={nominal_cpm / util:.3f}")
8. Autoscaling replica count
Code cell 18
arrival_rate = 180
service_rate_per_replica = 40
target_util = 0.70
replicas = int(np.ceil(arrival_rate / (target_util * service_rate_per_replica)))
print("needed replicas:", replicas)
print("resulting utilization:", arrival_rate / (replicas * service_rate_per_replica))
9. Error budget
Code cell 20
requests = 2_000_000
slo = 0.995
allowed_bad = int((1 - slo) * requests)
print("allowed bad requests per window:", allowed_bad)
10. Tail latency from samples
Code cell 22
rng = np.random.default_rng(9)
latency = rng.lognormal(mean=np.log(800), sigma=0.45, size=5000)
for p in [50, 90, 95, 99]:
print(f"p{p} latency ms:", np.percentile(latency, p))
11. Graceful degradation choice
Code cell 24
actions = [
{"name": "keep full", "latency": 2500, "quality": 1.00},
{"name": "smaller model", "latency": 900, "quality": 0.92},
{"name": "shorter answer", "latency": 1200, "quality": 0.95},
{"name": "lower retrieval k", "latency": 1500, "quality": 0.97},
]
slo_ms = 1600
eligible = [a for a in actions if a["latency"] <= slo_ms]
best = max(eligible, key=lambda a: a["quality"])
print("best eligible action:", best)
12. Serving trace checklist
Code cell 26
checks = [
"arrival time, queue time, and scheduler decision",
"prompt tokens, output tokens, TTFT, and TPOT",
"prefill time and decode time separately",
"KV cache memory and batch size over time",
"model version, quantization format, and fallback path",
"status, error type, and user-visible latency",
]
for i, check in enumerate(checks, 1):
print(f"{i}. {check}")