Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Language Model Probability Math: Exercises
There are ten exercises. Each one has a starter cell and a full solution cell. Work through the starter first, then run the solution to check the probability bookkeeping.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Exercise 1: Chain-rule sequence probability
Compute from three conditional probabilities.
Code cell 4
# Your Solution
probs = [0.8, 0.5, 0.25]
sequence_probability = None
print("Starter: multiply the conditional probabilities in probs.")
Code cell 5
# Solution
probs = np.array([0.8, 0.5, 0.25])
sequence_probability = probs.prod()
log_probability = np.log(probs).sum()
print("sequence probability:", sequence_probability)
print("log probability:", log_probability)
assert np.isclose(np.exp(log_probability), sequence_probability)
Exercise 2: Stable softmax
Implement softmax on logits without overflow.
Code cell 7
# Your Solution
logits = np.array([1000.0, 999.0, 997.0])
print("Starter: subtract logits.max() before exponentiating.")
Code cell 8
# Solution
logits = np.array([1000.0, 999.0, 997.0])
shifted = logits - logits.max()
probs = np.exp(shifted) / np.exp(shifted).sum()
print("probs:", np.round(probs, 6))
print("sum:", probs.sum())
assert np.isclose(probs.sum(), 1.0)
Exercise 3: Cross-entropy gradient
Show that the gradient of one-hot cross-entropy with respect to logits is .
Code cell 10
# Your Solution
logits = np.array([0.2, 1.1, -0.4])
target = 1
print("Starter: compute p=softmax(logits), then p-one_hot(target).")
Code cell 11
# Solution
logits = np.array([0.2, 1.1, -0.4])
target = 1
shifted = logits - logits.max()
p = np.exp(shifted) / np.exp(shifted).sum()
y = np.eye(3)[target]
grad = p - y
print("p:", np.round(p, 5))
print("grad:", np.round(grad, 5))
assert np.isclose(grad.sum(), 0.0)
Exercise 4: Masked mean loss
Average token losses only over non-padding tokens.
Code cell 13
# Your Solution
losses = np.array([[1.0, 0.8, 0.0], [1.4, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
print("Starter: use (losses * mask).sum() / mask.sum().")
Code cell 14
# Solution
losses = np.array([[1.0, 0.8, 0.0], [1.4, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
masked = (losses * mask).sum() / mask.sum()
print("masked mean:", masked)
assert np.isclose(masked, (1.0 + 0.8 + 1.4) / 3)
Exercise 5: Perplexity and bits
Convert average NLL in nats into perplexity and bits per token.
Code cell 16
# Your Solution
avg_nll = 1.75
print("Starter: ppl=exp(avg_nll), bits=avg_nll/log(2).")
Code cell 17
# Solution
avg_nll = 1.75
ppl = np.exp(avg_nll)
bits = avg_nll / np.log(2)
print("perplexity:", round(ppl, 4))
print("bits per token:", round(bits, 4))
assert ppl > 1
Exercise 6: Length-normalized answer scoring
Compare two answers using total log probability and mean log probability.
Code cell 19
# Your Solution
a = np.array([-0.3, -0.4, -0.3, -0.4])
b = np.array([-0.2, -0.9])
print("Starter: compute sum and mean for both candidates.")
Code cell 20
# Solution
a = np.array([-0.3, -0.4, -0.3, -0.4])
b = np.array([-0.2, -0.9])
print("A total, mean:", a.sum(), a.mean())
print("B total, mean:", b.sum(), b.mean())
print("Best total:", "B" if b.sum() > a.sum() else "A")
print("Best per token:", "B" if b.mean() > a.mean() else "A")
Exercise 7: Top-k filtering
Keep only the two highest probability tokens and renormalize.
Code cell 22
# Your Solution
probs = np.array([0.45, 0.25, 0.15, 0.10, 0.05])
print("Starter: zero out all but the two largest entries, then divide by the remaining mass.")
Code cell 23
# Solution
probs = np.array([0.45, 0.25, 0.15, 0.10, 0.05])
keep = np.argsort(probs)[-2:]
filtered = np.zeros_like(probs)
filtered[keep] = probs[keep]
filtered = filtered / filtered.sum()
print("filtered:", filtered)
assert np.isclose(filtered.sum(), 1.0)
assert np.count_nonzero(filtered) == 2
Exercise 8: Expected calibration error
Compute ECE for two confidence bins.
Code cell 25
# Your Solution
confidence = np.array([0.9, 0.8, 0.7, 0.4])
correct = np.array([1, 0, 1, 0])
print("Starter: make bins (0,0.5] and (0.5,1.0], then weight confidence-accuracy gaps.")
Code cell 26
# Solution
confidence = np.array([0.9, 0.8, 0.7, 0.4])
correct = np.array([1, 0, 1, 0])
ece = 0.0
for lo, hi in [(0.0, 0.5), (0.5, 1.0)]:
m = (confidence > lo) & (confidence <= hi)
acc = correct[m].mean()
conf = confidence[m].mean()
ece += m.mean() * abs(acc - conf)
print((lo, hi), "acc", acc, "conf", conf)
print("ECE:", ece)
Exercise 9: Conditional answer likelihood
Given prompt and answer token log probabilities, compute answer-only NLL.
Code cell 28
# Your Solution
prompt_logp = np.array([-0.1, -0.1, -0.2])
answer_logp = np.array([-0.5, -0.7])
print("Starter: answer NLL is -answer_logp.mean(), not negative full-string mean.")
Code cell 29
# Solution
prompt_logp = np.array([-0.1, -0.1, -0.2])
answer_logp = np.array([-0.5, -0.7])
answer_nll = -answer_logp.mean()
full_nll = -np.r_[prompt_logp, answer_logp].mean()
print("answer-only NLL:", answer_nll)
print("full-string NLL:", full_nll)
assert answer_nll != full_nll
Exercise 10: Debug checklist
Write three implementation checks for an LM probability pipeline.
Code cell 31
# Your Solution
print("Starter: name checks for shift, mask, and normalization.")
Code cell 32
# Solution
checks = [
"labels are shifted so position i predicts token i+1",
"padding and prompt masks are applied before averaging",
"softmax or log-softmax is taken across the vocabulary axis",
]
for check in checks:
print("-", check)
assert len(checks) == 3
Closing Reflection
The same small probability checks appear in real training code: stable log-softmax, target shifting, padding masks, answer-only scoring, and explicit decoding conventions.