Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Cross-Entropy — Exercises
10 graded exercises covering the full section arc from information-theory identities to LLM-flavored usage.
- Exercises 1-3: core mechanics and derivations
- Exercises 4-6: gradients, stability, and perplexity
- Exercises 7-8: soft targets, weighting, and modern practice
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as la
np.random.seed(42)
np.set_printoptions(precision=6, suppress=True)
def header(title):
print('\n' + '=' * len(title))
print(title)
print('=' * len(title))
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} — {name}")
if not ok:
print(' expected:', expected)
print(' got :', got)
return ok
def check_true(name, cond):
print(f"{'PASS' if cond else 'FAIL'} — {name}")
return cond
def entropy(p):
p = np.asarray(p, dtype=float)
p = p[p > 0]
return float(-np.sum(p * np.log(p)))
def cross_entropy(p, q):
p = np.asarray(p, dtype=float)
q = np.asarray(q, dtype=float)
mask = p > 0
if np.any((p > 0) & (q <= 0)):
return np.inf
return float(-np.sum(p[mask] * np.log(q[mask])))
def kl_divergence(p, q):
p = np.asarray(p, dtype=float)
q = np.asarray(q, dtype=float)
mask = p > 0
if np.any((p > 0) & (q <= 0)):
return np.inf
return float(np.sum(p[mask] * np.log(p[mask] / q[mask])))
def softmax(z):
z = np.asarray(z, dtype=float)
z = z - np.max(z)
exps = np.exp(z)
return exps / np.sum(exps)
def log_softmax(z):
z = np.asarray(z, dtype=float)
m = np.max(z)
return z - m - np.log(np.sum(np.exp(z - m)))
def binary_cross_entropy_from_prob(y, p_hat):
p_hat = np.clip(float(p_hat), 1e-12, 1 - 1e-12)
return float(-(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat)))
def one_hot(k, K):
y = np.zeros(K, dtype=float)
y[k] = 1.0
return y
print('Setup complete.')
Exercise 1: Verify the Core Identity ★
Let
[ p=(0.7,0.2,0.1), \qquad q=(0.5,0.3,0.2). ]
Compute:
Then verify the identity
[ H(p,q)=H(p)+D_{\mathrm{KL}}(p|q). ]
Code cell 5
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 6
# Solution
# Exercise 1: Solution
p = np.array([0.7, 0.2, 0.1])
q = np.array([0.5, 0.3, 0.2])
Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)
header('Exercise 1: Verify the Core Identity')
print(f'H(p) = {Hp:.6f}')
print(f'H(p, q) = {Hpq:.6f}')
print(f'D_KL(p || q) = {Dkl:.6f}')
check_close('identity', Hpq, Hp + Dkl)
print('\nTakeaway: cross-entropy is entropy plus a mismatch penalty measured by KL divergence.')
Exercise 2: One-Hot Targets Become Log-Loss ★
Let the true class be in a 4-class problem, and let the model predict
[ \hat{\mathbf{p}}=(0.10, 0.20, 0.60, 0.10). ]
Show that categorical cross-entropy with a one-hot target reduces to , and compute the numerical value.
Code cell 8
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 9
# Solution
# Exercise 2: Solution
p_hat = np.array([0.10, 0.20, 0.60, 0.10])
y = one_hot(2, 4)
ce = -np.sum(y * np.log(p_hat))
manual = -np.log(p_hat[2])
header('Exercise 2: One-Hot Targets Become Log-Loss')
print('target =', y)
print('prediction =', p_hat)
print(f'Categorical CE = {ce:.6f}')
print(f'-log p_hat[c] = {manual:.6f}')
check_close('one-hot reduction', ce, manual)
print('\nTakeaway: one-hot categorical cross-entropy is exactly the negative log-probability of the true class.')
Exercise 3: Derive Binary Cross-Entropy ★
Starting from the Bernoulli likelihood
[ q(y)=\hat{p}^{,y}(1-\hat{p})^{1-y}, \qquad y \in {0,1}, ]
derive the binary cross-entropy formula by taking the negative logarithm. Then evaluate it for and .
Code cell 11
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 12
# Solution
# Exercise 3: Solution
y = 1
p_hat = 0.8
bce = -(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat))
manual = -np.log(0.8)
header('Exercise 3: Derive Binary Cross-Entropy')
print('Bernoulli NLL = -[y log(p_hat) + (1-y) log(1-p_hat)]')
print(f'For y=1 and p_hat=0.8: {bce:.6f}')
check_close('reduces to -log(p_hat) when y=1', bce, manual)
print('\nTakeaway: BCE is nothing more than the Bernoulli negative log-likelihood.')
Exercise 4: Verify the Softmax-Cross-Entropy Gradient ★★
For logits
[ \mathbf{z}=(0.8,-0.4,1.2) ]
with true class , verify numerically that the gradient of the loss with respect to logits is
[ \hat{\mathbf{p}}-\mathbf{y}. ]
Code cell 14
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 15
# Solution
# Exercise 4: Solution
z = np.array([0.8, -0.4, 1.2])
y = one_hot(2, 3)
p_hat = softmax(z)
closed = p_hat - y
def loss_from_logits(z_vec):
return -log_softmax(z_vec)[2]
h = 1e-6
fd = np.zeros_like(z)
for i in range(len(z)):
zp = z.copy(); zp[i] += h
zm = z.copy(); zm[i] -= h
fd[i] = (loss_from_logits(zp) - loss_from_logits(zm)) / (2 * h)
header('Exercise 4: Verify the Softmax-Cross-Entropy Gradient')
print('closed-form gradient:', closed)
print('finite-diff gradient:', fd)
check_close('gradient formula', closed, fd, tol=1e-6)
print('\nTakeaway: the logit gradient for softmax cross-entropy is predicted minus target.')
Exercise 5: Perplexity From Token Losses ★★
Suppose a language model assigns the following probabilities to the correct next tokens:
[ (0.30, 0.70, 0.40, 0.10, 0.55). ]
Compute:
- the average token cross-entropy in nats
- the perplexity
Code cell 17
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 18
# Solution
# Exercise 5: Solution
probs = np.array([0.30, 0.70, 0.40, 0.10, 0.55])
avg_ce = -np.mean(np.log(probs))
ppl = np.exp(avg_ce)
header('Exercise 5: Perplexity From Token Losses')
print(f'Average token cross-entropy = {avg_ce:.6f} nats/token')
print(f'Perplexity = {ppl:.6f}')
check_close('PPL = exp(avg CE)', ppl, np.exp(avg_ce))
print('\nTakeaway: perplexity is the exponentiated average token-level cross-entropy for autoregressive models.')
Exercise 6: Stable Log-Softmax ★★
Let
[ \mathbf{z}=(1000,1001,999). ]
Show why naive softmax -> log is numerically unstable, and compute the stable
log-softmax instead.
Code cell 20
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 21
# Solution
# Exercise 6: Solution
z = np.array([1000.0, 1001.0, 999.0])
naive = np.exp(z) / np.sum(np.exp(z))
stable = log_softmax(z)
header('Exercise 6: Stable Log-Softmax')
print('Naive softmax:', naive)
print('Stable log-softmax:', stable)
check_true('naive softmax is numerically unstable', not np.all(np.isfinite(naive)))
check_close('stable probabilities sum to 1 after exponentiation', np.sum(np.exp(stable)), 1.0, tol=1e-10)
print('\nTakeaway: subtracting the maximum logit before exponentiating is essential for stable CE computation.')
Exercise 7: Label Smoothing as a Target Change ★★★
In a 5-class problem with true class , construct the label-smoothed target for , compare its entropy to the hard one-hot target, and explain what supervision information changed.
Code cell 23
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 24
# Solution
# Exercise 7: Solution
K = 5
eps = 0.1
hard = one_hot(0, K)
smoothed = (1 - eps) * hard + eps / K * np.ones(K)
header('Exercise 7: Label Smoothing as a Target Change')
print('hard target =', hard)
print('smoothed target=', smoothed)
print(f'Entropy(hard) = {entropy(hard):.6f}')
print(f'Entropy(smoothed) = {entropy(smoothed):.6f}')
check_true('smoothed target has higher entropy', entropy(smoothed) > entropy(hard))
print('\nTakeaway: label smoothing modifies the target distribution by injecting a controlled amount of uncertainty.')
Exercise 8: Masked Cross-Entropy Versus Naive Averaging ★★★
Suppose token losses are
[ (0.4, 0.9, 1.2, 0.3, 2.0, 0.6) ]
and the mask is
[ (1,1,0,1,0,1). ]
Compute the correct masked average and compare it to naive averaging after zeroing out masked positions.
Code cell 26
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 27
# Solution
# Exercise 8: Solution
losses = np.array([0.4, 0.9, 1.2, 0.3, 2.0, 0.6])
mask = np.array([1, 1, 0, 1, 0, 1], dtype=float)
correct = np.sum(losses * mask) / np.sum(mask)
naive = np.mean(losses * mask)
header('Exercise 8: Masked Cross-Entropy Versus Naive Averaging')
print(f'Correct masked average = {correct:.6f}')
print(f'Naive zeroed average = {naive:.6f}')
check_true('the two estimators differ', abs(correct - naive) > 1e-8)
print('\nTakeaway: masked cross-entropy must normalize by the number of valid tokens, not by the full padded length.')
Exercise 9: Label Smoothing Changes the Target Distribution
Compute categorical cross-entropy with a one-hot target and with label smoothing. Interpret the change in gradient pressure.
Code cell 29
# Your Solution
print("Compare one-hot and smoothed cross-entropy.")
Code cell 30
# Solution
header("Exercise 9: Label Smoothing")
logits = np.array([3.0, 0.5, -1.0, -2.0])
q = softmax(logits)
one_hot = np.array([1.0, 0.0, 0.0, 0.0])
eps = 0.1
smooth = (1 - eps) * one_hot + eps / len(one_hot)
ce_one = cross_entropy(one_hot, q)
ce_smooth = cross_entropy(smooth, q)
print("model q:", np.round(q, 4))
print("CE one-hot:", round(ce_one, 6))
print("CE smoothed:", round(ce_smooth, 6))
check_true("smoothed target is still normalized", np.isclose(smooth.sum(), 1.0))
print("Takeaway: label smoothing keeps the model from assigning all probability mass to one class.")
Exercise 10: Token Cross-Entropy and Perplexity
Convert average next-token negative log-likelihood into perplexity and verify the exponential relationship.
Code cell 32
# Your Solution
print("Compute perplexity from token negative log-likelihoods.")
Code cell 33
# Solution
header("Exercise 10: Perplexity")
nll = np.array([1.2, 0.7, 2.0, 1.5, 0.9])
ce = float(np.mean(nll))
ppl = float(np.exp(ce))
print("cross-entropy nats:", round(ce, 6))
print("perplexity:", round(ppl, 6))
check_close("perplexity equals exp(cross entropy)", ppl, np.exp(ce))
print("Takeaway: perplexity is the effective branching factor implied by token-level cross-entropy.")