Exercises NotebookMath for LLMs

Cross Entropy

Information Theory / Cross Entropy

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Cross-Entropy — Exercises

10 graded exercises covering the full section arc from information-theory identities to LLM-flavored usage.

  • Exercises 1-3: core mechanics and derivations
  • Exercises 4-6: gradients, stability, and perplexity
  • Exercises 7-8: soft targets, weighting, and modern practice

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la

np.random.seed(42)
np.set_printoptions(precision=6, suppress=True)


def header(title):
    print('\n' + '=' * len(title))
    print(title)
    print('=' * len(title))


def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'}{name}")
    if not ok:
        print('  expected:', expected)
        print('  got     :', got)
    return ok


def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'}{name}")
    return cond


def entropy(p):
    p = np.asarray(p, dtype=float)
    p = p[p > 0]
    return float(-np.sum(p * np.log(p)))


def cross_entropy(p, q):
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)
    mask = p > 0
    if np.any((p > 0) & (q <= 0)):
        return np.inf
    return float(-np.sum(p[mask] * np.log(q[mask])))


def kl_divergence(p, q):
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)
    mask = p > 0
    if np.any((p > 0) & (q <= 0)):
        return np.inf
    return float(np.sum(p[mask] * np.log(p[mask] / q[mask])))


def softmax(z):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    exps = np.exp(z)
    return exps / np.sum(exps)


def log_softmax(z):
    z = np.asarray(z, dtype=float)
    m = np.max(z)
    return z - m - np.log(np.sum(np.exp(z - m)))


def binary_cross_entropy_from_prob(y, p_hat):
    p_hat = np.clip(float(p_hat), 1e-12, 1 - 1e-12)
    return float(-(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat)))


def one_hot(k, K):
    y = np.zeros(K, dtype=float)
    y[k] = 1.0
    return y

print('Setup complete.')

Exercise 1: Verify the Core Identity ★

Let

[ p=(0.7,0.2,0.1), \qquad q=(0.5,0.3,0.2). ]

Compute:

  1. H(p)H(p)
  2. H(p,q)H(p,q)
  3. DKL(pq)D_{\mathrm{KL}}(p\|q)

Then verify the identity

[ H(p,q)=H(p)+D_{\mathrm{KL}}(p|q). ]

Code cell 5

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 6

# Solution
# Exercise 1: Solution
p = np.array([0.7, 0.2, 0.1])
q = np.array([0.5, 0.3, 0.2])
Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)

header('Exercise 1: Verify the Core Identity')
print(f'H(p)         = {Hp:.6f}')
print(f'H(p, q)      = {Hpq:.6f}')
print(f'D_KL(p || q) = {Dkl:.6f}')
check_close('identity', Hpq, Hp + Dkl)
print('\nTakeaway: cross-entropy is entropy plus a mismatch penalty measured by KL divergence.')

Exercise 2: One-Hot Targets Become Log-Loss ★

Let the true class be c=2c=2 in a 4-class problem, and let the model predict

[ \hat{\mathbf{p}}=(0.10, 0.20, 0.60, 0.10). ]

Show that categorical cross-entropy with a one-hot target reduces to logp^c-\log \hat{p}_c, and compute the numerical value.

Code cell 8

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 9

# Solution
# Exercise 2: Solution
p_hat = np.array([0.10, 0.20, 0.60, 0.10])
y = one_hot(2, 4)
ce = -np.sum(y * np.log(p_hat))
manual = -np.log(p_hat[2])

header('Exercise 2: One-Hot Targets Become Log-Loss')
print('target =', y)
print('prediction =', p_hat)
print(f'Categorical CE   = {ce:.6f}')
print(f'-log p_hat[c]    = {manual:.6f}')
check_close('one-hot reduction', ce, manual)
print('\nTakeaway: one-hot categorical cross-entropy is exactly the negative log-probability of the true class.')

Exercise 3: Derive Binary Cross-Entropy ★

Starting from the Bernoulli likelihood

[ q(y)=\hat{p}^{,y}(1-\hat{p})^{1-y}, \qquad y \in {0,1}, ]

derive the binary cross-entropy formula by taking the negative logarithm. Then evaluate it for y=1y=1 and p^=0.8\hat{p}=0.8.

Code cell 11

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 12

# Solution
# Exercise 3: Solution
y = 1
p_hat = 0.8
bce = -(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat))
manual = -np.log(0.8)

header('Exercise 3: Derive Binary Cross-Entropy')
print('Bernoulli NLL = -[y log(p_hat) + (1-y) log(1-p_hat)]')
print(f'For y=1 and p_hat=0.8: {bce:.6f}')
check_close('reduces to -log(p_hat) when y=1', bce, manual)
print('\nTakeaway: BCE is nothing more than the Bernoulli negative log-likelihood.')

Exercise 4: Verify the Softmax-Cross-Entropy Gradient ★★

For logits

[ \mathbf{z}=(0.8,-0.4,1.2) ]

with true class c=2c=2, verify numerically that the gradient of the loss with respect to logits is

[ \hat{\mathbf{p}}-\mathbf{y}. ]

Code cell 14

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 15

# Solution
# Exercise 4: Solution
z = np.array([0.8, -0.4, 1.2])
y = one_hot(2, 3)
p_hat = softmax(z)
closed = p_hat - y


def loss_from_logits(z_vec):
    return -log_softmax(z_vec)[2]

h = 1e-6
fd = np.zeros_like(z)
for i in range(len(z)):
    zp = z.copy(); zp[i] += h
    zm = z.copy(); zm[i] -= h
    fd[i] = (loss_from_logits(zp) - loss_from_logits(zm)) / (2 * h)

header('Exercise 4: Verify the Softmax-Cross-Entropy Gradient')
print('closed-form gradient:', closed)
print('finite-diff gradient:', fd)
check_close('gradient formula', closed, fd, tol=1e-6)
print('\nTakeaway: the logit gradient for softmax cross-entropy is predicted minus target.')

Exercise 5: Perplexity From Token Losses ★★

Suppose a language model assigns the following probabilities to the correct next tokens:

[ (0.30, 0.70, 0.40, 0.10, 0.55). ]

Compute:

  1. the average token cross-entropy in nats
  2. the perplexity

Code cell 17

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 18

# Solution
# Exercise 5: Solution
probs = np.array([0.30, 0.70, 0.40, 0.10, 0.55])
avg_ce = -np.mean(np.log(probs))
ppl = np.exp(avg_ce)

header('Exercise 5: Perplexity From Token Losses')
print(f'Average token cross-entropy = {avg_ce:.6f} nats/token')
print(f'Perplexity                  = {ppl:.6f}')
check_close('PPL = exp(avg CE)', ppl, np.exp(avg_ce))
print('\nTakeaway: perplexity is the exponentiated average token-level cross-entropy for autoregressive models.')

Exercise 6: Stable Log-Softmax ★★

Let

[ \mathbf{z}=(1000,1001,999). ]

Show why naive softmax -> log is numerically unstable, and compute the stable log-softmax instead.

Code cell 20

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 21

# Solution
# Exercise 6: Solution
z = np.array([1000.0, 1001.0, 999.0])
naive = np.exp(z) / np.sum(np.exp(z))
stable = log_softmax(z)

header('Exercise 6: Stable Log-Softmax')
print('Naive softmax:', naive)
print('Stable log-softmax:', stable)
check_true('naive softmax is numerically unstable', not np.all(np.isfinite(naive)))
check_close('stable probabilities sum to 1 after exponentiation', np.sum(np.exp(stable)), 1.0, tol=1e-10)
print('\nTakeaway: subtracting the maximum logit before exponentiating is essential for stable CE computation.')

Exercise 7: Label Smoothing as a Target Change ★★★

In a 5-class problem with true class 00, construct the label-smoothed target for ε=0.1\varepsilon=0.1, compare its entropy to the hard one-hot target, and explain what supervision information changed.

Code cell 23

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 24

# Solution
# Exercise 7: Solution
K = 5
eps = 0.1
hard = one_hot(0, K)
smoothed = (1 - eps) * hard + eps / K * np.ones(K)

header('Exercise 7: Label Smoothing as a Target Change')
print('hard target    =', hard)
print('smoothed target=', smoothed)
print(f'Entropy(hard)     = {entropy(hard):.6f}')
print(f'Entropy(smoothed) = {entropy(smoothed):.6f}')
check_true('smoothed target has higher entropy', entropy(smoothed) > entropy(hard))
print('\nTakeaway: label smoothing modifies the target distribution by injecting a controlled amount of uncertainty.')

Exercise 8: Masked Cross-Entropy Versus Naive Averaging ★★★

Suppose token losses are

[ (0.4, 0.9, 1.2, 0.3, 2.0, 0.6) ]

and the mask is

[ (1,1,0,1,0,1). ]

Compute the correct masked average and compare it to naive averaging after zeroing out masked positions.

Code cell 26

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 27

# Solution
# Exercise 8: Solution
losses = np.array([0.4, 0.9, 1.2, 0.3, 2.0, 0.6])
mask = np.array([1, 1, 0, 1, 0, 1], dtype=float)
correct = np.sum(losses * mask) / np.sum(mask)
naive = np.mean(losses * mask)

header('Exercise 8: Masked Cross-Entropy Versus Naive Averaging')
print(f'Correct masked average = {correct:.6f}')
print(f'Naive zeroed average   = {naive:.6f}')
check_true('the two estimators differ', abs(correct - naive) > 1e-8)
print('\nTakeaway: masked cross-entropy must normalize by the number of valid tokens, not by the full padded length.')

Exercise 9: Label Smoothing Changes the Target Distribution

Compute categorical cross-entropy with a one-hot target and with label smoothing. Interpret the change in gradient pressure.

Code cell 29

# Your Solution
print("Compare one-hot and smoothed cross-entropy.")

Code cell 30

# Solution
header("Exercise 9: Label Smoothing")
logits = np.array([3.0, 0.5, -1.0, -2.0])
q = softmax(logits)
one_hot = np.array([1.0, 0.0, 0.0, 0.0])
eps = 0.1
smooth = (1 - eps) * one_hot + eps / len(one_hot)
ce_one = cross_entropy(one_hot, q)
ce_smooth = cross_entropy(smooth, q)
print("model q:", np.round(q, 4))
print("CE one-hot:", round(ce_one, 6))
print("CE smoothed:", round(ce_smooth, 6))
check_true("smoothed target is still normalized", np.isclose(smooth.sum(), 1.0))
print("Takeaway: label smoothing keeps the model from assigning all probability mass to one class.")

Exercise 10: Token Cross-Entropy and Perplexity

Convert average next-token negative log-likelihood into perplexity and verify the exponential relationship.

Code cell 32

# Your Solution
print("Compute perplexity from token negative log-likelihoods.")

Code cell 33

# Solution
header("Exercise 10: Perplexity")
nll = np.array([1.2, 0.7, 2.0, 1.5, 0.9])
ce = float(np.mean(nll))
ppl = float(np.exp(ce))
print("cross-entropy nats:", round(ce, 6))
print("perplexity:", round(ppl, 6))
check_close("perplexity equals exp(cross entropy)", ppl, np.exp(ce))
print("Takeaway: perplexity is the effective branching factor implied by token-level cross-entropy.")