Theory Notebook
Converted from
theory.ipynbfor web reading.
Cross-Entropy
Cross-entropy is the bridge from information theory to the actual loss curves we use to train classifiers and language models.
This notebook is the interactive companion to notes.md. We move from coding intuition to logits, gradients, perplexity, label smoothing, distillation, and stable implementations.
| Block | Focus |
|---|---|
| 1 | Coding intuition and the discrete definition |
| 2 | Entropy + KL decomposition |
| 3 | BCE and categorical CE from probabilities and logits |
| 4 | Stable log-sum-exp, gradients, and Hessian geometry |
| 5 | Label smoothing, distillation, masking, weighting, and perplexity |
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
np.set_printoptions(precision=6, suppress=True)
try:
import torch
import torch.nn.functional as F
torch.manual_seed(42)
HAS_TORCH = True
except ImportError:
HAS_TORCH = False
def entropy(p):
p = np.asarray(p, dtype=float)
p = p[p > 0]
return float(-np.sum(p * np.log(p)))
def cross_entropy(p, q):
p = np.asarray(p, dtype=float)
q = np.asarray(q, dtype=float)
if np.any((p > 0) & (q <= 0)):
return np.inf
mask = p > 0
return float(-np.sum(p[mask] * np.log(q[mask])))
def kl_divergence(p, q):
p = np.asarray(p, dtype=float)
q = np.asarray(q, dtype=float)
if np.any((p > 0) & (q <= 0)):
return np.inf
mask = p > 0
return float(np.sum(p[mask] * np.log(p[mask] / q[mask])))
def softmax(z):
z = np.asarray(z, dtype=float)
shifted = z - np.max(z)
exps = np.exp(shifted)
return exps / np.sum(exps)
def log_softmax(z):
z = np.asarray(z, dtype=float)
m = np.max(z)
return z - m - np.log(np.sum(np.exp(z - m)))
def binary_cross_entropy_from_prob(y, p_hat):
p_hat = np.clip(float(p_hat), 1e-12, 1 - 1e-12)
return float(-(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat)))
def sigmoid(z):
z = np.asarray(z, dtype=float)
out = np.empty_like(z, dtype=float)
pos = z >= 0
out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
exp_z = np.exp(z[~pos])
out[~pos] = exp_z / (1.0 + exp_z)
return out
def bce_with_logits(y, z):
z = float(z)
return float(max(z, 0.0) - y * z + np.log1p(np.exp(-abs(z))))
def one_hot(k, K):
y = np.zeros(K, dtype=float)
y[k] = 1.0
return y
print(f'Helpers ready. Torch available: {HAS_TORCH}')
1. Coding Intuition
Cross-entropy is easiest to understand as the expected cost of using the wrong code. If the world generates symbols from a true distribution but we encode them using probabilities from , the expected code length is .
Code cell 5
p = np.array([0.70, 0.20, 0.10])
q = np.array([0.50, 0.30, 0.20])
Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)
print('Discrete source example:')
print('p =', p)
print('q =', q)
print(f'H(p) = {Hp:.6f} nats')
print(f'H(p, q) = {Hpq:.6f} nats')
print(f'D_KL(p || q) = {Dkl:.6f} nats')
ok = np.allclose(Hpq, Hp + Dkl)
print(f"{'PASS' if ok else 'FAIL'} — cross-entropy equals entropy plus KL divergence")
Code cell 6
symbols = ['A', 'B', 'C']
code_lengths_q = -np.log(q)
code_lengths_p = -np.log(p)
print('Per-symbol code lengths (nats):')
for s, lp, lq in zip(symbols, code_lengths_p, code_lengths_q):
print(f' {s}: optimal for p = {lp:.4f}, using q = {lq:.4f}')
if HAS_SNS:
fig, ax = plt.subplots()
x = np.arange(len(symbols))
ax.bar(x - 0.18, code_lengths_p, width=0.36, color=COLORS['primary'], label='Optimal lengths from p')
ax.bar(x + 0.18, code_lengths_q, width=0.36, color=COLORS['secondary'], label='Lengths from q')
ax.set_title('Code lengths under the true source and the model')
ax.set_xlabel('Symbol')
ax.set_ylabel('Length (nats)')
ax.set_xticks(x)
ax.set_xticklabels(symbols)
ax.legend()
fig.tight_layout()
plt.show()
2. Entropy Versus Cross-Entropy
Entropy is intrinsic uncertainty under the true source. Cross-entropy is the surprise of true data when evaluated under the model. The model distribution enters only in the second quantity.
Code cell 8
qs = np.linspace(0.01, 0.99, 250)
p_true = np.array([0.8, 0.2])
ce_curve = np.array([cross_entropy(p_true, [q1, 1 - q1]) for q1 in qs])
true_entropy = entropy(p_true)
best_q = qs[np.argmin(ce_curve)]
print(f'True entropy H(p): {true_entropy:.6f} nats')
print(f'Cross-entropy minimum occurs near q(1) = {best_q:.4f}')
print(f"{'PASS' if abs(best_q - 0.8) < 1e-2 else 'FAIL'} — minimum is attained near the true Bernoulli parameter")
fig, ax = plt.subplots()
ax.plot(qs, ce_curve, color=COLORS['primary'], label='H(p, q)')
ax.axhline(true_entropy, color=COLORS['secondary'], linestyle='--', label='H(p)')
ax.axvline(0.8, color=COLORS['highlight'], linestyle=':', label='True parameter')
ax.set_title('Cross-entropy is minimized at the true distribution')
ax.set_xlabel('Model probability q(1)')
ax.set_ylabel('Cross-entropy (nats)')
ax.legend()
fig.tight_layout()
plt.show()
3. Empirical Cross-Entropy as an Average Negative Log-Likelihood
For a dataset, cross-entropy is estimated by an empirical average. This is exactly the negative log-likelihood per example.
Code cell 10
rng = np.random.default_rng(42)
samples = rng.choice(3, size=5000, p=p)
empirical_p = np.bincount(samples, minlength=3) / len(samples)
empirical_ce = np.mean([-np.log(q[x]) for x in samples])
analytic_ce = cross_entropy(p, q)
print('Empirical source estimate:', empirical_p)
print(f'Empirical CE under q: {empirical_ce:.6f}')
print(f'Analytic CE under q: {analytic_ce:.6f}')
ok = abs(empirical_ce - analytic_ce) < 0.03
print(f"{'PASS' if ok else 'FAIL'} — sample average approximates population cross-entropy")
4. Continuous Cross-Entropy Example
For continuous distributions, cross-entropy becomes an expected negative log-density. A Gaussian example lets us compute it in closed form and by Monte Carlo.
Code cell 12
mu_p, sigma_p = 0.0, 1.0
mu_q, sigma_q = 0.4, 1.3
# Monte Carlo estimate with samples from p
x = np.random.normal(mu_p, sigma_p, size=30000)
log_q = -0.5 * np.log(2 * np.pi * sigma_q**2) - 0.5 * ((x - mu_q) / sigma_q) ** 2
mc_ce = -np.mean(log_q)
# Closed form for p = N(mu_p, sigma_p^2), q = N(mu_q, sigma_q^2)
closed_form = 0.5 * np.log(2 * np.pi * sigma_q**2) + (sigma_p**2 + (mu_p - mu_q)**2) / (2 * sigma_q**2)
print(f'Monte Carlo Gaussian CE: {mc_ce:.6f}')
print(f'Closed-form Gaussian CE: {closed_form:.6f}')
ok = abs(mc_ce - closed_form) < 0.02
print(f"{'PASS' if ok else 'FAIL'} — Monte Carlo matches the Gaussian cross-entropy formula")
5. Sequence Cross-Entropy and Perplexity
Autoregressive modeling turns a sequence log-probability into a sum of token-level losses. Average token cross-entropy exponentiates to perplexity.
Code cell 14
log_probs = np.log(np.array([0.30, 0.70, 0.40, 0.10, 0.55]))
sequence_nll = -np.sum(log_probs)
avg_ce = -np.mean(log_probs)
ppl = np.exp(avg_ce)
print(f'Sequence negative log-likelihood: {sequence_nll:.6f}')
print(f'Average token cross-entropy: {avg_ce:.6f} nats/token')
print(f'Perplexity: {ppl:.6f}')
check = np.allclose(ppl, np.exp(avg_ce))
print(f"{'PASS' if check else 'FAIL'} — perplexity is the exponential of average token cross-entropy")
Code cell 15
# Two segmentations of the same hypothetical text can change per-token cross-entropy.
char_log_probs = np.log(np.array([0.85, 0.80, 0.90, 0.82, 0.88, 0.91]))
subword_log_probs = np.log(np.array([0.55, 0.61, 0.58]))
char_ce = -np.mean(char_log_probs)
subword_ce = -np.mean(subword_log_probs)
print(f'Character-like tokenization CE: {char_ce:.6f}')
print(f'Subword-like tokenization CE: {subword_ce:.6f}')
print('These values are not directly comparable because the token events are different.')
6. The Identity $H(p,q) = H(p) + D_{
m KL}(p,|,q)$
This is the central structural identity for the section. It says cross-entropy is intrinsic uncertainty plus mismatch.
Code cell 17
rng = np.random.default_rng(7)
A = rng.uniform(0.1, 1.0, size=5)
B = rng.uniform(0.1, 1.0, size=5)
p5 = A / A.sum()
q5 = B / B.sum()
lhs = cross_entropy(p5, q5)
rhs = entropy(p5) + kl_divergence(p5, q5)
print('p =', p5)
print('q =', q5)
print(f'H(p, q) = {lhs:.6f}')
print(f'H(p) + D_KL(p || q) = {rhs:.6f}')
ok = np.allclose(lhs, rhs)
print(f"{'PASS' if ok else 'FAIL'} — decomposition holds on a random discrete example")
7. Support Mismatch
If the model assigns zero probability to an event that truly occurs, cross-entropy becomes infinite. This is mathematically correct, not a bug.
Code cell 19
p_bad = np.array([0.6, 0.4])
q_bad = np.array([1.0, 0.0])
value = cross_entropy(p_bad, q_bad)
print('p =', p_bad)
print('q =', q_bad)
print('Cross-entropy value:', value)
print(f"{'PASS' if np.isinf(value) else 'FAIL'} — support mismatch yields infinite cross-entropy")
8. Binary Cross-Entropy from a Bernoulli Model
For binary labels, BCE is the negative log-likelihood of a Bernoulli model. It strongly punishes confident mistakes.
Code cell 21
for y in [0, 1]:
print(f'--- True label y = {y} ---')
for p_hat in [0.01, 0.10, 0.50, 0.90, 0.99]:
loss = binary_cross_entropy_from_prob(y, p_hat)
print(f' p_hat={p_hat:>4.2f} -> BCE={loss:.6f}')
Code cell 22
ps = np.linspace(1e-4, 1 - 1e-4, 400)
loss_y1 = np.array([binary_cross_entropy_from_prob(1, p) for p in ps])
loss_y0 = np.array([binary_cross_entropy_from_prob(0, p) for p in ps])
fig, ax = plt.subplots()
ax.plot(ps, loss_y1, color=COLORS['primary'], label='True label y=1')
ax.plot(ps, loss_y0, color=COLORS['secondary'], label='True label y=0')
ax.set_title('Binary cross-entropy curves')
ax.set_xlabel('Predicted probability $p$')
ax.set_ylabel('Loss')
ax.legend()
fig.tight_layout()
plt.show()
9. BCE From Logits
In real training code, BCE should usually be computed from logits rather than post-sigmoid probabilities. This avoids numerical instability.
Code cell 24
for z in [-20, -5, 0, 5, 20]:
p_hat = float(sigmoid(np.array([z]))[0])
naive = binary_cross_entropy_from_prob(1, p_hat)
stable = bce_with_logits(1, z)
print(f'z={z:>3} -> sigmoid(z)={p_hat:.12f}, naive BCE={naive:.12f}, stable BCE={stable:.12f}')
10. Categorical Cross-Entropy From Softmax Logits
For one-hot labels, the multiclass cross-entropy reduces to the negative log-probability of the true class. In logits, it becomes a log-sum-exp expression.
Code cell 26
z = np.array([2.2, 0.4, -1.1, 1.7])
true_class = 3
p_hat = softmax(z)
ce_prob = -np.log(p_hat[true_class])
ce_logits = -z[true_class] + np.log(np.sum(np.exp(z)))
print('logits:', z)
print('softmax probabilities:', p_hat)
print(f'CE from probabilities: {ce_prob:.6f}')
print(f'CE from logits: {ce_logits:.6f}')
ok = np.allclose(ce_prob, ce_logits)
print(f"{'PASS' if ok else 'FAIL'} — probability-space and logit-space CE match")
11. Soft Targets
Cross-entropy naturally extends from one-hot labels to full target distributions. This is the mathematical home of label smoothing and distillation.
Code cell 28
z = np.array([3.0, 0.5, -0.2, -1.5])
p_hat = softmax(z)
one_hot_target = one_hot(0, 4)
soft_target = np.array([0.85, 0.10, 0.03, 0.02])
ce_hard = -np.sum(one_hot_target * np.log(p_hat))
ce_soft = -np.sum(soft_target * np.log(p_hat))
print('prediction:', p_hat)
print('hard target:', one_hot_target)
print('soft target:', soft_target)
print(f'Cross-entropy with hard target: {ce_hard:.6f}')
print(f'Cross-entropy with soft target: {ce_soft:.6f}')
12. Label Smoothing
Label smoothing replaces a point mass with a slightly higher-entropy target. It usually discourages extreme overconfidence.
Code cell 30
K = 5
true_class = 2
eps = 0.1
hard = one_hot(true_class, K)
smoothed = (1 - eps) * hard + eps / K * np.ones(K)
print('hard target =', hard)
print('smoothed target=', smoothed)
print(f'Entropy(hard) = {entropy(hard):.6f}')
print(f'Entropy(smoothed) = {entropy(smoothed):.6f}')
ok = entropy(smoothed) > entropy(hard)
print(f"{'PASS' if ok else 'FAIL'} — label smoothing increases target entropy")
Code cell 31
pred = np.array([0.93, 0.02, 0.02, 0.02, 0.01])
hard_target = one_hot(0, 5)
soft_target = (1 - 0.1) * hard_target + 0.1 / 5 * np.ones(5)
loss_hard = -np.sum(hard_target * np.log(pred))
loss_soft = -np.sum(soft_target * np.log(pred))
print('prediction:', pred)
print(f'Hard-target CE: {loss_hard:.6f}')
print(f'Smoothed-target CE: {loss_soft:.6f}')
print('The smoothed target penalizes overconfidence more broadly across classes.')
13. Knowledge Distillation
Soft teacher targets can carry information about class similarity that hard labels hide. Cross-entropy is the natural comparison tool.
Code cell 33
teacher_logits = np.array([7.0, 5.5, 1.0, -0.5])
student_logits = np.array([4.5, 3.7, 0.2, -0.7])
def softmax_temp(z, tau):
z = np.asarray(z, dtype=float) / tau
z = z - np.max(z)
exps = np.exp(z)
return exps / np.sum(exps)
for tau in [1.0, 2.0, 4.0]:
teacher = softmax_temp(teacher_logits, tau)
student = softmax_temp(student_logits, tau)
kd_ce = -np.sum(teacher * np.log(student))
print(f'tau={tau:.1f}')
print(' teacher:', teacher)
print(' student:', student)
print(f' soft-target CE: {kd_ce:.6f}')
14. Stable Log-Sum-Exp in Action
Subtracting the maximum logit prevents overflow while preserving the exact result.
Code cell 35
z_big = np.array([1000.0, 1001.0, 999.0])
try:
naive_softmax = np.exp(z_big) / np.sum(np.exp(z_big))
naive_ok = np.all(np.isfinite(naive_softmax))
except FloatingPointError:
naive_softmax = np.array([np.nan, np.nan, np.nan])
naive_ok = False
stable_probs = softmax(z_big)
print('Naive softmax:', naive_softmax)
print('Stable softmax:', stable_probs)
print(f"{'PASS' if not naive_ok else 'FAIL'} — naive computation overflows or becomes unstable")
Code cell 36
val1 = log_softmax(z_big)[1]
val2 = z_big[1] - np.max(z_big) - np.log(np.sum(np.exp(z_big - np.max(z_big))))
print(f'log_softmax component: {val1:.12f}')
print(f'manual stable formula: {val2:.12f}')
print(f"{'PASS' if np.allclose(val1, val2) else 'FAIL'} — stable log-softmax matches the explicit formula")
15. The Softmax-Cross-Entropy Gradient
For one-hot targets, the logit-space gradient is predicted minus target. We can verify that numerically.
Code cell 38
z = np.array([0.8, -0.4, 1.2])
y = one_hot(2, 3)
p_hat = softmax(z)
grad_closed = p_hat - y
def loss_from_logits(z_vec):
p = softmax(z_vec)
return -np.log(p[2])
# finite differences
h = 1e-6
grad_fd = np.zeros_like(z)
for i in range(len(z)):
zp = z.copy(); zp[i] += h
zm = z.copy(); zm[i] -= h
grad_fd[i] = (loss_from_logits(zp) - loss_from_logits(zm)) / (2 * h)
print('softmax probabilities:', p_hat)
print('closed-form gradient :', grad_closed)
print('finite-diff gradient :', grad_fd)
ok = np.allclose(grad_closed, grad_fd, atol=1e-6)
print(f"{'PASS' if ok else 'FAIL'} — gradient equals predicted minus target")
16. Hessian Geometry
The Hessian in logit space is the covariance matrix of the softmax distribution. It is symmetric and positive semidefinite.
Code cell 40
z = np.array([1.4, 0.1, -0.8, 0.3])
p_hat = softmax(z)
H = np.diag(p_hat) - np.outer(p_hat, p_hat)
eigs = np.linalg.eigvalsh(H)
print('softmax probabilities:', p_hat)
print('Hessian matrix:\n', H)
print('eigenvalues:', eigs)
checks = [
np.allclose(H, H.T),
np.min(eigs) >= -1e-12,
np.allclose(H.sum(axis=1), 0.0),
]
labels = ['symmetry', 'positive semidefinite up to numerical tolerance', 'row sums are zero']
for name, cond in zip(labels, checks):
print(f"{'PASS' if cond else 'FAIL'} — {name}")
17. Cross-Entropy as a Strictly Proper Scoring Rule
Expected log-loss is minimized at the true distribution. A binary example makes this very concrete.
Code cell 42
p_true = 0.73
qs = np.linspace(0.01, 0.99, 400)
expected_log_loss = -p_true * np.log(qs) - (1 - p_true) * np.log(1 - qs)
q_star = qs[np.argmin(expected_log_loss)]
print(f'True Bernoulli probability: {p_true:.4f}')
print(f'Expected log-loss minimizer: {q_star:.4f}')
ok = abs(q_star - p_true) < 5e-3
print(f"{'PASS' if ok else 'FAIL'} — expected log-loss is minimized at the true probability")
fig, ax = plt.subplots()
ax.plot(qs, expected_log_loss, color=COLORS['primary'])
ax.axvline(p_true, color=COLORS['highlight'], linestyle='--', label='True probability')
ax.set_title('Log-loss is strictly proper in the binary case')
ax.set_xlabel('Forecasted probability q')
ax.set_ylabel('Expected log-loss')
ax.legend()
fig.tight_layout()
plt.show()
18. Calibration and Temperature Scaling
Cross-entropy-trained models can still be overconfident in finite-sample practice. Temperature scaling is a simple post-hoc fix in logit space.
Code cell 44
logits = np.array([
[5.0, 0.8, -1.0],
[4.5, 0.3, -0.2],
[3.8, 1.1, -0.1],
[0.2, 1.7, 0.1],
[1.0, 0.7, 0.4],
])
labels = np.array([0, 0, 1, 1, 2])
def avg_nll(temp):
losses = []
for z, y in zip(logits / temp, labels):
losses.append(-log_softmax(z)[y])
return float(np.mean(losses))
temps = np.linspace(0.5, 3.0, 150)
losses = np.array([avg_nll(t) for t in temps])
best_temp = temps[np.argmin(losses)]
print(f'Best temperature on this toy validation set: {best_temp:.4f}')
print(f'NLL at tau=1.0: {avg_nll(1.0):.6f}')
print(f'NLL at best tau: {losses.min():.6f}')
fig, ax = plt.subplots()
ax.plot(temps, losses, color=COLORS['secondary'])
ax.axvline(best_temp, color=COLORS['highlight'], linestyle='--', label='Best tau')
ax.set_title('Temperature scaling tunes validation NLL')
ax.set_xlabel('Temperature $\tau$')
ax.set_ylabel('Average negative log-likelihood')
ax.legend()
fig.tight_layout()
plt.show()
19. Weighted and Masked Cross-Entropy
In real systems, CE is often modified by weighting classes or masking invalid positions. These are not just coding details: they change which errors matter most.
Code cell 46
# weighted binary example
probs = np.array([0.9, 0.8, 0.7, 0.4, 0.3, 0.2])
labels = np.array([0, 0, 0, 1, 1, 1])
weights = np.where(labels == 1, 3.0, 1.0)
plain = np.mean([binary_cross_entropy_from_prob(y, p) for y, p in zip(labels, probs)])
weighted = np.sum(weights * np.array([binary_cross_entropy_from_prob(y, p) for y, p in zip(labels, probs)])) / np.sum(weights)
print(f'Unweighted BCE average: {plain:.6f}')
print(f'Weighted BCE average: {weighted:.6f}')
print('The weighted loss emphasizes the positive-class mistakes more strongly.')
Code cell 47
# masked token loss example
losses = np.array([0.4, 0.9, 1.2, 0.3, 2.0, 0.6])
mask = np.array([1, 1, 0, 1, 0, 1], dtype=float)
masked_avg = float(np.sum(losses * mask) / np.sum(mask))
naive_avg = float(np.mean(losses * mask))
print('token losses:', losses)
print('mask :', mask.astype(int))
print(f'Correct masked average: {masked_avg:.6f}')
print(f'Naive average incl. zeros: {naive_avg:.6f}')
print('These are not the same estimator.')
20. When Cross-Entropy Is Not the Whole Story
CE is excellent for honest probabilistic prediction, but some settings require more. Class imbalance, label noise, OOD confidence, or structured decisions can motivate modified objectives.
Code cell 49
# same accuracy, different cross-entropy
preds_A = np.array([[0.51, 0.49], [0.51, 0.49], [0.51, 0.49], [0.49, 0.51]])
preds_B = np.array([[0.99, 0.01], [0.95, 0.05], [0.97, 0.03], [0.01, 0.99]])
labels = np.array([0, 0, 0, 1])
acc_A = np.mean(np.argmax(preds_A, axis=1) == labels)
acc_B = np.mean(np.argmax(preds_B, axis=1) == labels)
ce_A = np.mean([-np.log(preds_A[i, y]) for i, y in enumerate(labels)])
ce_B = np.mean([-np.log(preds_B[i, y]) for i, y in enumerate(labels)])
print(f'Accuracy A = {acc_A:.3f}, CE A = {ce_A:.6f}')
print(f'Accuracy B = {acc_B:.3f}, CE B = {ce_B:.6f}')
print('Same accuracy can hide very different probabilistic quality.')
21. Preference and Ranking Preview
Binary and pairwise log-losses are still cross-entropy-family objects. They show up in reward modeling and preference learning.
Code cell 51
score_diff = np.linspace(-4, 4, 9)
preference_prob = sigmoid(score_diff)
preference_loss = -np.log(preference_prob)
for d, p, l in zip(score_diff, preference_prob, preference_loss):
print(f'score difference={d:>4.1f} -> preferred-prob={p:.6f}, loss={l:.6f}')
22. Closing Checks
We finish by verifying the key ideas on one random example.
Code cell 53
rng = np.random.default_rng(123)
p = rng.uniform(0.1, 1.0, size=4)
q = rng.uniform(0.1, 1.0, size=4)
p /= p.sum()
q /= q.sum()
Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)
checks = {
'cross-entropy decomposition': np.allclose(Hpq, Hp + Dkl),
'cross-entropy lower bound': Hpq + 1e-12 >= Hp,
'KL non-negativity': Dkl >= -1e-12,
}
print('p =', p)
print('q =', q)
for name, cond in checks.items():
print(f"{'PASS' if cond else 'FAIL'} — {name}")
print('\nNotebook complete. Next: use exercises.ipynb to practice the identities, derivations, and implementation patterns.')