Theory NotebookMath for LLMs

Cross Entropy

Information Theory / Cross Entropy

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Cross-Entropy

Cross-entropy is the bridge from information theory to the actual loss curves we use to train classifiers and language models.

This notebook is the interactive companion to notes.md. We move from coding intuition to logits, gradients, perplexity, label smoothing, distillation, and stable implementations.

BlockFocus
1Coding intuition and the discrete definition
2Entropy + KL decomposition
3BCE and categorical CE from probabilities and logits
4Stable log-sum-exp, gradients, and Hessian geometry
5Label smoothing, distillation, masking, weighting, and perplexity

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

np.set_printoptions(precision=6, suppress=True)

try:
    import torch
    import torch.nn.functional as F
    torch.manual_seed(42)
    HAS_TORCH = True
except ImportError:
    HAS_TORCH = False


def entropy(p):
    p = np.asarray(p, dtype=float)
    p = p[p > 0]
    return float(-np.sum(p * np.log(p)))


def cross_entropy(p, q):
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)
    if np.any((p > 0) & (q <= 0)):
        return np.inf
    mask = p > 0
    return float(-np.sum(p[mask] * np.log(q[mask])))


def kl_divergence(p, q):
    p = np.asarray(p, dtype=float)
    q = np.asarray(q, dtype=float)
    if np.any((p > 0) & (q <= 0)):
        return np.inf
    mask = p > 0
    return float(np.sum(p[mask] * np.log(p[mask] / q[mask])))


def softmax(z):
    z = np.asarray(z, dtype=float)
    shifted = z - np.max(z)
    exps = np.exp(shifted)
    return exps / np.sum(exps)


def log_softmax(z):
    z = np.asarray(z, dtype=float)
    m = np.max(z)
    return z - m - np.log(np.sum(np.exp(z - m)))


def binary_cross_entropy_from_prob(y, p_hat):
    p_hat = np.clip(float(p_hat), 1e-12, 1 - 1e-12)
    return float(-(y * np.log(p_hat) + (1 - y) * np.log(1 - p_hat)))


def sigmoid(z):
    z = np.asarray(z, dtype=float)
    out = np.empty_like(z, dtype=float)
    pos = z >= 0
    out[pos] = 1.0 / (1.0 + np.exp(-z[pos]))
    exp_z = np.exp(z[~pos])
    out[~pos] = exp_z / (1.0 + exp_z)
    return out


def bce_with_logits(y, z):
    z = float(z)
    return float(max(z, 0.0) - y * z + np.log1p(np.exp(-abs(z))))


def one_hot(k, K):
    y = np.zeros(K, dtype=float)
    y[k] = 1.0
    return y

print(f'Helpers ready. Torch available: {HAS_TORCH}')

1. Coding Intuition

Cross-entropy is easiest to understand as the expected cost of using the wrong code. If the world generates symbols from a true distribution pp but we encode them using probabilities from qq, the expected code length is H(p,q)H(p,q).

Code cell 5

p = np.array([0.70, 0.20, 0.10])
q = np.array([0.50, 0.30, 0.20])

Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)

print('Discrete source example:')
print('p =', p)
print('q =', q)
print(f'H(p)           = {Hp:.6f} nats')
print(f'H(p, q)        = {Hpq:.6f} nats')
print(f'D_KL(p || q)   = {Dkl:.6f} nats')
ok = np.allclose(Hpq, Hp + Dkl)
print(f"{'PASS' if ok else 'FAIL'} — cross-entropy equals entropy plus KL divergence")

Code cell 6

symbols = ['A', 'B', 'C']
code_lengths_q = -np.log(q)
code_lengths_p = -np.log(p)

print('Per-symbol code lengths (nats):')
for s, lp, lq in zip(symbols, code_lengths_p, code_lengths_q):
    print(f'  {s}: optimal for p = {lp:.4f}, using q = {lq:.4f}')

if HAS_SNS:
    fig, ax = plt.subplots()
    x = np.arange(len(symbols))
    ax.bar(x - 0.18, code_lengths_p, width=0.36, color=COLORS['primary'], label='Optimal lengths from p')
    ax.bar(x + 0.18, code_lengths_q, width=0.36, color=COLORS['secondary'], label='Lengths from q')
    ax.set_title('Code lengths under the true source and the model')
    ax.set_xlabel('Symbol')
    ax.set_ylabel('Length (nats)')
    ax.set_xticks(x)
    ax.set_xticklabels(symbols)
    ax.legend()
    fig.tight_layout()
    plt.show()

2. Entropy Versus Cross-Entropy

Entropy is intrinsic uncertainty under the true source. Cross-entropy is the surprise of true data when evaluated under the model. The model distribution enters only in the second quantity.

Code cell 8

qs = np.linspace(0.01, 0.99, 250)
p_true = np.array([0.8, 0.2])
ce_curve = np.array([cross_entropy(p_true, [q1, 1 - q1]) for q1 in qs])
true_entropy = entropy(p_true)

best_q = qs[np.argmin(ce_curve)]
print(f'True entropy H(p): {true_entropy:.6f} nats')
print(f'Cross-entropy minimum occurs near q(1) = {best_q:.4f}')
print(f"{'PASS' if abs(best_q - 0.8) < 1e-2 else 'FAIL'} — minimum is attained near the true Bernoulli parameter")

fig, ax = plt.subplots()
ax.plot(qs, ce_curve, color=COLORS['primary'], label='H(p, q)')
ax.axhline(true_entropy, color=COLORS['secondary'], linestyle='--', label='H(p)')
ax.axvline(0.8, color=COLORS['highlight'], linestyle=':', label='True parameter')
ax.set_title('Cross-entropy is minimized at the true distribution')
ax.set_xlabel('Model probability q(1)')
ax.set_ylabel('Cross-entropy (nats)')
ax.legend()
fig.tight_layout()
plt.show()

3. Empirical Cross-Entropy as an Average Negative Log-Likelihood

For a dataset, cross-entropy is estimated by an empirical average. This is exactly the negative log-likelihood per example.

Code cell 10

rng = np.random.default_rng(42)
samples = rng.choice(3, size=5000, p=p)
empirical_p = np.bincount(samples, minlength=3) / len(samples)
empirical_ce = np.mean([-np.log(q[x]) for x in samples])
analytic_ce = cross_entropy(p, q)

print('Empirical source estimate:', empirical_p)
print(f'Empirical CE under q: {empirical_ce:.6f}')
print(f'Analytic  CE under q: {analytic_ce:.6f}')
ok = abs(empirical_ce - analytic_ce) < 0.03
print(f"{'PASS' if ok else 'FAIL'} — sample average approximates population cross-entropy")

4. Continuous Cross-Entropy Example

For continuous distributions, cross-entropy becomes an expected negative log-density. A Gaussian example lets us compute it in closed form and by Monte Carlo.

Code cell 12

mu_p, sigma_p = 0.0, 1.0
mu_q, sigma_q = 0.4, 1.3

# Monte Carlo estimate with samples from p
x = np.random.normal(mu_p, sigma_p, size=30000)
log_q = -0.5 * np.log(2 * np.pi * sigma_q**2) - 0.5 * ((x - mu_q) / sigma_q) ** 2
mc_ce = -np.mean(log_q)

# Closed form for p = N(mu_p, sigma_p^2), q = N(mu_q, sigma_q^2)
closed_form = 0.5 * np.log(2 * np.pi * sigma_q**2) + (sigma_p**2 + (mu_p - mu_q)**2) / (2 * sigma_q**2)

print(f'Monte Carlo Gaussian CE: {mc_ce:.6f}')
print(f'Closed-form Gaussian CE: {closed_form:.6f}')
ok = abs(mc_ce - closed_form) < 0.02
print(f"{'PASS' if ok else 'FAIL'} — Monte Carlo matches the Gaussian cross-entropy formula")

5. Sequence Cross-Entropy and Perplexity

Autoregressive modeling turns a sequence log-probability into a sum of token-level losses. Average token cross-entropy exponentiates to perplexity.

Code cell 14

log_probs = np.log(np.array([0.30, 0.70, 0.40, 0.10, 0.55]))
sequence_nll = -np.sum(log_probs)
avg_ce = -np.mean(log_probs)
ppl = np.exp(avg_ce)

print(f'Sequence negative log-likelihood: {sequence_nll:.6f}')
print(f'Average token cross-entropy:      {avg_ce:.6f} nats/token')
print(f'Perplexity:                       {ppl:.6f}')
check = np.allclose(ppl, np.exp(avg_ce))
print(f"{'PASS' if check else 'FAIL'} — perplexity is the exponential of average token cross-entropy")

Code cell 15

# Two segmentations of the same hypothetical text can change per-token cross-entropy.
char_log_probs = np.log(np.array([0.85, 0.80, 0.90, 0.82, 0.88, 0.91]))
subword_log_probs = np.log(np.array([0.55, 0.61, 0.58]))

char_ce = -np.mean(char_log_probs)
subword_ce = -np.mean(subword_log_probs)

print(f'Character-like tokenization CE: {char_ce:.6f}')
print(f'Subword-like tokenization CE:   {subword_ce:.6f}')
print('These values are not directly comparable because the token events are different.')

6. The Identity $H(p,q) = H(p) + D_{

m KL}(p,|,q)$

This is the central structural identity for the section. It says cross-entropy is intrinsic uncertainty plus mismatch.

Code cell 17

rng = np.random.default_rng(7)
A = rng.uniform(0.1, 1.0, size=5)
B = rng.uniform(0.1, 1.0, size=5)
p5 = A / A.sum()
q5 = B / B.sum()

lhs = cross_entropy(p5, q5)
rhs = entropy(p5) + kl_divergence(p5, q5)
print('p =', p5)
print('q =', q5)
print(f'H(p, q)              = {lhs:.6f}')
print(f'H(p) + D_KL(p || q)  = {rhs:.6f}')
ok = np.allclose(lhs, rhs)
print(f"{'PASS' if ok else 'FAIL'} — decomposition holds on a random discrete example")

7. Support Mismatch

If the model assigns zero probability to an event that truly occurs, cross-entropy becomes infinite. This is mathematically correct, not a bug.

Code cell 19

p_bad = np.array([0.6, 0.4])
q_bad = np.array([1.0, 0.0])
value = cross_entropy(p_bad, q_bad)
print('p =', p_bad)
print('q =', q_bad)
print('Cross-entropy value:', value)
print(f"{'PASS' if np.isinf(value) else 'FAIL'} — support mismatch yields infinite cross-entropy")

8. Binary Cross-Entropy from a Bernoulli Model

For binary labels, BCE is the negative log-likelihood of a Bernoulli model. It strongly punishes confident mistakes.

Code cell 21

for y in [0, 1]:
    print(f'--- True label y = {y} ---')
    for p_hat in [0.01, 0.10, 0.50, 0.90, 0.99]:
        loss = binary_cross_entropy_from_prob(y, p_hat)
        print(f'  p_hat={p_hat:>4.2f} -> BCE={loss:.6f}')

Code cell 22

ps = np.linspace(1e-4, 1 - 1e-4, 400)
loss_y1 = np.array([binary_cross_entropy_from_prob(1, p) for p in ps])
loss_y0 = np.array([binary_cross_entropy_from_prob(0, p) for p in ps])

fig, ax = plt.subplots()
ax.plot(ps, loss_y1, color=COLORS['primary'], label='True label y=1')
ax.plot(ps, loss_y0, color=COLORS['secondary'], label='True label y=0')
ax.set_title('Binary cross-entropy curves')
ax.set_xlabel('Predicted probability $p$')
ax.set_ylabel('Loss')
ax.legend()
fig.tight_layout()
plt.show()

9. BCE From Logits

In real training code, BCE should usually be computed from logits rather than post-sigmoid probabilities. This avoids numerical instability.

Code cell 24

for z in [-20, -5, 0, 5, 20]:
    p_hat = float(sigmoid(np.array([z]))[0])
    naive = binary_cross_entropy_from_prob(1, p_hat)
    stable = bce_with_logits(1, z)
    print(f'z={z:>3} -> sigmoid(z)={p_hat:.12f}, naive BCE={naive:.12f}, stable BCE={stable:.12f}')

10. Categorical Cross-Entropy From Softmax Logits

For one-hot labels, the multiclass cross-entropy reduces to the negative log-probability of the true class. In logits, it becomes a log-sum-exp expression.

Code cell 26

z = np.array([2.2, 0.4, -1.1, 1.7])
true_class = 3
p_hat = softmax(z)
ce_prob = -np.log(p_hat[true_class])
ce_logits = -z[true_class] + np.log(np.sum(np.exp(z)))

print('logits:', z)
print('softmax probabilities:', p_hat)
print(f'CE from probabilities: {ce_prob:.6f}')
print(f'CE from logits:        {ce_logits:.6f}')
ok = np.allclose(ce_prob, ce_logits)
print(f"{'PASS' if ok else 'FAIL'} — probability-space and logit-space CE match")

11. Soft Targets

Cross-entropy naturally extends from one-hot labels to full target distributions. This is the mathematical home of label smoothing and distillation.

Code cell 28

z = np.array([3.0, 0.5, -0.2, -1.5])
p_hat = softmax(z)
one_hot_target = one_hot(0, 4)
soft_target = np.array([0.85, 0.10, 0.03, 0.02])

ce_hard = -np.sum(one_hot_target * np.log(p_hat))
ce_soft = -np.sum(soft_target * np.log(p_hat))

print('prediction:', p_hat)
print('hard target:', one_hot_target)
print('soft target:', soft_target)
print(f'Cross-entropy with hard target: {ce_hard:.6f}')
print(f'Cross-entropy with soft target: {ce_soft:.6f}')

12. Label Smoothing

Label smoothing replaces a point mass with a slightly higher-entropy target. It usually discourages extreme overconfidence.

Code cell 30

K = 5
true_class = 2
eps = 0.1
hard = one_hot(true_class, K)
smoothed = (1 - eps) * hard + eps / K * np.ones(K)

print('hard target    =', hard)
print('smoothed target=', smoothed)
print(f'Entropy(hard)     = {entropy(hard):.6f}')
print(f'Entropy(smoothed) = {entropy(smoothed):.6f}')
ok = entropy(smoothed) > entropy(hard)
print(f"{'PASS' if ok else 'FAIL'} — label smoothing increases target entropy")

Code cell 31

pred = np.array([0.93, 0.02, 0.02, 0.02, 0.01])
hard_target = one_hot(0, 5)
soft_target = (1 - 0.1) * hard_target + 0.1 / 5 * np.ones(5)
loss_hard = -np.sum(hard_target * np.log(pred))
loss_soft = -np.sum(soft_target * np.log(pred))

print('prediction:', pred)
print(f'Hard-target CE:   {loss_hard:.6f}')
print(f'Smoothed-target CE: {loss_soft:.6f}')
print('The smoothed target penalizes overconfidence more broadly across classes.')

13. Knowledge Distillation

Soft teacher targets can carry information about class similarity that hard labels hide. Cross-entropy is the natural comparison tool.

Code cell 33

teacher_logits = np.array([7.0, 5.5, 1.0, -0.5])
student_logits = np.array([4.5, 3.7, 0.2, -0.7])

def softmax_temp(z, tau):
    z = np.asarray(z, dtype=float) / tau
    z = z - np.max(z)
    exps = np.exp(z)
    return exps / np.sum(exps)

for tau in [1.0, 2.0, 4.0]:
    teacher = softmax_temp(teacher_logits, tau)
    student = softmax_temp(student_logits, tau)
    kd_ce = -np.sum(teacher * np.log(student))
    print(f'tau={tau:.1f}')
    print('  teacher:', teacher)
    print('  student:', student)
    print(f'  soft-target CE: {kd_ce:.6f}')

14. Stable Log-Sum-Exp in Action

Subtracting the maximum logit prevents overflow while preserving the exact result.

Code cell 35

z_big = np.array([1000.0, 1001.0, 999.0])

try:
    naive_softmax = np.exp(z_big) / np.sum(np.exp(z_big))
    naive_ok = np.all(np.isfinite(naive_softmax))
except FloatingPointError:
    naive_softmax = np.array([np.nan, np.nan, np.nan])
    naive_ok = False

stable_probs = softmax(z_big)
print('Naive softmax:', naive_softmax)
print('Stable softmax:', stable_probs)
print(f"{'PASS' if not naive_ok else 'FAIL'} — naive computation overflows or becomes unstable")

Code cell 36

val1 = log_softmax(z_big)[1]
val2 = z_big[1] - np.max(z_big) - np.log(np.sum(np.exp(z_big - np.max(z_big))))
print(f'log_softmax component: {val1:.12f}')
print(f'manual stable formula: {val2:.12f}')
print(f"{'PASS' if np.allclose(val1, val2) else 'FAIL'} — stable log-softmax matches the explicit formula")

15. The Softmax-Cross-Entropy Gradient

For one-hot targets, the logit-space gradient is predicted minus target. We can verify that numerically.

Code cell 38

z = np.array([0.8, -0.4, 1.2])
y = one_hot(2, 3)
p_hat = softmax(z)
grad_closed = p_hat - y


def loss_from_logits(z_vec):
    p = softmax(z_vec)
    return -np.log(p[2])

# finite differences
h = 1e-6
grad_fd = np.zeros_like(z)
for i in range(len(z)):
    zp = z.copy(); zp[i] += h
    zm = z.copy(); zm[i] -= h
    grad_fd[i] = (loss_from_logits(zp) - loss_from_logits(zm)) / (2 * h)

print('softmax probabilities:', p_hat)
print('closed-form gradient :', grad_closed)
print('finite-diff gradient :', grad_fd)
ok = np.allclose(grad_closed, grad_fd, atol=1e-6)
print(f"{'PASS' if ok else 'FAIL'} — gradient equals predicted minus target")

16. Hessian Geometry

The Hessian in logit space is the covariance matrix of the softmax distribution. It is symmetric and positive semidefinite.

Code cell 40

z = np.array([1.4, 0.1, -0.8, 0.3])
p_hat = softmax(z)
H = np.diag(p_hat) - np.outer(p_hat, p_hat)
eigs = np.linalg.eigvalsh(H)

print('softmax probabilities:', p_hat)
print('Hessian matrix:\n', H)
print('eigenvalues:', eigs)
checks = [
    np.allclose(H, H.T),
    np.min(eigs) >= -1e-12,
    np.allclose(H.sum(axis=1), 0.0),
]
labels = ['symmetry', 'positive semidefinite up to numerical tolerance', 'row sums are zero']
for name, cond in zip(labels, checks):
    print(f"{'PASS' if cond else 'FAIL'}{name}")

17. Cross-Entropy as a Strictly Proper Scoring Rule

Expected log-loss is minimized at the true distribution. A binary example makes this very concrete.

Code cell 42

p_true = 0.73
qs = np.linspace(0.01, 0.99, 400)
expected_log_loss = -p_true * np.log(qs) - (1 - p_true) * np.log(1 - qs)
q_star = qs[np.argmin(expected_log_loss)]

print(f'True Bernoulli probability: {p_true:.4f}')
print(f'Expected log-loss minimizer: {q_star:.4f}')
ok = abs(q_star - p_true) < 5e-3
print(f"{'PASS' if ok else 'FAIL'} — expected log-loss is minimized at the true probability")

fig, ax = plt.subplots()
ax.plot(qs, expected_log_loss, color=COLORS['primary'])
ax.axvline(p_true, color=COLORS['highlight'], linestyle='--', label='True probability')
ax.set_title('Log-loss is strictly proper in the binary case')
ax.set_xlabel('Forecasted probability q')
ax.set_ylabel('Expected log-loss')
ax.legend()
fig.tight_layout()
plt.show()

18. Calibration and Temperature Scaling

Cross-entropy-trained models can still be overconfident in finite-sample practice. Temperature scaling is a simple post-hoc fix in logit space.

Code cell 44

logits = np.array([
    [5.0, 0.8, -1.0],
    [4.5, 0.3, -0.2],
    [3.8, 1.1, -0.1],
    [0.2, 1.7, 0.1],
    [1.0, 0.7, 0.4],
])
labels = np.array([0, 0, 1, 1, 2])

def avg_nll(temp):
    losses = []
    for z, y in zip(logits / temp, labels):
        losses.append(-log_softmax(z)[y])
    return float(np.mean(losses))

temps = np.linspace(0.5, 3.0, 150)
losses = np.array([avg_nll(t) for t in temps])
best_temp = temps[np.argmin(losses)]

print(f'Best temperature on this toy validation set: {best_temp:.4f}')
print(f'NLL at tau=1.0: {avg_nll(1.0):.6f}')
print(f'NLL at best tau: {losses.min():.6f}')

fig, ax = plt.subplots()
ax.plot(temps, losses, color=COLORS['secondary'])
ax.axvline(best_temp, color=COLORS['highlight'], linestyle='--', label='Best tau')
ax.set_title('Temperature scaling tunes validation NLL')
ax.set_xlabel('Temperature $\tau$')
ax.set_ylabel('Average negative log-likelihood')
ax.legend()
fig.tight_layout()
plt.show()

19. Weighted and Masked Cross-Entropy

In real systems, CE is often modified by weighting classes or masking invalid positions. These are not just coding details: they change which errors matter most.

Code cell 46

# weighted binary example
probs = np.array([0.9, 0.8, 0.7, 0.4, 0.3, 0.2])
labels = np.array([0, 0, 0, 1, 1, 1])
weights = np.where(labels == 1, 3.0, 1.0)

plain = np.mean([binary_cross_entropy_from_prob(y, p) for y, p in zip(labels, probs)])
weighted = np.sum(weights * np.array([binary_cross_entropy_from_prob(y, p) for y, p in zip(labels, probs)])) / np.sum(weights)

print(f'Unweighted BCE average: {plain:.6f}')
print(f'Weighted   BCE average: {weighted:.6f}')
print('The weighted loss emphasizes the positive-class mistakes more strongly.')

Code cell 47

# masked token loss example
losses = np.array([0.4, 0.9, 1.2, 0.3, 2.0, 0.6])
mask = np.array([1, 1, 0, 1, 0, 1], dtype=float)
masked_avg = float(np.sum(losses * mask) / np.sum(mask))
naive_avg = float(np.mean(losses * mask))

print('token losses:', losses)
print('mask        :', mask.astype(int))
print(f'Correct masked average: {masked_avg:.6f}')
print(f'Naive average incl. zeros: {naive_avg:.6f}')
print('These are not the same estimator.')

20. When Cross-Entropy Is Not the Whole Story

CE is excellent for honest probabilistic prediction, but some settings require more. Class imbalance, label noise, OOD confidence, or structured decisions can motivate modified objectives.

Code cell 49

# same accuracy, different cross-entropy
preds_A = np.array([[0.51, 0.49], [0.51, 0.49], [0.51, 0.49], [0.49, 0.51]])
preds_B = np.array([[0.99, 0.01], [0.95, 0.05], [0.97, 0.03], [0.01, 0.99]])
labels = np.array([0, 0, 0, 1])

acc_A = np.mean(np.argmax(preds_A, axis=1) == labels)
acc_B = np.mean(np.argmax(preds_B, axis=1) == labels)
ce_A = np.mean([-np.log(preds_A[i, y]) for i, y in enumerate(labels)])
ce_B = np.mean([-np.log(preds_B[i, y]) for i, y in enumerate(labels)])

print(f'Accuracy A = {acc_A:.3f}, CE A = {ce_A:.6f}')
print(f'Accuracy B = {acc_B:.3f}, CE B = {ce_B:.6f}')
print('Same accuracy can hide very different probabilistic quality.')

21. Preference and Ranking Preview

Binary and pairwise log-losses are still cross-entropy-family objects. They show up in reward modeling and preference learning.

Code cell 51

score_diff = np.linspace(-4, 4, 9)
preference_prob = sigmoid(score_diff)
preference_loss = -np.log(preference_prob)

for d, p, l in zip(score_diff, preference_prob, preference_loss):
    print(f'score difference={d:>4.1f} -> preferred-prob={p:.6f}, loss={l:.6f}')

22. Closing Checks

We finish by verifying the key ideas on one random example.

Code cell 53

rng = np.random.default_rng(123)
p = rng.uniform(0.1, 1.0, size=4)
q = rng.uniform(0.1, 1.0, size=4)
p /= p.sum()
q /= q.sum()

Hp = entropy(p)
Hpq = cross_entropy(p, q)
Dkl = kl_divergence(p, q)

checks = {
    'cross-entropy decomposition': np.allclose(Hpq, Hp + Dkl),
    'cross-entropy lower bound': Hpq + 1e-12 >= Hp,
    'KL non-negativity': Dkl >= -1e-12,
}

print('p =', p)
print('q =', q)
for name, cond in checks.items():
    print(f"{'PASS' if cond else 'FAIL'}{name}")
print('\nNotebook complete. Next: use exercises.ipynb to practice the identities, derivations, and implementation patterns.')