Theory NotebookMath for LLMs

Preference Optimization RLHF and DPO

Alignment and Safety / Preference Optimization RLHF and DPO

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Preference Optimization RLHF and DPO

Preference optimization learns from comparisons, either by fitting a reward model and optimizing a KL-regularized policy or by directly optimizing policy log-ratios.

This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - z.max(axis=axis, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=axis, keepdims=True)

def log_softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    shifted = z - z.max(axis=axis, keepdims=True)
    return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))

def check_true(condition, message):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {message}")
    assert ok

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(float(actual) - float(expected)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
    assert ok

print("Alignment helper functions loaded.")

Demo 1: Preferences optimize choices rather than demonstrations

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 5

header("Demo 1 - Preferences optimize choices rather than demonstrations: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 2: Reward models as learned proxies

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 7

header("Demo 2 - Reward models as learned proxies: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 3: Policy shift under a KL budget

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 9

header("Demo 3 - Policy shift under a KL budget: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 4: DPO as direct reward-model-free training

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 11

header("Demo 4 - DPO as direct reward-model-free training: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 5: Why preference data is noisy but valuable

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 13

header("Demo 5 - Why preference data is noisy but valuable: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 6: Preference pair $(x,y_w,y_l)$

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 15

header("Demo 6 - Preference pair $(x,y_w,y_l)$: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 7: Reward model $r_\phi(x,y)$

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 17

header("Demo 7 - Reward model $r_\phi(x,y)$: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 8: Bradley-Terry preference model

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 19

header("Demo 8 - Bradley-Terry preference model: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 9: Reference policy $\pi_{\mathrm{ref}}$

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 21

header("Demo 9 - Reference policy $\pi_{\mathrm{ref}}$: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 10: KL coefficient and inverse temperature $\beta$

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 23

header("Demo 10 - KL coefficient and inverse temperature $\beta$: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 11: Pairwise logistic loss

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 25

header("Demo 11 - Pairwise logistic loss: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 12: Reward margins

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 27

header("Demo 12 - Reward margins: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 13: Annotator disagreement

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 29

header("Demo 13 - Annotator disagreement: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 14: Reward calibration

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 31

header("Demo 14 - Reward calibration: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 15: RewardBench-style evaluation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 33

header("Demo 15 - RewardBench-style evaluation: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 16: SFT policy initialization

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 35

header("Demo 16 - SFT policy initialization: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 17: Reward model training

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 37

header("Demo 17 - Reward model training: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 18: PPO objective

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 39

header("Demo 18 - PPO objective: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 19: KL penalty

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 41

header("Demo 19 - KL penalty: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 20: Reward hacking controls

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 43

header("Demo 20 - Reward hacking controls: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 21: KL-constrained optimal policy

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 45

header("Demo 21 - KL-constrained optimal policy: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 22: Implicit reward

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 47

header("Demo 22 - Implicit reward: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 23: DPO loss

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 49

header("Demo 23 - DPO loss: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 24: $\beta$ tradeoff

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 51

header("Demo 24 - $\beta$ tradeoff: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 25: Gradient interpretation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 53

header("Demo 25 - Gradient interpretation: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Previous lesson Next lesson