Theory NotebookMath for LLMs

Preference Optimization RLHF and DPO

Alignment and Safety / Preference Optimization RLHF and DPO

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Preference Optimization RLHF and DPO

Preference optimization learns from comparisons, either by fitting a reward model and optimizing a KL-regularized policy or by directly optimizing policy log-ratios.

This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - z.max(axis=axis, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=axis, keepdims=True)

def log_softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    shifted = z - z.max(axis=axis, keepdims=True)
    return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))

def check_true(condition, message):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {message}")
    assert ok

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(float(actual) - float(expected)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
    assert ok

print("Alignment helper functions loaded.")

Demo 1: Preferences optimize choices rather than demonstrations

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 5

header("Demo 1 - Preferences optimize choices rather than demonstrations: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 2: Reward models as learned proxies

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 7

header("Demo 2 - Reward models as learned proxies: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 3: Policy shift under a KL budget

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 9

header("Demo 3 - Policy shift under a KL budget: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 4: DPO as direct reward-model-free training

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 11

header("Demo 4 - DPO as direct reward-model-free training: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 5: Why preference data is noisy but valuable

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 13

header("Demo 5 - Why preference data is noisy but valuable: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 6: Preference pair (x,yw,yl)(x,y_w,y_l)

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 15

header("Demo 6 - Preference pair $(x,y_w,y_l)$: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 7: Reward model rϕ(x,y)r_\phi(x,y)

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 17

header("Demo 7 - Reward model $r_\phi(x,y)$: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 8: Bradley-Terry preference model

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 19

header("Demo 8 - Bradley-Terry preference model: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 9: Reference policy πref\pi_{\mathrm{ref}}

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 21

header("Demo 9 - Reference policy $\pi_{\mathrm{ref}}$: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 10: KL coefficient and inverse temperature β\beta

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 23

header("Demo 10 - KL coefficient and inverse temperature $\beta$: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 11: Pairwise logistic loss

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 25

header("Demo 11 - Pairwise logistic loss: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 12: Reward margins

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 27

header("Demo 12 - Reward margins: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 13: Annotator disagreement

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 29

header("Demo 13 - Annotator disagreement: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 14: Reward calibration

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 31

header("Demo 14 - Reward calibration: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 15: RewardBench-style evaluation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 33

header("Demo 15 - RewardBench-style evaluation: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 16: SFT policy initialization

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 35

header("Demo 16 - SFT policy initialization: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 17: Reward model training

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 37

header("Demo 17 - Reward model training: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 18: PPO objective

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 39

header("Demo 18 - PPO objective: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 19: KL penalty

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 41

header("Demo 19 - KL penalty: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 20: Reward hacking controls

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 43

header("Demo 20 - Reward hacking controls: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 21: KL-constrained optimal policy

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 45

header("Demo 21 - KL-constrained optimal policy: Bradley-Terry preference probability")
r_win = np.array([2.1, 0.7, 1.4, 0.2])
r_lose = np.array([0.3, 0.4, 0.8, 0.5])
margin = r_win - r_lose
p = sigmoid(margin)
loss = -np.log(p).mean()
print("preference probabilities:", np.round(p, 3))
print(f"pairwise reward-model loss={loss:.4f}")
check_true(np.all((p > 0) & (p < 1)), "Bradley-Terry probabilities are valid")

Demo 22: Implicit reward

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 47

header("Demo 22 - Implicit reward: DPO loss from log ratios")
beta = 0.1
log_ratio_win = np.array([1.2, 0.8, 0.4, 1.5])
log_ratio_lose = np.array([-0.2, 0.1, 0.5, 0.0])
adv = beta * (log_ratio_win - log_ratio_lose)
loss = -np.log(sigmoid(adv)).mean()
weights = sigmoid(-adv)
print("DPO advantages:", np.round(adv, 3))
print("gradient weights:", np.round(weights, 3))
print(f"DPO loss={loss:.4f}")
check_true(loss > 0, "DPO loss is positive")

Demo 23: DPO loss

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 49

header("Demo 23 - DPO loss: reward and KL tradeoff")
reward = np.linspace(0.0, 2.0, 50)
for beta in [0.05, 0.2, 0.8]:
    objective = reward - beta * reward**2
    plt.plot(reward, objective, label=f"beta={beta}")
plt.title("KL-regularized reward proxy")
plt.xlabel("Policy shift proxy")
plt.ylabel("Reward minus KL penalty")
plt.legend()
plt.tight_layout()
plt.show()
print("larger beta makes the optimal shift more conservative")
check_true(True, "tradeoff curve rendered")

Demo 24: β\beta tradeoff

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 51

header("Demo 24 - $\beta$ tradeoff: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 25: Gradient interpretation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 53

header("Demo 25 - Gradient interpretation: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")