Theory NotebookMath for LLMs

Human in the Loop and Monitoring

Alignment and Safety / Human in the Loop and Monitoring

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Human in the Loop and Monitoring

Human-in-the-loop alignment treats review, escalation, feedback, and retraining as a controlled feedback system rather than a one-time annotation job.

This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - z.max(axis=axis, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=axis, keepdims=True)

def log_softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    shifted = z - z.max(axis=axis, keepdims=True)
    return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))

def check_true(condition, message):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {message}")
    assert ok

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(float(actual) - float(expected)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
    assert ok

print("Alignment helper functions loaded.")

Demo 1: Alignment as a feedback system

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 5

header("Demo 1 - Alignment as a feedback system: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")

Demo 2: Humans as sparse high-value sensors

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 7

header("Demo 2 - Humans as sparse high-value sensors: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 3: Escalation as risk control

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 9

header("Demo 3 - Escalation as risk control: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 4: Feedback loops versus production dashboards

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 11

header("Demo 4 - Feedback loops versus production dashboards: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 5: Why monitoring must become data

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 13

header("Demo 5 - Why monitoring must become data: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 6: Feedback event

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 15

header("Demo 6 - Feedback event: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")

Demo 7: Label budget

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 17

header("Demo 7 - Label budget: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 8: Active learning score

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 19

header("Demo 8 - Active learning score: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 9: Escalation policy

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 21

header("Demo 9 - Escalation policy: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 10: Preference queue

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 23

header("Demo 10 - Preference queue: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 11: Rankings

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 25

header("Demo 11 - Rankings: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")

Demo 12: Ratings

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 27

header("Demo 12 - Ratings: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 13: Edits

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 29

header("Demo 13 - Edits: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 14: Demonstrations

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 31

header("Demo 14 - Demonstrations: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 15: Natural-language critiques

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 33

header("Demo 15 - Natural-language critiques: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 16: Uncertainty sampling

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 35

header("Demo 16 - Uncertainty sampling: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")

Demo 17: Diversity sampling

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 37

header("Demo 17 - Diversity sampling: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 18: Risk-weighted sampling

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 39

header("Demo 18 - Risk-weighted sampling: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 19: Marginal value of labels

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 41

header("Demo 19 - Marginal value of labels: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 20: Exploration budget

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 43

header("Demo 20 - Exploration budget: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 21: Escalation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 45

header("Demo 21 - Escalation: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")

Demo 22: Triage

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 47

header("Demo 22 - Triage: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")

Demo 23: Reviewer calibration

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 49

header("Demo 23 - Reviewer calibration: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 24: Rubric drift

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 51

header("Demo 24 - Rubric drift: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 25: Adjudication

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 53

header("Demo 25 - Adjudication: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")