Theory Notebook
Converted from
theory.ipynbfor web reading.
Human in the Loop and Monitoring
Human-in-the-loop alignment treats review, escalation, feedback, and retraining as a controlled feedback system rather than a one-time annotation job.
This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - z.max(axis=axis, keepdims=True)
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=axis, keepdims=True)
def log_softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - z.max(axis=axis, keepdims=True)
return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))
def check_true(condition, message):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {message}")
assert ok
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(float(actual) - float(expected)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
assert ok
print("Alignment helper functions loaded.")
Demo 1: Alignment as a feedback system
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 5
header("Demo 1 - Alignment as a feedback system: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")
Demo 2: Humans as sparse high-value sensors
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 7
header("Demo 2 - Humans as sparse high-value sensors: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 3: Escalation as risk control
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 9
header("Demo 3 - Escalation as risk control: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 4: Feedback loops versus production dashboards
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 11
header("Demo 4 - Feedback loops versus production dashboards: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 5: Why monitoring must become data
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 13
header("Demo 5 - Why monitoring must become data: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 6: Feedback event
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 15
header("Demo 6 - Feedback event: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")
Demo 7: Label budget
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 17
header("Demo 7 - Label budget: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 8: Active learning score
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 19
header("Demo 8 - Active learning score: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 9: Escalation policy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 21
header("Demo 9 - Escalation policy: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 10: Preference queue
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 23
header("Demo 10 - Preference queue: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 11: Rankings
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 25
header("Demo 11 - Rankings: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")
Demo 12: Ratings
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 27
header("Demo 12 - Ratings: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 13: Edits
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 29
header("Demo 13 - Edits: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 14: Demonstrations
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 31
header("Demo 14 - Demonstrations: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 15: Natural-language critiques
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 33
header("Demo 15 - Natural-language critiques: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 16: Uncertainty sampling
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 35
header("Demo 16 - Uncertainty sampling: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")
Demo 17: Diversity sampling
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 37
header("Demo 17 - Diversity sampling: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 18: Risk-weighted sampling
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 39
header("Demo 18 - Risk-weighted sampling: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 19: Marginal value of labels
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 41
header("Demo 19 - Marginal value of labels: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 20: Exploration budget
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 43
header("Demo 20 - Exploration budget: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 21: Escalation
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 45
header("Demo 21 - Escalation: active-learning priority")
n = 200
risk = np.random.beta(2, 5, size=n)
uncertainty = np.random.beta(3, 3, size=n)
diversity = np.random.rand(n)
utility = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
top = np.argsort(utility)[-10:]
print("top feedback item utilities:", np.round(utility[top], 3))
check_true(utility[top].mean() > utility.mean(), "selected items have above-average utility")
Demo 22: Triage
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 47
header("Demo 22 - Triage: reviewer agreement")
labels_a = np.random.choice([0, 1], size=300, p=[0.7, 0.3])
flip = np.random.rand(300) < 0.12
labels_b = np.where(flip, 1 - labels_a, labels_a)
agree = (labels_a == labels_b).mean()
p_yes_a, p_yes_b = labels_a.mean(), labels_b.mean()
chance = p_yes_a * p_yes_b + (1 - p_yes_a) * (1 - p_yes_b)
kappa = (agree - chance) / (1 - chance)
print(f"agreement={agree:.3f}, Cohen kappa={kappa:.3f}")
check_true(kappa <= 1, "kappa is bounded above by 1")
Demo 23: Reviewer calibration
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 49
header("Demo 23 - Reviewer calibration: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 24: Rubric drift
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 51
header("Demo 24 - Rubric drift: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 25: Adjudication
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 53
header("Demo 25 - Adjudication: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")