Theory Notebook
Converted from
theory.ipynbfor web reading.
Red Teaming and Safety Evaluations
Red teaming treats harmful behavior discovery as an adaptive search problem over prompts, contexts, policies, and scoring rules.
This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - z.max(axis=axis, keepdims=True)
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=axis, keepdims=True)
def log_softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - z.max(axis=axis, keepdims=True)
return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))
def check_true(condition, message):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {message}")
assert ok
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(float(actual) - float(expected)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
assert ok
print("Alignment helper functions loaded.")
Demo 1: Red teaming searches before users find failures
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 5
header("Demo 1 - Red teaming searches before users find failures: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 2: Safety failures are rare-event measurements
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 7
header("Demo 2 - Safety failures are rare-event measurements: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")
Demo 3: Attackers adapt to defenses
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 9
header("Demo 3 - Attackers adapt to defenses: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 4: Red-team findings become training data
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 11
header("Demo 4 - Red-team findings become training data: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 5: Safety evaluation versus general robustness
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 13
header("Demo 5 - Safety evaluation versus general robustness: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 6: Harm taxonomy
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 15
header("Demo 6 - Harm taxonomy: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 7: Attack prompt
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 17
header("Demo 7 - Attack prompt: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")
Demo 8: Target model
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 19
header("Demo 8 - Target model: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 9: Violation score
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 21
header("Demo 9 - Violation score: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 10: Attack success rate
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 23
header("Demo 10 - Attack success rate: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 11: Protocols
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 25
header("Demo 11 - Protocols: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 12: Severity labels
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 27
header("Demo 12 - Severity labels: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")
Demo 13: Inter-rater agreement
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 29
header("Demo 13 - Inter-rater agreement: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 14: Coverage gaps
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 31
header("Demo 14 - Coverage gaps: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 15: Triage and reproduction
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 33
header("Demo 15 - Triage and reproduction: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 16: Attacker models
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 35
header("Demo 16 - Attacker models: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 17: Prompt mutation
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 37
header("Demo 17 - Prompt mutation: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")
Demo 18: Search objectives
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 39
header("Demo 18 - Search objectives: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 19: Adaptive attacks
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 41
header("Demo 19 - Adaptive attacks: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 20: Budgeted exploration
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 43
header("Demo 20 - Budgeted exploration: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")
Demo 21: SafetyPrompts
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 45
header("Demo 21 - SafetyPrompts: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")
Demo 22: HarmBench-style suites
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 47
header("Demo 22 - HarmBench-style suites: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")
Demo 23: Jailbreak sets
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 49
header("Demo 23 - Jailbreak sets: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
tpr.append((scores_bad >= tau).mean())
fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")
Demo 24: Refusal measurement
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 51
header("Demo 24 - Refusal measurement: policy category matrix")
matrix = np.array([
[0.02, 0.08, 0.25],
[0.04, 0.20, 0.62],
[0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")
Demo 25: Over-refusal measurement
This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.
Code cell 53
header("Demo 25 - Over-refusal measurement: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")