Theory NotebookMath for LLMs

Red Teaming and Safety Evaluations

Alignment and Safety / Red Teaming and Safety Evaluations

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Red Teaming and Safety Evaluations

Red teaming treats harmful behavior discovery as an adaptive search problem over prompts, contexts, policies, and scoring rules.

This notebook is the executable companion to notes.md. It uses synthetic alignment data so the objective, threshold, and feedback mechanics can be studied without external files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 80)
    print(title)
    print("=" * 80)

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    z = z - z.max(axis=axis, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / exp_z.sum(axis=axis, keepdims=True)

def log_softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    shifted = z - z.max(axis=axis, keepdims=True)
    return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))

def check_true(condition, message):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {message}")
    assert ok

def check_close(actual, expected, tol=1e-8, message="values close"):
    ok = abs(float(actual) - float(expected)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
    assert ok

print("Alignment helper functions loaded.")

Demo 1: Red teaming searches before users find failures

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 5

header("Demo 1 - Red teaming searches before users find failures: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 2: Safety failures are rare-event measurements

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 7

header("Demo 2 - Safety failures are rare-event measurements: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")

Demo 3: Attackers adapt to defenses

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 9

header("Demo 3 - Attackers adapt to defenses: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 4: Red-team findings become training data

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 11

header("Demo 4 - Red-team findings become training data: policy category matrix")
matrix = np.array([
    [0.02, 0.08, 0.25],
    [0.04, 0.20, 0.62],
    [0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")

Demo 5: Safety evaluation versus general robustness

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 13

header("Demo 5 - Safety evaluation versus general robustness: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 6: Harm taxonomy

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 15

header("Demo 6 - Harm taxonomy: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 7: Attack prompt

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 17

header("Demo 7 - Attack prompt: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")

Demo 8: Target model

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 19

header("Demo 8 - Target model: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 9: Violation score

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 21

header("Demo 9 - Violation score: policy category matrix")
matrix = np.array([
    [0.02, 0.08, 0.25],
    [0.04, 0.20, 0.62],
    [0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")

Demo 10: Attack success rate

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 23

header("Demo 10 - Attack success rate: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 11: Protocols

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 25

header("Demo 11 - Protocols: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 12: Severity labels

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 27

header("Demo 12 - Severity labels: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")

Demo 13: Inter-rater agreement

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 29

header("Demo 13 - Inter-rater agreement: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 14: Coverage gaps

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 31

header("Demo 14 - Coverage gaps: policy category matrix")
matrix = np.array([
    [0.02, 0.08, 0.25],
    [0.04, 0.20, 0.62],
    [0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")

Demo 15: Triage and reproduction

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 33

header("Demo 15 - Triage and reproduction: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 16: Attacker models

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 35

header("Demo 16 - Attacker models: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 17: Prompt mutation

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 37

header("Demo 17 - Prompt mutation: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")

Demo 18: Search objectives

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 39

header("Demo 18 - Search objectives: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 19: Adaptive attacks

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 41

header("Demo 19 - Adaptive attacks: policy category matrix")
matrix = np.array([
    [0.02, 0.08, 0.25],
    [0.04, 0.20, 0.62],
    [0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")

Demo 20: Budgeted exploration

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 43

header("Demo 20 - Budgeted exploration: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Demo 21: SafetyPrompts

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 45

header("Demo 21 - SafetyPrompts: attack success rate")
n = 300
severity = np.random.choice([0, 1, 2, 3], size=n, p=[0.68, 0.18, 0.10, 0.04])
violated = severity >= 2
asr = violated.mean()
se = np.sqrt(asr * (1 - asr) / n)
print(f"ASR={asr:.3f}, 95% CI=[{asr-1.96*se:.3f}, {asr+1.96*se:.3f}]")
check_true(0 <= asr <= 1, "attack success rate is a probability")

Demo 22: HarmBench-style suites

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 47

header("Demo 22 - HarmBench-style suites: adaptive red-team search")
rounds = np.arange(1, 21)
best = 1 - np.exp(-rounds / 6)
noise = np.random.normal(0, 0.02, size=len(rounds))
score = np.clip(best + noise, 0, 1)
fig, ax = plt.subplots()
ax.plot(rounds, score, color=COLORS["error"], marker="o", label="best violation score")
ax.set_title("Adaptive red-team search improves found severity")
ax.set_xlabel("Search round")
ax.set_ylabel("Best violation score")
ax.legend()
fig.tight_layout()
plt.show()
check_true(score[-1] >= score[0], "search finds stronger attacks over time")

Demo 23: Jailbreak sets

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 49

header("Demo 23 - Jailbreak sets: guardrail threshold tradeoff")
scores_bad = np.random.beta(6, 2, size=500)
scores_good = np.random.beta(2, 6, size=500)
thresholds = np.linspace(0.05, 0.95, 30)
tpr, fpr = [], []
for tau in thresholds:
    tpr.append((scores_bad >= tau).mean())
    fpr.append((scores_good >= tau).mean())
fig, ax = plt.subplots()
ax.plot(thresholds, tpr, color=COLORS["primary"], label="block harmful")
ax.plot(thresholds, fpr, color=COLORS["secondary"], label="block benign")
ax.set_title("Guardrail threshold tradeoff")
ax.set_xlabel("Threshold")
ax.set_ylabel("Rate")
ax.legend()
fig.tight_layout()
plt.show()
check_true(max(tpr) <= 1 and min(fpr) >= 0, "rates stay in [0, 1]")

Demo 24: Refusal measurement

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 51

header("Demo 24 - Refusal measurement: policy category matrix")
matrix = np.array([
    [0.02, 0.08, 0.25],
    [0.04, 0.20, 0.62],
    [0.12, 0.45, 0.88],
])
fig, ax = plt.subplots(figsize=(7, 5))
im = ax.imshow(matrix, cmap="viridis")
fig.colorbar(im, ax=ax, label="block probability")
ax.set_title("Guardrail block probability by intent and harm")
ax.set_xlabel("Harm severity")
ax.set_ylabel("User intent risk")
fig.tight_layout()
plt.show()
print("highest-risk cell:", matrix.max())
check_true(matrix.max() > matrix.min(), "policy matrix distinguishes risk levels")

Demo 25: Over-refusal measurement

This cell turns the approved TOC into a small executable alignment experiment. Watch the estimator, the proxy objective, and the failure mode.

Code cell 53

header("Demo 25 - Over-refusal measurement: feedback loop improvement")
rounds = np.arange(6)
safety = 0.72 + 0.05 * (1 - np.exp(-rounds / 2))
helpfulness = 0.78 - 0.015 * rounds + 0.008 * np.sqrt(rounds)
fig, ax = plt.subplots()
ax.plot(rounds, safety, color=COLORS["primary"], marker="o", label="safety")
ax.plot(rounds, helpfulness, color=COLORS["secondary"], marker="o", label="helpfulness")
ax.set_title("Feedback loop metrics")
ax.set_xlabel("Refresh round")
ax.set_ylabel("Score")
ax.legend()
fig.tight_layout()
plt.show()
print(f"final safety={safety[-1]:.3f}, final helpfulness={helpfulness[-1]:.3f}")
check_true(safety[-1] > safety[0], "safety improves after feedback refreshes")

Previous lesson Next lesson