Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Exercises: Human in the Loop and Monitoring
There are 10 exercises. Exercises 1-3 cover mechanics, 4-6 cover theory, and 7-10 connect alignment math to AI safety workflows.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 80)
print(title)
print("=" * 80)
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
z = z - z.max(axis=axis, keepdims=True)
exp_z = np.exp(z)
return exp_z / exp_z.sum(axis=axis, keepdims=True)
def log_softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - z.max(axis=axis, keepdims=True)
return shifted - np.log(np.exp(shifted).sum(axis=axis, keepdims=True))
def check_true(condition, message):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {message}")
assert ok
def check_close(actual, expected, tol=1e-8, message="values close"):
ok = abs(float(actual) - float(expected)) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {message}: actual={float(actual):.6f}, expected={float(expected):.6f}")
assert ok
print("Alignment helper functions loaded.")
Exercise 1: Alignment as a feedback system (*)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 5
# Your Solution - Exercise 1
answer = None
print("answer =", answer)
Code cell 6
# Solution
header("Exercise 1: Response-only SFT loss")
log_probs = np.log(np.array([0.7, 0.2, 0.6]))
mask = np.array([0, 1, 1], dtype=float)
loss = -(log_probs * mask).sum() / mask.sum()
expected = -np.log(np.array([0.2, 0.6])).mean()
check_close(loss, expected, message="masked loss")
print("\nTakeaway: SFT should usually train on assistant response tokens, not user prompt tokens.")
Exercise 2: Humans as sparse high-value sensors (*)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 8
# Your Solution - Exercise 2
answer = None
print("answer =", answer)
Code cell 9
# Solution
header("Exercise 2: Bradley-Terry probability")
r_w, r_l = 1.2, 0.4
p = sigmoid(r_w - r_l)
check_true(0.5 < p < 1.0, "winner has probability above one half")
print("\nTakeaway: Pairwise reward models convert reward margins into preference probabilities.")
Exercise 3: Escalation as risk control (*)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 11
# Your Solution - Exercise 3
answer = None
print("answer =", answer)
Code cell 12
# Solution
header("Exercise 3: DPO margin")
beta = 0.2
margin = beta * ((1.0) - (-0.5))
loss = -np.log(sigmoid(margin))
check_true(loss > 0, "DPO loss is positive")
print("\nTakeaway: DPO increases the policy log-ratio of the preferred response over the rejected one.")
Exercise 4: Feedback loops versus production dashboards (**)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 14
# Your Solution - Exercise 4
answer = None
print("answer =", answer)
Code cell 15
# Solution
header("Exercise 4: Attack success rate")
violations = np.array([0, 1, 0, 0, 1, 1, 0, 0], dtype=float)
asr = violations.mean()
check_close(asr, 0.375, message="attack success rate")
print("\nTakeaway: Safety rates need denominators and uncertainty, not anecdotes.")
Exercise 5: Why monitoring must become data (**)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 17
# Your Solution - Exercise 5
answer = None
print("answer =", answer)
Code cell 18
# Solution
header("Exercise 5: Guardrail action")
score = 0.73
tau = 0.60
blocked = score >= tau
check_true(blocked, "score above threshold triggers block")
print("\nTakeaway: Thresholds turn calibrated risk scores into runtime actions.")
Exercise 6: Feedback event (**)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 20
# Your Solution - Exercise 6
answer = None
print("answer =", answer)
Code cell 21
# Solution
header("Exercise 6: Refusal tradeoff")
safe_compliance = 0.82
benign_refusal = 0.09
utility = safe_compliance - 0.5 * benign_refusal
check_close(utility, 0.775, message="simple refusal-adjusted utility")
print("\nTakeaway: Safety metrics should not reward blanket refusal without measuring helpfulness.")
Exercise 7: Label budget (***)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 23
# Your Solution - Exercise 7
answer = None
print("answer =", answer)
Code cell 24
# Solution
header("Exercise 7: Active-learning priority")
risk, uncertainty, diversity = 0.8, 0.6, 0.3
u = 0.5 * risk + 0.35 * uncertainty + 0.15 * diversity
check_close(u, 0.655, message="feedback priority")
print("\nTakeaway: Human review should prioritize high-risk and high-information items.")
Exercise 8: Active learning score (***)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 26
# Your Solution - Exercise 8
answer = None
print("answer =", answer)
Code cell 27
# Solution
header("Exercise 8: Reviewer agreement")
a = np.array([1, 0, 1, 0, 1])
b = np.array([1, 0, 0, 0, 1])
agreement = (a == b).mean()
check_close(agreement, 0.8, message="raw agreement")
print("\nTakeaway: Human feedback quality must be measured before it becomes training signal.")
Exercise 9: Escalation policy (***)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 29
# Your Solution - Exercise 9
answer = None
print("answer =", answer)
Code cell 30
# Solution
header("Exercise 9: KL budget intuition")
reward_gain = 0.12
kl_cost = 0.04
beta = 2.0
objective = reward_gain - beta * kl_cost
check_close(objective, 0.04, message="KL-regularized improvement")
print("\nTakeaway: KL penalties make aggressive alignment updates pay for policy drift.")
Exercise 10: Preference queue (***)
Write the relevant alignment object, compute a small synthetic quantity, and explain what regression metric would protect against a false win.
Code cell 32
# Your Solution - Exercise 10
answer = None
print("answer =", answer)
Code cell 33
# Solution
header("Exercise 10: Regression gate")
safety_delta = 0.04
capability_delta = -0.01
ship = safety_delta > 0.02 and capability_delta > -0.02
check_true(ship, "release gate passes both safety and capability checks")
print("\nTakeaway: Alignment releases need multi-metric gates, not a single proxy win.")