Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Loss Functions - Exercises
Ten graded exercises. Each exercise has a problem statement, a runnable learner scaffold, and a complete solution cell.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_close(name, value, expected, tol=1e-7):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
def logsumexp(logits, axis=-1):
m = np.max(logits, axis=axis, keepdims=True)
return np.squeeze(m + np.log(np.sum(np.exp(logits - m), axis=axis, keepdims=True)), axis=axis)
def softmax(logits, axis=-1):
shifted = logits - np.max(logits, axis=axis, keepdims=True)
exp_shifted = np.exp(shifted)
return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)
print("Exercise helpers ready.")
Exercise 1: MSE gradient (*)
Compute MSE and its gradient for residuals y_pred - y_true.
Code cell 5
# Your Solution
y_true = np.array([1.0, 2.0, 3.0])
y_pred = np.array([1.5, 1.0, 5.0])
mse = None
grad = None
print("mse:", mse)
print("grad:", grad)
Code cell 6
# Solution
header("Exercise 1: MSE gradient")
y_true = np.array([1.0, 2.0, 3.0])
y_pred = np.array([1.5, 1.0, 5.0])
residual = y_pred - y_true
mse = np.mean(residual ** 2)
grad = 2 * residual / len(residual)
check_close("mse", mse, (0.5**2 + (-1.0)**2 + 2.0**2) / 3)
check_close("gradient", grad, np.array([1.0, -2.0, 4.0]) / 3)
print("\nTakeaway: MSE gives large residuals proportionally larger gradients.")
Exercise 2: Huber loss (*)
Implement Huber loss for residuals with delta=1.
Code cell 8
# Your Solution
residual = np.array([-2.0, -0.5, 0.0, 0.5, 3.0])
delta = 1.0
huber = None
print("huber:", huber)
Code cell 9
# Solution
header("Exercise 2: Huber loss")
residual = np.array([-2.0, -0.5, 0.0, 0.5, 3.0])
delta = 1.0
huber = np.where(np.abs(residual) <= delta, 0.5 * residual**2, delta * (np.abs(residual) - 0.5 * delta))
expected = np.array([1.5, 0.125, 0.0, 0.125, 2.5])
check_close("Huber values", huber, expected)
print("\nTakeaway: Huber is quadratic near zero and linear in the tails.")
Exercise 3: BCE from logits (*)
Compute binary cross-entropy directly from logits.
Code cell 11
# Your Solution
z = np.array([-2.0, 0.0, 2.0])
y = np.array([0.0, 1.0, 1.0])
loss = None
print("loss:", loss)
Code cell 12
# Solution
header("Exercise 3: BCE from logits")
z = np.array([-2.0, 0.0, 2.0])
y = np.array([0.0, 1.0, 1.0])
loss = np.maximum(z, 0) - z * y + np.log1p(np.exp(-np.abs(z)))
prob_loss = -(y * np.log(sigmoid(z)) + (1 - y) * np.log(1 - sigmoid(z)))
check_close("stable BCE equals probability BCE", loss, prob_loss)
print("\nTakeaway: Fused logit-space BCE avoids unstable probability logs.")
Exercise 4: Stable softmax CE (**)
Implement multiclass cross-entropy from logits using log-sum-exp.
Code cell 14
# Your Solution
logits = np.array([[2.0, 1.0, 0.0], [1000.0, 999.0, 998.0]])
targets = np.array([0, 2])
ce = None
print("ce:", ce)
Code cell 15
# Solution
header("Exercise 4: Stable softmax CE")
logits = np.array([[2.0, 1.0, 0.0], [1000.0, 999.0, 998.0]])
targets = np.array([0, 2])
ce = -logits[np.arange(len(targets)), targets] + logsumexp(logits, axis=1)
expected_first = -2.0 + np.log(np.exp(2.0) + np.exp(1.0) + np.exp(0.0))
check_close("first CE", ce[0], expected_first)
check_true("huge-logit CE finite", np.isfinite(ce).all())
print("\nTakeaway: subtracting the max inside log-sum-exp makes CE finite.")
Exercise 5: Masked sequence loss (**)
Compute a valid-token mean from token losses and a binary mask.
Code cell 17
# Your Solution
losses = np.array([[0.2, 0.5, 0.0], [0.3, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
masked_mean = None
print("masked_mean:", masked_mean)
Code cell 18
# Solution
header("Exercise 5: Masked sequence loss")
losses = np.array([[0.2, 0.5, 0.0], [0.3, 0.0, 0.0]])
mask = np.array([[1, 1, 0], [1, 0, 0]])
masked_mean = np.sum(losses * mask) / np.sum(mask)
check_close("masked mean", masked_mean, (0.2 + 0.5 + 0.3) / 3)
print("\nTakeaway: sequence losses should divide by valid tokens, not padded length.")
Exercise 6: Focal loss (**)
Compare BCE and focal loss for easy and hard positive examples.
Code cell 20
# Your Solution
pt = np.array([0.95, 0.55, 0.10])
gamma = 2.0
focal = None
print("focal:", focal)
Code cell 21
# Solution
header("Exercise 6: Focal loss")
pt = np.array([0.95, 0.55, 0.10])
gamma = 2.0
bce = -np.log(pt)
focal = ((1 - pt) ** gamma) * bce
check_true("easy example downweighted most", focal[0] / bce[0] < focal[1] / bce[1] < focal[2] / bce[2])
print("BCE:", bce)
print("Focal:", focal)
print("\nTakeaway: focal loss preserves hard examples and suppresses easy ones.")
Exercise 7: InfoNCE (**)
Compute InfoNCE as row-wise cross-entropy over a similarity matrix.
Code cell 23
# Your Solution
sim = np.array([[2.0, 0.5, 0.1], [0.0, 1.5, 0.2], [0.1, 0.4, 1.0]])
temperature = 0.5
loss = None
print("loss:", loss)
Code cell 24
# Solution
header("Exercise 7: InfoNCE")
sim = np.array([[2.0, 0.5, 0.1], [0.0, 1.5, 0.2], [0.1, 0.4, 1.0]])
temperature = 0.5
labels = np.arange(sim.shape[0])
logits = sim / temperature
loss = -logits[np.arange(3), labels] + logsumexp(logits, axis=1)
check_true("positive diagonal gives low mean loss", loss.mean() < 0.5)
print("InfoNCE losses:", loss)
print("\nTakeaway: InfoNCE is cross-entropy where the correct class is the positive pair.")
Exercise 8: Triplet loss (***)
Compute triplet loss and identify whether the triplet violates the margin.
Code cell 26
# Your Solution
anchor = np.array([0.0, 0.0])
positive = np.array([0.4, 0.0])
negative = np.array([0.6, 0.0])
margin = 0.5
loss = None
print("loss:", loss)
Code cell 27
# Solution
header("Exercise 8: Triplet loss")
anchor = np.array([0.0, 0.0])
positive = np.array([0.4, 0.0])
negative = np.array([0.6, 0.0])
margin = 0.5
d_pos = np.sum((anchor - positive) ** 2)
d_neg = np.sum((anchor - negative) ** 2)
loss = max(0.0, d_pos - d_neg + margin)
check_close("triplet loss", loss, 0.16 - 0.36 + 0.5)
check_true("triplet violates margin", loss > 0)
print("\nTakeaway: triplet loss trains only when the positive is not closer by enough margin.")
Exercise 9: Preference loss (***)
Implement a DPO-style pairwise logistic loss.
Code cell 29
# Your Solution
logp_w = np.array([-2.0, -1.5])
logp_l = np.array([-3.0, -1.0])
ref_w = np.array([-2.2, -1.4])
ref_l = np.array([-2.8, -1.1])
beta = 0.2
loss = None
print("loss:", loss)
Code cell 30
# Solution
header("Exercise 9: Preference loss")
logp_w = np.array([-2.0, -1.5])
logp_l = np.array([-3.0, -1.0])
ref_w = np.array([-2.2, -1.4])
ref_l = np.array([-2.8, -1.1])
beta = 0.2
advantage = beta * ((logp_w - ref_w) - (logp_l - ref_l))
loss = np.logaddexp(0.0, -advantage)
check_true("preference loss finite", np.isfinite(loss).all())
check_true("better relative advantage lowers logistic loss", loss[0] < np.log(2))
print("\nTakeaway: preference losses optimize relative chosen-vs-rejected log probabilities.")
Exercise 10: Loss balancing (***)
Compute contribution shares for a weighted multi-term objective.
Code cell 32
# Your Solution
terms = {"ce": 0.8, "kl": 0.05, "aux": 2.0}
weights = {"ce": 1.0, "kl": 10.0, "aux": 0.1}
shares = None
print("shares:", shares)
Code cell 33
# Solution
header("Exercise 10: Loss balancing")
terms = {"ce": 0.8, "kl": 0.05, "aux": 2.0}
weights = {"ce": 1.0, "kl": 10.0, "aux": 0.1}
total = sum(terms[k] * weights[k] for k in terms)
shares = {k: terms[k] * weights[k] / total for k in terms}
check_close("shares sum to one", sum(shares.values()), 1.0)
check_true("CE dominates after weighting", shares["ce"] > shares["aux"])
print("shares:", shares)
print("\nTakeaway: loss coefficients should be judged by weighted contribution, not by raw value.")