Exercises NotebookMath for LLMs

Reinforcement Learning

Math for Specific Models / Reinforcement Learning

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Exercises: Reinforcement Learning

There are 10 exercises. Exercises 1-3 cover MDP mechanics, 4-7 cover TD/control/policy gradients, and 8-10 connect actor-critic, PPO, and preference optimization to modern AI systems.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def softmax(logits):
    logits = np.asarray(logits, dtype=float)
    shifted = logits - np.max(logits)
    exp = np.exp(shifted)
    return exp / exp.sum()

def chain_mdp():
    # Three nonterminal states plus one terminal state.
    n_states, n_actions, terminal = 4, 2, 3
    P = np.zeros((n_states, n_actions, n_states))
    R = np.zeros((n_states, n_actions, n_states))
    for s in range(n_states):
        if s == terminal:
            P[s, :, terminal] = 1.0
            continue
        left = max(0, s - 1)
        right = min(terminal, s + 1)
        P[s, 0, left] = 1.0
        P[s, 1, right] = 1.0
        if right == terminal:
            R[s, 1, terminal] = 1.0
    return P, R, terminal

def policy_transition_reward(P, R, policy):
    P_pi = np.einsum("sa,san->sn", policy, P)
    r_pi = np.einsum("sa,san,san->s", policy, P, R)
    return P_pi, r_pi

def policy_evaluation(P, R, policy, gamma=0.9):
    P_pi, r_pi = policy_transition_reward(P, R, policy)
    A = np.eye(P.shape[0]) - gamma * P_pi
    return np.linalg.solve(A, r_pi)

def bellman_optimality_backup(P, R, V, gamma=0.9):
    q = np.einsum("san,san->sa", P, R + gamma * V[None, None, :])
    return q.max(axis=1), q.argmax(axis=1), q

def value_iteration(P, R, gamma=0.9, steps=25):
    V = np.zeros(P.shape[0])
    history = []
    for _ in range(steps):
        new_V, policy, q = bellman_optimality_backup(P, R, V, gamma)
        history.append(float(np.max(np.abs(new_V - V))))
        V = new_V
    return V, policy, np.array(history)

def td_update(v_s, reward, v_next, alpha=0.2, gamma=0.9, done=False):
    target = reward if done else reward + gamma * v_next
    return v_s + alpha * (target - v_s), target

def q_learning_update(q_sa, reward, next_q, alpha=0.3, gamma=0.9, done=False):
    target = reward if done else reward + gamma * np.max(next_q)
    return q_sa + alpha * (target - q_sa), target

def sarsa_update(q_sa, reward, q_next_action, alpha=0.3, gamma=0.9, done=False):
    target = reward if done else reward + gamma * q_next_action
    return q_sa + alpha * (target - q_sa), target

def discounted_returns(rewards, gamma=0.9):
    out = []
    g = 0.0
    for r in reversed(rewards):
        g = float(r) + gamma * g
        out.append(g)
    return np.array(list(reversed(out)))

def generalized_advantages(rewards, values, gamma=0.9, lam=0.95):
    rewards = np.asarray(rewards, dtype=float)
    values = np.asarray(values, dtype=float)
    adv = np.zeros_like(rewards)
    gae = 0.0
    for t in reversed(range(len(rewards))):
        delta = rewards[t] + gamma * values[t + 1] - values[t]
        gae = delta + gamma * lam * gae
        adv[t] = gae
    return adv

def ppo_clip_objective(ratio, advantage, eps=0.2):
    unclipped = ratio * advantage
    clipped = np.clip(ratio, 1 - eps, 1 + eps) * advantage
    return np.minimum(unclipped, clipped)

print("Reinforcement-learning helpers ready.")

Exercise 1: Bellman policy evaluation (*)

Solve a fixed-policy value equation in a tiny chain MDP. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 5

# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)

Code cell 6

# Solution - Exercise 1
header("Exercise 1: Bellman policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.4, 0.6], [0.3, 0.7], [0.2, 0.8], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "value increases near terminal reward")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 2: Value iteration (*)

Apply optimal Bellman backups and inspect the greedy policy. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 8

# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)

Code cell 9

# Solution - Exercise 2
header("Exercise 2: Value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=8)
print("History:", np.round(history, 6).tolist())
print("Policy:", policy.tolist())
check_true(history[-1] <= history[0], "Bellman changes shrink")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 3: Monte Carlo return (*)

Compute delayed credit from a sampled episode. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 11

# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)

Code cell 12

# Solution - Exercise 3
header("Exercise 3: Monte Carlo return")
returns = discounted_returns([0.0, 0.0, 1.0], gamma=0.9)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="two-step delayed reward")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 4: TD target (**)

Compare current value with a bootstrapped one-step target. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 14

# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)

Code cell 15

# Solution - Exercise 4
header("Exercise 4: TD target")
new_v, target = td_update(0.2, reward=0.5, v_next=0.6, alpha=0.25, gamma=0.9)
print("Target:", round(target, 4), "new value:", round(new_v, 4))
check_close(target, 1.04, name="TD target")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 5: SARSA and Q-learning (**)

Compute on-policy and off-policy control targets. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 17

# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)

Code cell 18

# Solution - Exercise 5
header("Exercise 5: SARSA and Q-learning")
next_q = np.array([0.1, 0.8])
q_sarsa, _ = sarsa_update(0.2, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, _ = q_learning_update(0.2, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA:", round(q_sarsa, 4), "Q-learning:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "Q-learning uses greedy next value")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 6: Epsilon greedy policy (**)

Build a valid exploration distribution. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 20

# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)

Code cell 21

# Solution - Exercise 6
header("Exercise 6: Epsilon greedy policy")
q = np.array([0.0, 2.0, 1.0, -1.0])
eps = 0.2
probs = np.ones_like(q) * eps / len(q)
probs[np.argmax(q)] += 1 - eps
print("Probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="epsilon greedy normalizes")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 7: Policy gradient score (**)

Estimate a softmax policy-gradient direction. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 23

# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)

Code cell 24

# Solution - Exercise 7
header("Exercise 7: Policy gradient score")
logits = np.array([0.3, -0.2, 0.1])
probs = softmax(logits)
action = 2
advantage = 1.2
grad = (np.eye(3)[action] - probs) * advantage
print("Gradient:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="score gradient conservation")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 8: Generalized advantage estimation (***)

Compute smoothed advantages from rewards and values. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 26

# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)

Code cell 27

# Solution - Exercise 8
header("Exercise 8: Generalized advantage estimation")
rewards = [0.0, 0.5, 1.0]
values = [0.2, 0.3, 0.4, 0.0]
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.95)
print("GAE:", np.round(adv, 4).tolist())
check_true(np.all(adv > 0), "positive rewards create positive advantages here")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 9: PPO clipping (***)

Evaluate a clipped surrogate update. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 29

# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)

Code cell 30

# Solution - Exercise 9
header("Exercise 9: PPO clipping")
ratios = np.array([0.6, 1.0, 1.4])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="ratio clipped at upper bound")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")

Exercise 10: Preference optimization (***)

Compute a reward-model probability and DPO-style loss. State the object being estimated, compute the numeric answer, and interpret the result.

Code cell 32

# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)

Code cell 33

# Solution - Exercise 10
header("Exercise 10: Preference optimization")
policy_gap = 1.1
reference_gap = 0.4
beta = 0.5
prob = 1 / (1 + np.exp(-beta * (policy_gap - reference_gap)))
loss = -np.log(prob)
print("Preference probability:", round(float(prob), 4))
print("Loss:", round(float(loss), 4))
check_true(loss > 0, "negative log-likelihood is positive")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")