Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Exercises: Reinforcement Learning
There are 10 exercises. Exercises 1-3 cover MDP mechanics, 4-7 cover TD/control/policy gradients, and 8-10 connect actor-critic, PPO, and preference optimization to modern AI systems.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def softmax(logits):
logits = np.asarray(logits, dtype=float)
shifted = logits - np.max(logits)
exp = np.exp(shifted)
return exp / exp.sum()
def chain_mdp():
# Three nonterminal states plus one terminal state.
n_states, n_actions, terminal = 4, 2, 3
P = np.zeros((n_states, n_actions, n_states))
R = np.zeros((n_states, n_actions, n_states))
for s in range(n_states):
if s == terminal:
P[s, :, terminal] = 1.0
continue
left = max(0, s - 1)
right = min(terminal, s + 1)
P[s, 0, left] = 1.0
P[s, 1, right] = 1.0
if right == terminal:
R[s, 1, terminal] = 1.0
return P, R, terminal
def policy_transition_reward(P, R, policy):
P_pi = np.einsum("sa,san->sn", policy, P)
r_pi = np.einsum("sa,san,san->s", policy, P, R)
return P_pi, r_pi
def policy_evaluation(P, R, policy, gamma=0.9):
P_pi, r_pi = policy_transition_reward(P, R, policy)
A = np.eye(P.shape[0]) - gamma * P_pi
return np.linalg.solve(A, r_pi)
def bellman_optimality_backup(P, R, V, gamma=0.9):
q = np.einsum("san,san->sa", P, R + gamma * V[None, None, :])
return q.max(axis=1), q.argmax(axis=1), q
def value_iteration(P, R, gamma=0.9, steps=25):
V = np.zeros(P.shape[0])
history = []
for _ in range(steps):
new_V, policy, q = bellman_optimality_backup(P, R, V, gamma)
history.append(float(np.max(np.abs(new_V - V))))
V = new_V
return V, policy, np.array(history)
def td_update(v_s, reward, v_next, alpha=0.2, gamma=0.9, done=False):
target = reward if done else reward + gamma * v_next
return v_s + alpha * (target - v_s), target
def q_learning_update(q_sa, reward, next_q, alpha=0.3, gamma=0.9, done=False):
target = reward if done else reward + gamma * np.max(next_q)
return q_sa + alpha * (target - q_sa), target
def sarsa_update(q_sa, reward, q_next_action, alpha=0.3, gamma=0.9, done=False):
target = reward if done else reward + gamma * q_next_action
return q_sa + alpha * (target - q_sa), target
def discounted_returns(rewards, gamma=0.9):
out = []
g = 0.0
for r in reversed(rewards):
g = float(r) + gamma * g
out.append(g)
return np.array(list(reversed(out)))
def generalized_advantages(rewards, values, gamma=0.9, lam=0.95):
rewards = np.asarray(rewards, dtype=float)
values = np.asarray(values, dtype=float)
adv = np.zeros_like(rewards)
gae = 0.0
for t in reversed(range(len(rewards))):
delta = rewards[t] + gamma * values[t + 1] - values[t]
gae = delta + gamma * lam * gae
adv[t] = gae
return adv
def ppo_clip_objective(ratio, advantage, eps=0.2):
unclipped = ratio * advantage
clipped = np.clip(ratio, 1 - eps, 1 + eps) * advantage
return np.minimum(unclipped, clipped)
print("Reinforcement-learning helpers ready.")
Exercise 1: Bellman policy evaluation (*)
Solve a fixed-policy value equation in a tiny chain MDP. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 5
# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)
Code cell 6
# Solution - Exercise 1
header("Exercise 1: Bellman policy evaluation")
P, R, terminal = chain_mdp()
policy = np.array([[0.4, 0.6], [0.3, 0.7], [0.2, 0.8], [0.5, 0.5]])
V = policy_evaluation(P, R, policy, gamma=0.9)
print("Value:", np.round(V, 4).tolist())
check_true(V[2] > V[1] > V[0], "value increases near terminal reward")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 2: Value iteration (*)
Apply optimal Bellman backups and inspect the greedy policy. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 8
# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)
Code cell 9
# Solution - Exercise 2
header("Exercise 2: Value iteration")
P, R, terminal = chain_mdp()
V, policy, history = value_iteration(P, R, gamma=0.9, steps=8)
print("History:", np.round(history, 6).tolist())
print("Policy:", policy.tolist())
check_true(history[-1] <= history[0], "Bellman changes shrink")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 3: Monte Carlo return (*)
Compute delayed credit from a sampled episode. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 11
# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)
Code cell 12
# Solution - Exercise 3
header("Exercise 3: Monte Carlo return")
returns = discounted_returns([0.0, 0.0, 1.0], gamma=0.9)
print("Returns:", np.round(returns, 4).tolist())
check_close(returns[0], 0.81, name="two-step delayed reward")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 4: TD target (**)
Compare current value with a bootstrapped one-step target. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 14
# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)
Code cell 15
# Solution - Exercise 4
header("Exercise 4: TD target")
new_v, target = td_update(0.2, reward=0.5, v_next=0.6, alpha=0.25, gamma=0.9)
print("Target:", round(target, 4), "new value:", round(new_v, 4))
check_close(target, 1.04, name="TD target")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 5: SARSA and Q-learning (**)
Compute on-policy and off-policy control targets. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 17
# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)
Code cell 18
# Solution - Exercise 5
header("Exercise 5: SARSA and Q-learning")
next_q = np.array([0.1, 0.8])
q_sarsa, _ = sarsa_update(0.2, reward=0.0, q_next_action=next_q[0], alpha=0.5, gamma=0.9)
q_ql, _ = q_learning_update(0.2, reward=0.0, next_q=next_q, alpha=0.5, gamma=0.9)
print("SARSA:", round(q_sarsa, 4), "Q-learning:", round(q_ql, 4))
check_true(q_ql > q_sarsa, "Q-learning uses greedy next value")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 6: Epsilon greedy policy (**)
Build a valid exploration distribution. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 20
# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)
Code cell 21
# Solution - Exercise 6
header("Exercise 6: Epsilon greedy policy")
q = np.array([0.0, 2.0, 1.0, -1.0])
eps = 0.2
probs = np.ones_like(q) * eps / len(q)
probs[np.argmax(q)] += 1 - eps
print("Probabilities:", np.round(probs, 4).tolist())
check_close(probs.sum(), 1.0, name="epsilon greedy normalizes")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 7: Policy gradient score (**)
Estimate a softmax policy-gradient direction. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 23
# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)
Code cell 24
# Solution - Exercise 7
header("Exercise 7: Policy gradient score")
logits = np.array([0.3, -0.2, 0.1])
probs = softmax(logits)
action = 2
advantage = 1.2
grad = (np.eye(3)[action] - probs) * advantage
print("Gradient:", np.round(grad, 4).tolist())
check_close(grad.sum(), 0.0, name="score gradient conservation")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 8: Generalized advantage estimation (***)
Compute smoothed advantages from rewards and values. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 26
# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)
Code cell 27
# Solution - Exercise 8
header("Exercise 8: Generalized advantage estimation")
rewards = [0.0, 0.5, 1.0]
values = [0.2, 0.3, 0.4, 0.0]
adv = generalized_advantages(rewards, values, gamma=0.9, lam=0.95)
print("GAE:", np.round(adv, 4).tolist())
check_true(np.all(adv > 0), "positive rewards create positive advantages here")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 9: PPO clipping (***)
Evaluate a clipped surrogate update. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 29
# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)
Code cell 30
# Solution - Exercise 9
header("Exercise 9: PPO clipping")
ratios = np.array([0.6, 1.0, 1.4])
adv = np.ones_like(ratios)
obj = ppo_clip_objective(ratios, adv, eps=0.2)
print("Objective:", np.round(obj, 4).tolist())
check_close(obj[-1], 1.2, name="ratio clipped at upper bound")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")
Exercise 10: Preference optimization (***)
Compute a reward-model probability and DPO-style loss. State the object being estimated, compute the numeric answer, and interpret the result.
Code cell 32
# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)
Code cell 33
# Solution - Exercise 10
header("Exercise 10: Preference optimization")
policy_gap = 1.1
reference_gap = 0.4
beta = 0.5
prob = 1 / (1 + np.exp(-beta * (policy_gap - reference_gap)))
loss = -np.log(prob)
print("Preference probability:", round(float(prob), 4))
print("Loss:", round(float(loss), 4))
check_true(loss > 0, "negative log-likelihood is positive")
print("\nTakeaway: reinforcement-learning math is safest when the target equation and sampling assumption are explicit.")