Exercises NotebookMath for LLMs

Derivatives and Differentiation

Calculus Fundamentals / Derivatives and Differentiation

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Derivatives and Differentiation - Exercises

10 graded exercises covering the full section arc, from core calculus mechanics to ML-facing applications.

FormatDescription
ProblemMarkdown cell with task description
Your SolutionCode cell for learner work
SolutionReference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

Exercise 1 (★): Basic Differentiation Rules

Implement analytical derivatives for three functions using the power, product, and quotient rules.

(a) f(x)=3x42x2+7x1f(x) = 3x^4 - 2x^2 + 7x - 1

(b) g(x)=xexg(x) = \sqrt{x}\,e^x (product rule)

(c) h(x)=x2+1x1h(x) = \dfrac{x^2 + 1}{x - 1} (quotient rule, x1x \neq 1)

For each, implement the derivative analytically, then verify against centered_diff.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def f(x):
    return 3*x**4 - 2*x**2 + 7*x - 1

def f_prime(x):
    return 12*x**3 - 4*x + 7

def g(x):
    return np.sqrt(x) * np.exp(x)

def g_prime(x):
    # Product rule: u=sqrt(x), v=e^x => u'v + uv' = 1/(2sqrt(x))*e^x + sqrt(x)*e^x
    return (1/(2*np.sqrt(x))) * np.exp(x) + np.sqrt(x) * np.exp(x)

def h(x):
    return (x**2 + 1) / (x - 1)

def h_prime(x):
    # Quotient rule: (f'g - fg')/g^2 = (2x*(x-1) - (x^2+1)*1)/(x-1)^2
    return (2*x*(x-1) - (x**2+1)) / (x-1)**2

x_test = 2.0
header('Exercise 1: Basic Differentiation Rules')

check_close('f\'(2.0) = 12(8)-8+7 = 95', f_prime(x_test), 95.0)
check_close('f\' matches numerical', f_prime(x_test), centered_diff(f, x_test))
check_close('g\' matches numerical', g_prime(x_test), centered_diff(g, x_test))
check_close('h\' matches numerical', h_prime(x_test), centered_diff(h, x_test))

# Verify at multiple points
x_pts = np.array([0.5, 1.0, 2.0, 3.5])
for fn, fn_prime, fn_name in [(f, f_prime, 'f'), (g, g_prime, 'g')]:
    errs = [abs(fn_prime(x) - centered_diff(fn, x)) for x in x_pts]
    check_true(f'{fn_name}\'(x) accurate at 4 test points', all(e < 1e-5 for e in errs))

print('\nTakeaway: Power rule gives polynomial derivatives; product/quotient rules handle compositions.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Chain Rule

Differentiate using the chain rule and verify numerically.

(a) p(x)=sin(x3)p(x) = \sin(x^3)

(b) q(x)=ln(cosx)q(x) = \ln(\cos x) for x(0,π/2)x \in (0, \pi/2)

(c) r(x)=ex2/2r(x) = e^{-x^2/2} (Gaussian kernel)

(d) s(x)=(1+x2)10s(x) = (1 + x^2)^{10}

Identify the outer and inner functions for each.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

def p(x): return np.sin(x**3)
def p_prime(x): return np.cos(x**3) * 3*x**2  # outer=sin, inner=x^3

def q(x): return np.log(np.cos(x))
def q_prime(x): return -np.sin(x) / np.cos(x)  # = -tan(x)

def r(x): return np.exp(-x**2/2)
def r_prime(x): return np.exp(-x**2/2) * (-x)  # outer=exp, inner=-x^2/2

def s(x): return (1 + x**2)**10
def s_prime(x): return 10*(1+x**2)**9 * 2*x  # outer=u^10, inner=1+x^2

x_test = 1.0
header('Exercise 2: Chain Rule')

check_close('p\' matches numerical at x=1', p_prime(x_test), centered_diff(p, x_test))
check_close('q\' matches numerical at x=0.5', q_prime(0.5), centered_diff(q, 0.5))
check_close('r\' matches numerical at x=1', r_prime(x_test), centered_diff(r, x_test))
check_close('s\' matches numerical at x=1', s_prime(x_test), centered_diff(s, x_test))

# q'(x) = -tan(x) — verify this identity
check_close('q\'(0.5) = -tan(0.5)', q_prime(0.5), -np.tan(0.5))

print('\nTakeaway: Chain rule = differentiate outside × keep inside × derivative of inside.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Implicit Differentiation

The ellipse x24+y29=1\dfrac{x^2}{4} + \dfrac{y^2}{9} = 1 defines yy implicitly as a function of xx.

(a) Differentiate both sides implicitly with respect to xx to find dy/dxdy/dx.

(b) At the point (1,332)(1, \frac{3\sqrt{3}}{2}) on the upper ellipse, compute the slope of the tangent line.

(c) Verify numerically: define y(x)=31x2/4y(x) = 3\sqrt{1 - x^2/4} explicitly and compare your formula to centered_diff(y, 1.0).

(d) Find all points where the tangent is horizontal (slope = 0).

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

import numpy as np

# (a) Implicit: x^2/4 + y^2/9 = 1
# Differentiate: x/2 + 2y/9 * y' = 0 => y' = -9x/(4y)
def dydx_ellipse(x, y):
    return -9*x / (4*y)

# (b)
x0 = 1.0
y0 = 3 * np.sqrt(1 - x0**2/4)
slope_at_point = dydx_ellipse(x0, y0)

# (c) Explicit derivative for verification
def y_upper(x): return 3 * np.sqrt(1 - x**2/4)
numerical_slope = (y_upper(x0+1e-6) - y_upper(x0-1e-6)) / 2e-6

header('Exercise 3: Implicit Differentiation — Ellipse')
print(f'Point: ({x0}, {y0:.6f})')
check_close('Implicit slope at (1, y0)', slope_at_point, numerical_slope)

# Verify formula at multiple points
for x in [0.5, 1.0, 1.5]:
    y = 3*np.sqrt(1-x**2/4)
    a = dydx_ellipse(x, y)
    n = (y_upper(x+1e-6) - y_upper(x-1e-6)) / 2e-6
    check_close(f'slope at x={x}', a, n, tol=1e-5)

# (d) Horizontal tangents: dy/dx = 0 => x=0
x_horiz = 0.0
y_top = 3.0  # y(0) = 3
y_bot = -3.0
check_true('Horizontal tangent at (0, ±3)', abs(dydx_ellipse(x_horiz, y_top)) < 1e-10)

print('\nTakeaway: Implicit differentiation avoids solving for y — differentiate the equation directly.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): Activation Function Derivatives

(a) Derive σ(x)=σ(x)(1σ(x))\sigma'(x) = \sigma(x)(1-\sigma(x)) from scratch using the quotient rule.

(b) Implement sigmoid and its derivative, verify at x{3,0,3}x \in \{-3, 0, 3\}.

(c) Derive tanh(x)=1tanh2(x)\tanh'(x) = 1 - \tanh^2(x). Verify numerically.

(d) For a network with LL sigmoid layers, each with pre-activation near z=2z = 2, compute the product l=1Lσ(z)\prod_{l=1}^L \sigma'(z) for L{1,5,10,20}L \in \{1, 5, 10, 20\}. Explain the vanishing gradient problem.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

import numpy as np

def sigmoid(x):
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def sigmoid_prime(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh_prime(x):
    return 1 - np.tanh(x)**2

header('Exercise 4: Activation Function Derivatives')

# (b) Verify sigmoid'
for x in [-3.0, 0.0, 3.0]:
    a = sigmoid_prime(x)
    n = (sigmoid(x+1e-6) - sigmoid(x-1e-6)) / 2e-6
    check_close(f'sigma\'({x:+.0f})', a, n, tol=1e-5)

# Peak at x=0: 1/4
check_close('sigma\'(0) = 0.25', sigmoid_prime(0.0), 0.25)

# (c) Verify tanh'
for x in [-1.0, 0.0, 1.5]:
    a = tanh_prime(x)
    n = (np.tanh(x+1e-6) - np.tanh(x-1e-6)) / 2e-6
    check_close(f'tanh\'({x:+.1f})', a, n, tol=1e-5)

# (d) Vanishing gradient
print('\nVanishing gradient at z=2.0:')
z = 2.0
sp = sigmoid_prime(z)
print(f'sigma\'(2.0) = {sp:.4f}')
for L in [1, 5, 10, 20]:
    product = sp**L
    check_true(f'L={L:2d}: product={product:.2e} (vanishing for L>=5)', L < 4 or product < 1e-2)

print('\nTakeaway: Sigmoid gradients max at 0.25 — products vanish exponentially with depth.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): Critical Points and Extrema

Consider f(x)=x44x3+6x24x+1=(x1)4f(x) = x^4 - 4x^3 + 6x^2 - 4x + 1 = (x-1)^4.

(a) Find all critical points analytically (where f(x)=0f'(x) = 0).

(b) Apply the second derivative test. What does it tell you? Is the test conclusive?

(c) Apply the first derivative test (check sign of ff' on both sides of the critical point).

(d) Find the global minimum and maximum of ff on [1,3][-1, 3].

(e) Plot ff, ff', and ff'' on [0.5,2.5][-0.5, 2.5].

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

import numpy as np

def f(x): return (x-1)**4
def f_prime(x): return 4*(x-1)**3
def f_double_prime(x): return 12*(x-1)**2

header('Exercise 5: Critical Points and Extrema')

# (a) Critical points: f'(x) = 4(x-1)^3 = 0 => x=1 (triple root)
x_crit = 1.0
check_close('f\'(1) = 0', f_prime(x_crit), 0.0)

# (b) Second derivative test
fpp_crit = f_double_prime(x_crit)
print(f'f\'\'(1) = {fpp_crit} — INCONCLUSIVE (= 0, but f = (x-1)^4 >= 0)')

# (c) First derivative test
# f'(0.5) = 4(-0.5)^3 = -0.5 < 0 (decreasing before)
# f'(1.5) = 4(0.5)^3 = 0.5 > 0 (increasing after)
# sign changes - to + => LOCAL MINIMUM
check_true('f\' sign change: - to + at x=1 (local min)', f_prime(0.5) < 0 and f_prime(1.5) > 0)

# (d) Global extrema on [-1, 3]
candidates = {'x=-1': (-1.0, f(-1.0)), 'x=1 (crit)': (1.0, f(1.0)), 'x=3': (3.0, f(3.0))}
for name, (x, fx) in candidates.items():
    print(f'  {name}: f = {fx:.4f}')
check_close('Global min = f(1) = 0', f(1.0), 0.0)
check_close('Global max = f(-1) = 16 or f(3) = 16', f(-1.0), f(3.0))  # both = 16

# (e) Plot
try:
    import matplotlib.pyplot as plt
    x = np.linspace(-0.5, 2.5, 300)
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    for ax, fn, nm in zip(axes, [f, f_prime, f_double_prime], ['f(x)', "f'(x)", "f''(x)"]):
        ax.plot(x, fn(x), lw=2); ax.axhline(0, color='k', lw=0.7)
        ax.axvline(1, color='r', ls='--', alpha=0.5, label='x=1')
        ax.set_title(nm); ax.legend()
    plt.tight_layout(); plt.show()
except ImportError:
    pass

print('\nTakeaway: f\'\'=0 is INCONCLUSIVE — always verify with sign of f\' or higher derivatives.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Mean Value Theorem Application

(a) Prove that sinasinbab|\sin a - \sin b| \leq |a - b| for all a,bRa, b \in \mathbb{R} using the MVT.

(b) Use the MVT to show that lnxlnyxymin(x,y)|\ln x - \ln y| \leq \frac{|x-y|}{\min(x,y)} for x,y>0x, y > 0.

(c) For f(x)=x3xf(x) = x^3 - x on [2,2][-2, 2], find all cc guaranteed by the MVT and verify numerically.

(d) The gradient descent descent lemma: if f(x)L|f''(x)| \leq L for all xx, show that f(x)f(y)Lxy|f'(x) - f'(y)| \leq L|x-y| (gradient is LL-Lipschitz). Verify for f(x)=x2f(x) = x^2.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

import numpy as np

def f(x): return x**3 - x
def f_prime(x): return 3*x**2 - 1

header('Exercise 6: Mean Value Theorem')

# (a) |sin a - sin b| <= |a-b| — verify numerically
np.random.seed(42)
for _ in range(20):
    a, b = np.random.uniform(-10, 10, 2)
    lhs = abs(np.sin(a) - np.sin(b))
    rhs = abs(a - b)
    if lhs > rhs + 1e-10:
        print(f'FAIL at a={a:.2f}, b={b:.2f}')
        break
else:
    print('PASS — |sin a - sin b| <= |a-b| verified on 20 random pairs')

# (c) Find MVT c values for f(x)=x^3-x on [-2,2]
a, b = -2.0, 2.0
secant_slope = (f(b) - f(a)) / (b - a)
print(f'\nSecant slope: {secant_slope:.4f}')
# 3c^2 - 1 = secant_slope => c^2 = (secant_slope + 1)/3
c_sq = (secant_slope + 1) / 3
c_pos = np.sqrt(c_sq); c_neg = -np.sqrt(c_sq)
print(f'MVT c values: ±{c_pos:.4f}')
check_close('f\'(c_pos) = secant slope', f_prime(c_pos), secant_slope)
check_close('f\'(c_neg) = secant slope', f_prime(c_neg), secant_slope)
check_true('c_pos in (-2,2)', -2 < c_pos < 2)

# (d) L-Lipschitz gradient for g(x)=x^2
def g(x): return x**2
def g_prime(x): return 2*x
# g''(x) = 2 everywhere, so L = 2
L = 2.0
x1, x2 = 1.0, 3.0
lhs = abs(g_prime(x1) - g_prime(x2))
rhs = L * abs(x1 - x2)
print(f'\nLipschitz bound: {lhs:.4f} <= {rhs:.4f}')
check_true('|g\'(x1)-g\'(x2)| <= L|x1-x2|', lhs <= rhs + 1e-10)

print('\nTakeaway: MVT provides Lipschitz bounds that determine safe learning rates (eta <= 1/L).')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): Numerical Differentiation and Gradient Checking

(a) Implement forward, backward, and centered finite differences for f(x)f'(x).

(b) For f(x)=xln(x+ex)f(x) = x \ln(x + e^{-x}), compute the analytic derivative and verify against each finite difference method at x=1.5x = 1.5.

(c) Experiment with h{101,103,106,1010,1015}h \in \{10^{-1}, 10^{-3}, 10^{-6}, 10^{-10}, 10^{-15}\}. Record the error for each. Identify the optimal hh for centered differences in float64.

(d) Implement grad_check_vector for a vector-to-scalar function. Test it on the L2 loss L(w)=ww2\mathcal{L}(\mathbf{w}) = \|\mathbf{w} - \mathbf{w}^*\|^2 at w=[1,2,3]\mathbf{w} = [1, 2, 3], w=[0,0,0]\mathbf{w}^* = [0, 0, 0].

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

import numpy as np

def forward_diff(f, x, h): return (f(x+h) - f(x)) / h
def backward_diff(f, x, h): return (f(x) - f(x-h)) / h
def centered_diff_h(f, x, h): return (f(x+h) - f(x-h)) / (2*h)

def f(x): return x * np.log(x + np.exp(-x))
def f_prime_analytic(x):
    # f = x * ln(x + e^{-x})
    # product rule: 1*ln(x+e^-x) + x * (1 - e^-x)/(x + e^-x)
    return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))

header('Exercise 7: Gradient Checking')

x = 1.5
true_deriv = f_prime_analytic(x)
print(f'Analytic f\'(1.5) = {true_deriv:.8f}')

h_vals = [1e-1, 1e-3, 1e-6, 1e-10, 1e-15]
print(f'\n{"h":>8} {"Forward err":>14} {"Centered err":>14}')
best_ctr_h = None; best_ctr_err = np.inf
for h in h_vals:
    fwd_err = abs(forward_diff(f, x, h) - true_deriv)
    ctr_err = abs(centered_diff_h(f, x, h) - true_deriv)
    if ctr_err < best_ctr_err:
        best_ctr_err = ctr_err; best_ctr_h = h
    print(f'{h:8.0e} {fwd_err:14.2e} {ctr_err:14.2e}')
print(f'\nOptimal h for centered: {best_ctr_h:.0e} (theoretical: ~1e-6)')

# (d) Vector grad check
def L2_loss(w): return np.sum(w**2)
def L2_grad(w): return 2*w

w = np.array([1.0, 2.0, 3.0])
err = grad_check(L2_loss, w, L2_grad(w))
print(f'\nVector gradient check relative error: {err:.2e}')
check_true('Gradient check passes (err < 1e-5)', err < 1e-5)

print('\nTakeaway: Centered differences have O(h^2) error — optimal h ≈ 1e-6 for float64.')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): Backpropagation via Chain Rule

A 2-layer network with sigmoid activations and MSE loss:

z1=w1x+b1,a1=σ(z1),z2=w2a1+b2,y^=σ(z2),L=12(y^y)2z_1 = w_1 x + b_1, \quad a_1 = \sigma(z_1), \quad z_2 = w_2 a_1 + b_2, \quad \hat{y} = \sigma(z_2), \quad \mathcal{L} = \tfrac{1}{2}(\hat{y} - y)^2

(a) Derive L/w2\partial\mathcal{L}/\partial w_2 and L/b2\partial\mathcal{L}/\partial b_2 using the chain rule.

(b) Derive L/w1\partial\mathcal{L}/\partial w_1 and L/b1\partial\mathcal{L}/\partial b_1 (requires backpropagating through 2 sigmoid layers).

(c) Implement the forward and backward passes. Verify all 4 gradients against numerical estimates.

(d) Run 200 gradient descent steps with η=0.5\eta = 0.5 on the single example (x=1.5,y=0.7)(x=1.5, y=0.7). Plot the loss curve.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

import numpy as np

def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)

def forward(x, y, w1, b1, w2, b2):
    z1 = w1*x + b1
    a1 = sigmoid(z1)
    z2 = w2*a1 + b2
    a2 = sigmoid(z2)
    loss = 0.5*(a2 - y)**2
    return loss, {'z1':z1,'a1':a1,'z2':z2,'a2':a2,'x':x,'y':y}

def backward(cache, y, w1, b1, w2, b2):
    z1,a1,z2,a2,x = cache['z1'],cache['a1'],cache['z2'],cache['a2'],cache['x']
    dL_da2 = a2 - y
    da2_dz2 = sigmoid_prime(z2)
    dL_dz2 = dL_da2 * da2_dz2
    dL_dw2 = dL_dz2 * a1
    dL_db2 = dL_dz2
    dL_da1 = dL_dz2 * w2
    dL_dz1 = dL_da1 * sigmoid_prime(z1)
    dL_dw1 = dL_dz1 * x
    dL_db1 = dL_dz1
    return {'w1':dL_dw1,'b1':dL_db1,'w2':dL_dw2,'b2':dL_db2}

x, y = 1.5, 0.7
w1, b1, w2, b2 = 0.5, -0.3, 1.2, 0.1
params = [w1, b1, w2, b2]

header('Exercise 8: Backpropagation')
loss, cache = forward(x, y, w1, b1, w2, b2)
grads = backward(cache, y, w1, b1, w2, b2)

# Numerical gradient check
h = 1e-5
for i, (pname, pval) in enumerate([('w1',w1),('b1',b1),('w2',w2),('b2',b2)]):
    p_plus = params[:]; p_plus[i] += h
    p_minus = params[:]; p_minus[i] -= h
    L_p = forward(x, y, *p_plus)[0]
    L_m = forward(x, y, *p_minus)[0]
    num_grad = (L_p - L_m) / (2*h)
    check_close(f'd{pname}', grads[pname], num_grad, tol=1e-6)

# (d) Training loop
eta = 0.5
losses = []
for _ in range(200):
    loss, cache = forward(x, y, w1, b1, w2, b2)
    losses.append(loss)
    grads = backward(cache, y, w1, b1, w2, b2)
    w1 -= eta*grads['w1']; b1 -= eta*grads['b1']
    w2 -= eta*grads['w2']; b2 -= eta*grads['b2']

check_true('Loss decreased (training works)', losses[-1] < losses[0])
print(f'Initial loss: {losses[0]:.6f}, Final loss: {losses[-1]:.6f}')

try:
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5))
    plt.semilogy(losses, 'b-', lw=2)
    plt.xlabel('Step'); plt.ylabel('Loss (log)')
    plt.title('2-Layer Network Training Loss')
    plt.tight_layout(); plt.show()
except ImportError:
    pass

print('\nTakeaway: Backpropagation = chain rule applied layer by layer from output to input.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Binary Cross-Entropy Gradient from Logits

For a label y{0,1}y \in \{0,1\} and logit aa, the binary cross-entropy is

(a,y)=ylogσ(a)(1y)log(1σ(a)),σ(a)=11+ea.\ell(a,y) = -y\log \sigma(a) - (1-y)\log(1-\sigma(a)), \qquad \sigma(a)=\frac{1}{1+e^{-a}}.

Derive and verify the compact derivative

a=σ(a)y.\frac{\partial \ell}{\partial a} = \sigma(a)-y.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Derive d loss / d logit and verify it with finite differences.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - BCE logit gradient
header("Exercise 9: BCE gradient from logits")

def sigmoid(a):
    a = np.asarray(a, dtype=float)
    return np.where(a >= 0, 1 / (1 + np.exp(-a)), np.exp(a) / (1 + np.exp(a)))

def bce_from_logit(a, y):
    # Stable equivalent of -y log(sigmoid(a)) - (1-y) log(1-sigmoid(a)).
    return np.maximum(a, 0) - a * y + np.log1p(np.exp(-np.abs(a)))

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

for a, y in [(-2.0, 0.0), (-0.4, 1.0), (1.7, 1.0), (3.0, 0.0)]:
    analytic = float(sigmoid(a) - y)
    numeric = centered_diff(lambda t: bce_from_logit(t, y), a)
    print(f"a={a:+.2f}, y={y:.0f}: analytic={analytic:+.8f}, numeric={numeric:+.8f}")
    check_close("gradient check", analytic, numeric, tol=1e-6)
print("Takeaway: classification backprop sends prediction error, sigmoid(a)-y, through the graph.")

Exercise 10 (★★★): Manual Backprop for a Tiny Tanh Network

Consider

h=tanh(Wx+b),y^=vh+c,L=12(y^y)2.h = \tanh(Wx+b), \qquad \hat{y} = v^\top h + c, \qquad L=\frac{1}{2}(\hat{y}-y)^2.

Derive gradients for v,c,W,b,xv,c,W,b,x and verify one component by finite differences.

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Work backward from dL/dyhat through the tanh hidden layer.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - manual backprop through tanh network
header("Exercise 10: manual backprop through tanh network")

W = np.array([[0.4, -0.2], [0.1, 0.3], [-0.5, 0.2]])
b = np.array([0.1, -0.2, 0.05])
v = np.array([0.7, -0.4, 0.2])
c = -0.1
x = np.array([1.2, -0.7])
y = 0.35

def forward(W, b, v, c, x, y):
    a = W @ x + b
    h = np.tanh(a)
    yhat = v @ h + c
    loss = 0.5 * (yhat - y)**2
    return loss, (a, h, yhat)

loss, (a, h, yhat) = forward(W, b, v, c, x, y)
dyhat = yhat - y
grad_v = dyhat * h
grad_c = dyhat
grad_h = dyhat * v
grad_a = grad_h * (1 - h**2)
grad_W = np.outer(grad_a, x)
grad_b = grad_a
grad_x = W.T @ grad_a

# Finite-difference check for W[1,0].
def loss_w10(w10):
    W2 = W.copy(); W2[1,0] = w10
    return forward(W2, b, v, c, x, y)[0]
num = (loss_w10(W[1,0] + 1e-6) - loss_w10(W[1,0] - 1e-6)) / (2e-6)
print("loss:", loss)
print("grad_W:\n", grad_W)
print("grad_x:", grad_x)
check_close("finite difference W[1,0]", grad_W[1,0], num, tol=1e-6)
print("Takeaway: backprop is the chain rule organized so every local derivative is reused once.")

What to Review After Finishing

  • Can you apply the chain rule to a 3+ layer composition without looking it up?
  • Can you derive σ(x)=σ(x)(1σ(x))\sigma'(x) = \sigma(x)(1-\sigma(x)) from scratch?
  • Do you understand why centered differences have O(h2)O(h^2) error vs. O(h)O(h) for forward differences?
  • Can you explain the vanishing gradient problem quantitatively?
  • Can you implement a full forward/backward pass for a 2-layer network?
  • Do you know the second derivative test — including when it's inconclusive?

References

  1. Stewart, J. Calculus: Early Transcendentals, 8th ed. — Chapters 3–4
  2. Goodfellow, Bengio, Courville. Deep Learning — Chapter 6 (backpropagation)
  3. Rumelhart et al. (1986). Learning representations by back-propagating errors. Nature.
  4. He et al. (2015). Delving deep into rectifiers (ReLU/PReLU analysis)
  5. Kingma & Ba (2015). Adam: A method for stochastic optimization