Exercises Notebook

Converted from exercises.ipynb for web reading.

Derivatives and Differentiation - Exercises

10 graded exercises covering the full section arc, from core calculus mechanics to ML-facing applications.

Format	Description
Problem	Markdown cell with task description
Your Solution	Code cell for learner work
Solution	Reference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

Exercise 1 (★): Basic Differentiation Rules

Implement analytical derivatives for three functions using the power, product, and quotient rules.

(a) $f(x) = 3x^4 - 2x^2 + 7x - 1$

(b) $g(x) = \sqrt{x}\,e^x$ (product rule)

(c) $h(x) = \dfrac{x^2 + 1}{x - 1}$ (quotient rule, $x \neq 1$ )

For each, implement the derivative analytically, then verify against centered_diff.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def f(x):
    return 3*x**4 - 2*x**2 + 7*x - 1

def f_prime(x):
    return 12*x**3 - 4*x + 7

def g(x):
    return np.sqrt(x) * np.exp(x)

def g_prime(x):
    # Product rule: u=sqrt(x), v=e^x => u'v + uv' = 1/(2sqrt(x))*e^x + sqrt(x)*e^x
    return (1/(2*np.sqrt(x))) * np.exp(x) + np.sqrt(x) * np.exp(x)

def h(x):
    return (x**2 + 1) / (x - 1)

def h_prime(x):
    # Quotient rule: (f'g - fg')/g^2 = (2x*(x-1) - (x^2+1)*1)/(x-1)^2
    return (2*x*(x-1) - (x**2+1)) / (x-1)**2

x_test = 2.0
header('Exercise 1: Basic Differentiation Rules')

check_close('f\'(2.0) = 12(8)-8+7 = 95', f_prime(x_test), 95.0)
check_close('f\' matches numerical', f_prime(x_test), centered_diff(f, x_test))
check_close('g\' matches numerical', g_prime(x_test), centered_diff(g, x_test))
check_close('h\' matches numerical', h_prime(x_test), centered_diff(h, x_test))

# Verify at multiple points
x_pts = np.array([0.5, 1.0, 2.0, 3.5])
for fn, fn_prime, fn_name in [(f, f_prime, 'f'), (g, g_prime, 'g')]:
    errs = [abs(fn_prime(x) - centered_diff(fn, x)) for x in x_pts]
    check_true(f'{fn_name}\'(x) accurate at 4 test points', all(e < 1e-5 for e in errs))

print('\nTakeaway: Power rule gives polynomial derivatives; product/quotient rules handle compositions.')

print("Exercise 1 solution complete.")

Exercise 2 (★): Chain Rule

Differentiate using the chain rule and verify numerically.

(a) $p(x) = \sin(x^3)$

(b) $q(x) = \ln(\cos x)$ for $x \in (0, \pi/2)$

(c) $r(x) = e^{-x^2/2}$ (Gaussian kernel)

(d) $s(x) = (1 + x^2)^{10}$

Identify the outer and inner functions for each.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

def p(x): return np.sin(x**3)
def p_prime(x): return np.cos(x**3) * 3*x**2  # outer=sin, inner=x^3

def q(x): return np.log(np.cos(x))
def q_prime(x): return -np.sin(x) / np.cos(x)  # = -tan(x)

def r(x): return np.exp(-x**2/2)
def r_prime(x): return np.exp(-x**2/2) * (-x)  # outer=exp, inner=-x^2/2

def s(x): return (1 + x**2)**10
def s_prime(x): return 10*(1+x**2)**9 * 2*x  # outer=u^10, inner=1+x^2

x_test = 1.0
header('Exercise 2: Chain Rule')

check_close('p\' matches numerical at x=1', p_prime(x_test), centered_diff(p, x_test))
check_close('q\' matches numerical at x=0.5', q_prime(0.5), centered_diff(q, 0.5))
check_close('r\' matches numerical at x=1', r_prime(x_test), centered_diff(r, x_test))
check_close('s\' matches numerical at x=1', s_prime(x_test), centered_diff(s, x_test))

# q'(x) = -tan(x) — verify this identity
check_close('q\'(0.5) = -tan(0.5)', q_prime(0.5), -np.tan(0.5))

print('\nTakeaway: Chain rule = differentiate outside × keep inside × derivative of inside.')

print("Exercise 2 solution complete.")

Exercise 3 (★): Implicit Differentiation

The ellipse $\dfrac{x^2}{4} + \dfrac{y^2}{9} = 1$ defines $y$ implicitly as a function of $x$ .

(a) Differentiate both sides implicitly with respect to $x$ to find $dy/dx$ .

(b) At the point $(1, \frac{3\sqrt{3}}{2})$ on the upper ellipse, compute the slope of the tangent line.

(c) Verify numerically: define $y(x) = 3\sqrt{1 - x^2/4}$ explicitly and compare your formula to centered_diff(y, 1.0).

(d) Find all points where the tangent is horizontal (slope = 0).

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

import numpy as np

# (a) Implicit: x^2/4 + y^2/9 = 1
# Differentiate: x/2 + 2y/9 * y' = 0 => y' = -9x/(4y)
def dydx_ellipse(x, y):
    return -9*x / (4*y)

# (b)
x0 = 1.0
y0 = 3 * np.sqrt(1 - x0**2/4)
slope_at_point = dydx_ellipse(x0, y0)

# (c) Explicit derivative for verification
def y_upper(x): return 3 * np.sqrt(1 - x**2/4)
numerical_slope = (y_upper(x0+1e-6) - y_upper(x0-1e-6)) / 2e-6

header('Exercise 3: Implicit Differentiation — Ellipse')
print(f'Point: ({x0}, {y0:.6f})')
check_close('Implicit slope at (1, y0)', slope_at_point, numerical_slope)

# Verify formula at multiple points
for x in [0.5, 1.0, 1.5]:
    y = 3*np.sqrt(1-x**2/4)
    a = dydx_ellipse(x, y)
    n = (y_upper(x+1e-6) - y_upper(x-1e-6)) / 2e-6
    check_close(f'slope at x={x}', a, n, tol=1e-5)

# (d) Horizontal tangents: dy/dx = 0 => x=0
x_horiz = 0.0
y_top = 3.0  # y(0) = 3
y_bot = -3.0
check_true('Horizontal tangent at (0, ±3)', abs(dydx_ellipse(x_horiz, y_top)) < 1e-10)

print('\nTakeaway: Implicit differentiation avoids solving for y — differentiate the equation directly.')

print("Exercise 3 solution complete.")

Exercise 4 (★★): Activation Function Derivatives

(a) Derive $\sigma'(x) = \sigma(x)(1-\sigma(x))$ from scratch using the quotient rule.

(b) Implement sigmoid and its derivative, verify at $x \in \{-3, 0, 3\}$ .

(c) Derive $\tanh'(x) = 1 - \tanh^2(x)$ . Verify numerically.

(d) For a network with $L$ sigmoid layers, each with pre-activation near $z = 2$ , compute the product $\prod_{l=1}^L \sigma'(z)$ for $L \in \{1, 5, 10, 20\}$ . Explain the vanishing gradient problem.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

import numpy as np

def sigmoid(x):
    return np.where(x >= 0, 1/(1+np.exp(-x)), np.exp(x)/(1+np.exp(x)))

def sigmoid_prime(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh_prime(x):
    return 1 - np.tanh(x)**2

header('Exercise 4: Activation Function Derivatives')

# (b) Verify sigmoid'
for x in [-3.0, 0.0, 3.0]:
    a = sigmoid_prime(x)
    n = (sigmoid(x+1e-6) - sigmoid(x-1e-6)) / 2e-6
    check_close(f'sigma\'({x:+.0f})', a, n, tol=1e-5)

# Peak at x=0: 1/4
check_close('sigma\'(0) = 0.25', sigmoid_prime(0.0), 0.25)

# (c) Verify tanh'
for x in [-1.0, 0.0, 1.5]:
    a = tanh_prime(x)
    n = (np.tanh(x+1e-6) - np.tanh(x-1e-6)) / 2e-6
    check_close(f'tanh\'({x:+.1f})', a, n, tol=1e-5)

# (d) Vanishing gradient
print('\nVanishing gradient at z=2.0:')
z = 2.0
sp = sigmoid_prime(z)
print(f'sigma\'(2.0) = {sp:.4f}')
for L in [1, 5, 10, 20]:
    product = sp**L
    check_true(f'L={L:2d}: product={product:.2e} (vanishing for L>=5)', L < 4 or product < 1e-2)

print('\nTakeaway: Sigmoid gradients max at 0.25 — products vanish exponentially with depth.')

print("Exercise 4 solution complete.")

Exercise 5 (★★): Critical Points and Extrema

Consider $f(x) = x^4 - 4x^3 + 6x^2 - 4x + 1 = (x-1)^4$ .

(a) Find all critical points analytically (where $f'(x) = 0$ ).

(b) Apply the second derivative test. What does it tell you? Is the test conclusive?

(c) Apply the first derivative test (check sign of $f'$ on both sides of the critical point).

(d) Find the global minimum and maximum of $f$ on $[-1, 3]$ .

(e) Plot $f$ , $f'$ , and $f''$ on $[-0.5, 2.5]$ .

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

import numpy as np

def f(x): return (x-1)**4
def f_prime(x): return 4*(x-1)**3
def f_double_prime(x): return 12*(x-1)**2

header('Exercise 5: Critical Points and Extrema')

# (a) Critical points: f'(x) = 4(x-1)^3 = 0 => x=1 (triple root)
x_crit = 1.0
check_close('f\'(1) = 0', f_prime(x_crit), 0.0)

# (b) Second derivative test
fpp_crit = f_double_prime(x_crit)
print(f'f\'\'(1) = {fpp_crit} — INCONCLUSIVE (= 0, but f = (x-1)^4 >= 0)')

# (c) First derivative test
# f'(0.5) = 4(-0.5)^3 = -0.5 < 0 (decreasing before)
# f'(1.5) = 4(0.5)^3 = 0.5 > 0 (increasing after)
# sign changes - to + => LOCAL MINIMUM
check_true('f\' sign change: - to + at x=1 (local min)', f_prime(0.5) < 0 and f_prime(1.5) > 0)

# (d) Global extrema on [-1, 3]
candidates = {'x=-1': (-1.0, f(-1.0)), 'x=1 (crit)': (1.0, f(1.0)), 'x=3': (3.0, f(3.0))}
for name, (x, fx) in candidates.items():
    print(f'  {name}: f = {fx:.4f}')
check_close('Global min = f(1) = 0', f(1.0), 0.0)
check_close('Global max = f(-1) = 16 or f(3) = 16', f(-1.0), f(3.0))  # both = 16

# (e) Plot
try:
    import matplotlib.pyplot as plt
    x = np.linspace(-0.5, 2.5, 300)
    fig, axes = plt.subplots(1, 3, figsize=(15, 4))
    for ax, fn, nm in zip(axes, [f, f_prime, f_double_prime], ['f(x)', "f'(x)", "f''(x)"]):
        ax.plot(x, fn(x), lw=2); ax.axhline(0, color='k', lw=0.7)
        ax.axvline(1, color='r', ls='--', alpha=0.5, label='x=1')
        ax.set_title(nm); ax.legend()
    plt.tight_layout(); plt.show()
except ImportError:
    pass

print('\nTakeaway: f\'\'=0 is INCONCLUSIVE — always verify with sign of f\' or higher derivatives.')

print("Exercise 5 solution complete.")

Exercise 6 (★★): Mean Value Theorem Application

(a) Prove that $|\sin a - \sin b| \leq |a - b|$ for all $a, b \in \mathbb{R}$ using the MVT.

(b) Use the MVT to show that $|\ln x - \ln y| \leq \frac{|x-y|}{\min(x,y)}$ for $x, y > 0$ .

(c) For $f(x) = x^3 - x$ on $[-2, 2]$ , find all $c$ guaranteed by the MVT and verify numerically.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

import numpy as np

def f(x): return x**3 - x
def f_prime(x): return 3*x**2 - 1

header('Exercise 6: Mean Value Theorem')

# (a) |sin a - sin b| <= |a-b| — verify numerically
np.random.seed(42)
for _ in range(20):
    a, b = np.random.uniform(-10, 10, 2)
    lhs = abs(np.sin(a) - np.sin(b))
    rhs = abs(a - b)
    if lhs > rhs + 1e-10:
        print(f'FAIL at a={a:.2f}, b={b:.2f}')
        break
else:
    print('PASS — |sin a - sin b| <= |a-b| verified on 20 random pairs')

# (c) Find MVT c values for f(x)=x^3-x on [-2,2]
a, b = -2.0, 2.0
secant_slope = (f(b) - f(a)) / (b - a)
print(f'\nSecant slope: {secant_slope:.4f}')
# 3c^2 - 1 = secant_slope => c^2 = (secant_slope + 1)/3
c_sq = (secant_slope + 1) / 3
c_pos = np.sqrt(c_sq); c_neg = -np.sqrt(c_sq)
print(f'MVT c values: ±{c_pos:.4f}')
check_close('f\'(c_pos) = secant slope', f_prime(c_pos), secant_slope)
check_close('f\'(c_neg) = secant slope', f_prime(c_neg), secant_slope)
check_true('c_pos in (-2,2)', -2 < c_pos < 2)

# (d) L-Lipschitz gradient for g(x)=x^2
def g(x): return x**2
def g_prime(x): return 2*x
# g''(x) = 2 everywhere, so L = 2
L = 2.0
x1, x2 = 1.0, 3.0
lhs = abs(g_prime(x1) - g_prime(x2))
rhs = L * abs(x1 - x2)
print(f'\nLipschitz bound: {lhs:.4f} <= {rhs:.4f}')
check_true('|g\'(x1)-g\'(x2)| <= L|x1-x2|', lhs <= rhs + 1e-10)

print('\nTakeaway: MVT provides Lipschitz bounds that determine safe learning rates (eta <= 1/L).')

print("Exercise 6 solution complete.")

Exercise 7 (★★★): Numerical Differentiation and Gradient Checking

(a) Implement forward, backward, and centered finite differences for $f'(x)$ .

(b) For $f(x) = x \ln(x + e^{-x})$ , compute the analytic derivative and verify against each finite difference method at $x = 1.5$ .

(c) Experiment with $h \in \{10^{-1}, 10^{-3}, 10^{-6}, 10^{-10}, 10^{-15}\}$ . Record the error for each. Identify the optimal $h$ for centered differences in float64.

(d) Implement grad_check_vector for a vector-to-scalar function. Test it on the L2 loss $\mathcal{L}(\mathbf{w}) = \|\mathbf{w} - \mathbf{w}^*\|^2$ at $\mathbf{w} = [1, 2, 3]$ , $\mathbf{w}^* = [0, 0, 0]$ .

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

import numpy as np

def forward_diff(f, x, h): return (f(x+h) - f(x)) / h
def backward_diff(f, x, h): return (f(x) - f(x-h)) / h
def centered_diff_h(f, x, h): return (f(x+h) - f(x-h)) / (2*h)

def f(x): return x * np.log(x + np.exp(-x))
def f_prime_analytic(x):
    # f = x * ln(x + e^{-x})
    # product rule: 1*ln(x+e^-x) + x * (1 - e^-x)/(x + e^-x)
    return np.log(x + np.exp(-x)) + x * (1 - np.exp(-x)) / (x + np.exp(-x))

header('Exercise 7: Gradient Checking')

x = 1.5
true_deriv = f_prime_analytic(x)
print(f'Analytic f\'(1.5) = {true_deriv:.8f}')

h_vals = [1e-1, 1e-3, 1e-6, 1e-10, 1e-15]
print(f'\n{"h":>8} {"Forward err":>14} {"Centered err":>14}')
best_ctr_h = None; best_ctr_err = np.inf
for h in h_vals:
    fwd_err = abs(forward_diff(f, x, h) - true_deriv)
    ctr_err = abs(centered_diff_h(f, x, h) - true_deriv)
    if ctr_err < best_ctr_err:
        best_ctr_err = ctr_err; best_ctr_h = h
    print(f'{h:8.0e} {fwd_err:14.2e} {ctr_err:14.2e}')
print(f'\nOptimal h for centered: {best_ctr_h:.0e} (theoretical: ~1e-6)')

# (d) Vector grad check
def L2_loss(w): return np.sum(w**2)
def L2_grad(w): return 2*w

w = np.array([1.0, 2.0, 3.0])
err = grad_check(L2_loss, w, L2_grad(w))
print(f'\nVector gradient check relative error: {err:.2e}')
check_true('Gradient check passes (err < 1e-5)', err < 1e-5)

print('\nTakeaway: Centered differences have O(h^2) error — optimal h ≈ 1e-6 for float64.')

print("Exercise 7 solution complete.")

Exercise 8 (★★★): Backpropagation via Chain Rule

A 2-layer network with sigmoid activations and MSE loss:

z_1 = w_1 x + b_1, \quad a_1 = \sigma(z_1), \quad z_2 = w_2 a_1 + b_2, \quad \hat{y} = \sigma(z_2), \quad \mathcal{L} = \tfrac{1}{2}(\hat{y} - y)^2

(a) Derive $\partial\mathcal{L}/\partial w_2$ and $\partial\mathcal{L}/\partial b_2$ using the chain rule.

(b) Derive $\partial\mathcal{L}/\partial w_1$ and $\partial\mathcal{L}/\partial b_1$ (requires backpropagating through 2 sigmoid layers).

(c) Implement the forward and backward passes. Verify all 4 gradients against numerical estimates.

(d) Run 200 gradient descent steps with $\eta = 0.5$ on the single example $(x=1.5, y=0.7)$ . Plot the loss curve.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

import numpy as np

def sigmoid(x): return 1/(1+np.exp(-np.clip(x,-500,500)))
def sigmoid_prime(x): s = sigmoid(x); return s*(1-s)

def forward(x, y, w1, b1, w2, b2):
    z1 = w1*x + b1
    a1 = sigmoid(z1)
    z2 = w2*a1 + b2
    a2 = sigmoid(z2)
    loss = 0.5*(a2 - y)**2
    return loss, {'z1':z1,'a1':a1,'z2':z2,'a2':a2,'x':x,'y':y}

def backward(cache, y, w1, b1, w2, b2):
    z1,a1,z2,a2,x = cache['z1'],cache['a1'],cache['z2'],cache['a2'],cache['x']
    dL_da2 = a2 - y
    da2_dz2 = sigmoid_prime(z2)
    dL_dz2 = dL_da2 * da2_dz2
    dL_dw2 = dL_dz2 * a1
    dL_db2 = dL_dz2
    dL_da1 = dL_dz2 * w2
    dL_dz1 = dL_da1 * sigmoid_prime(z1)
    dL_dw1 = dL_dz1 * x
    dL_db1 = dL_dz1
    return {'w1':dL_dw1,'b1':dL_db1,'w2':dL_dw2,'b2':dL_db2}

x, y = 1.5, 0.7
w1, b1, w2, b2 = 0.5, -0.3, 1.2, 0.1
params = [w1, b1, w2, b2]

header('Exercise 8: Backpropagation')
loss, cache = forward(x, y, w1, b1, w2, b2)
grads = backward(cache, y, w1, b1, w2, b2)

# Numerical gradient check
h = 1e-5
for i, (pname, pval) in enumerate([('w1',w1),('b1',b1),('w2',w2),('b2',b2)]):
    p_plus = params[:]; p_plus[i] += h
    p_minus = params[:]; p_minus[i] -= h
    L_p = forward(x, y, *p_plus)[0]
    L_m = forward(x, y, *p_minus)[0]
    num_grad = (L_p - L_m) / (2*h)
    check_close(f'd{pname}', grads[pname], num_grad, tol=1e-6)

# (d) Training loop
eta = 0.5
losses = []
for _ in range(200):
    loss, cache = forward(x, y, w1, b1, w2, b2)
    losses.append(loss)
    grads = backward(cache, y, w1, b1, w2, b2)
    w1 -= eta*grads['w1']; b1 -= eta*grads['b1']
    w2 -= eta*grads['w2']; b2 -= eta*grads['b2']

check_true('Loss decreased (training works)', losses[-1] < losses[0])
print(f'Initial loss: {losses[0]:.6f}, Final loss: {losses[-1]:.6f}')

try:
    import matplotlib.pyplot as plt
    plt.figure(figsize=(8,5))
    plt.semilogy(losses, 'b-', lw=2)
    plt.xlabel('Step'); plt.ylabel('Loss (log)')
    plt.title('2-Layer Network Training Loss')
    plt.tight_layout(); plt.show()
except ImportError:
    pass

print('\nTakeaway: Backpropagation = chain rule applied layer by layer from output to input.')

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Binary Cross-Entropy Gradient from Logits

For a label $y \in \{0,1\}$ and logit $a$ , the binary cross-entropy is

\ell(a,y) = -y\log \sigma(a) - (1-y)\log(1-\sigma(a)), \qquad \sigma(a)=\frac{1}{1+e^{-a}}.

Derive and verify the compact derivative

\frac{\partial \ell}{\partial a} = \sigma(a)-y.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Derive d loss / d logit and verify it with finite differences.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - BCE logit gradient
header("Exercise 9: BCE gradient from logits")

def sigmoid(a):
    a = np.asarray(a, dtype=float)
    return np.where(a >= 0, 1 / (1 + np.exp(-a)), np.exp(a) / (1 + np.exp(a)))

def bce_from_logit(a, y):
    # Stable equivalent of -y log(sigmoid(a)) - (1-y) log(1-sigmoid(a)).
    return np.maximum(a, 0) - a * y + np.log1p(np.exp(-np.abs(a)))

def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

for a, y in [(-2.0, 0.0), (-0.4, 1.0), (1.7, 1.0), (3.0, 0.0)]:
    analytic = float(sigmoid(a) - y)
    numeric = centered_diff(lambda t: bce_from_logit(t, y), a)
    print(f"a={a:+.2f}, y={y:.0f}: analytic={analytic:+.8f}, numeric={numeric:+.8f}")
    check_close("gradient check", analytic, numeric, tol=1e-6)
print("Takeaway: classification backprop sends prediction error, sigmoid(a)-y, through the graph.")

Exercise 10 (★★★): Manual Backprop for a Tiny Tanh Network

Consider

h = \tanh(Wx+b), \qquad \hat{y} = v^\top h + c, \qquad L=\frac{1}{2}(\hat{y}-y)^2.

Derive gradients for $v,c,W,b,x$ and verify one component by finite differences.

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Work backward from dL/dyhat through the tanh hidden layer.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - manual backprop through tanh network
header("Exercise 10: manual backprop through tanh network")

W = np.array([[0.4, -0.2], [0.1, 0.3], [-0.5, 0.2]])
b = np.array([0.1, -0.2, 0.05])
v = np.array([0.7, -0.4, 0.2])
c = -0.1
x = np.array([1.2, -0.7])
y = 0.35

def forward(W, b, v, c, x, y):
    a = W @ x + b
    h = np.tanh(a)
    yhat = v @ h + c
    loss = 0.5 * (yhat - y)**2
    return loss, (a, h, yhat)

loss, (a, h, yhat) = forward(W, b, v, c, x, y)
dyhat = yhat - y
grad_v = dyhat * h
grad_c = dyhat
grad_h = dyhat * v
grad_a = grad_h * (1 - h**2)
grad_W = np.outer(grad_a, x)
grad_b = grad_a
grad_x = W.T @ grad_a

# Finite-difference check for W[1,0].
def loss_w10(w10):
    W2 = W.copy(); W2[1,0] = w10
    return forward(W2, b, v, c, x, y)[0]
num = (loss_w10(W[1,0] + 1e-6) - loss_w10(W[1,0] - 1e-6)) / (2e-6)
print("loss:", loss)
print("grad_W:\n", grad_W)
print("grad_x:", grad_x)
check_close("finite difference W[1,0]", grad_W[1,0], num, tol=1e-6)
print("Takeaway: backprop is the chain rule organized so every local derivative is reused once.")

What to Review After Finishing

Can you apply the chain rule to a 3+ layer composition without looking it up?
Can you derive $\sigma'(x) = \sigma(x)(1-\sigma(x))$ from scratch?
Do you understand why centered differences have $O(h^2)$ error vs. $O(h)$ for forward differences?
Can you explain the vanishing gradient problem quantitatively?
Can you implement a full forward/backward pass for a 2-layer network?
Do you know the second derivative test — including when it's inconclusive?

References

Stewart, J. Calculus: Early Transcendentals, 8th ed. — Chapters 3–4
Goodfellow, Bengio, Courville. Deep Learning — Chapter 6 (backpropagation)
Rumelhart et al. (1986). Learning representations by back-propagating errors. Nature.
He et al. (2015). Delving deep into rectifiers (ReLU/PReLU analysis)
Kingma & Ba (2015). Adam: A method for stochastic optimization