Exercises Notebook

Converted from exercises.ipynb for web reading.

Floating-Point Arithmetic — Exercises

"To understand a number system, break it."

10 graded exercises covering: bit-level encoding, machine epsilon, catastrophic cancellation, Kahan summation, condition numbers, stable algorithms, mixed precision, and error analysis.

Companion: notes.md | theory.ipynb

Difficulty: ★ Easy | ★★ Medium | ★★★ Hard

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as nla
from scipy import stats
import struct

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 5), 'figure.dpi': 110})
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

np.set_printoptions(precision=10, suppress=False)
np.random.seed(42)
print('Setup complete.')

print("Chapter helper setup complete.")

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return bool(cond)

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print('  expected:', expected)
        print('  got     :', got)
    return bool(ok)

print("Chapter helper setup complete.")

Exercise 1 ★ — IEEE 754 Bit Encoding

Understanding floating-point at the bit level is essential for debugging precision issues.

Task: Implement a function that decodes an fp32 number into its three IEEE 754 components.

(a) Write decode_fp32(x) that returns (sign, exponent, mantissa, value) where: - sign is 0 or 1 - exponent is the raw 8-bit biased exponent (0 to 255) - mantissa is the raw 23-bit mantissa (0 to 2²³ - 1) - value is the reconstructed floating-point value

(b) Use your function to decode: 1.0, -2.5, 0.0, float('inf'), float('nan'), 1e-40 (subnormal)

Hint: Use struct.pack('>f', np.float32(x)) to get the 4 bytes of a float32.

Code cell 5

# Your Solution
# Exercise 1: IEEE 754 Bit Encoding

def decode_fp32(x):
    """Decode fp32 into (sign, biased_exp, mantissa, reconstructed_value)."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 6

# Solution
def decode_fp32_solution(x):
    """Decode fp32 into (sign, biased_exp, mantissa, reconstructed_value)."""
    packed = struct.pack('>f', np.float32(x))
    bits = int.from_bytes(packed, 'big')
    sign = (bits >> 31) & 1
    exp  = (bits >> 23) & 0xFF
    mant = bits & 0x7FFFFF

    # Reconstruct value
    if exp == 255:
        val = float('nan') if mant != 0 else float('inf') * (-1)**sign
    elif exp == 0:
        val = (-1)**sign * (mant / 2**23) * 2**(-126)  # Subnormal
    else:
        val = (-1)**sign * (1 + mant / 2**23) * 2**(exp - 127)  # Normal

    return sign, exp, mant, val

print('Decoded fp32 values:')
test_values = [1.0, -2.5, 0.0, float('inf'), 1e-40]
for v in test_values:
    s, e, m, rec = decode_fp32_solution(v)
    err = abs(rec - float(np.float32(v))) if np.isfinite(v) else 0.0
    print(f'{str(v):>8}: sign={s}, exp={e:3d}, mant={m:>8d}, reconstructed={rec:.8e}, err={err:.2e}')

# Verify reconstruction for normal numbers
normal_vals = [1.0, -2.5, 3.14159, 100.0, 0.001]
all_ok = all(abs(decode_fp32_solution(v)[3] - float(np.float32(v))) < 1e-30
             for v in normal_vals if np.isfinite(v))
print(f"{'PASS' if all_ok else 'FAIL'} - reconstruction matches for normal numbers")

Exercise 2 ★ — Machine Epsilon by Experiment

Machine epsilon $\varepsilon_{\text{mach}}$ is the smallest number such that $1 + \varepsilon_{\text{mach}} > 1$ in floating point.

(a) Compute machine epsilon for fp64, fp32, and fp16 by successive halving.

(b) Verify your results against np.finfo(dtype).eps.

(c) Show that $\varepsilon_{\text{mach}} = 2^{1-p}$ where $p$ is the number of significand bits: fp64: $p = 53$ , fp32: $p = 24$ , fp16: $p = 11$ .

(d) Demonstrate that 1.0 + eps/2 == 1.0 but 1.0 + eps > 1.0 (in the appropriate dtype).

Code cell 8

# Your Solution
# Exercise 2: Machine Epsilon

# (a) Compute by successive halving
def compute_machine_epsilon(dtype):
    """Compute machine epsilon for given numpy dtype."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 9

# Solution
def compute_machine_epsilon_solution(dtype):
    one = dtype(1.0)
    eps = dtype(1.0)
    while one + dtype(eps) / dtype(2.0) > one:
        eps = dtype(eps) / dtype(2.0)
    return eps

print(f'{"Format":>8}  {"Computed eps":>16}  {"np.finfo eps":>16}  {"2^(1-p)":>16}  {"match":>6}')
for dtype, p in [(np.float64, 53), (np.float32, 24), (np.float16, 11)]:
    eps_computed = compute_machine_epsilon_solution(dtype)
    eps_theory   = dtype(2**(1-p))
    eps_numpy    = np.finfo(dtype).eps
    match = np.isclose(float(eps_computed), float(eps_numpy), rtol=1e-3)
    print(f'{dtype.__name__:>8}  {float(eps_computed):16.6e}  '
          f'{float(eps_numpy):16.6e}  {float(eps_theory):16.6e}  {"YES" if match else "NO":>6}')

print()
# (d) Verify boundary
eps32 = np.finfo(np.float32).eps
print(f'fp32: 1.0 + eps    > 1.0: {np.float32(1.0) + eps32 > np.float32(1.0)}')
print(f'fp32: 1.0 + eps/2 == 1.0: {np.float32(1.0) + eps32/2 == np.float32(1.0)}')
ok = (np.float32(1.0) + eps32 > np.float32(1.0)) and \
     (np.float32(1.0) + eps32/2 == np.float32(1.0))
print(f"{'PASS' if ok else 'FAIL'} - machine epsilon boundary verified")

Exercise 3 ★★ — Catastrophic Cancellation and Algebraic Fixes

Catastrophic cancellation destroys precision when subtracting nearly-equal numbers.

(a) Implement the naive formula $\sqrt{x+1} - \sqrt{x}$ for large $x$ . Show it loses precision as $x \to \infty$ .

(b) Derive and implement an algebraically equivalent stable formula by rationalizing: $\sqrt{x+1} - \sqrt{x} = \frac{1}{\sqrt{x+1} + \sqrt{x}}$

(d) Plot the relative error of both methods vs $x$ on a log-log scale.

Code cell 11

# Your Solution
# Exercise 3: Catastrophic Cancellation

def sqrt_diff_naive(x):
    """Naive: sqrt(x+1) - sqrt(x)."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 12

# Solution
def sqrt_diff_naive_sol(x):
    x = np.float32(x)
    return np.sqrt(x + np.float32(1.0)) - np.sqrt(x)

def sqrt_diff_stable_sol(x):
    x = np.float32(x)
    return np.float32(1.0) / (np.sqrt(x + np.float32(1.0)) + np.sqrt(x))

x_vals = np.logspace(1, 12, 60, dtype=np.float64)
x32 = x_vals.astype(np.float32)

ref_vals  = 1.0 / (np.sqrt(x_vals + 1) + np.sqrt(x_vals))  # fp64 reference
naive_err = np.abs(np.vectorize(sqrt_diff_naive_sol)(x32).astype(np.float64) - ref_vals) / ref_vals
stable_err = np.abs(np.vectorize(sqrt_diff_stable_sol)(x32).astype(np.float64) - ref_vals) / ref_vals

print(f'{"x":>12}  {"Naive err":>12}  {"Stable err":>12}')
for i in [0, 15, 30, 45, 55]:
    print(f'{x_vals[i]:12.2e}  {naive_err[i]:12.3e}  {stable_err[i]:12.3e}')

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 5))
    valid_n = naive_err > 0
    valid_s = stable_err > 0
    ax.loglog(x_vals[valid_n], naive_err[valid_n], color='#CC3311', label='Naive fp32')
    ax.loglog(x_vals[valid_s], stable_err[valid_s] + 1e-16, color='#0077BB', label='Stable fp32')
    ax.axhline(float(np.finfo(np.float32).eps), color='gray', linestyle='--', label='fp32 eps')
    ax.set_xlabel('x'); ax.set_ylabel('Relative error'); ax.legend()
    ax.set_title('sqrt(x+1) - sqrt(x): naive vs stable')
    plt.tight_layout(); plt.show()

ok = np.median(stable_err[30:]) < np.median(naive_err[30:]) * 0.001
print(f"{'PASS' if ok else 'FAIL'} - stable formula has much lower error")

Exercise 4 ★★ — Kahan Summation

Kahan compensated summation achieves $O(\varepsilon_{\text{mach}})$ error regardless of $n$ .

(a) Implement Kahan summation in fp32.

(b) Sum the series $\sum_{k=1}^{n} \frac{1}{k^2}$ for $n = 1, 10, 100, 1000, 10000$ . The true value converges to $\pi^2/6 \approx 1.6449340668...$

(d) ★★★ Extension: Show that pairwise summation (divide-and-conquer) achieves $O(\log n \cdot \varepsilon)$ error. Implement pairwise summation and compare all three methods.

Code cell 14

# Your Solution
# Exercise 4: Kahan Summation

def kahan_sum(arr):
    """Kahan compensated sum — compute in fp32."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 15

# Solution
def kahan_sum_sol(arr):
    s = np.float32(0.0)
    c = np.float32(0.0)
    for x in arr:
        y = np.float32(x) - c
        t = s + y
        c = (t - s) - y
        s = t
    return float(s)

def pairwise_sum(arr):
    """Pairwise summation — O(log n * eps) error."""
    arr = arr.copy().astype(np.float32)
    n = len(arr)
    while n > 1:
        half = n // 2
        arr[:half] = arr[:2*half:2] + arr[1:2*half:2]
        if n % 2 == 1:
            arr[half] = arr[n-1]
            n = half + 1
        else:
            n = half
    return float(arr[0])

true_val = np.pi**2 / 6
print(f'True pi^2/6 = {true_val:.15f}')
print()
print(f'{"n":>8}  {"Kahan err":>12}  {"Naive err":>12}  {"Pairwise err":>14}  {"fp64 err":>12}')
for n in [10, 100, 1000, 10000]:
    arr = np.array([1/k**2 for k in range(1, n+1)], dtype=np.float32)
    k_val = kahan_sum_sol(arr)
    n_val = float(np.sum(arr))
    p_val = pairwise_sum(arr)
    f_val = float(np.sum(arr.astype(np.float64)))
    ref   = float(np.sum(np.array([1/k**2 for k in range(1,n+1)], dtype=np.float64)))
    print(f'{n:8d}  {abs(k_val-ref):12.3e}  {abs(n_val-ref):12.3e}  '
          f'{abs(p_val-ref):14.3e}  {abs(f_val-ref):12.3e}')

print()
n = 10000
arr = np.array([1/k**2 for k in range(1, n+1)], dtype=np.float32)
ref = float(np.sum(arr.astype(np.float64)))
ok = abs(kahan_sum_sol(arr) - ref) < abs(float(np.sum(arr)) - ref)
print(f"{'PASS' if ok else 'FAIL'} - Kahan more accurate than naive sum")

Exercise 5 ★★ — Condition Numbers and Ill-Conditioned Systems

The condition number bounds how much errors in the input amplify in the output.

(a) Compute $\kappa(A) = \|A\| \cdot \|A^{-1}\|$ using the 2-norm (i.e., $\sigma_{\max}/\sigma_{\min}$ ) for: - Identity matrix $I_5$ - Diagonal matrix with entries $[1, 2, 4, 8, 16]$ - Hilbert matrix $H_5$ - Random Gaussian matrix $G \in \mathbb{R}^{5 \times 5}$

(b) For the Hilbert matrix $H_n$ with $n = 5, 8, 10, 12$ , solve $Hx = b$ where $b = H \cdot \mathbf{1}$ (so true solution is $\mathbf{1}$ ). Report the relative error $\|x - \mathbf{1}\|_2 / \|\mathbf{1}\|_2$ .

Code cell 17

# Your Solution
# Exercise 5: Condition Numbers

import numpy.linalg as nla

# (a) Compute condition numbers
def condition_number(A):
    """Compute 2-norm condition number via SVD."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 18

# Solution
def condition_number_sol(A):
    sv = nla.svd(A, compute_uv=False)
    return sv[0] / sv[-1]

def hilbert_sol(n):
    i, j = np.meshgrid(np.arange(1,n+1), np.arange(1,n+1), indexing='ij')
    return 1.0 / (i + j - 1)

# (a)
print('(a) Condition numbers:')
matrices = {
    'Identity(5)': np.eye(5),
    'Diag[1..16]': np.diag([1.,2.,4.,8.,16.]),
    'Hilbert(5)':  hilbert_sol(5),
    'Random(5x5)': np.random.randn(5, 5),
}
for name, A in matrices.items():
    kappa = condition_number_sol(A)
    print(f'  {name:>14}: kappa = {kappa:.4e}')

# (b, c) Hilbert system solve
print()
print('(b,c) Hilbert system solve:')
eps64 = np.finfo(np.float64).eps
print(f'{"n":>4}  {"kappa":>14}  {"rel_error":>14}  {"kappa*eps":>14}  {"bound_holds":>12}')
for n in [5, 8, 10, 12]:
    H = hilbert_sol(n)
    x_true = np.ones(n)
    b = H @ x_true
    x_comp = nla.solve(H, b)
    kappa = condition_number_sol(H)
    rel_err = nla.norm(x_comp - x_true) / nla.norm(x_true)
    bound = kappa * eps64
    holds = rel_err <= bound * 100  # Factor 100 for rounding in norm estimation
    print(f'{n:4d}  {kappa:14.4e}  {rel_err:14.4e}  {bound:14.4e}  {"YES" if holds else "NO":>12}')

Exercise 6 ★★ — Implementing Stable Softmax and Log-Sum-Exp

(a) Implement stable_softmax(x) that: 1. Subtracts max(x) before computing exp 2. Returns a probability distribution summing to 1 3. Handles both 1D and 2D inputs (axis=-1 for rows)

(b) Implement stable_log_softmax(x) using $\log \text{softmax}(z)_i = z_i - \text{LSE}(z)$ .

(d) Verify that exp(log_softmax(x)) equals softmax(x) for varied inputs.

(e) Test with extreme logits: x = [1000, 1001, 999] and x = [-1000, -999, -1001].

Code cell 20

# Your Solution
# Exercise 6: Stable Softmax and Log-Softmax

def stable_softmax(x, axis=-1):
    """Numerically stable softmax."""

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 21

# Solution
def stable_softmax_sol(x, axis=-1):
    x = np.asarray(x, dtype=np.float64)
    x_shifted = x - np.max(x, axis=axis, keepdims=True)
    e = np.exp(x_shifted)
    return e / e.sum(axis=axis, keepdims=True)

def stable_log_softmax_sol(x, axis=-1):
    x = np.asarray(x, dtype=np.float64)
    c = np.max(x, axis=axis, keepdims=True)
    log_sum_exp = c.squeeze() + np.log(np.sum(np.exp(x - c), axis=axis))
    return x - log_sum_exp if x.ndim == 1 else x - log_sum_exp[..., np.newaxis]

def nll_loss_sol(log_probs, targets):
    return -np.mean([log_probs[i, t] for i, t in enumerate(targets)])

# (d) Verify exp(log_softmax) == softmax
test_cases = [
    np.array([1.0, 2.0, 3.0]),
    np.array([1000.0, 1001.0, 999.0]),
    np.array([-1000.0, -999.0, -1001.0]),
    np.array([0.001, 0.002, 0.003]),
]

print('Verifying exp(log_softmax) == softmax:')
all_ok = True
for x in test_cases:
    s1 = stable_softmax_sol(x)
    s2 = np.exp(stable_log_softmax_sol(x))
    ok = np.allclose(s1, s2, atol=1e-10)
    all_ok = all_ok and ok
    print(f'  x={x}: {"PASS" if ok else "FAIL"}, sum={s1.sum():.10f}')

# Batch test
X = np.random.randn(5, 10)
s_batch = stable_softmax_sol(X, axis=-1)
sums_ok = np.allclose(s_batch.sum(axis=-1), 1.0)
print(f"{'PASS' if all_ok and sums_ok else 'FAIL'} - all verifications passed")

Exercise 7 ★★ — Mixed Precision and Loss Scaling

Mixed-precision training uses bf16/fp16 compute with fp32 master weights.

(a) Simulate gradient underflow: generate gradients uniformly in $[10^{-6}, 10^{-5}]$ and quantize to fp16. What fraction underflow to zero?

(b) Apply loss scaling with $S \in \{1, 16, 256, 4096\}$ and report the fraction that survive.

(c) Implement a simple dynamic loss scaler: - Increase scale by factor 2 every 100 steps without overflow - Decrease scale by factor 2 on overflow - Report the scale trajectory over 500 simulated steps

(d) What happens when the scale is too large? Demonstrate overflow in fp16.

Code cell 23

# Your Solution
# Exercise 7: Mixed Precision Loss Scaling

# (a) Gradient underflow
n_params = 10000
grads_fp32 = np.random.uniform(1e-6, 1e-5, n_params)

# YOUR CODE: convert to fp16 and count underflows
# grads_fp16 = ...
# n_underflow = ...
# print(f'Fraction underflowing: {n_underflow/n_params:.2%}')

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 24

# Solution
n_params = 10000
np.random.seed(42)
grads_fp32 = np.random.uniform(1e-6, 1e-5, n_params).astype(np.float32)

# (a) Fraction underflowing without scaling
grads_fp16 = grads_fp32.astype(np.float16).astype(np.float32)
n_underflow_none = np.sum(grads_fp16 == 0)
print(f'(a) No scaling: {n_underflow_none}/{n_params} ({n_underflow_none/n_params:.1%}) underflow to 0')

# (b) Effect of loss scaling
print()
print('(b) Effect of loss scaling:')
print(f'{"Scale":>8}  {"Underflows":>12}  {"Overflows":>10}  {"Useful":>10}')
fp16_max = np.finfo(np.float16).max

for S in [1, 16, 256, 4096, 65536]:
    scaled = grads_fp32 * S
    overflow  = np.sum(np.abs(scaled) > fp16_max)
    fp16_q = scaled.astype(np.float16)
    underflow = np.sum(fp16_q == 0)
    useful = n_params - underflow - overflow
    print(f'{S:8d}  {underflow:12d}  {overflow:10d}  {useful:10d}')

# (c) Dynamic loss scaler
print()
print('(c) Dynamic loss scaler trajectory:')
scale = 1024.0
growth_interval = 100
steps_since_increase = 0
scale_history = []
np.random.seed(0)

for step in range(500):
    # Simulate: overflow probability decreases as scale increases past 65504
    overflow_prob = min(0.5, max(0.0, (scale - 32768) / 65504))
    overflow = np.random.random() < overflow_prob

    if overflow:
        scale /= 2
        steps_since_increase = 0
    else:
        steps_since_increase += 1
        if steps_since_increase >= growth_interval:
            scale = min(scale * 2, 2**24)  # Cap
            steps_since_increase = 0
    scale_history.append(scale)

if HAS_MPL:
    fig, ax = plt.subplots(figsize=(10, 4))
    ax.semilogy(scale_history, color='#0077BB')
    ax.set_xlabel('Training step'); ax.set_ylabel('Loss scale (log)')
    ax.set_title('Dynamic loss scale trajectory')
    plt.tight_layout(); plt.show()

print(f'Final scale: {scale_history[-1]:.0f}')
ok = n_underflow_none > 0  # Some underflow without scaling
print(f"{'PASS' if ok else 'FAIL'} - underflow occurs without scaling")

Exercise 8 ★★★ — Forward Error Analysis of Matrix-Vector Product

Derive and verify the forward error bound for the matrix-vector product $y = Ax$ .

(a) The forward error for computing $\hat{y} = \text{fl}(Ax)$ satisfies:

|\hat{y}_i - y_i| \leq n \varepsilon_{\text{mach}} \sum_j |A_{ij}| |x_j| + O(\varepsilon_{\text{mach}}^2)

Implement a function that computes this error bound.

(b) Generate random $A \in \mathbb{R}^{10 \times 10}$ and $x \in \mathbb{R}^{10}$ in fp32. Compute $\hat{y}$ in fp32 and $y$ in fp64 (reference). Verify the bound holds for each component.

(d) ★★★ Extension: Show empirically that the worst-case error bound is nearly tight by constructing $A$ and $x$ that maximize rounding errors (e.g., alternating signs in $x$ ).

Code cell 26

# Your Solution
# Exercise 8: Error Analysis

# (a) Implement error bound
def matvec_error_bound(A, x, eps_mach):
    """
    Compute componentwise forward error bound for y = A @ x.
    Bound: n * eps_mach * sum_j |A_ij| |x_j|
    """

print("Learner scaffold loaded; write your solution here, then compare below.")

Code cell 27

# Solution
import numpy.linalg as nla

def matvec_error_bound_sol(A, x, eps_mach):
    n = A.shape[1]
    return n * eps_mach * (np.abs(A) @ np.abs(x))

def hilbert_mat(n):
    i, j = np.meshgrid(np.arange(1,n+1), np.arange(1,n+1), indexing='ij')
    return 1.0 / (i + j - 1)

eps32 = np.finfo(np.float32).eps
np.random.seed(42)

# (b) Random matrix
print('(b) Random 10x10 matrix:')
A = np.random.randn(10, 10).astype(np.float32)
x = np.random.randn(10).astype(np.float32)
y_ref = A.astype(np.float64) @ x.astype(np.float64)  # fp64 reference
y_fp32 = A @ x  # fp32 computation
actual_err = np.abs(y_fp32.astype(np.float64) - y_ref)
bound_vec  = matvec_error_bound_sol(A.astype(np.float64), x.astype(np.float64), eps32)
all_hold = np.all(actual_err <= bound_vec + 1e-30)
print(f'  Max actual error: {actual_err.max():.4e}')
print(f'  Max bound:        {bound_vec.max():.4e}')
print(f'  Bound holds for all components: {all_hold}')

# (c) Hilbert matrix
print()
print('(c) Hilbert 10x10 matrix:')
H = hilbert_mat(10).astype(np.float32)
x_h = np.ones(10, dtype=np.float32)
y_ref_h = H.astype(np.float64) @ x_h.astype(np.float64)
y_fp32_h = H @ x_h
actual_err_h = np.abs(y_fp32_h.astype(np.float64) - y_ref_h)
bound_vec_h  = matvec_error_bound_sol(H.astype(np.float64), x_h.astype(np.float64), eps32)
all_hold_h = np.all(actual_err_h <= bound_vec_h * 1000)  # Looser factor for ill-conditioned
print(f'  Max actual error: {actual_err_h.max():.4e}')
print(f'  Max bound:        {bound_vec_h.max():.4e}')
print(f'  Bound holds (up to 1000x): {all_hold_h}')

ok = all_hold and all_hold_h
print(f"\n{'PASS' if ok else 'FAIL'} - error bounds verified")

Summary

You have practiced:

IEEE 754 encoding — bit-level decomposition of sign, exponent, mantissa
Machine epsilon — the fundamental precision limit of each format
Catastrophic cancellation — identifying and fixing precision loss via algebraic identities
Kahan summation — achieving $O(\varepsilon)$ error independent of $n$
Condition numbers — measuring sensitivity of linear systems to perturbations
Stable softmax — log-sum-exp trick, translation invariance, log-softmax
Mixed precision — loss scaling dynamics, gradient underflow/overflow
Error analysis — componentwise forward error bounds for matrix operations

Next: Numerical Linear Algebra →

Exercise 9 *** - Stable Cross-Entropy from Logits

Implement cross-entropy directly from logits using log-sum-exp, then compare it to a naive softmax implementation on very large logits.

Code cell 30

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 31

# Solution
def logsumexp(z):
    z=np.asarray(z,dtype=float); m=z.max(); return m+np.log(np.exp(z-m).sum())
def stable_ce(logits,y): return -logits[y]+logsumexp(logits)
def naive_ce(logits,y):
    p=np.exp(logits)/np.exp(logits).sum(); return -np.log(p[y])
header('Exercise 9: Stable Cross-Entropy')
z=np.array([1000.,999.,995.])
ce=stable_ce(z,0)
print(f'stable CE={ce:.6f}')
try:
    bad=naive_ce(z,0); print(f'naive CE={bad}')
except Exception as e:
    print('naive CE failed:', type(e).__name__)
check_true('stable CE is finite', np.isfinite(ce))
print('Takeaway: log-sum-exp is the numerical core of stable softmax cross-entropy.')

Exercise 10 *** - Dynamic Loss Scaling for fp16 Gradients

Simulate tiny gradients in fp16, then show how multiplying the loss before backpropagation and unscaling afterward preserves information.

Code cell 33

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 34

# Solution
g=np.array([1e-8,2e-8,5e-8,1e-7],dtype=np.float32)
g16=g.astype(np.float16)
scale=2**14
scaled=(g*scale).astype(np.float16)
unscaled=scaled.astype(np.float32)/scale
header('Exercise 10: Dynamic Loss Scaling')
print('fp32:',g)
print('direct fp16:',g16)
print('scaled/unscaled:',unscaled)
check_true('direct fp16 underflows some entries', np.count_nonzero(g16)==0 or np.count_nonzero(g16)<len(g16))
check_true('loss scaling preserves more nonzero entries', np.count_nonzero(unscaled)>np.count_nonzero(g16))
print('Takeaway: loss scaling shifts gradients into fp16 representable range, then restores scale in fp32.')

Floating Point Arithmetic