Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Floating-Point Arithmetic — Exercises
"To understand a number system, break it."
10 graded exercises covering: bit-level encoding, machine epsilon, catastrophic cancellation, Kahan summation, condition numbers, stable algorithms, mixed precision, and error analysis.
Companion: notes.md | theory.ipynb
Difficulty: ★ Easy | ★★ Medium | ★★★ Hard
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import numpy.linalg as nla
from scipy import stats
import struct
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update({'figure.figsize': (10, 5), 'figure.dpi': 110})
HAS_MPL = True
except ImportError:
HAS_MPL = False
np.set_printoptions(precision=10, suppress=False)
np.random.seed(42)
print('Setup complete.')
print("Chapter helper setup complete.")
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def check_true(name, cond):
print(f"{'PASS' if cond else 'FAIL'} - {name}")
return bool(cond)
def check_close(name, got, expected, tol=1e-8):
ok = np.allclose(got, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
if not ok:
print(' expected:', expected)
print(' got :', got)
return bool(ok)
print("Chapter helper setup complete.")
Exercise 1 ★ — IEEE 754 Bit Encoding
Understanding floating-point at the bit level is essential for debugging precision issues.
Task: Implement a function that decodes an fp32 number into its three IEEE 754 components.
(a) Write decode_fp32(x) that returns (sign, exponent, mantissa, value) where:
- sign is 0 or 1
- exponent is the raw 8-bit biased exponent (0 to 255)
- mantissa is the raw 23-bit mantissa (0 to 2²³ - 1)
- value is the reconstructed floating-point value
(b) Use your function to decode: 1.0, -2.5, 0.0, float('inf'), float('nan'), 1e-40 (subnormal)
(c) Verify that for normal numbers, the reconstructed value equals the input.
Hint: Use struct.pack('>f', np.float32(x)) to get the 4 bytes of a float32.
Code cell 5
# Your Solution
# Exercise 1: IEEE 754 Bit Encoding
def decode_fp32(x):
"""Decode fp32 into (sign, biased_exp, mantissa, reconstructed_value)."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 6
# Solution
def decode_fp32_solution(x):
"""Decode fp32 into (sign, biased_exp, mantissa, reconstructed_value)."""
packed = struct.pack('>f', np.float32(x))
bits = int.from_bytes(packed, 'big')
sign = (bits >> 31) & 1
exp = (bits >> 23) & 0xFF
mant = bits & 0x7FFFFF
# Reconstruct value
if exp == 255:
val = float('nan') if mant != 0 else float('inf') * (-1)**sign
elif exp == 0:
val = (-1)**sign * (mant / 2**23) * 2**(-126) # Subnormal
else:
val = (-1)**sign * (1 + mant / 2**23) * 2**(exp - 127) # Normal
return sign, exp, mant, val
print('Decoded fp32 values:')
test_values = [1.0, -2.5, 0.0, float('inf'), 1e-40]
for v in test_values:
s, e, m, rec = decode_fp32_solution(v)
err = abs(rec - float(np.float32(v))) if np.isfinite(v) else 0.0
print(f'{str(v):>8}: sign={s}, exp={e:3d}, mant={m:>8d}, reconstructed={rec:.8e}, err={err:.2e}')
# Verify reconstruction for normal numbers
normal_vals = [1.0, -2.5, 3.14159, 100.0, 0.001]
all_ok = all(abs(decode_fp32_solution(v)[3] - float(np.float32(v))) < 1e-30
for v in normal_vals if np.isfinite(v))
print(f"{'PASS' if all_ok else 'FAIL'} - reconstruction matches for normal numbers")
Exercise 2 ★ — Machine Epsilon by Experiment
Machine epsilon is the smallest number such that in floating point.
(a) Compute machine epsilon for fp64, fp32, and fp16 by successive halving.
(b) Verify your results against np.finfo(dtype).eps.
(c) Show that where is the number of significand bits: fp64: , fp32: , fp16: .
(d) Demonstrate that 1.0 + eps/2 == 1.0 but 1.0 + eps > 1.0 (in the appropriate dtype).
Code cell 8
# Your Solution
# Exercise 2: Machine Epsilon
# (a) Compute by successive halving
def compute_machine_epsilon(dtype):
"""Compute machine epsilon for given numpy dtype."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 9
# Solution
def compute_machine_epsilon_solution(dtype):
one = dtype(1.0)
eps = dtype(1.0)
while one + dtype(eps) / dtype(2.0) > one:
eps = dtype(eps) / dtype(2.0)
return eps
print(f'{"Format":>8} {"Computed eps":>16} {"np.finfo eps":>16} {"2^(1-p)":>16} {"match":>6}')
for dtype, p in [(np.float64, 53), (np.float32, 24), (np.float16, 11)]:
eps_computed = compute_machine_epsilon_solution(dtype)
eps_theory = dtype(2**(1-p))
eps_numpy = np.finfo(dtype).eps
match = np.isclose(float(eps_computed), float(eps_numpy), rtol=1e-3)
print(f'{dtype.__name__:>8} {float(eps_computed):16.6e} '
f'{float(eps_numpy):16.6e} {float(eps_theory):16.6e} {"YES" if match else "NO":>6}')
print()
# (d) Verify boundary
eps32 = np.finfo(np.float32).eps
print(f'fp32: 1.0 + eps > 1.0: {np.float32(1.0) + eps32 > np.float32(1.0)}')
print(f'fp32: 1.0 + eps/2 == 1.0: {np.float32(1.0) + eps32/2 == np.float32(1.0)}')
ok = (np.float32(1.0) + eps32 > np.float32(1.0)) and \
(np.float32(1.0) + eps32/2 == np.float32(1.0))
print(f"{'PASS' if ok else 'FAIL'} - machine epsilon boundary verified")
Exercise 3 ★★ — Catastrophic Cancellation and Algebraic Fixes
Catastrophic cancellation destroys precision when subtracting nearly-equal numbers.
(a) Implement the naive formula for large . Show it loses precision as .
(b) Derive and implement an algebraically equivalent stable formula by rationalizing:
(c) Compare errors at using fp64 as the reference for fp32 computation.
(d) Plot the relative error of both methods vs on a log-log scale.
Code cell 11
# Your Solution
# Exercise 3: Catastrophic Cancellation
def sqrt_diff_naive(x):
"""Naive: sqrt(x+1) - sqrt(x)."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 12
# Solution
def sqrt_diff_naive_sol(x):
x = np.float32(x)
return np.sqrt(x + np.float32(1.0)) - np.sqrt(x)
def sqrt_diff_stable_sol(x):
x = np.float32(x)
return np.float32(1.0) / (np.sqrt(x + np.float32(1.0)) + np.sqrt(x))
x_vals = np.logspace(1, 12, 60, dtype=np.float64)
x32 = x_vals.astype(np.float32)
ref_vals = 1.0 / (np.sqrt(x_vals + 1) + np.sqrt(x_vals)) # fp64 reference
naive_err = np.abs(np.vectorize(sqrt_diff_naive_sol)(x32).astype(np.float64) - ref_vals) / ref_vals
stable_err = np.abs(np.vectorize(sqrt_diff_stable_sol)(x32).astype(np.float64) - ref_vals) / ref_vals
print(f'{"x":>12} {"Naive err":>12} {"Stable err":>12}')
for i in [0, 15, 30, 45, 55]:
print(f'{x_vals[i]:12.2e} {naive_err[i]:12.3e} {stable_err[i]:12.3e}')
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 5))
valid_n = naive_err > 0
valid_s = stable_err > 0
ax.loglog(x_vals[valid_n], naive_err[valid_n], color='#CC3311', label='Naive fp32')
ax.loglog(x_vals[valid_s], stable_err[valid_s] + 1e-16, color='#0077BB', label='Stable fp32')
ax.axhline(float(np.finfo(np.float32).eps), color='gray', linestyle='--', label='fp32 eps')
ax.set_xlabel('x'); ax.set_ylabel('Relative error'); ax.legend()
ax.set_title('sqrt(x+1) - sqrt(x): naive vs stable')
plt.tight_layout(); plt.show()
ok = np.median(stable_err[30:]) < np.median(naive_err[30:]) * 0.001
print(f"{'PASS' if ok else 'FAIL'} - stable formula has much lower error")
Exercise 4 ★★ — Kahan Summation
Kahan compensated summation achieves error regardless of .
(a) Implement Kahan summation in fp32.
(b) Sum the series for . The true value converges to
(c) Compare naive fp32, Kahan fp32, and fp64 naive. Which is most accurate?
(d) ★★★ Extension: Show that pairwise summation (divide-and-conquer) achieves error. Implement pairwise summation and compare all three methods.
Code cell 14
# Your Solution
# Exercise 4: Kahan Summation
def kahan_sum(arr):
"""Kahan compensated sum — compute in fp32."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 15
# Solution
def kahan_sum_sol(arr):
s = np.float32(0.0)
c = np.float32(0.0)
for x in arr:
y = np.float32(x) - c
t = s + y
c = (t - s) - y
s = t
return float(s)
def pairwise_sum(arr):
"""Pairwise summation — O(log n * eps) error."""
arr = arr.copy().astype(np.float32)
n = len(arr)
while n > 1:
half = n // 2
arr[:half] = arr[:2*half:2] + arr[1:2*half:2]
if n % 2 == 1:
arr[half] = arr[n-1]
n = half + 1
else:
n = half
return float(arr[0])
true_val = np.pi**2 / 6
print(f'True pi^2/6 = {true_val:.15f}')
print()
print(f'{"n":>8} {"Kahan err":>12} {"Naive err":>12} {"Pairwise err":>14} {"fp64 err":>12}')
for n in [10, 100, 1000, 10000]:
arr = np.array([1/k**2 for k in range(1, n+1)], dtype=np.float32)
k_val = kahan_sum_sol(arr)
n_val = float(np.sum(arr))
p_val = pairwise_sum(arr)
f_val = float(np.sum(arr.astype(np.float64)))
ref = float(np.sum(np.array([1/k**2 for k in range(1,n+1)], dtype=np.float64)))
print(f'{n:8d} {abs(k_val-ref):12.3e} {abs(n_val-ref):12.3e} '
f'{abs(p_val-ref):14.3e} {abs(f_val-ref):12.3e}')
print()
n = 10000
arr = np.array([1/k**2 for k in range(1, n+1)], dtype=np.float32)
ref = float(np.sum(arr.astype(np.float64)))
ok = abs(kahan_sum_sol(arr) - ref) < abs(float(np.sum(arr)) - ref)
print(f"{'PASS' if ok else 'FAIL'} - Kahan more accurate than naive sum")
Exercise 5 ★★ — Condition Numbers and Ill-Conditioned Systems
The condition number bounds how much errors in the input amplify in the output.
(a) Compute using the 2-norm (i.e., ) for: - Identity matrix - Diagonal matrix with entries - Hilbert matrix - Random Gaussian matrix
(b) For the Hilbert matrix with , solve where (so true solution is ). Report the relative error .
(c) Verify that the relative error .
Code cell 17
# Your Solution
# Exercise 5: Condition Numbers
import numpy.linalg as nla
# (a) Compute condition numbers
def condition_number(A):
"""Compute 2-norm condition number via SVD."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 18
# Solution
def condition_number_sol(A):
sv = nla.svd(A, compute_uv=False)
return sv[0] / sv[-1]
def hilbert_sol(n):
i, j = np.meshgrid(np.arange(1,n+1), np.arange(1,n+1), indexing='ij')
return 1.0 / (i + j - 1)
# (a)
print('(a) Condition numbers:')
matrices = {
'Identity(5)': np.eye(5),
'Diag[1..16]': np.diag([1.,2.,4.,8.,16.]),
'Hilbert(5)': hilbert_sol(5),
'Random(5x5)': np.random.randn(5, 5),
}
for name, A in matrices.items():
kappa = condition_number_sol(A)
print(f' {name:>14}: kappa = {kappa:.4e}')
# (b, c) Hilbert system solve
print()
print('(b,c) Hilbert system solve:')
eps64 = np.finfo(np.float64).eps
print(f'{"n":>4} {"kappa":>14} {"rel_error":>14} {"kappa*eps":>14} {"bound_holds":>12}')
for n in [5, 8, 10, 12]:
H = hilbert_sol(n)
x_true = np.ones(n)
b = H @ x_true
x_comp = nla.solve(H, b)
kappa = condition_number_sol(H)
rel_err = nla.norm(x_comp - x_true) / nla.norm(x_true)
bound = kappa * eps64
holds = rel_err <= bound * 100 # Factor 100 for rounding in norm estimation
print(f'{n:4d} {kappa:14.4e} {rel_err:14.4e} {bound:14.4e} {"YES" if holds else "NO":>12}')
Exercise 6 ★★ — Implementing Stable Softmax and Log-Sum-Exp
(a) Implement stable_softmax(x) that:
1. Subtracts max(x) before computing exp
2. Returns a probability distribution summing to 1
3. Handles both 1D and 2D inputs (axis=-1 for rows)
(b) Implement stable_log_softmax(x) using .
(c) Implement nll_loss(log_probs, targets) (negative log-likelihood).
(d) Verify that exp(log_softmax(x)) equals softmax(x) for varied inputs.
(e) Test with extreme logits: x = [1000, 1001, 999] and x = [-1000, -999, -1001].
Code cell 20
# Your Solution
# Exercise 6: Stable Softmax and Log-Softmax
def stable_softmax(x, axis=-1):
"""Numerically stable softmax."""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 21
# Solution
def stable_softmax_sol(x, axis=-1):
x = np.asarray(x, dtype=np.float64)
x_shifted = x - np.max(x, axis=axis, keepdims=True)
e = np.exp(x_shifted)
return e / e.sum(axis=axis, keepdims=True)
def stable_log_softmax_sol(x, axis=-1):
x = np.asarray(x, dtype=np.float64)
c = np.max(x, axis=axis, keepdims=True)
log_sum_exp = c.squeeze() + np.log(np.sum(np.exp(x - c), axis=axis))
return x - log_sum_exp if x.ndim == 1 else x - log_sum_exp[..., np.newaxis]
def nll_loss_sol(log_probs, targets):
return -np.mean([log_probs[i, t] for i, t in enumerate(targets)])
# (d) Verify exp(log_softmax) == softmax
test_cases = [
np.array([1.0, 2.0, 3.0]),
np.array([1000.0, 1001.0, 999.0]),
np.array([-1000.0, -999.0, -1001.0]),
np.array([0.001, 0.002, 0.003]),
]
print('Verifying exp(log_softmax) == softmax:')
all_ok = True
for x in test_cases:
s1 = stable_softmax_sol(x)
s2 = np.exp(stable_log_softmax_sol(x))
ok = np.allclose(s1, s2, atol=1e-10)
all_ok = all_ok and ok
print(f' x={x}: {"PASS" if ok else "FAIL"}, sum={s1.sum():.10f}')
# Batch test
X = np.random.randn(5, 10)
s_batch = stable_softmax_sol(X, axis=-1)
sums_ok = np.allclose(s_batch.sum(axis=-1), 1.0)
print(f"{'PASS' if all_ok and sums_ok else 'FAIL'} - all verifications passed")
Exercise 7 ★★ — Mixed Precision and Loss Scaling
Mixed-precision training uses bf16/fp16 compute with fp32 master weights.
(a) Simulate gradient underflow: generate gradients uniformly in and quantize to fp16. What fraction underflow to zero?
(b) Apply loss scaling with and report the fraction that survive.
(c) Implement a simple dynamic loss scaler: - Increase scale by factor 2 every 100 steps without overflow - Decrease scale by factor 2 on overflow - Report the scale trajectory over 500 simulated steps
(d) What happens when the scale is too large? Demonstrate overflow in fp16.
Code cell 23
# Your Solution
# Exercise 7: Mixed Precision Loss Scaling
# (a) Gradient underflow
n_params = 10000
grads_fp32 = np.random.uniform(1e-6, 1e-5, n_params)
# YOUR CODE: convert to fp16 and count underflows
# grads_fp16 = ...
# n_underflow = ...
# print(f'Fraction underflowing: {n_underflow/n_params:.2%}')
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 24
# Solution
n_params = 10000
np.random.seed(42)
grads_fp32 = np.random.uniform(1e-6, 1e-5, n_params).astype(np.float32)
# (a) Fraction underflowing without scaling
grads_fp16 = grads_fp32.astype(np.float16).astype(np.float32)
n_underflow_none = np.sum(grads_fp16 == 0)
print(f'(a) No scaling: {n_underflow_none}/{n_params} ({n_underflow_none/n_params:.1%}) underflow to 0')
# (b) Effect of loss scaling
print()
print('(b) Effect of loss scaling:')
print(f'{"Scale":>8} {"Underflows":>12} {"Overflows":>10} {"Useful":>10}')
fp16_max = np.finfo(np.float16).max
for S in [1, 16, 256, 4096, 65536]:
scaled = grads_fp32 * S
overflow = np.sum(np.abs(scaled) > fp16_max)
fp16_q = scaled.astype(np.float16)
underflow = np.sum(fp16_q == 0)
useful = n_params - underflow - overflow
print(f'{S:8d} {underflow:12d} {overflow:10d} {useful:10d}')
# (c) Dynamic loss scaler
print()
print('(c) Dynamic loss scaler trajectory:')
scale = 1024.0
growth_interval = 100
steps_since_increase = 0
scale_history = []
np.random.seed(0)
for step in range(500):
# Simulate: overflow probability decreases as scale increases past 65504
overflow_prob = min(0.5, max(0.0, (scale - 32768) / 65504))
overflow = np.random.random() < overflow_prob
if overflow:
scale /= 2
steps_since_increase = 0
else:
steps_since_increase += 1
if steps_since_increase >= growth_interval:
scale = min(scale * 2, 2**24) # Cap
steps_since_increase = 0
scale_history.append(scale)
if HAS_MPL:
fig, ax = plt.subplots(figsize=(10, 4))
ax.semilogy(scale_history, color='#0077BB')
ax.set_xlabel('Training step'); ax.set_ylabel('Loss scale (log)')
ax.set_title('Dynamic loss scale trajectory')
plt.tight_layout(); plt.show()
print(f'Final scale: {scale_history[-1]:.0f}')
ok = n_underflow_none > 0 # Some underflow without scaling
print(f"{'PASS' if ok else 'FAIL'} - underflow occurs without scaling")
Exercise 8 ★★★ — Forward Error Analysis of Matrix-Vector Product
Derive and verify the forward error bound for the matrix-vector product .
(a) The forward error for computing satisfies:
Implement a function that computes this error bound.
(b) Generate random and in fp32. Compute in fp32 and in fp64 (reference). Verify the bound holds for each component.
(c) Experiment with ill-conditioned (Hilbert matrix). Does the bound still hold?
(d) ★★★ Extension: Show empirically that the worst-case error bound is nearly tight by constructing and that maximize rounding errors (e.g., alternating signs in ).
Code cell 26
# Your Solution
# Exercise 8: Error Analysis
# (a) Implement error bound
def matvec_error_bound(A, x, eps_mach):
"""
Compute componentwise forward error bound for y = A @ x.
Bound: n * eps_mach * sum_j |A_ij| |x_j|
"""
print("Learner scaffold loaded; write your solution here, then compare below.")
Code cell 27
# Solution
import numpy.linalg as nla
def matvec_error_bound_sol(A, x, eps_mach):
n = A.shape[1]
return n * eps_mach * (np.abs(A) @ np.abs(x))
def hilbert_mat(n):
i, j = np.meshgrid(np.arange(1,n+1), np.arange(1,n+1), indexing='ij')
return 1.0 / (i + j - 1)
eps32 = np.finfo(np.float32).eps
np.random.seed(42)
# (b) Random matrix
print('(b) Random 10x10 matrix:')
A = np.random.randn(10, 10).astype(np.float32)
x = np.random.randn(10).astype(np.float32)
y_ref = A.astype(np.float64) @ x.astype(np.float64) # fp64 reference
y_fp32 = A @ x # fp32 computation
actual_err = np.abs(y_fp32.astype(np.float64) - y_ref)
bound_vec = matvec_error_bound_sol(A.astype(np.float64), x.astype(np.float64), eps32)
all_hold = np.all(actual_err <= bound_vec + 1e-30)
print(f' Max actual error: {actual_err.max():.4e}')
print(f' Max bound: {bound_vec.max():.4e}')
print(f' Bound holds for all components: {all_hold}')
# (c) Hilbert matrix
print()
print('(c) Hilbert 10x10 matrix:')
H = hilbert_mat(10).astype(np.float32)
x_h = np.ones(10, dtype=np.float32)
y_ref_h = H.astype(np.float64) @ x_h.astype(np.float64)
y_fp32_h = H @ x_h
actual_err_h = np.abs(y_fp32_h.astype(np.float64) - y_ref_h)
bound_vec_h = matvec_error_bound_sol(H.astype(np.float64), x_h.astype(np.float64), eps32)
all_hold_h = np.all(actual_err_h <= bound_vec_h * 1000) # Looser factor for ill-conditioned
print(f' Max actual error: {actual_err_h.max():.4e}')
print(f' Max bound: {bound_vec_h.max():.4e}')
print(f' Bound holds (up to 1000x): {all_hold_h}')
ok = all_hold and all_hold_h
print(f"\n{'PASS' if ok else 'FAIL'} - error bounds verified")
Summary
You have practiced:
- IEEE 754 encoding — bit-level decomposition of sign, exponent, mantissa
- Machine epsilon — the fundamental precision limit of each format
- Catastrophic cancellation — identifying and fixing precision loss via algebraic identities
- Kahan summation — achieving error independent of
- Condition numbers — measuring sensitivity of linear systems to perturbations
- Stable softmax — log-sum-exp trick, translation invariance, log-softmax
- Mixed precision — loss scaling dynamics, gradient underflow/overflow
- Error analysis — componentwise forward error bounds for matrix operations
Next: Numerical Linear Algebra →
Exercise 9 *** - Stable Cross-Entropy from Logits
Implement cross-entropy directly from logits using log-sum-exp, then compare it to a naive softmax implementation on very large logits.
Code cell 30
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 31
# Solution
def logsumexp(z):
z=np.asarray(z,dtype=float); m=z.max(); return m+np.log(np.exp(z-m).sum())
def stable_ce(logits,y): return -logits[y]+logsumexp(logits)
def naive_ce(logits,y):
p=np.exp(logits)/np.exp(logits).sum(); return -np.log(p[y])
header('Exercise 9: Stable Cross-Entropy')
z=np.array([1000.,999.,995.])
ce=stable_ce(z,0)
print(f'stable CE={ce:.6f}')
try:
bad=naive_ce(z,0); print(f'naive CE={bad}')
except Exception as e:
print('naive CE failed:', type(e).__name__)
check_true('stable CE is finite', np.isfinite(ce))
print('Takeaway: log-sum-exp is the numerical core of stable softmax cross-entropy.')
Exercise 10 *** - Dynamic Loss Scaling for fp16 Gradients
Simulate tiny gradients in fp16, then show how multiplying the loss before backpropagation and unscaling afterward preserves information.
Code cell 33
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 34
# Solution
g=np.array([1e-8,2e-8,5e-8,1e-7],dtype=np.float32)
g16=g.astype(np.float16)
scale=2**14
scaled=(g*scale).astype(np.float16)
unscaled=scaled.astype(np.float32)/scale
header('Exercise 10: Dynamic Loss Scaling')
print('fp32:',g)
print('direct fp16:',g16)
print('scaled/unscaled:',unscaled)
check_true('direct fp16 underflows some entries', np.count_nonzero(g16)==0 or np.count_nonzero(g16)<len(g16))
check_true('loss scaling preserves more nonzero entries', np.count_nonzero(unscaled)>np.count_nonzero(g16))
print('Takeaway: loss scaling shifts gradients into fp16 representable range, then restores scale in fp32.')