Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Fisher Information — Exercises
This notebook contains 8 exercises covering score functions, scalar and matrix Fisher information, KL curvature, Jeffreys priors, empirical Fisher approximations, and ML-facing applications.
Difficulty: Exercises 1-3 are mechanics, 4-6 are theory, and 7-8 are advanced ML applications.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
if not ok:
print(" value :", value)
print(" target:", target)
return ok
def check_true(name, condition):
print(f"{'PASS' if condition else 'FAIL'} - {name}")
return condition
print("Exercise helpers ready.")
Exercise 1 [*] — Bernoulli Fisher
Let .
- Derive the score .
- Compute the Fisher information .
- Explain where on the model is most locally informative.
Code cell 5
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 6
# Solution
# Exercise 1: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
header("Exercise 1: Bernoulli Fisher")
p = 0.3
fisher = 1.0 / (p * (1 - p))
check_close("I(p) = 1/(p(1-p))", fisher, 1.0 / (p * (1 - p)))
print("The model is most locally informative near the boundaries p -> 0 or p -> 1.")
print("\nTakeaway: Bernoulli Fisher grows near the boundaries because rare outcomes are highly informative about edge probabilities.")
Exercise 2 [*] — Additivity
Show that if are iid from a regular scalar model, then . Verify the formula numerically for the Gaussian mean model with known variance.
Code cell 8
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 9
# Solution
# Exercise 2: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
header("Exercise 2: Additivity")
n = 20
sigma2 = 2.0
single = 1.0 / sigma2
total = n * single
check_close("I_n = n I", total, 10.0)
print("\nTakeaway: Fisher information accumulates linearly across independent observations.")
Exercise 3 [*] — Reparameterization
For the Bernoulli model, compute Fisher information in probability coordinates and in logit coordinates . Verify the transformation law.
Code cell 11
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 12
# Solution
# Exercise 3: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
header("Exercise 3: Reparameterization")
p = 0.25
fisher_p = 1.0 / (p * (1 - p))
dp_dphi = p * (1 - p)
fisher_phi = fisher_p * dp_dphi**2
check_close("I_phi = I_p (dp/dphi)^2", fisher_phi, p * (1 - p))
print("\nTakeaway: Raw Fisher values change under coordinates, but the metric meaning is preserved.")
Exercise 4 [**] — Gaussian matrix Fisher
Let with known covariance matrix . Derive the Fisher information matrix for and explain what its eigenvalues mean.
Code cell 14
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 15
# Solution
# Exercise 4: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
header("Exercise 4: Gaussian matrix Fisher")
Sigma = np.array([[2.0, 0.5], [0.5, 1.0]])
fisher = np.linalg.inv(Sigma)
check_close("Fisher equals Sigma^{-1}", fisher @ Sigma, np.eye(2), tol=1e-7)
print("Eigenvalues describe directional information strength in parameter space.")
print("\nTakeaway: In Gaussian location models, covariance and Fisher information are exact inverses.")
Exercise 5 [**] — Local KL curvature
Show for the Bernoulli model that . Verify this numerically for a small displacement.
Code cell 17
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 18
# Solution
# Exercise 5: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_close(name, value, target, tol=1e-8):
ok = np.allclose(value, target, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
header("Exercise 5: Local KL curvature")
p = 0.4
delta = 0.01
q = p + delta
kl = p * np.log(p / q) + (1 - p) * np.log((1 - p) / (1 - q))
fisher = 1.0 / (p * (1 - p))
quad = 0.5 * fisher * delta**2
check_close("Local KL matches the Fisher quadratic term", kl, quad, tol=5e-6)
print("\nTakeaway: Fisher information is the second-order curvature hidden inside local KL divergence.")
Exercise 6 [**] — Jeffreys prior
Derive Jeffreys prior for Bernoulli() and for the exponential rate parameter . State which prior is proper and which is improper.
Code cell 20
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 21
# Solution
# Exercise 6: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_true(name, condition):
print(f"{'PASS' if condition else 'FAIL'} - {name}")
return condition
header("Exercise 6: Jeffreys prior")
print("Bernoulli: pi_J(p) proportional to 1/sqrt(p(1-p)) -> proper Beta(1/2, 1/2).")
print("Exponential rate: pi_J(lambda) proportional to 1/lambda -> improper on (0, infinity).")
check_true("Bernoulli Jeffreys prior is proper", True)
check_true("Exponential Jeffreys prior is improper", True)
print("\nTakeaway: Jeffreys priors are invariant by construction, but not always normalizable.")
Exercise 7 [***] — Empirical Fisher versus Hessian
Build a toy logistic-regression example and compare the empirical Fisher with the observed Hessian. Explain why they need not match on finite data.
Code cell 23
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 24
# Solution
# Exercise 7: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_true(name, condition):
print(f"{'PASS' if condition else 'FAIL'} - {name}")
return condition
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
header("Exercise 7: Empirical Fisher versus Hessian")
rng = np.random.default_rng(0)
X = rng.normal(size=(300, 2))
w = np.array([0.8, -0.4])
probs = sigmoid(X @ w)
y = rng.binomial(1, probs)
W = probs * (1 - probs)
hessian = (X.T * W) @ X / len(X)
grads = ((y - probs)[:, None]) * X
empirical = grads.T @ grads / len(X)
diff = np.linalg.norm(empirical - hessian)
print("||Empirical Fisher - Hessian||_F =", diff)
check_true("The difference is nonzero on finite data", diff > 1e-4)
print("\nTakeaway: The empirical Fisher is a convenient approximation, not the same object by definition.")
Exercise 8 [***] — EWC-style Fisher importance
Use a diagonal Fisher estimate from a first binary-classification task to construct an EWC penalty. Show that larger moves along important parameters incur larger penalties.
Code cell 26
# Your Solution
print("Write your solution here, then compare with the reference solution below.")
Code cell 27
# Solution
# Exercise 8: Solution
import numpy as np
def header(title):
print("\n" + "=" * 78)
print(title)
print("=" * 78)
def check_true(name, condition):
print(f"{'PASS' if condition else 'FAIL'} - {name}")
return condition
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
header("Exercise 8: EWC-style Fisher importance")
rng = np.random.default_rng(1)
X = rng.normal(size=(300, 2))
w_star = np.array([0.9, -0.7])
probs = sigmoid(X @ w_star)
y = rng.binomial(1, probs)
diag_fisher = np.mean((((y - probs)[:, None]) * X) ** 2, axis=0)
old = w_star
big_move = np.array([1.4, -0.1])
small_move = np.array([0.95, -0.65])
penalty_big = 0.5 * np.sum(diag_fisher * (big_move - old) ** 2)
penalty_small = 0.5 * np.sum(diag_fisher * (small_move - old) ** 2)
print("big-move penalty =", penalty_big)
print("small-move penalty=", penalty_small)
check_true("Larger movement creates a larger EWC penalty", penalty_big > penalty_small)
print("\nTakeaway: EWC reuses Fisher information as a local importance weighting on parameters.")
Exercise 9: Bernoulli Fisher Information
Compute the Fisher information of a Bernoulli model and verify the closed form away from the boundary.
Code cell 29
# Your Solution
print("Compute Bernoulli Fisher information at p=0.3.")
Code cell 30
# Solution
header("Exercise 9: Bernoulli Fisher")
p = 0.3
score1 = 1 / p
score0 = -1 / (1 - p)
I = p * score1**2 + (1-p) * score0**2
closed = 1 / (p * (1-p))
print("computed:", round(float(I), 6))
print("closed form:", round(float(closed), 6))
check_close("Bernoulli Fisher", I, closed)
print("Takeaway: Fisher information blows up near p=0 or p=1 because the model becomes locally sensitive.")
Exercise 10: Natural Gradient Rescaling
Compare an ordinary gradient step and a natural-gradient step for a two-parameter diagonal Fisher matrix.
Code cell 32
# Your Solution
print("Rescale a gradient by inverse Fisher curvature.")
Code cell 33
# Solution
header("Exercise 10: Natural Gradient")
g = np.array([2.0, 2.0])
F = np.diag([10.0, 0.5])
eta = 0.1
ordinary = -eta * g
natural = -eta * np.linalg.solve(F, g)
print("ordinary step:", ordinary)
print("natural step:", natural)
check_true("high-curvature coordinate is damped", abs(natural[0]) < abs(ordinary[0]))
check_true("low-curvature coordinate is amplified", abs(natural[1]) > abs(ordinary[1]))
print("Takeaway: natural gradient measures step size in distribution space, not raw parameter space.")