Exercises NotebookMath for LLMs

Normalization Techniques

ML Specific Math / Normalization Techniques

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Normalization Techniques - Exercises

Ten graded exercises for normalization axes and diagnostics.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

def header(title): print("\n"+"="*72+"\n"+title+"\n"+"="*72)
def check_close(name,value,expected,tol=1e-7):
    ok=np.allclose(value,expected,atol=tol,rtol=tol); print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}"); return ok
def check_true(name,condition): ok=bool(condition); print(f"{'PASS' if ok else 'FAIL'} - {name}"); return ok
print("Exercise helpers ready.")

Exercise 1: Normalize a vector (*)

Compute mean, variance, and normalized vector.

Code cell 5

# Your Solution
x=np.array([1.,2.,3.])
y=None
print(y)

Code cell 6

# Solution
header("Exercise 1: Normalize a vector")
x=np.array([1.,2.,3.]); mu=x.mean(); var=x.var(); y=(x-mu)/np.sqrt(var+1e-5)
check_close("mean zero", y.mean(), 0.0, tol=1e-6)
print("\nTakeaway: normalization subtracts a chosen mean and divides by a stabilized scale.")

Exercise 2: BatchNorm (*)

Normalize a B x D matrix over batch axis.

Code cell 8

# Your Solution
X=np.array([[1.,2.],[3.,4.]])
Y=None
print(Y)

Code cell 9

# Solution
header("Exercise 2: BatchNorm")
X=np.array([[1.,2.],[3.,4.]]); Y=(X-X.mean(axis=0,keepdims=True))/np.sqrt(X.var(axis=0,keepdims=True)+1e-5)
check_close("feature means zero", Y.mean(axis=0), np.zeros(2), tol=1e-5)
print("\nTakeaway: BatchNorm couples examples through feature statistics.")

Exercise 3: LayerNorm (*)

Normalize a B x D matrix over feature axis.

Code cell 11

# Your Solution
X=np.array([[1.,2.],[3.,7.]])
Y=None
print(Y)

Code cell 12

# Solution
header("Exercise 3: LayerNorm")
X=np.array([[1.,2.],[3.,7.]]); Y=(X-X.mean(axis=1,keepdims=True))/np.sqrt(X.var(axis=1,keepdims=True)+1e-5)
check_close("row means zero", Y.mean(axis=1), np.zeros(2), tol=1e-5)
print("\nTakeaway: LayerNorm couples features inside each example.")

Exercise 4: Batch dependence (**)

Show same sample changes under BatchNorm when batch changes.

Code cell 14

# Your Solution
sample=np.array([[1.,2.]])
out=None
print(out)

Code cell 15

# Solution
header("Exercise 4: Batch dependence")
sample=np.array([[1.,2.]])
A=np.vstack([sample, [[2.,3.],[3.,4.]]]); B=np.vstack([sample, [[100.,200.],[110.,210.]]])
YA=(A-A.mean(0,keepdims=True))/np.sqrt(A.var(0,keepdims=True)+1e-5); YB=(B-B.mean(0,keepdims=True))/np.sqrt(B.var(0,keepdims=True)+1e-5)
check_true("same sample differs", np.linalg.norm(YA[0]-YB[0])>1)
print("\nTakeaway: BatchNorm output depends on other examples in the mini-batch.")

Exercise 5: Layer independence (**)

Show LayerNorm ignores other batch examples.

Code cell 17

# Your Solution
sample=np.array([[1.,2.,4.]])
out=None
print(out)

Code cell 18

# Solution
header("Exercise 5: Layer independence")
sample=np.array([[1.,2.,4.]])
A=np.vstack([sample, np.zeros((2,3))]); B=np.vstack([sample, 100*np.ones((2,3))])
def ln(X): return (X-X.mean(1,keepdims=True))/np.sqrt(X.var(1,keepdims=True)+1e-5)
check_close("same sample same LN", ln(A)[0], ln(B)[0])
print("\nTakeaway: LayerNorm is stable across batch composition.")

Exercise 6: RMSNorm (**)

Implement RMSNorm and compare mean.

Code cell 20

# Your Solution
X=np.array([[1.,2.,3.]])
Y=None
print(Y)

Code cell 21

# Solution
header("Exercise 6: RMSNorm")
X=np.array([[1.,2.,3.]]); rms=np.sqrt(np.mean(X**2,axis=1,keepdims=True)+1e-5); Y=X/rms
check_close("unit RMS", np.sqrt(np.mean(Y**2,axis=1)), np.ones(1), tol=1e-5)
check_true("mean not zero", abs(Y.mean())>0.1)
print("\nTakeaway: RMSNorm controls scale without centering.")

Exercise 7: GroupNorm (**)

Normalize two groups per example.

Code cell 23

# Your Solution
X=np.array([[1.,2.,10.,12.]])
Y=None
print(Y)

Code cell 24

# Solution
header("Exercise 7: GroupNorm")
X=np.array([[1.,2.,10.,12.]]); G=X.reshape(1,2,2); Y=((G-G.mean(2,keepdims=True))/np.sqrt(G.var(2,keepdims=True)+1e-5)).reshape(1,4)
check_close("group means", Y.reshape(1,2,2).mean(2), np.zeros((1,2)), tol=1e-5)
print("\nTakeaway: GroupNorm normalizes channel groups within each example.")

Exercise 8: WeightNorm (***)

Verify weight norm equals learned scale.

Code cell 26

# Your Solution
v=np.array([3.,4.]); g=2.
w=None
print(w)

Code cell 27

# Solution
header("Exercise 8: WeightNorm")
v=np.array([3.,4.]); g=2.; w=g*v/np.linalg.norm(v)
check_close("norm equals g", np.linalg.norm(w), g)
print("\nTakeaway: WeightNorm separates direction from magnitude.")

Exercise 9: SpectralNorm (***)

Normalize a matrix by largest singular value.

Code cell 29

# Your Solution
W=np.array([[3.,0.],[0.,1.]])
Wn=None
print(Wn)

Code cell 30

# Solution
header("Exercise 9: SpectralNorm")
W=np.array([[3.,0.],[0.,1.]]); sigma=np.linalg.svd(W,compute_uv=False)[0]; Wn=W/sigma
check_close("spectral norm one", np.linalg.svd(Wn,compute_uv=False)[0], 1.0)
print("\nTakeaway: SpectralNorm controls operator scale, not activation statistics.")

Exercise 10: Broadcasting bug (***)

Check gamma shape for a B x T x D tensor.

Code cell 32

# Your Solution
X=np.zeros((2,3,4)); gamma=np.ones((4,))
ok=None
print(ok)

Code cell 33

# Solution
header("Exercise 10: Broadcasting bug")
X=np.zeros((2,3,4)); gamma=np.ones((4,)); Y=X+gamma
check_true("broadcasts across B and T", Y.shape==X.shape)
check_close("last dimension gamma", Y[0,0], gamma)
print("\nTakeaway: normalization parameters must align with the intended feature axis.")