Exercises NotebookMath for LLMs

Activation Functions

ML Specific Math / Activation Functions

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Activation Functions - Exercises

Ten graded exercises covering values, derivatives, gates, softmax, and diagnostics.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_close(name, value, expected, tol=1e-7):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def sigmoid(x): return 1/(1+np.exp(-x))
def softmax(z):
    shifted = z - np.max(z)
    e = np.exp(shifted)
    return e/e.sum()
print("Exercise helpers ready.")

Exercise 1: Sigmoid derivative (*)

Derive and compute sigmoid derivative.

Code cell 5

# Your Solution
x=np.array([-1.0,0.0,1.0])
deriv=None
print(deriv)

Code cell 6

# Solution
header("Exercise 1: Sigmoid derivative")
x=np.array([-1.0,0.0,1.0])
s=sigmoid(x)
deriv=s*(1-s)
check_close("center derivative", deriv[1], 0.25)
print("\nTakeaway: sigmoid saturates because its derivative approaches zero in both tails.")

Exercise 2: Tanh derivative (*)

Compute tanh derivative.

Code cell 8

# Your Solution
x=np.array([-1.0,0.0,1.0])
deriv=None
print(deriv)

Code cell 9

# Solution
header("Exercise 2: Tanh derivative")
x=np.array([-1.0,0.0,1.0])
deriv=1-np.tanh(x)**2
check_close("center derivative", deriv[1], 1.0)
print("\nTakeaway: tanh is zero-centered but still saturates.")

Exercise 3: ReLU family (*)

Compute ReLU and Leaky ReLU.

Code cell 11

# Your Solution
x=np.array([-2.0,0.0,3.0])
y=None
print(y)

Code cell 12

# Solution
header("Exercise 3: ReLU family")
x=np.array([-2.0,0.0,3.0])
relu=np.maximum(0,x)
leaky=np.where(x>0,x,0.1*x)
check_close("relu", relu, np.array([0.0,0.0,3.0]))
check_true("leaky keeps negative signal", leaky[0]<0)
print("\nTakeaway: Leaky ReLU reduces the dead-neuron zero-gradient region.")

Exercise 4: Affine collapse (**)

Show two affine layers collapse without activation.

Code cell 14

# Your Solution
W1=np.eye(2); W2=2*np.eye(2); x=np.ones(2)
out=None
print(out)

Code cell 15

# Solution
header("Exercise 4: Affine collapse")
W1=np.array([[1.,2.],[0.,1.]]); W2=np.array([[2.,0.],[1.,1.]])
b1=np.array([1.,-1.]); b2=np.array([0.5,0.5]); x=np.array([2.,3.])
out=W2@(W1@x+b1)+b2
A=W2@W1; c=W2@b1+b2
check_close("collapsed affine", out, A@x+c)
print("\nTakeaway: activations are required for nonlinear depth.")

Exercise 5: Stable softmax (**)

Implement stable softmax and test shift invariance.

Code cell 17

# Your Solution
z=np.array([1000.,999.,998.])
p=None
print(p)

Code cell 18

# Solution
header("Exercise 5: Stable softmax")
z=np.array([1000.,999.,998.])
p=softmax(z)
check_close("sums to one", p.sum(), 1.0)
check_close("shift invariant", p, softmax(z-1000))
print("\nTakeaway: subtracting the max protects softmax from overflow.")

Exercise 6: Softmax Jacobian (**)

Compute the softmax Jacobian.

Code cell 20

# Your Solution
s=np.array([0.2,0.3,0.5])
J=None
print(J)

Code cell 21

# Solution
header("Exercise 6: Softmax Jacobian")
s=np.array([0.2,0.3,0.5])
J=np.diag(s)-np.outer(s,s)
check_close("row sums zero", J.sum(axis=1), np.zeros(3))
check_true("PSD", np.linalg.eigvalsh(J).min()>-1e-10)
print("\nTakeaway: softmax derivatives are coupled across classes.")

Exercise 7: GELU vs SiLU (**)

Compute smooth activations at sample inputs.

Code cell 23

# Your Solution
x=np.array([-1.,0.,1.])
gel=None
print(gel)

Code cell 24

# Solution
header("Exercise 7: GELU vs SiLU")
x=np.array([-1.,0.,1.])
gel=0.5*x*(1+np.tanh(np.sqrt(2/np.pi)*(x+0.044715*x**3)))
silu=x*sigmoid(x)
check_true("finite smooth activations", np.isfinite(gel).all() and np.isfinite(silu).all())
print("GELU", gel, "SiLU", silu)
print("\nTakeaway: smooth activations keep small negative outputs instead of hard zeroing.")

Exercise 8: GLU (***)

Compute GLU output and local derivatives.

Code cell 26

# Your Solution
a=np.array([2.,-1.]); b=np.array([0.,2.])
out=None
print(out)

Code cell 27

# Solution
header("Exercise 8: GLU")
a=np.array([2.,-1.]); b=np.array([0.,2.])
g=sigmoid(b); out=a*g
check_close("GLU output", out, a*g)
check_close("dy/da", g, sigmoid(b))
print("\nTakeaway: gated activations create separate content and gate gradient paths.")

Exercise 9: He variance (***)

Compute Xavier and He variances.

Code cell 29

# Your Solution
n_in=128; n_out=64
var=None
print(var)

Code cell 30

# Solution
header("Exercise 9: He variance")
n_in=128; n_out=64
xavier=2/(n_in+n_out); he=2/n_in
check_true("He greater than Xavier here", he>xavier)
print("xavier", xavier, "he", he)
print("\nTakeaway: initialization depends on activation statistics.")

Exercise 10: Dead ReLU diagnostic (***)

Detect dead units from preactivation statistics.

Code cell 32

# Your Solution
preacts=np.array([[-1.,2.],[-2.,3.]])
dead=None
print(dead)

Code cell 33

# Solution
header("Exercise 10: Dead ReLU diagnostic")
preacts=np.array([[-2.,1.],[-1.,2.],[-3.,3.]])
active_fraction=np.mean(preacts>0, axis=0)
dead=active_fraction<0.01
check_true("first unit dead", dead[0])
print("\nTakeaway: dead ReLUs are diagnosed by persistently inactive preactivations.")