Theory NotebookMath for LLMs

Activation Functions

ML Specific Math / Activation Functions

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Activation Functions - Theory Notebook

Executable companion to notes.md.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

Code cell 4

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def tanh(x):
    return np.tanh(x)

def relu(x):
    return np.maximum(0, x)

def leaky_relu(x, alpha=0.1):
    return np.where(x > 0, x, alpha * x)

def softplus(x):
    return np.logaddexp(0, x)

def gelu(x):
    return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))

def silu(x):
    return x * sigmoid(x)

def mish(x):
    return x * np.tanh(softplus(x))

def softmax(z, axis=-1):
    z = np.asarray(z, dtype=float)
    shifted = z - np.max(z, axis=axis, keepdims=True)
    exp_shifted = np.exp(shifted)
    return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)

def finite_diff(f, x, eps=1e-5):
    return (f(x + eps) - f(x - eps)) / (2 * eps)

def check_close(name, value, expected, tol=1e-6):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

print("Activation helpers ready.")

1. Curves

Code cell 6

x = np.linspace(-6, 6, 500)
curves = {"sigmoid": sigmoid(x), "tanh": tanh(x), "ReLU": relu(x), "GELU": gelu(x), "SiLU": silu(x)}
fig, ax = plt.subplots()
for (name, y), color in zip(curves.items(), [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"], COLORS["error"]]):
    ax.plot(x, y, label=name, color=color)
ax.set_title("Activation function curves")
ax.set_xlabel("Input $x$")
ax.set_ylabel("Activation")
ax.legend()
fig.tight_layout()
plt.show()
print("Plotted common activation curves.")

2. Sigmoid derivative

Code cell 8

x = np.linspace(-8, 8, 500)
s = sigmoid(x)
deriv = s * (1 - s)
print("max sigmoid derivative:", deriv.max())
check_true("sigmoid derivative <= 0.25", deriv.max() <= 0.25001)

3. Tanh derivative

Code cell 10

t = tanh(x)
deriv_t = 1 - t**2
print("max tanh derivative:", deriv_t.max())
check_true("tanh derivative <= 1", deriv_t.max() <= 1.00001)

4. Derivative comparison plot

Code cell 12

fig, ax = plt.subplots()
ax.plot(x, s * (1 - s), label="sigmoid'", color=COLORS["primary"])
ax.plot(x, 1 - tanh(x)**2, label="tanh'", color=COLORS["secondary"])
ax.plot(x, (x > 0).astype(float), label="ReLU'", color=COLORS["tertiary"])
ax.plot(x, finite_diff(gelu, x), label="GELU'", color=COLORS["highlight"])
ax.set_title("Activation derivatives")
ax.set_xlabel("Input $x$")
ax.set_ylabel("Derivative")
ax.legend()
fig.tight_layout()
plt.show()
print("Derivative comparison complete.")

5. Saturation

Code cell 14

for value in [-10, -5, 0, 5, 10]:
    sig_grad = sigmoid(value) * (1 - sigmoid(value))
    tanh_grad = 1 - np.tanh(value) ** 2
    print(f"x={value:>3}: sigmoid'={sig_grad:.6f}, tanh'={tanh_grad:.6f}")

6. Affine collapse

Code cell 16

W1 = np.array([[1.0, 2.0], [0.0, 1.0]])
W2 = np.array([[2.0, -1.0], [1.0, 0.5]])
b1 = np.array([0.5, -1.0])
b2 = np.array([1.0, 2.0])
A = W2 @ W1
c = W2 @ b1 + b2
x0 = np.array([3.0, -2.0])
stacked = W2 @ (W1 @ x0 + b1) + b2
collapsed = A @ x0 + c
check_close("stacked affine equals collapsed affine", stacked, collapsed)

7. ReLU sparsity

Code cell 18

z = np.random.normal(size=(1000,))
a = relu(z)
print("Fraction zeros after ReLU:", np.mean(a == 0))
check_true("roughly half zero for standard normal", 0.4 < np.mean(a == 0) < 0.6)

8. Leaky ReLU keeps negative gradient

Code cell 20

vals = np.array([-2.0, -0.5, 0.0, 1.0])
print("ReLU:", relu(vals))
print("Leaky ReLU:", leaky_relu(vals, alpha=0.1))
check_true("negative values are not zeroed by Leaky ReLU", leaky_relu(vals)[0] < 0)

9. Softplus approximates ReLU

Code cell 22

sp = softplus(x)
err_pos = abs(softplus(8.0) - relu(8.0))
err_neg = abs(softplus(-8.0) - relu(-8.0))
print("softplus(8)-relu(8):", err_pos)
print("softplus(-8)-relu(-8):", err_neg)
check_true("softplus close to ReLU in tails", err_pos < 1e-3 and err_neg < 1e-3)

10. GELU and SiLU negative tails

Code cell 24

values = np.array([-3.0, -1.0, 0.0, 1.0, 3.0])
print("GELU:", np.round(gelu(values), 4))
print("SiLU:", np.round(silu(values), 4))
check_true("GELU allows small negative outputs", gelu(np.array([-1.0]))[0] < 0)

11. Smooth activation derivatives

Code cell 26

for name, f in [("GELU", gelu), ("SiLU", silu), ("Mish", mish)]:
    d0 = finite_diff(f, np.array([0.0]))[0]
    print(f"{name} derivative near 0: {d0:.4f}")

12. GLU

Code cell 28

a = np.array([1.0, -2.0, 3.0])
b = np.array([-2.0, 0.0, 2.0])
glu = a * sigmoid(b)
print("gate:", np.round(sigmoid(b), 4))
print("GLU output:", np.round(glu, 4))
check_true("gate values lie in (0,1)", np.all((sigmoid(b) > 0) & (sigmoid(b) < 1)))

13. GEGLU and SwiGLU

Code cell 30

geglu = a * gelu(b)
swiglu = a * silu(b)
print("GEGLU:", np.round(geglu, 4))
print("SwiGLU:", np.round(swiglu, 4))
check_true("gated outputs finite", np.isfinite(geglu).all() and np.isfinite(swiglu).all())

14. Bilinear gate gradients

Code cell 32

a = np.array([2.0, -1.0])
g = np.array([0.25, 0.75])
y = a * g
print("dy/da = gate:", g)
print("dy/dg = content:", a)
check_close("output", y, np.array([0.5, -0.75]))

15. Softmax probabilities

Code cell 34

z = np.array([2.0, 1.0, -1.0])
s = softmax(z)
print("softmax:", np.round(s, 6))
check_close("softmax sums to 1", s.sum(), 1.0)

16. Stable softmax shift invariance

Code cell 36

shifted = softmax(z + 1000)
check_close("softmax shift invariance", shifted, s)

17. Temperature softmax

Code cell 38

for tau in [0.5, 1.0, 2.0]:
    st = softmax(z / tau)
    print(f"tau={tau}: probs={np.round(st, 4)}, entropy={-np.sum(st*np.log(st)):.4f}")

18. Softmax Jacobian

Code cell 40

J = np.diag(s) - np.outer(s, s)
print("Jacobian:\n", np.round(J, 5))
check_close("Jacobian row sums zero", J.sum(axis=1), np.zeros_like(s))
check_true("Jacobian PSD", np.linalg.eigvalsh(J).min() > -1e-10)

19. Softmax saturation

Code cell 42

sharp = softmax(np.array([20.0, 0.0, -20.0]))
print("sharp softmax:", sharp)
check_true("largest class nearly one", sharp[0] > 0.999999)

20. Activation variance

Code cell 44

X = np.random.normal(size=(5000, 128))
for name, f in [("tanh", tanh), ("ReLU", relu), ("GELU", gelu), ("SiLU", silu)]:
    A = f(X)
    print(f"{name:>5}: mean={A.mean(): .4f}, var={A.var():.4f}")

21. Xavier vs He scale

Code cell 46

n_in, n_out = 256, 128
var_xavier = 2 / (n_in + n_out)
var_he = 2 / n_in
print("Xavier variance:", var_xavier)
print("He variance:", var_he)
check_true("He variance larger for ReLU", var_he > var_xavier)

22. One-layer signal propagation

Code cell 48

X = np.random.normal(size=(4096, 256))
W_xavier = np.random.normal(scale=np.sqrt(var_xavier), size=(256, 128))
W_he = np.random.normal(scale=np.sqrt(var_he), size=(256, 128))
A_tanh = tanh(X @ W_xavier)
A_relu = relu(X @ W_he)
print("tanh with Xavier var:", A_tanh.var())
print("ReLU with He var:", A_relu.var())
check_true("activation variances finite", np.isfinite(A_tanh.var()) and np.isfinite(A_relu.var()))

23. Vanishing gradient toy product

Code cell 50

slopes = np.full(40, 0.25)
product = np.prod(slopes)
print("Product of 40 sigmoid max slopes:", product)
check_true("deep sigmoid slope product tiny", product < 1e-20)

24. ReLU active path product

Code cell 52

relu_slopes = np.random.binomial(1, 0.5, size=40)
print("Random ReLU slopes:", relu_slopes)
print("Path survives:", bool(np.prod(relu_slopes)))

25. Dead neuron diagnostic

Code cell 54

preacts = np.random.normal(loc=-3.0, scale=0.5, size=(1000, 32))
active_fraction = np.mean(preacts > 0, axis=0)
dead = active_fraction < 0.01
print("Dead-like units:", dead.sum(), "out of", dead.size)
check_true("negative-biased layer has dead-like units", dead.sum() > 0)