Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
Activation Functions - Theory Notebook
Executable companion to notes.md.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
Code cell 4
def sigmoid(x):
return 1.0 / (1.0 + np.exp(-x))
def tanh(x):
return np.tanh(x)
def relu(x):
return np.maximum(0, x)
def leaky_relu(x, alpha=0.1):
return np.where(x > 0, x, alpha * x)
def softplus(x):
return np.logaddexp(0, x)
def gelu(x):
return 0.5 * x * (1 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * x**3)))
def silu(x):
return x * sigmoid(x)
def mish(x):
return x * np.tanh(softplus(x))
def softmax(z, axis=-1):
z = np.asarray(z, dtype=float)
shifted = z - np.max(z, axis=axis, keepdims=True)
exp_shifted = np.exp(shifted)
return exp_shifted / np.sum(exp_shifted, axis=axis, keepdims=True)
def finite_diff(f, x, eps=1e-5):
return (f(x + eps) - f(x - eps)) / (2 * eps)
def check_close(name, value, expected, tol=1e-6):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
print("Activation helpers ready.")
1. Curves
Code cell 6
x = np.linspace(-6, 6, 500)
curves = {"sigmoid": sigmoid(x), "tanh": tanh(x), "ReLU": relu(x), "GELU": gelu(x), "SiLU": silu(x)}
fig, ax = plt.subplots()
for (name, y), color in zip(curves.items(), [COLORS["primary"], COLORS["secondary"], COLORS["tertiary"], COLORS["highlight"], COLORS["error"]]):
ax.plot(x, y, label=name, color=color)
ax.set_title("Activation function curves")
ax.set_xlabel("Input $x$")
ax.set_ylabel("Activation")
ax.legend()
fig.tight_layout()
plt.show()
print("Plotted common activation curves.")
2. Sigmoid derivative
Code cell 8
x = np.linspace(-8, 8, 500)
s = sigmoid(x)
deriv = s * (1 - s)
print("max sigmoid derivative:", deriv.max())
check_true("sigmoid derivative <= 0.25", deriv.max() <= 0.25001)
3. Tanh derivative
Code cell 10
t = tanh(x)
deriv_t = 1 - t**2
print("max tanh derivative:", deriv_t.max())
check_true("tanh derivative <= 1", deriv_t.max() <= 1.00001)
4. Derivative comparison plot
Code cell 12
fig, ax = plt.subplots()
ax.plot(x, s * (1 - s), label="sigmoid'", color=COLORS["primary"])
ax.plot(x, 1 - tanh(x)**2, label="tanh'", color=COLORS["secondary"])
ax.plot(x, (x > 0).astype(float), label="ReLU'", color=COLORS["tertiary"])
ax.plot(x, finite_diff(gelu, x), label="GELU'", color=COLORS["highlight"])
ax.set_title("Activation derivatives")
ax.set_xlabel("Input $x$")
ax.set_ylabel("Derivative")
ax.legend()
fig.tight_layout()
plt.show()
print("Derivative comparison complete.")
5. Saturation
Code cell 14
for value in [-10, -5, 0, 5, 10]:
sig_grad = sigmoid(value) * (1 - sigmoid(value))
tanh_grad = 1 - np.tanh(value) ** 2
print(f"x={value:>3}: sigmoid'={sig_grad:.6f}, tanh'={tanh_grad:.6f}")
6. Affine collapse
Code cell 16
W1 = np.array([[1.0, 2.0], [0.0, 1.0]])
W2 = np.array([[2.0, -1.0], [1.0, 0.5]])
b1 = np.array([0.5, -1.0])
b2 = np.array([1.0, 2.0])
A = W2 @ W1
c = W2 @ b1 + b2
x0 = np.array([3.0, -2.0])
stacked = W2 @ (W1 @ x0 + b1) + b2
collapsed = A @ x0 + c
check_close("stacked affine equals collapsed affine", stacked, collapsed)
7. ReLU sparsity
Code cell 18
z = np.random.normal(size=(1000,))
a = relu(z)
print("Fraction zeros after ReLU:", np.mean(a == 0))
check_true("roughly half zero for standard normal", 0.4 < np.mean(a == 0) < 0.6)
8. Leaky ReLU keeps negative gradient
Code cell 20
vals = np.array([-2.0, -0.5, 0.0, 1.0])
print("ReLU:", relu(vals))
print("Leaky ReLU:", leaky_relu(vals, alpha=0.1))
check_true("negative values are not zeroed by Leaky ReLU", leaky_relu(vals)[0] < 0)
9. Softplus approximates ReLU
Code cell 22
sp = softplus(x)
err_pos = abs(softplus(8.0) - relu(8.0))
err_neg = abs(softplus(-8.0) - relu(-8.0))
print("softplus(8)-relu(8):", err_pos)
print("softplus(-8)-relu(-8):", err_neg)
check_true("softplus close to ReLU in tails", err_pos < 1e-3 and err_neg < 1e-3)
10. GELU and SiLU negative tails
Code cell 24
values = np.array([-3.0, -1.0, 0.0, 1.0, 3.0])
print("GELU:", np.round(gelu(values), 4))
print("SiLU:", np.round(silu(values), 4))
check_true("GELU allows small negative outputs", gelu(np.array([-1.0]))[0] < 0)
11. Smooth activation derivatives
Code cell 26
for name, f in [("GELU", gelu), ("SiLU", silu), ("Mish", mish)]:
d0 = finite_diff(f, np.array([0.0]))[0]
print(f"{name} derivative near 0: {d0:.4f}")
12. GLU
Code cell 28
a = np.array([1.0, -2.0, 3.0])
b = np.array([-2.0, 0.0, 2.0])
glu = a * sigmoid(b)
print("gate:", np.round(sigmoid(b), 4))
print("GLU output:", np.round(glu, 4))
check_true("gate values lie in (0,1)", np.all((sigmoid(b) > 0) & (sigmoid(b) < 1)))
13. GEGLU and SwiGLU
Code cell 30
geglu = a * gelu(b)
swiglu = a * silu(b)
print("GEGLU:", np.round(geglu, 4))
print("SwiGLU:", np.round(swiglu, 4))
check_true("gated outputs finite", np.isfinite(geglu).all() and np.isfinite(swiglu).all())
14. Bilinear gate gradients
Code cell 32
a = np.array([2.0, -1.0])
g = np.array([0.25, 0.75])
y = a * g
print("dy/da = gate:", g)
print("dy/dg = content:", a)
check_close("output", y, np.array([0.5, -0.75]))
15. Softmax probabilities
Code cell 34
z = np.array([2.0, 1.0, -1.0])
s = softmax(z)
print("softmax:", np.round(s, 6))
check_close("softmax sums to 1", s.sum(), 1.0)
16. Stable softmax shift invariance
Code cell 36
shifted = softmax(z + 1000)
check_close("softmax shift invariance", shifted, s)
17. Temperature softmax
Code cell 38
for tau in [0.5, 1.0, 2.0]:
st = softmax(z / tau)
print(f"tau={tau}: probs={np.round(st, 4)}, entropy={-np.sum(st*np.log(st)):.4f}")
18. Softmax Jacobian
Code cell 40
J = np.diag(s) - np.outer(s, s)
print("Jacobian:\n", np.round(J, 5))
check_close("Jacobian row sums zero", J.sum(axis=1), np.zeros_like(s))
check_true("Jacobian PSD", np.linalg.eigvalsh(J).min() > -1e-10)
19. Softmax saturation
Code cell 42
sharp = softmax(np.array([20.0, 0.0, -20.0]))
print("sharp softmax:", sharp)
check_true("largest class nearly one", sharp[0] > 0.999999)
20. Activation variance
Code cell 44
X = np.random.normal(size=(5000, 128))
for name, f in [("tanh", tanh), ("ReLU", relu), ("GELU", gelu), ("SiLU", silu)]:
A = f(X)
print(f"{name:>5}: mean={A.mean(): .4f}, var={A.var():.4f}")
21. Xavier vs He scale
Code cell 46
n_in, n_out = 256, 128
var_xavier = 2 / (n_in + n_out)
var_he = 2 / n_in
print("Xavier variance:", var_xavier)
print("He variance:", var_he)
check_true("He variance larger for ReLU", var_he > var_xavier)
22. One-layer signal propagation
Code cell 48
X = np.random.normal(size=(4096, 256))
W_xavier = np.random.normal(scale=np.sqrt(var_xavier), size=(256, 128))
W_he = np.random.normal(scale=np.sqrt(var_he), size=(256, 128))
A_tanh = tanh(X @ W_xavier)
A_relu = relu(X @ W_he)
print("tanh with Xavier var:", A_tanh.var())
print("ReLU with He var:", A_relu.var())
check_true("activation variances finite", np.isfinite(A_tanh.var()) and np.isfinite(A_relu.var()))
23. Vanishing gradient toy product
Code cell 50
slopes = np.full(40, 0.25)
product = np.prod(slopes)
print("Product of 40 sigmoid max slopes:", product)
check_true("deep sigmoid slope product tiny", product < 1e-20)
24. ReLU active path product
Code cell 52
relu_slopes = np.random.binomial(1, 0.5, size=40)
print("Random ReLU slopes:", relu_slopes)
print("Path survives:", bool(np.prod(relu_slopes)))
25. Dead neuron diagnostic
Code cell 54
preacts = np.random.normal(loc=-3.0, scale=0.5, size=(1000, 32))
active_fraction = np.mean(preacts > 0, axis=0)
dead = active_fraction < 0.01
print("Dead-like units:", dead.sum(), "out of", dead.size)
check_true("negative-biased layer has dead-like units", dead.sum() > 0)