Theory NotebookMath for LLMs

Linear Models

Math for Specific Models / Linear Models

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Linear Models: Theory Notebook

This notebook makes linear model math executable: least squares, gradients, ridge, conditioning, logistic regression, softmax, calibration, and linear probes.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

1. Affine prediction

Code cell 4

X = np.array([[1.0, 2.0], [3.0, -1.0], [0.0, 4.0]])
w = np.array([0.5, -1.0])
b = 2.0
y_hat = X @ w + b
print("predictions:", y_hat)

2. Least-squares closed form

Code cell 6

X = np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0], [1.0, 4.0]])
y = np.array([1.2, 1.9, 3.2, 3.9])
w_hat = np.linalg.solve(X.T @ X, X.T @ y)
print("intercept and slope:", w_hat)
print("predictions:", np.round(X @ w_hat, 3))

3. Projection view

Code cell 8

P = X @ np.linalg.inv(X.T @ X) @ X.T
y_proj = P @ y
residual = y - y_proj
print("projection idempotent error:", np.linalg.norm(P @ P - P))
print("X^T residual:", np.round(X.T @ residual, 10))

4. Gradient descent

Code cell 10

X = np.c_[np.ones(50), np.linspace(-1, 1, 50)]
true_w = np.array([0.3, 2.0])
y = X @ true_w + 0.1 * np.random.normal(size=50)
w = np.zeros(2)
lr = 0.1
losses = []
for step in range(60):
    r = X @ w - y
    losses.append(0.5 * np.mean(r**2))
    grad = X.T @ r / len(y)
    w -= lr * grad
plt.plot(losses)
plt.title("Gradient descent on least squares")
plt.xlabel("step")
plt.ylabel("loss")
plt.tight_layout()
plt.show()
print("learned w:", np.round(w, 3))

5. Gradient check

Code cell 12

X = np.array([[1.0, 2.0], [3.0, 4.0]])
y = np.array([1.0, 2.0])
w = np.array([0.2, -0.3])
def loss(wv):
    r = X @ wv - y
    return 0.5 * np.sum(r**2)
analytic = X.T @ (X @ w - y)
eps = 1e-6
numeric = np.zeros_like(w)
for i in range(len(w)):
    e = np.zeros_like(w)
    e[i] = eps
    numeric[i] = (loss(w + e) - loss(w - e)) / (2 * eps)
print("analytic:", analytic)
print("numeric:", numeric)
print("max error:", np.max(np.abs(analytic - numeric)))

6. Ridge shrinkage

Code cell 14

X = np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0], [1.0, 4.0]])
y = np.array([1.2, 1.9, 3.2, 3.9])
for lam in [0.0, 0.1, 1.0, 10.0]:
    w = np.linalg.solve(X.T @ X + lam * np.eye(X.shape[1]), X.T @ y)
    print(f"lambda={lam:>4}: w={np.round(w, 3)}")

7. Conditioning and feature scaling

Code cell 16

rng = np.random.default_rng(4)
x1 = rng.normal(size=200)
x2 = 1000 * rng.normal(size=200)
X_bad = np.c_[x1, x2]
X_good = (X_bad - X_bad.mean(axis=0)) / X_bad.std(axis=0)
cond_bad = np.linalg.cond(X_bad.T @ X_bad)
cond_good = np.linalg.cond(X_good.T @ X_good)
print("condition before scaling:", cond_bad)
print("condition after scaling:", cond_good)

8. Logistic regression probability

Code cell 18

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

x = np.array([1.0, -2.0])
w = np.array([0.8, -0.5])
b = -0.2
p = sigmoid(w @ x + b)
y = 1
ce = -(y * np.log(p) + (1 - y) * np.log(1 - p))
print("probability:", p)
print("binary cross entropy:", ce)

9. Softmax regression

Code cell 20

logits = np.array([2.0, 1.0, -1.0])
exp_logits = np.exp(logits - logits.max())
probs = exp_logits / exp_logits.sum()
target = 0
loss = -np.log(probs[target])
print("probs:", np.round(probs, 3))
print("loss:", loss)

10. Calibration bins

Code cell 22

confidence = np.array([0.9, 0.8, 0.7, 0.6, 0.4])
correct = np.array([1, 1, 0, 1, 0])
bins = [(0.0, 0.5), (0.5, 0.75), (0.75, 1.0)]
for lo, hi in bins:
    m = (confidence > lo) & (confidence <= hi)
    if m.any():
        print((lo, hi), "acc", correct[m].mean(), "conf", confidence[m].mean())

11. Linear probe

Code cell 24

rng = np.random.default_rng(5)
H = rng.normal(size=(100, 6))
probe_w_true = rng.normal(size=6)
labels = (H @ probe_w_true > 0).astype(float)
X_aug = np.c_[H, np.ones(len(H))]
w_probe = np.linalg.lstsq(X_aug, labels, rcond=None)[0]
pred = (X_aug @ w_probe > 0.5).astype(float)
print("linear probe accuracy:", (pred == labels).mean())

12. Residual diagnostics

Code cell 26

x = np.linspace(-2, 2, 80)
y = x**2 + 0.1 * np.random.normal(size=len(x))
X_lin = np.c_[np.ones_like(x), x]
w = np.linalg.lstsq(X_lin, y, rcond=None)[0]
resid = y - X_lin @ w
plt.scatter(x, resid, s=16)
plt.axhline(0, color="black", linewidth=1)
plt.title("Residual plot reveals nonlinearity")
plt.xlabel("x")
plt.ylabel("residual")
plt.tight_layout()
plt.show()

13. Final checklist

Code cell 28

checks = [
    "X, y, and w shapes match",
    "features are scaled using train-only statistics",
    "rank and condition number are inspected",
    "regularization strength is validated",
    "residuals and calibration are checked",
    "linear baseline is compared with nonlinear alternatives",
]
for i, check in enumerate(checks, 1):
    print(f"{i}. {check}")