Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
Linear Models: Theory Notebook
This notebook makes linear model math executable: least squares, gradients, ridge, conditioning, logistic regression, softmax, calibration, and linear probes.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
1. Affine prediction
Code cell 4
X = np.array([[1.0, 2.0], [3.0, -1.0], [0.0, 4.0]])
w = np.array([0.5, -1.0])
b = 2.0
y_hat = X @ w + b
print("predictions:", y_hat)
2. Least-squares closed form
Code cell 6
X = np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0], [1.0, 4.0]])
y = np.array([1.2, 1.9, 3.2, 3.9])
w_hat = np.linalg.solve(X.T @ X, X.T @ y)
print("intercept and slope:", w_hat)
print("predictions:", np.round(X @ w_hat, 3))
3. Projection view
Code cell 8
P = X @ np.linalg.inv(X.T @ X) @ X.T
y_proj = P @ y
residual = y - y_proj
print("projection idempotent error:", np.linalg.norm(P @ P - P))
print("X^T residual:", np.round(X.T @ residual, 10))
4. Gradient descent
Code cell 10
X = np.c_[np.ones(50), np.linspace(-1, 1, 50)]
true_w = np.array([0.3, 2.0])
y = X @ true_w + 0.1 * np.random.normal(size=50)
w = np.zeros(2)
lr = 0.1
losses = []
for step in range(60):
r = X @ w - y
losses.append(0.5 * np.mean(r**2))
grad = X.T @ r / len(y)
w -= lr * grad
plt.plot(losses)
plt.title("Gradient descent on least squares")
plt.xlabel("step")
plt.ylabel("loss")
plt.tight_layout()
plt.show()
print("learned w:", np.round(w, 3))
5. Gradient check
Code cell 12
X = np.array([[1.0, 2.0], [3.0, 4.0]])
y = np.array([1.0, 2.0])
w = np.array([0.2, -0.3])
def loss(wv):
r = X @ wv - y
return 0.5 * np.sum(r**2)
analytic = X.T @ (X @ w - y)
eps = 1e-6
numeric = np.zeros_like(w)
for i in range(len(w)):
e = np.zeros_like(w)
e[i] = eps
numeric[i] = (loss(w + e) - loss(w - e)) / (2 * eps)
print("analytic:", analytic)
print("numeric:", numeric)
print("max error:", np.max(np.abs(analytic - numeric)))
6. Ridge shrinkage
Code cell 14
X = np.array([[1.0, 1.0], [1.0, 2.0], [1.0, 3.0], [1.0, 4.0]])
y = np.array([1.2, 1.9, 3.2, 3.9])
for lam in [0.0, 0.1, 1.0, 10.0]:
w = np.linalg.solve(X.T @ X + lam * np.eye(X.shape[1]), X.T @ y)
print(f"lambda={lam:>4}: w={np.round(w, 3)}")
7. Conditioning and feature scaling
Code cell 16
rng = np.random.default_rng(4)
x1 = rng.normal(size=200)
x2 = 1000 * rng.normal(size=200)
X_bad = np.c_[x1, x2]
X_good = (X_bad - X_bad.mean(axis=0)) / X_bad.std(axis=0)
cond_bad = np.linalg.cond(X_bad.T @ X_bad)
cond_good = np.linalg.cond(X_good.T @ X_good)
print("condition before scaling:", cond_bad)
print("condition after scaling:", cond_good)
8. Logistic regression probability
Code cell 18
def sigmoid(z):
return 1 / (1 + np.exp(-z))
x = np.array([1.0, -2.0])
w = np.array([0.8, -0.5])
b = -0.2
p = sigmoid(w @ x + b)
y = 1
ce = -(y * np.log(p) + (1 - y) * np.log(1 - p))
print("probability:", p)
print("binary cross entropy:", ce)
9. Softmax regression
Code cell 20
logits = np.array([2.0, 1.0, -1.0])
exp_logits = np.exp(logits - logits.max())
probs = exp_logits / exp_logits.sum()
target = 0
loss = -np.log(probs[target])
print("probs:", np.round(probs, 3))
print("loss:", loss)
10. Calibration bins
Code cell 22
confidence = np.array([0.9, 0.8, 0.7, 0.6, 0.4])
correct = np.array([1, 1, 0, 1, 0])
bins = [(0.0, 0.5), (0.5, 0.75), (0.75, 1.0)]
for lo, hi in bins:
m = (confidence > lo) & (confidence <= hi)
if m.any():
print((lo, hi), "acc", correct[m].mean(), "conf", confidence[m].mean())
11. Linear probe
Code cell 24
rng = np.random.default_rng(5)
H = rng.normal(size=(100, 6))
probe_w_true = rng.normal(size=6)
labels = (H @ probe_w_true > 0).astype(float)
X_aug = np.c_[H, np.ones(len(H))]
w_probe = np.linalg.lstsq(X_aug, labels, rcond=None)[0]
pred = (X_aug @ w_probe > 0.5).astype(float)
print("linear probe accuracy:", (pred == labels).mean())
12. Residual diagnostics
Code cell 26
x = np.linspace(-2, 2, 80)
y = x**2 + 0.1 * np.random.normal(size=len(x))
X_lin = np.c_[np.ones_like(x), x]
w = np.linalg.lstsq(X_lin, y, rcond=None)[0]
resid = y - X_lin @ w
plt.scatter(x, resid, s=16)
plt.axhline(0, color="black", linewidth=1)
plt.title("Residual plot reveals nonlinearity")
plt.xlabel("x")
plt.ylabel("residual")
plt.tight_layout()
plt.show()
13. Final checklist
Code cell 28
checks = [
"X, y, and w shapes match",
"features are scaled using train-only statistics",
"rank and condition number are inspected",
"regularization strength is validated",
"residuals and calibration are checked",
"linear baseline is compared with nonlinear alternatives",
]
for i, check in enumerate(checks, 1):
print(f"{i}. {check}")