Theory Notebook
Converted from
theory.ipynbfor web reading.
Hilbert Spaces: Theory Notebook
Hilbert spaces add angle, projection, orthogonal coordinates, and self-duality to normed spaces. This notebook is the interactive companion to notes.md.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=4, suppress=True)
rng = np.random.default_rng(7)
def inner(x, y):
return np.vdot(y, x).real
def norm(x):
return np.sqrt(inner(x, x))
print("Hilbert theory helpers ready.")
1. Inner Products
In a real Euclidean space, the standard inner product is .
Code cell 5
x = np.array([2.0, -1.0, 3.0])
y = np.array([-4.0, 0.5, 1.0])
print("inner(x, y) =", inner(x, y))
print("inner(y, x) =", inner(y, x))
print("norm(x) =", norm(x))
assert np.isclose(inner(x, y), inner(y, x))
assert norm(x) > 0
A weighted inner product uses a positive definite matrix :
Code cell 7
W = np.array([[3.0, 0.5], [0.5, 1.5]])
eigvals = np.linalg.eigvalsh(W)
assert np.all(eigvals > 0)
def inner_W(a, b):
return float(a.T @ W @ b)
a = np.array([1.0, -2.0])
b = np.array([0.25, 3.0])
print("eigenvalues(W) =", eigvals)
print("weighted inner(a, b) =", inner_W(a, b))
print("weighted norm(a) =", np.sqrt(inner_W(a, a)))
2. Induced Norms and the Parallelogram Law
A norm comes from an inner product exactly when it satisfies the parallelogram law.
Code cell 9
def l2(v):
return np.linalg.norm(v, 2)
def l1(v):
return np.linalg.norm(v, 1)
u = np.array([1.0, 0.0])
v = np.array([0.0, 1.0])
lhs_l2 = l2(u + v) ** 2 + l2(u - v) ** 2
rhs_l2 = 2 * (l2(u) ** 2 + l2(v) ** 2)
lhs_l1 = l1(u + v) ** 2 + l1(u - v) ** 2
rhs_l1 = 2 * (l1(u) ** 2 + l1(v) ** 2)
print("L2 parallelogram:", lhs_l2, rhs_l2)
print("L1 parallelogram:", lhs_l1, rhs_l1)
assert np.isclose(lhs_l2, rhs_l2)
assert not np.isclose(lhs_l1, rhs_l1)
3. Cauchy-Schwarz
The inequality bounds alignment.
Code cell 11
ratios = []
for _ in range(1000):
x = rng.normal(size=12)
y = rng.normal(size=12)
ratios.append(abs(inner(x, y)) / (norm(x) * norm(y)))
print("largest observed alignment ratio:", max(ratios))
assert max(ratios) <= 1.0 + 1e-12
4. Cosine Similarity and Attention Scores
Dot-product attention uses Hilbert alignment before applying softmax.
Code cell 13
queries = rng.normal(size=(3, 5))
keys = rng.normal(size=(4, 5))
scale = np.sqrt(keys.shape[1])
scores = queries @ keys.T / scale
shifted = scores - scores.max(axis=1, keepdims=True)
weights = np.exp(shifted) / np.exp(shifted).sum(axis=1, keepdims=True)
print("attention score shape:", scores.shape)
print("row sums after softmax:", weights.sum(axis=1))
assert np.allclose(weights.sum(axis=1), 1.0)
5. Projection Onto a Line
Projection removes the component orthogonal to a subspace.
Code cell 15
x = np.array([2.5, 2.0])
direction = np.array([1.0, 0.4])
e = direction / np.linalg.norm(direction)
projection = inner(x, e) * e
residual_line = x - projection
print("projection =", projection)
print("residual dot direction =", residual_line @ direction)
assert abs(residual_line @ direction) < 1e-12
fig, ax = plt.subplots(figsize=(4, 4))
ax.axhline(0, color="black", linewidth=0.8)
ax.axvline(0, color="black", linewidth=0.8)
t = np.linspace(-1, 4, 100)
ax.plot(t * direction[0], t * direction[1], label="subspace")
ax.arrow(0, 0, x[0], x[1], head_width=0.08, length_includes_head=True, color="C1")
ax.arrow(0, 0, projection[0], projection[1], head_width=0.08, length_includes_head=True, color="C2")
ax.plot([x[0], projection[0]], [x[1], projection[1]], "--", color="C3")
ax.set_aspect("equal")
ax.legend()
plt.close(fig)
6. Least Squares as Projection
The residual of a least-squares fit is orthogonal to the column space of the design matrix.
Code cell 17
A = np.array([[1.0, 0.0], [1.0, 1.0], [1.0, 2.0], [1.0, 3.0]])
b = np.array([1.0, 1.8, 3.2, 3.9])
coef, *_ = np.linalg.lstsq(A, b, rcond=None)
fitted = A @ coef
residual_ls = b - fitted
print("least-squares coefficients:", coef)
print("A^T residual:", A.T @ residual_ls)
assert np.allclose(A.T @ residual_ls, 0.0)
7. Projection Matrices
A full-column-rank design matrix gives .
Code cell 19
P = A @ np.linalg.inv(A.T @ A) @ A.T
print("P^2 - P norm:", np.linalg.norm(P @ P - P))
print("P^T - P norm:", np.linalg.norm(P.T - P))
assert np.allclose(P @ P, P)
assert np.allclose(P.T, P)
8. Gram-Schmidt
Gram-Schmidt constructs an orthonormal basis for a span.
Code cell 21
def gram_schmidt(columns, tol=1e-12):
basis = []
for v in columns.T:
u = v.astype(float).copy()
for e in basis:
u -= (e @ u) * e
nrm = np.linalg.norm(u)
if nrm > tol:
basis.append(u / nrm)
return np.column_stack(basis)
V = np.array([[1.0, 1.0, 0.0], [1.0, 0.0, 1.0], [0.0, 1.0, 1.0]])
Q = gram_schmidt(V)
print(Q)
print("Q^T Q =")
print(Q.T @ Q)
assert np.allclose(Q.T @ Q, np.eye(Q.shape[1]))
9. QR Comparison
Numerical linear algebra usually uses stable QR routines instead of hand-written classical Gram-Schmidt.
Code cell 23
Q_np, R_np = np.linalg.qr(V)
print("reconstruction error:", np.linalg.norm(Q_np @ R_np - V))
print("orthogonality error:", np.linalg.norm(Q_np.T @ Q_np - np.eye(3)))
assert np.allclose(Q_np @ R_np, V)
10. Bessel Inequality
Coordinates in an orthonormal set cannot carry more energy than the vector itself.
Code cell 25
x = rng.normal(size=6)
Q6, _ = np.linalg.qr(rng.normal(size=(6, 6)))
partial = Q6[:, :3]
coeffs = partial.T @ x
energy_in_coeffs = np.sum(coeffs ** 2)
total_energy = np.sum(x ** 2)
print("partial coordinate energy:", energy_in_coeffs)
print("total energy:", total_energy)
assert energy_in_coeffs <= total_energy + 1e-12
11. Parseval Identity
A complete orthonormal basis preserves squared norm exactly.
Code cell 27
coeffs_full = Q6.T @ x
print("energy in full coordinates:", np.sum(coeffs_full ** 2))
print("energy in x:", total_energy)
assert np.allclose(np.sum(coeffs_full ** 2), total_energy)
12. Fourier-Bessel Expansion by Sampling
A finite sampled Fourier basis gives a numerical shadow of Fourier coordinates.
Code cell 29
n = 256
t = np.linspace(0, 2 * np.pi, n, endpoint=False)
signal = np.sin(3 * t) + 0.5 * np.cos(5 * t)
fft_coeffs = np.fft.rfft(signal) / n
energy_time = np.mean(signal ** 2)
energy_freq = abs(fft_coeffs[0]) ** 2 + 2 * np.sum(abs(fft_coeffs[1:]) ** 2)
print("time-domain mean energy:", energy_time)
print("frequency-domain energy:", energy_freq)
assert np.isclose(energy_time, energy_freq)
13. Riesz Representation in Euclidean Geometry
A linear functional is represented by .
Code cell 31
a = np.array([2.0, -3.0, 0.5])
def L(x):
return float(a @ x)
x = rng.normal(size=3)
print("L(x):", L(x))
print("<x, a>:", x @ a)
assert np.isclose(L(x), x @ a)
14. Riesz Representation with a Weighted Inner Product
Under , the same functional has representative .
Code cell 33
G = np.array([[4.0, 1.0], [1.0, 2.0]])
a = np.array([3.0, -1.0])
rep = np.linalg.solve(G, a)
x = rng.normal(size=2)
euclidean_value = a @ x
weighted_rep_value = x @ G @ rep
print("G^{-1} a =", rep)
print(euclidean_value, weighted_rep_value)
assert np.isclose(euclidean_value, weighted_rep_value)
15. Gradients Depend on Geometry
The differential is fixed. The gradient vector changes when the inner product changes.
Code cell 35
X = rng.normal(size=(30, 2))
true_w = np.array([1.5, -0.5])
y = X @ true_w + 0.05 * rng.normal(size=30)
w = np.array([0.0, 0.0])
grad_euclidean = X.T @ (X @ w - y) / len(y)
G = np.array([[5.0, 0.0], [0.0, 1.0]])
grad_weighted = np.linalg.solve(G, grad_euclidean)
print("Euclidean gradient:", grad_euclidean)
print("Weighted-geometry gradient:", grad_weighted)
assert np.linalg.norm(grad_weighted) != 0
16. Adjoints
The adjoint is defined by .
Code cell 37
T = rng.normal(size=(4, 3))
x = rng.normal(size=3)
y = rng.normal(size=4)
left = (T @ x) @ y
right = x @ (T.T @ y)
print(left, right)
assert np.isclose(left, right)
17. Self-Adjoint Positive Operators
Covariance matrices are finite-dimensional positive self-adjoint operators.
Code cell 39
Z = rng.normal(size=(100, 4))
C = Z.T @ Z / len(Z)
eigenvalues = np.linalg.eigvalsh(C)
print("covariance eigenvalues:", eigenvalues)
assert np.allclose(C.T, C)
assert np.all(eigenvalues >= -1e-12)
18. Compact-Operator Intuition
Compact operators in infinite dimensions often look like diagonal maps with singular values tending to zero.
Code cell 41
singular_values = 1 / np.arange(1, 31)
coeffs = rng.normal(size=30)
transformed = singular_values * coeffs
tail_norms = [np.linalg.norm(transformed[k:]) for k in [1, 3, 5, 10, 20]]
print("tail norms:", tail_norms)
assert tail_norms[-1] < tail_norms[0]
19. Spectral Theorem and PCA
PCA diagonalizes a positive self-adjoint covariance operator.
Code cell 43
data = rng.normal(size=(150, 2)) @ np.array([[2.0, 0.0], [0.8, 0.4]])
centered = data - data.mean(axis=0)
C = centered.T @ centered / len(centered)
vals, vecs = np.linalg.eigh(C)
order = np.argsort(vals)[::-1]
vals = vals[order]
vecs = vecs[:, order]
one_dim = centered @ vecs[:, :1] @ vecs[:, :1].T
print("PCA eigenvalues:", vals)
print("rank-1 reconstruction mse:", np.mean((centered - one_dim) ** 2))
assert vals[0] >= vals[1]
20. Polynomial Kernel as an Explicit Feature Inner Product
For degree 2 in two variables, a finite feature map can reproduce a polynomial kernel.
Code cell 45
def phi2(z):
x1, x2 = z
return np.array([x1**2, np.sqrt(2) * x1 * x2, x2**2])
p = np.array([1.0, 2.0])
q = np.array([-0.5, 1.5])
explicit = phi2(p) @ phi2(q)
kernel = (p @ q) ** 2
print("explicit feature inner product:", explicit)
print("kernel value:", kernel)
assert np.isclose(explicit, kernel)
21. Kernel Gram Matrices are PSD
A valid kernel produces positive semidefinite Gram matrices on finite samples.
Code cell 47
points = np.linspace(-2, 2, 15)[:, None]
def rbf_kernel(X, Y, sigma=0.8):
sq = (X - Y.T) ** 2
return np.exp(-sq / (2 * sigma**2))
K = rbf_kernel(points, points)
evals = np.linalg.eigvalsh(K)
print("smallest RBF Gram eigenvalue:", evals[0])
assert evals[0] > -1e-10
22. A Tiny NTK-Style Feature Kernel
Parameter-gradient inner products induce a kernel over inputs.
Code cell 49
def grad_features(x):
# f(x; a, b) = a * tanh(b x), evaluated at a=1 and b=0.7
a = 1.0
b = 0.7
return np.array([np.tanh(b * x), a * x * (1 - np.tanh(b * x) ** 2)])
xs = np.linspace(-2, 2, 7)
Phi = np.vstack([grad_features(x) for x in xs])
K_ntk_toy = Phi @ Phi.T
print("toy NTK Gram shape:", K_ntk_toy.shape)
assert np.all(np.linalg.eigvalsh(K_ntk_toy) > -1e-10)
23. Weak Convergence Intuition
In , the standard basis vectors do not converge strongly to zero, but their fixed coordinates go to zero.
Code cell 51
dimension = 20
fixed_probe = np.zeros(dimension)
fixed_probe[:5] = [1, -2, 0.5, 0.25, -1]
inner_values = []
norms = []
for n_idx in range(dimension):
e = np.zeros(dimension)
e[n_idx] = 1.0
inner_values.append(e @ fixed_probe)
norms.append(np.linalg.norm(e))
print("last probe inner products:", inner_values[-5:])
print("all norms equal one:", set(np.round(norms, 6)))
assert np.allclose(norms, 1.0)
24. Concept Map
The same few objects keep reappearing: inner products give norms, norms give projection, projection gives least squares, bases give coordinates, Riesz turns functionals into vectors, and operators give spectral learning tools.
Code cell 53
concepts = [
"inner product -> norm",
"orthogonality -> projection",
"projection -> least squares",
"orthonormal basis -> coordinates",
"Riesz representative -> gradient vector",
"self-adjoint operator -> spectral decomposition",
"kernel -> implicit Hilbert inner product",
]
for item in concepts:
print(item)
assert len(concepts) == 7
25. Final Checks
The notebook has touched the operational pieces from the note: inner products, projections, bases, Riesz representation, operators, PCA, kernels, and weak convergence.
Code cell 55
checks = {
"projection_residual_orthogonal": abs(residual_line @ direction) < 1e-12,
"least_squares_residual_orthogonal": np.allclose(A.T @ residual_ls, 0.0),
"gram_schmidt_orthonormal": np.allclose(Q.T @ Q, np.eye(Q.shape[1])),
"rbf_psd": np.linalg.eigvalsh(K)[0] > -1e-10,
}
print(checks)
assert all(checks.values())