Exercises Notebook

Converted from exercises.ipynb for web reading.

Vectors and Spaces - Exercises

10 graded exercises covering the full linear algebra basics arc, from computation to ML-facing matrix workflows.

Format	Description
Problem	Markdown cell with task description
Your Solution	Code cell for learner work
Solution	Reference solution with checks

Difficulty: straightforward -> moderate -> challenging.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
import scipy.linalg as sla
from scipy import stats

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def softmax(z, axis=-1, tau=1.0):
    z = np.asarray(z, dtype=float) / float(tau)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def cosine_similarity(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    return float(a @ b / (la.norm(a) * la.norm(b) + 1e-12))

def numerical_rank(A, tol=1e-10):
    return int(np.sum(la.svd(A, compute_uv=False) > tol))

def orthonormal_basis(A, tol=1e-10):
    Q, R = la.qr(A)
    keep = np.abs(np.diag(R)) > tol
    return Q[:, keep]

def null_space(A, tol=1e-10):
    U, S, Vt = la.svd(A)
    return Vt[S.size:,:].T if S.size < Vt.shape[0] else Vt[S <= tol,:].T



# Compatibility helpers used by the Chapter 02 theory and exercise cells.
def null_space(A, tol=1e-10):
    A = np.asarray(A, dtype=float)
    U, S, Vt = la.svd(A, full_matrices=True)
    rank = int(np.sum(S > tol))
    return Vt[rank:].T

svd_null_space = null_space

def gram_schmidt(vectors, tol=1e-10):
    A = np.asarray(vectors, dtype=float)
    if A.ndim == 1:
        A = A.reshape(1, -1)
    basis = []
    for v in A:
        w = v.astype(float).copy()
        for q in basis:
            w = w - np.dot(w, q) * q
        norm = la.norm(w)
        if norm > tol:
            basis.append(w / norm)
    return np.array(basis)

def projection_matrix_from_columns(A, tol=1e-10):
    Q = orthonormal_basis(np.asarray(A, dtype=float), tol=tol)
    return Q @ Q.T


def random_unit_vectors(n, d):
    X = np.random.randn(n, d)
    return X / np.maximum(la.norm(X, axis=1, keepdims=True), 1e-12)

def pairwise_distances(X):
    X = np.asarray(X, dtype=float)
    diff = X[:, None, :] - X[None, :, :]
    return la.norm(diff, axis=-1)


def normalize(x, axis=None, tol=1e-12):
    x = np.asarray(x, dtype=float)
    norm = la.norm(x, axis=axis, keepdims=True)
    return x / np.maximum(norm, tol)

def frobenius_inner(A, B):
    return float(np.sum(np.asarray(A, dtype=float) * np.asarray(B, dtype=float)))

def outer_sum_product(A, B):
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    return sum(np.outer(A[:, k], B[k, :]) for k in range(A.shape[1]))

def softmax_rows(X):
    return softmax(X, axis=1)

def col_space(A, tol=1e-10):
    return orthonormal_basis(np.asarray(A, dtype=float), tol=tol)

def row_space(A, tol=1e-10):
    return orthonormal_basis(np.asarray(A, dtype=float).T, tol=tol).T

def rref(A, tol=1e-10):
    R = np.array(A, dtype=float, copy=True)
    m, n = R.shape
    pivots = []
    row = 0
    for col in range(n):
        pivot = row + int(np.argmax(np.abs(R[row:, col]))) if row < m else row
        if row >= m or abs(R[pivot, col]) <= tol:
            continue
        if pivot != row:
            R[[row, pivot]] = R[[pivot, row]]
        R[row] = R[row] / R[row, col]
        for r in range(m):
            if r != row:
                R[r] = R[r] - R[r, col] * R[row]
        pivots.append(col)
        row += 1
        if row == m:
            break
    R[np.abs(R) < tol] = 0.0
    return R, pivots

def nullspace_basis(A, tol=1e-10):
    A = np.asarray(A, dtype=float)
    U, S, Vt = la.svd(A, full_matrices=True)
    rank = int(np.sum(S > tol))
    return Vt[rank:].T, rank

print("Chapter helper setup complete.")

Exercise 1: Dot Product and p-Norms

Task: Implement two of the most basic primitives in linear algebra.

dot_product(u, v) using elementwise multiplication and a sum
p_norm(v, p) for p = 1, 2, 3, ...

Requirements:

Do not use np.dot inside dot_product
Do not use np.linalg.norm inside p_norm
Verify Cauchy-Schwarz numerically on one example

Written part:

Explain why p < 1 does not define a norm in general.

Code cell 5

# Your Solution
# Exercise 1 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 1.")

Code cell 6

# Solution
# Exercise 1 - reference solution

def dot_product(u, v):
    u = np.asarray(u, dtype=float)
    v = np.asarray(v, dtype=float)
    return float(np.sum(u * v))


def p_norm(v, p=2):
    v = np.asarray(v, dtype=float)
    return float(np.sum(np.abs(v) ** p) ** (1.0 / p))


u = np.array([1.0, 2.0, -1.0])
v = np.array([3.0, 0.0, 1.0])

print("u.v =", dot_product(u, v))
for p in [1, 2, 3]:
    print(f"||u||_{p} = {p_norm(u, p):.6f}")

lhs = abs(dot_product(u, v))
rhs = p_norm(u, 2) * p_norm(v, 2)
print("Cauchy-Schwarz check:", lhs, "<=", rhs, "->", lhs <= rhs + 1e-12)

print("Exercise 1 solution complete.")

Exercise 2: Cosine Similarity and Angle

Task: Implement directional similarity and convert it into an angle.

Requirements:

Implement cosine_similarity(u, v)
Implement angle_between(u, v, degrees=True)
Verify for unit vectors that ||u-v||^2 = 2 - 2 cos(u,v)

Written part:

Explain why cosine similarity can stay high even when Euclidean distance is large.

Code cell 8

# Your Solution
# Exercise 2 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 2.")

Code cell 9

# Solution
# Exercise 2 - reference solution

def cosine_similarity(u, v):
    u = np.asarray(u, dtype=float)
    v = np.asarray(v, dtype=float)
    return float(np.sum(u * v) / (np.sqrt(np.sum(u ** 2)) * np.sqrt(np.sum(v ** 2))))


def angle_between(u, v, degrees=True):
    cos = np.clip(cosine_similarity(u, v), -1.0, 1.0)
    angle = np.arccos(cos)
    return float(np.degrees(angle) if degrees else angle)


u = np.array([1.0, 1.0])
v = np.array([1.0, -1.0])
u = u / np.linalg.norm(u)
v = v / np.linalg.norm(v)

cos = cosine_similarity(u, v)
dist_sq = np.sum((u - v) ** 2)
rhs = 2 - 2 * cos

print("cos(u,v) =", cos)
print("angle(u,v) =", angle_between(u, v), "degrees")
print("||u-v||^2 =", dist_sq)
print("2 - 2 cos(u,v) =", rhs)
print("identity holds:", np.allclose(dist_sq, rhs))

print("Exercise 2 solution complete.")

Exercise 3: Span, Linear Independence, and Coordinates

Task: Work with span and basis coordinates directly.

Requirements:

Implement are_linearly_independent(vectors) using rank
Implement coordinates_in_basis(B, x) for a square basis matrix B
Implement is_in_span(vectors, target) using rank or least squares

Written part:

Explain why coordinate vectors are unique only when the basis vectors are linearly independent.

Code cell 11

# Your Solution
# Exercise 3 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 3.")

Code cell 12

# Solution
# Exercise 3 - reference solution

def are_linearly_independent(vectors):
    A = np.column_stack(vectors).astype(float)
    return np.linalg.matrix_rank(A) == A.shape[1]


def coordinates_in_basis(B, x):
    B = np.asarray(B, dtype=float)
    x = np.asarray(x, dtype=float)
    return np.linalg.solve(B, x)


def is_in_span(vectors, target, tol=1e-10):
    A = np.column_stack(vectors).astype(float)
    target = np.asarray(target, dtype=float)
    coeffs, *_ = np.linalg.lstsq(A, target, rcond=None)
    residual = np.linalg.norm(A @ coeffs - target)
    return residual < tol, coeffs


independent = [np.array([1.0, 0.0]), np.array([0.0, 1.0])]
dependent = [np.array([1.0, 2.0]), np.array([2.0, 4.0])]
B = np.array([[1.0, 1.0], [1.0, -1.0]])
x = np.array([4.0, 2.0])

print("independent set?", are_linearly_independent(independent))
print("dependent set?", are_linearly_independent(dependent))
print("coordinates of x in basis B =", coordinates_in_basis(B, x))
in_span, coeffs = is_in_span(independent, np.array([7.0, 4.0]))
print("[7,4] in span(e1,e2)?", in_span, "with coeffs", coeffs)

print("Exercise 3 solution complete.")

Exercise 4: Projection and Gram-Schmidt

Task: Move from algebra to geometry.

Requirements:

Implement project_onto(u, v) to project u onto the line spanned by v
Implement gram_schmidt(vectors) returning orthonormal vectors
Verify orthonormality numerically

Written part:

Prove or explain why a set of nonzero pairwise-orthogonal vectors must be linearly independent.

Code cell 14

# Your Solution
# Exercise 4 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 4.")

Code cell 15

# Solution
# Exercise 4 - reference solution

def project_onto(u, v):
    u = np.asarray(u, dtype=float)
    v = np.asarray(v, dtype=float)
    return (np.dot(u, v) / np.dot(v, v)) * v


def gram_schmidt(vectors, tol=1e-10):
    orthonormal = []
    for v in vectors:
        w = np.array(v, dtype=float)
        for u in orthonormal:
            w = w - np.dot(w, u) * u
        n = np.linalg.norm(w)
        if n > tol:
            orthonormal.append(w / n)
    return np.array(orthonormal)


u = np.array([1.0, 2.0, -1.0])
v = np.array([3.0, 0.0, 1.0])
proj = project_onto(u, v)
resid = u - proj

Q = gram_schmidt([
    np.array([1.0, 1.0, 0.0]),
    np.array([1.0, 0.0, 1.0]),
    np.array([0.0, 1.0, 1.0])
])

print("projection =", proj)
print("residual orthogonal to v:", np.allclose(np.dot(resid, v), 0.0))
print("\nGram-Schmidt output =\n", Q)
print("Q Q^T =\n", Q @ Q.T)

print("Exercise 4 solution complete.")

Exercise 5: Projection Matrix and Least Squares

Task: Implement projection onto a column space and solve a least-squares problem.

Requirements:

Implement projection_matrix(A) = A (A^T A)^(-1) A^T
Implement least_squares_solution(A, b)
Verify the residual is orthogonal to the column space

Written part:

Explain why the least-squares residual must lie in the orthogonal complement of col(A).

Code cell 17

# Your Solution
# Exercise 5 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 5.")

Code cell 18

# Solution
# Exercise 5 - reference solution

def projection_matrix(A):
    A = np.asarray(A, dtype=float)
    return A @ np.linalg.inv(A.T @ A) @ A.T


def least_squares_solution(A, b):
    A = np.asarray(A, dtype=float)
    b = np.asarray(b, dtype=float)
    return np.linalg.solve(A.T @ A, A.T @ b)


A = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
b = np.array([2.0, 1.0, 4.0])
P = projection_matrix(A)
x_star = least_squares_solution(A, b)
residual = A @ x_star - b

print("P =\n", P)
print("P^2 = P?", np.allclose(P @ P, P))
print("x* =", x_star)
print("Ax* =", A @ x_star)
print("A^T(Ax*-b) =", A.T @ residual)

print("Exercise 5 solution complete.")

Exercise 6: Rank-Nullity and the Four Fundamental Subspaces

Task: Use SVD to recover rank, nullity, right null space, and left null space.

Requirements:

Implement rank_nullity(A)
Implement null_space_basis(A)
Implement left_null_space_basis(A)
Verify rank + nullity = n

Written part:

Describe geometrically what the null space and image of a linear map mean.

Code cell 20

# Your Solution
# Exercise 6 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 6.")

Code cell 21

# Solution
# Exercise 6 - reference solution

def rank_nullity(A, tol=1e-10):
    A = np.asarray(A, dtype=float)
    S = np.linalg.svd(A, compute_uv=False)
    rank = int(np.sum(S > tol))
    nullity = A.shape[1] - rank
    return rank, nullity


def null_space_basis(A, tol=1e-10):
    return svd_null_space(A, tol=tol)


def left_null_space_basis(A, tol=1e-10):
    return svd_null_space(np.asarray(A, dtype=float).T, tol=tol)


A = np.array([
    [1.0, 2.0, 3.0],
    [2.0, 4.0, 6.0],
    [1.0, 1.0, 2.0]
])
rank, nullity = rank_nullity(A)
null_basis = null_space_basis(A)
left_null_basis = left_null_space_basis(A)

print("rank =", rank, "nullity =", nullity)
print("rank + nullity =", rank + nullity, "domain dimension =", A.shape[1])
print("null basis =\n", null_basis)
print("left null basis =\n", left_null_basis)
if null_basis.size:
    print("A @ null_basis =\n", A @ null_basis)
if left_null_basis.size:
    print("left_null_basis^T @ A =\n", left_null_basis.T @ A)

print("Exercise 6 solution complete.")

Exercise 7: Softmax, the Simplex, and Attention Output

Task: Show explicitly that attention weights form a probability vector and that the output is a convex combination of value vectors.

Requirements:

Implement a numerically stable softmax(logits, tau)
Implement is_in_simplex(p)
Implement attention_output(logits, values, tau)
Compare low temperature and high temperature behavior

Written part:

Explain why single-head attention output must lie in the convex hull of the value vectors.

Code cell 23

# Your Solution
# Exercise 7 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 7.")

Code cell 24

# Solution
# Exercise 7 - reference solution

def softmax(logits, tau=1.0):
    z = np.asarray(logits, dtype=float) / tau
    z = z - np.max(z)
    e = np.exp(z)
    return e / np.sum(e)


def is_in_simplex(p, tol=1e-10):
    p = np.asarray(p, dtype=float)
    return bool(np.all(p >= -tol) and abs(np.sum(p) - 1.0) < tol)


def attention_output(logits, values, tau=1.0):
    weights = softmax(logits, tau=tau)
    values = np.asarray(values, dtype=float)
    return weights, weights @ values


logits = np.array([2.5, 1.0, -0.5])
values = np.array([[2.0, 0.0], [0.0, 2.0], [1.0, 1.0]])

for tau in [0.2, 1.0, 5.0]:
    weights, out = attention_output(logits, values, tau=tau)
    print(f"tau={tau:>3}: weights={weights}, simplex={is_in_simplex(weights)}, output={out}")

print("Exercise 7 solution complete.")

Exercise 8: High-Dimensional Random Geometry

Task: Empirically verify that random unit vectors become nearly orthogonal as dimension grows.

Requirements:

Implement random_unit_vectors(n, d)
Implement pairwise_cosine_stats(X) returning mean and std of off-diagonal cosines
Compare dimensions d = 4, 32, 256

Written part:

Predict qualitatively how the standard deviation of pairwise cosine similarity should scale with dimension.

Code cell 26

# Your Solution
# Exercise 8 - learner workspace
# Write your solution here, then run the reference solution below to compare.
print("Learner workspace ready for Exercise 8.")

Code cell 27

# Solution
# Exercise 8 - reference solution

def random_unit_vectors(n, d):
    X = np.random.randn(n, d)
    X /= np.linalg.norm(X, axis=1, keepdims=True)
    return X


def pairwise_cosine_stats(X):
    G = X @ X.T
    upper = G[np.triu_indices_from(G, k=1)]
    return float(np.mean(upper)), float(np.std(upper))


for d in [4, 32, 256]:
    X = random_unit_vectors(300, d)
    mean_cos, std_cos = pairwise_cosine_stats(X)
    print(f"d={d:>3} -> mean cosine = {mean_cos:+.4f}, std = {std_cos:.4f}")

print("Exercise 8 solution complete.")

Exercise 9 (★★★): Embedding Similarity and Nearest Neighbors

Given a query embedding $q$ and candidate embeddings $e_i$ , rank candidates by cosine similarity

\cos(q,e_i)=\frac{q^\top e_i}{\|q\|\,\|e_i\|}.

Explain why scaling an embedding does not change its cosine score.

Code cell 29

# Your Solution
# Exercise 9 - learner workspace
# Compute cosine similarities and rank the candidates.
print("Learner workspace ready for Exercise 9.")

Code cell 30

# Solution
# Exercise 9 - embedding cosine similarity
header("Exercise 9: embedding cosine similarity")

q = np.array([1.0, 2.0, 0.5])
E = np.array([[1.0, 2.1, 0.4], [-2.0, 0.1, 0.0], [0.5, 1.0, 0.2], [3.0, 6.0, 1.5]])
scores = np.array([cosine_similarity(q, e) for e in E])
order = np.argsort(scores)[::-1]
print("scores:", scores)
print("ranking:", order)
check_true("scaled copy ranks highly", order[0] in {0, 3})
check_close("scale invariance", cosine_similarity(q, E[0]), cosine_similarity(3*q, 5*E[0]), tol=1e-12)
print("Takeaway: cosine similarity compares direction, which is why embedding normalization matters.")

Exercise 10 (★★★): Projection onto a Feature Subspace

Let columns of $B$ span a subspace. The orthogonal projection of $x$ onto $\operatorname{span}(B)$ is

\hat{x}=B(B^\top B)^{-1}B^\top x.

Compute the projection and verify that the residual is orthogonal to every column of $B$ .

Code cell 32

# Your Solution
# Exercise 10 - learner workspace
# Compute projection and residual orthogonality.
print("Learner workspace ready for Exercise 10.")

Code cell 33

# Solution
# Exercise 10 - projection onto subspace
header("Exercise 10: subspace projection")

B = np.array([[1.0, 0.0], [1.0, 1.0], [0.0, 1.0]])
x = np.array([2.0, 1.0, 3.0])
P = B @ la.inv(B.T @ B) @ B.T
x_hat = P @ x
resid = x - x_hat
print("projection:", x_hat)
print("residual:", resid)
check_close("idempotent projection", P @ P, P, tol=1e-10)
check_close("residual orthogonal to subspace", B.T @ resid, np.zeros(B.shape[1]), tol=1e-10)
print("Takeaway: projection separates represented signal from unexplained residual.")