Theory NotebookMath for LLMs

Matrix Operations

Linear Algebra Basics / Matrix Operations

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Matrix Operations - Theory & Implementations

This notebook follows the Matrix Operations chapter, but in the order that is easiest to learn by doing:

  1. Matrix shapes, notation, and special matrices
  2. Elementwise operations, trace, and transpose
  3. Matrix multiplication from row, column, and outer-product views
  4. Dense layers and attention as matrix operations
  5. Inverse, condition number, and solving linear systems
  6. Pseudo-inverse, least squares, and projection
  7. LU, QR, and eigendecomposition
  8. SVD, low-rank approximation, and LoRA intuition
  9. Cholesky and symmetric positive definite matrices
  10. GEMM vs GEMV and the hardware view of performance

The notebook is intentionally code-first. The goal is not only to know the formulas, but to make the operations feel operational.

Primary references behind the chapter: Axler, MIT 18.06, LAPACK, cuBLAS, Vaswani et al. (2017), Dao et al. (2022, 2024), Hu et al. (2022), and Halko et al. (2011).

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
import scipy.linalg as sla
from scipy import stats

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok

def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

def softmax(z, axis=-1, tau=1.0):
    z = np.asarray(z, dtype=float) / float(tau)
    z = z - np.max(z, axis=axis, keepdims=True)
    e = np.exp(z)
    return e / np.sum(e, axis=axis, keepdims=True)

def cosine_similarity(a, b):
    a = np.asarray(a, dtype=float); b = np.asarray(b, dtype=float)
    return float(a @ b / (la.norm(a) * la.norm(b) + 1e-12))

def numerical_rank(A, tol=1e-10):
    return int(np.sum(la.svd(A, compute_uv=False) > tol))

def orthonormal_basis(A, tol=1e-10):
    Q, R = la.qr(A)
    keep = np.abs(np.diag(R)) > tol
    return Q[:, keep]

def null_space(A, tol=1e-10):
    U, S, Vt = la.svd(A)
    return Vt[S.size:,:].T if S.size < Vt.shape[0] else Vt[S <= tol,:].T



# Compatibility helpers used by the Chapter 02 theory and exercise cells.
def null_space(A, tol=1e-10):
    A = np.asarray(A, dtype=float)
    U, S, Vt = la.svd(A, full_matrices=True)
    rank = int(np.sum(S > tol))
    return Vt[rank:].T

svd_null_space = null_space

def gram_schmidt(vectors, tol=1e-10):
    A = np.asarray(vectors, dtype=float)
    if A.ndim == 1:
        A = A.reshape(1, -1)
    basis = []
    for v in A:
        w = v.astype(float).copy()
        for q in basis:
            w = w - np.dot(w, q) * q
        norm = la.norm(w)
        if norm > tol:
            basis.append(w / norm)
    return np.array(basis)

def projection_matrix_from_columns(A, tol=1e-10):
    Q = orthonormal_basis(np.asarray(A, dtype=float), tol=tol)
    return Q @ Q.T


def random_unit_vectors(n, d):
    X = np.random.randn(n, d)
    return X / np.maximum(la.norm(X, axis=1, keepdims=True), 1e-12)

def pairwise_distances(X):
    X = np.asarray(X, dtype=float)
    diff = X[:, None, :] - X[None, :, :]
    return la.norm(diff, axis=-1)


def normalize(x, axis=None, tol=1e-12):
    x = np.asarray(x, dtype=float)
    norm = la.norm(x, axis=axis, keepdims=True)
    return x / np.maximum(norm, tol)

def frobenius_inner(A, B):
    return float(np.sum(np.asarray(A, dtype=float) * np.asarray(B, dtype=float)))

def outer_sum_product(A, B):
    A = np.asarray(A, dtype=float)
    B = np.asarray(B, dtype=float)
    return sum(np.outer(A[:, k], B[k, :]) for k in range(A.shape[1]))

def softmax_rows(X):
    return softmax(X, axis=1)

def col_space(A, tol=1e-10):
    return orthonormal_basis(np.asarray(A, dtype=float), tol=tol)

def row_space(A, tol=1e-10):
    return orthonormal_basis(np.asarray(A, dtype=float).T, tol=tol).T

def rref(A, tol=1e-10):
    R = np.array(A, dtype=float, copy=True)
    m, n = R.shape
    pivots = []
    row = 0
    for col in range(n):
        pivot = row + int(np.argmax(np.abs(R[row:, col]))) if row < m else row
        if row >= m or abs(R[pivot, col]) <= tol:
            continue
        if pivot != row:
            R[[row, pivot]] = R[[pivot, row]]
        R[row] = R[row] / R[row, col]
        for r in range(m):
            if r != row:
                R[r] = R[r] - R[r, col] * R[row]
        pivots.append(col)
        row += 1
        if row == m:
            break
    R[np.abs(R) < tol] = 0.0
    return R, pivots

def nullspace_basis(A, tol=1e-10):
    A = np.asarray(A, dtype=float)
    U, S, Vt = la.svd(A, full_matrices=True)
    rank = int(np.sum(S > tol))
    return Vt[rank:].T, rank

print("Chapter helper setup complete.")

1. Matrix Basics, Shapes, and Special Structure

Start concrete. A matrix is both a table of numbers and a linear map with a shape. Shape reasoning is the first debugging tool in linear algebra and in deep learning code.

Learning move: inspect rows, columns, shapes, and special matrices before touching matrix multiplication.

Code cell 5

# === 1.1 Basic matrix objects and shape reasoning ===
A = np.array([[1., 2., 3.],
              [4., 5., 6.]])
I3 = np.eye(3)
D = np.diag([2., -1., 4.])
U = np.array([[1., 2., 3.],
              [0., 4., 5.],
              [0., 0., 6.]])
S = np.array([[2., 1., 3.],
              [1., 4., 5.],
              [3., 5., 6.]])

print("A shape:", A.shape)
print("A =\n", A)
print("row 0 =", A[0, :])
print("column 1 =", A[:, 1])
print("I3 =\n", I3)
print("Diagonal matrix D =\n", D)
print("Upper triangular U =\n", U)
print("Symmetric matrix S =\n", S)
print("S.T == S ?", np.allclose(S.T, S))

# Memory layout intuition: transpose changes interpretation, not always data.
print("A strides =", A.strides)
print("A.T strides =", A.T.strides)
print("A.T shares memory with A ?", np.shares_memory(A, A.T))

2. Elementwise Operations, Trace, and Transpose

Before the main algebraic operation, matrix multiplication, there is a simpler layer of operations: entrywise arithmetic, scalar scaling, trace, and transpose.

These operations appear constantly in optimizers, masks, residual updates, and gradient formulas.

Code cell 7

# === 2.1 Elementwise operations and transpose identities ===
A = np.array([[1., 2.],
              [3., 4.]])
B = np.array([[5., 6.],
              [7., 8.]])
alpha = -0.5

print("A + B =\n", A + B)
print("A - B =\n", A - B)
print("alpha * A =\n", alpha * A)
print("Hadamard A * B =\n", A * B)
print("trace(A) =", np.trace(A))
print("trace(A.T @ B) =", np.trace(A.T @ B))
print("Frobenius inner(A, B) =", frobenius_inner(A, B))

AB = A @ B
lhs = (AB).T
rhs = B.T @ A.T
print("(AB)^T =\n", lhs)
print("B^T A^T =\n", rhs)
print("(AB)^T == B^T A^T ?", np.allclose(lhs, rhs))

3. Matrix Multiplication from Multiple Views

Matrix multiplication is the computational heart of the chapter. A single formula is not enough intuition. You want to be able to see it in at least three ways:

  1. row of A dot column of B
  2. A transforming each column of B
  3. sum of outer products

When those three views line up in your head, many AI formulas become much easier to read.

Code cell 9

# === 3.1 Row, column, and outer-product views of AB ===
A = np.array([[1., 2., 3.],
              [4., 5., 6.]])
B = np.array([[1., 0.],
              [0., 1.],
              [1., 1.]])

C = A @ B
print("A shape:", A.shape, "B shape:", B.shape, "C shape:", C.shape)
print("C = A @ B =\n", C)

# Row-column dot product view
c00_manual = np.dot(A[0, :], B[:, 0])
c01_manual = np.dot(A[0, :], B[:, 1])
print("manual first row entries via dot products:", c00_manual, c01_manual)

# Column view: each output column is A applied to one column of B
print("A @ B[:, 0] =", A @ B[:, 0])
print("A @ B[:, 1] =", A @ B[:, 1])

# Outer-product view
C_outer = outer_sum_product(A, B)
print("sum of outer products =\n", C_outer)
print("outer-product view matches matmul ?", np.allclose(C, C_outer))

# Rank bottleneck intuition
print("rank(A) =", np.linalg.matrix_rank(A))
print("rank(B) =", np.linalg.matrix_rank(B))
print("rank(C) =", np.linalg.matrix_rank(C))

4. Dense Layers and Attention as Matrix Operations

The cleanest way to see why matrices dominate AI is to implement the core computations directly:

  • batched dense layer: Y = X W^T + b
  • single-head attention: S = Q K^T / sqrt(d_k), A = softmax(S), O = A V

Once you can read the shapes here automatically, transformer code stops feeling mysterious.

Code cell 11

# === 4.1 Dense layer and attention as matrix pipelines ===
X = np.array([[1.0, 0.5, -1.0],
              [0.0, 2.0, 1.0]])          # batch of 2, feature dim 3
W = np.array([[1.0, -1.0, 0.5],
              [0.0, 2.0, 1.0]])         # 2 output units, 3 input units
b = np.array([0.1, -0.2])

Y = X @ W.T + b
print("Dense layer shapes: X", X.shape, "W", W.shape, "b", b.shape, "Y", Y.shape)
print("Y =\n", Y)

Q = np.array([[1.0, 0.0],
              [0.5, 1.0],
              [1.0, 1.0],
              [0.0, 1.0]])
K = np.array([[1.0, 0.0],
              [1.0, 1.0],
              [0.0, 1.0],
              [0.5, 0.5]])
V = np.array([[2.0, 0.0],
              [1.0, 1.0],
              [0.0, 2.0],
              [1.5, 0.5]])

dk = Q.shape[1]
S = (Q @ K.T) / np.sqrt(dk)
A_attn = softmax_rows(S)
O = A_attn @ V

print("\nAttention score matrix S shape:", S.shape)
print(S)
print("\nAttention weights shape:", A_attn.shape)
print(A_attn)
print("row sums:", A_attn.sum(axis=1))
print("\nAttention output O shape:", O.shape)
print(O)

print("\nIf Q = K, score matrix is symmetric?", np.allclose(Q @ Q.T, (Q @ Q.T).T))

5. Inverse, Solving, and Condition Number

Mathematically, A^{-1} is clean. Numerically, solving Ax = b directly is usually better than explicitly forming the inverse.

Learning move: compare a well-conditioned matrix with an ill-conditioned one and watch how the condition number changes the problem.

Code cell 13

# === 5.1 Inverse, solve, and conditioning ===
A_good = np.array([[3.0, 1.0],
                   [1.0, 2.0]])
b = np.array([1.0, 0.0])

x_solve = np.linalg.solve(A_good, b)
x_inv = np.linalg.inv(A_good) @ b
print("A_good =\n", A_good)
print("solve(A_good, b) =", x_solve)
print("inv(A_good) @ b =", x_inv)
print("condition number of A_good =", np.linalg.cond(A_good))

# Ill-conditioned Hilbert-type example
A_bad = np.array([[1.0, 1.0],
                  [1.0, 1.0001]])
b_bad = np.array([2.0, 2.0001])
x_bad = np.linalg.solve(A_bad, b_bad)

print("\nA_bad =\n", A_bad)
print("condition number of A_bad =", np.linalg.cond(A_bad))
print("solution to A_bad x = b_bad =", x_bad)

# Tiny perturbation in b
b_bad_perturbed = b_bad + np.array([0.0, 1e-4])
x_bad_perturbed = np.linalg.solve(A_bad, b_bad_perturbed)
print("perturbed solution =", x_bad_perturbed)
print("solution change =", x_bad_perturbed - x_bad)

6. Pseudo-Inverse, Least Squares, and Projection

The pseudo-inverse is what you use when ordinary inverse is unavailable or not the right object. In practice, this means least squares, overdetermined systems, and projection onto the column space.

The geometric picture matters more than the formula: A x* is the closest reachable point to b.

Code cell 15

# === 6.1 Least squares and Moore-Penrose pseudo-inverse ===
A = np.array([[1.0, 2.0],
              [3.0, 4.0],
              [5.0, 6.0]])
b = np.array([1.0, 2.0, 2.5])

x_star = np.linalg.pinv(A) @ b
projection = A @ x_star
residual = b - projection

print("least-squares x* =", x_star)
print("A x* =", projection)
print("residual =", residual)
print("residual norm =", np.linalg.norm(residual))

# Residual should be orthogonal to col(A)
print("A^T residual =", A.T @ residual)

# Projection matrix onto col(A)
P_col = A @ np.linalg.pinv(A)
print("\nProjection matrix P = A A^+ =\n", P_col)
print("P^2 == P ?", np.allclose(P_col @ P_col, P_col))
print("P^T == P ?", np.allclose(P_col.T, P_col))
print("P b =", P_col @ b)
print("matches A x* ?", np.allclose(P_col @ b, projection))

8. Matrix Decompositions — Preview

The decompositions below are previewed here with working code — enough to know which one to reach for and how to call it. Full derivations, algorithms, and proofs live in their canonical sections.

DecompositionWhen to useCanonical section
LUSolve Ax=b, general square A03-Advanced §08
QRLeast squares, orthonormalisation03-Advanced §08
CholeskySymmetric positive definite systems03-Advanced §08
EigendecompositionPowers, spectral analysis03-Advanced §01
SVDLow-rank approx, pseudo-inverse, all cases03-Advanced §02

Code cell 17

# === 8.1 Decomposition decision: which one for which job? ===

import numpy as np

np.random.seed(42)
n = 4

M = np.random.randn(n, n)
A_spd  = M.T @ M + n * np.eye(n)   # symmetric positive definite
A_gen  = np.random.randn(n, n)      # general square
A_rect = np.random.randn(n+2, n)    # rectangular (overdetermined)
b      = np.random.randn(n)
b_rect = np.random.randn(n+2)

# LU: solve general square system
x_lu = np.linalg.solve(A_gen, b)
print(f'LU solve residual:         {np.linalg.norm(A_gen @ x_lu - b):.2e}')

# Cholesky: solve SPD system
L = np.linalg.cholesky(A_spd)
y = np.linalg.solve(L, b)
x_chol = np.linalg.solve(L.T, y)
print(f'Cholesky solve residual:   {np.linalg.norm(A_spd @ x_chol - b):.2e}')

# QR: least squares overdetermined system
Q, R = np.linalg.qr(A_rect)
x_qr = np.linalg.solve(R, Q.T @ b_rect)
print(f'QR least-squares residual: {np.linalg.norm(A_rect @ x_qr - b_rect):.4f}  (non-zero: overdetermined)')

# SVD: rank, pseudo-inverse, low-rank approximation
U, s, Vt = np.linalg.svd(A_rect, full_matrices=False)
rank  = int(np.sum(s > 1e-10))
x_svd = Vt.T @ np.diag(1/s) @ U.T @ b_rect
print(f'SVD numerical rank:        {rank}')
print(f'SVD pseudo-inverse ||x||:  {np.linalg.norm(x_svd):.4f}')

# Eigendecomposition: works for square diagonalisable matrices
vals, vecs = np.linalg.eig(A_spd)
A4_via_eig = vecs @ np.diag(vals**4) @ np.linalg.inv(vecs)
A4_direct  = np.linalg.matrix_power(A_spd, 4)
print(f'Eig A^4 error vs direct:   {np.linalg.norm(A4_via_eig - A4_direct):.2e}')

print('\nFull treatment: 03-Advanced-Linear-Algebra (eigenvalues, SVD, decompositions).')

11. GEMM vs GEMV and the Hardware View

The same mathematics can produce very different hardware behavior. Large batched matrix-matrix multiply is high reuse and can keep tensor units busy. Matrix-vector multiply has much less reuse and is often bandwidth-bound.

This is one reason training throughput and single-token decoding latency behave so differently.

Code cell 19

# === 11.1 Arithmetic intensity intuition for GEMV vs GEMM ===
def gemv_intensity(m, n):
    flops = 2 * m * n
    bytes_moved = 8 * (m * n + n + m)  # rough float64 accounting for intuition only
    return flops / bytes_moved


def gemm_intensity(m, p, n):
    flops = 2 * m * p * n
    bytes_moved = 8 * (m * p + p * n + m * n)
    return flops / bytes_moved


m, n, p = 4096, 4096, 4096
print("Approx arithmetic intensity GEMV(4096, 4096) =", gemv_intensity(m, n))
print("Approx arithmetic intensity GEMM(4096, 4096, 4096) =", gemm_intensity(m, p, n))

# Batching converts many GEMV-like workloads toward GEMM.
for batch in [1, 4, 16, 64]:
    intensity = gemm_intensity(batch, 4096, 4096)
    print(f"batch={batch:2d} -> batched matmul intensity ~ {intensity:.4f}")

print("\nInterpretation:")
print("Higher arithmetic intensity usually means better chance of being compute-bound.")
print("Lower arithmetic intensity usually means memory traffic dominates.")

12. What to Notice

The main conceptual moves in this notebook are:

  • matrix multiplication is composition, not just a formula
  • transpose and inverse reverse order because composition is order-sensitive
  • pseudo-inverse is really about projection and least squares
  • QR exposes orthogonality, eigendecomposition exposes invariant directions, SVD exposes full low-rank structure
  • Cholesky is what happens when symmetry and positive definiteness make the problem easier
  • the AI systems view is not separate from the math: GEMM, GEMV, low-rank updates, and numerical conditioning are the math made operational

The next clean step after this notebook is to go deeper on spectral structure: eigenvalues, eigenvectors, singular values, and what they say about dynamics, compression, and optimization.