Theory NotebookMath for LLMs

Embedding Space Math

Math for LLMs / Embedding Space Math

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Embedding Space Math

This notebook is the executable companion to notes.md. It turns embedding lookup, vector similarity, analogy structure, position encodings, output gradients, and diagnostics into small checked computations.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def normalize_rows(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    return X / np.maximum(norms, 1e-12)

def cosine(u, v):
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

def sinusoidal_positions(n_positions, d_model):
    pos = np.arange(n_positions)[:, None]
    i = np.arange(d_model)[None, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
    angles = pos * angle_rates
    pe = np.zeros((n_positions, d_model))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

def rope_rotate(x, position, base=10000.0):
    x = np.asarray(x, dtype=float)
    assert x.shape[0] % 2 == 0
    d = x.shape[0]
    out = x.copy()
    for k in range(0, d, 2):
        theta = position / (base ** (k / d))
        c, s = np.cos(theta), np.sin(theta)
        a, b = x[k], x[k + 1]
        out[k] = c * a - s * b
        out[k + 1] = s * a + c * b
    return out

def alibi_bias(n, slope=-0.5):
    i = np.arange(n)[:, None]
    j = np.arange(n)[None, :]
    dist = np.maximum(i - j, 0)
    return slope * dist

def pca2(X):
    Xc = X - X.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    return Xc @ Vt[:2].T, S

def softmax(logits):
    logits = np.asarray(logits, dtype=float)
    exp = np.exp(logits - logits.max())
    return exp / exp.sum()

print("Embedding helpers ready.")

Demo 1: From token ids to vectors

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 5

header("Demo 1: From token ids to vectors - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")

Demo 2: Why continuous geometry helps

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 7

header("Demo 2: Why continuous geometry helps - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")

Demo 3: Embedding space as model memory

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 9

header("Demo 3: Embedding space as model memory - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")

Demo 4: Static versus contextual meaning

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 11

header("Demo 4: Static versus contextual meaning - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")

Demo 5: Pipeline position after tokenization

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 13

header("Demo 5: Pipeline position after tokenization - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")

Demo 6: Embedding matrix

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 15

header("Demo 6: Embedding matrix - softmax gradient")
h = np.array([1.0, -1.0])
W = np.array([[0.2, 0.0], [-0.1, 0.3], [0.4, -0.2]])
logits = W @ h
p = softmax(logits)
y = 2
grad = (p[:, None] - np.eye(3)[y][:, None]) * h[None, :]
print("Probabilities:", np.round(p, 4).tolist())
print("Target-row gradient:", np.round(grad[y], 4).tolist())
check_true(grad[y, 0] < 0, "target row moves toward hidden state under gradient descent")

Demo 7: One-hot lookup equivalence

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 17

header("Demo 7: One-hot lookup equivalence - sinusoidal positions")
pe = sinusoidal_positions(6, 8)
print("PE shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin at position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional encoding")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 8: Batch sequence tensor shapes

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 19

header("Demo 8: Batch sequence tensor shapes - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
rot = rope_rotate(x, position=7)
print("Original norm:", round(float(np.linalg.norm(x)), 6))
print("Rotated norm:", round(float(np.linalg.norm(rot)), 6))
check_close(np.linalg.norm(rot), np.linalg.norm(x), name="rotation preserves norm")

Demo 9: Input output and tied embeddings

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 21

header("Demo 9: Input output and tied embeddings - ALiBi bias")
bias = alibi_bias(5, slope=-0.25)
print("Bias matrix:", np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="farther past receives larger penalty")
fig, ax = plt.subplots()
im = ax.imshow(bias, cmap="viridis")
ax.set_title("ALiBi causal distance bias")
ax.set_xlabel("Key position")
ax.set_ylabel("Query position")
fig.colorbar(im, ax=ax)
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 10: Residual stream initialization

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 23

header("Demo 10: Residual stream initialization - PCA diagnostic")
rng = np.random.default_rng(0)
X = rng.normal(size=(20, 5))
coords, S = pca2(X)
print("PCA coords shape:", coords.shape)
print("Top singular values:", np.round(S[:3], 4).tolist())
check_true(coords.shape == (20, 2), "PCA projection is two-dimensional")

Demo 11: Dot product

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 25

header("Demo 11: Dot product - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")

Demo 12: Cosine similarity

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 27

header("Demo 12: Cosine similarity - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")

Demo 13: Euclidean distance

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 29

header("Demo 13: Euclidean distance - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")

Demo 14: Norms and frequency effects

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 31

header("Demo 14: Norms and frequency effects - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")

Demo 15: Nearest neighbors

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 33

header("Demo 15: Nearest neighbors - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")

Demo 16: Analogy directions

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 35

header("Demo 16: Analogy directions - softmax gradient")
h = np.array([1.0, -1.0])
W = np.array([[0.2, 0.0], [-0.1, 0.3], [0.4, -0.2]])
logits = W @ h
p = softmax(logits)
y = 2
grad = (p[:, None] - np.eye(3)[y][:, None]) * h[None, :]
print("Probabilities:", np.round(p, 4).tolist())
print("Target-row gradient:", np.round(grad[y], 4).tolist())
check_true(grad[y, 0] < 0, "target row moves toward hidden state under gradient descent")

Demo 17: Subspaces and probes

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 37

header("Demo 17: Subspaces and probes - sinusoidal positions")
pe = sinusoidal_positions(6, 8)
print("PE shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin at position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional encoding")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 18: Isotropy and anisotropy

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 39

header("Demo 18: Isotropy and anisotropy - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
rot = rope_rotate(x, position=7)
print("Original norm:", round(float(np.linalg.norm(x)), 6))
print("Rotated norm:", round(float(np.linalg.norm(rot)), 6))
check_close(np.linalg.norm(rot), np.linalg.norm(x), name="rotation preserves norm")

Demo 19: Centering and whitening

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 41

header("Demo 19: Centering and whitening - ALiBi bias")
bias = alibi_bias(5, slope=-0.25)
print("Bias matrix:", np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="farther past receives larger penalty")
fig, ax = plt.subplots()
im = ax.imshow(bias, cmap="viridis")
ax.set_title("ALiBi causal distance bias")
ax.set_xlabel("Key position")
ax.set_ylabel("Query position")
fig.colorbar(im, ax=ax)
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 20: Bias and representation directions

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 43

header("Demo 20: Bias and representation directions - PCA diagnostic")
rng = np.random.default_rng(0)
X = rng.normal(size=(20, 5))
coords, S = pca2(X)
print("PCA coords shape:", coords.shape)
print("Top singular values:", np.round(S[:3], 4).tolist())
check_true(coords.shape == (20, 2), "PCA projection is two-dimensional")

Demo 21: Language-model loss gradients

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 45

header("Demo 21: Language-model loss gradients - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")

Demo 22: Word2vec and negative sampling

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 47

header("Demo 22: Word2vec and negative sampling - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")

Demo 23: GloVe and co-occurrence factorization

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 49

header("Demo 23: GloVe and co-occurrence factorization - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")

Demo 24: Fine-tuning drift

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 51

header("Demo 24: Fine-tuning drift - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")

Demo 25: Vocabulary resizing

This demo connects one embedding-space concept to a concrete numeric check.

Code cell 53

header("Demo 25: Vocabulary resizing - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")

Previous lesson Next lesson