Exercises NotebookMath for LLMs

Embedding Space Math

Math for LLMs / Embedding Space Math

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Exercises: Embedding Space Math

There are 10 exercises. Exercises 1-3 cover lookup and similarity, 4-6 cover geometry and gradients, and 7-10 cover positions and systems cost.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def normalize_rows(X):
    norms = np.linalg.norm(X, axis=1, keepdims=True)
    return X / np.maximum(norms, 1e-12)

def cosine(u, v):
    return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))

def sinusoidal_positions(n_positions, d_model):
    pos = np.arange(n_positions)[:, None]
    i = np.arange(d_model)[None, :]
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
    angles = pos * angle_rates
    pe = np.zeros((n_positions, d_model))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

def rope_rotate(x, position, base=10000.0):
    x = np.asarray(x, dtype=float)
    assert x.shape[0] % 2 == 0
    d = x.shape[0]
    out = x.copy()
    for k in range(0, d, 2):
        theta = position / (base ** (k / d))
        c, s = np.cos(theta), np.sin(theta)
        a, b = x[k], x[k + 1]
        out[k] = c * a - s * b
        out[k + 1] = s * a + c * b
    return out

def alibi_bias(n, slope=-0.5):
    i = np.arange(n)[:, None]
    j = np.arange(n)[None, :]
    dist = np.maximum(i - j, 0)
    return slope * dist

def pca2(X):
    Xc = X - X.mean(axis=0, keepdims=True)
    U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
    return Xc @ Vt[:2].T, S

def softmax(logits):
    logits = np.asarray(logits, dtype=float)
    exp = np.exp(logits - logits.max())
    return exp / exp.sum()

print("Embedding helpers ready.")

Exercise 1: Embedding lookup (*)

Convert token ids into a batch of vectors. State the shapes, compute the result, and explain the LLM consequence.

Code cell 5

# Your Solution - Exercise 1
answer = None
print("Your answer placeholder:", answer)

Code cell 6

# Solution - Exercise 1
header("Exercise 1: Embedding lookup")
E = np.arange(12, dtype=float).reshape(4, 3)
ids = np.array([[0, 2], [3, 1]])
X = E[ids]
print("Shape:", X.shape)
check_true(X.shape == (2, 2, 3), "B x T ids become B x T x d vectors")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 2: One-hot equivalence (*)

Show lookup equals one-hot matrix multiplication. State the shapes, compute the result, and explain the LLM consequence.

Code cell 8

# Your Solution - Exercise 2
answer = None
print("Your answer placeholder:", answer)

Code cell 9

# Solution - Exercise 2
header("Exercise 2: One-hot equivalence")
E = np.arange(15, dtype=float).reshape(5, 3)
idx = 4
product = np.eye(5)[idx] @ E
print("Product:", product.tolist())
check_true(np.allclose(product, E[idx]), "one-hot lookup matches direct row")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 3: Cosine similarity (*)

Compute a nearest neighbor by angle. State the shapes, compute the result, and explain the LLM consequence.

Code cell 11

# Your Solution - Exercise 3
answer = None
print("Your answer placeholder:", answer)

Code cell 12

# Solution - Exercise 3
header("Exercise 3: Cosine similarity")
u = np.array([1.0, 1.0])
v = np.array([2.0, 2.0])
w = np.array([-1.0, 0.0])
print("cos(u,v):", cosine(u, v), "cos(u,w):", round(cosine(u, w), 4))
check_true(cosine(u, v) > cosine(u, w), "aligned vector is more similar")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 4: Analogy arithmetic (**)

Recover a synthetic relation vector. State the shapes, compute the result, and explain the LLM consequence.

Code cell 14

# Your Solution - Exercise 4
answer = None
print("Your answer placeholder:", answer)

Code cell 15

# Solution - Exercise 4
header("Exercise 4: Analogy arithmetic")
a = np.array([1.0, 0.0])
b = np.array([0.0, 1.0])
c = np.array([2.0, 0.0])
target = c - a + b
print("Target:", target.tolist())
check_true(np.allclose(target, [1.0, 1.0]), "offset arithmetic is consistent")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 5: Centering anisotropy (**)

Remove a dominant mean direction. State the shapes, compute the result, and explain the LLM consequence.

Code cell 17

# Your Solution - Exercise 5
answer = None
print("Your answer placeholder:", answer)

Code cell 18

# Solution - Exercise 5
header("Exercise 5: Centering anisotropy")
X = np.array([[3.0, 1.0], [3.0, -1.0], [4.0, 0.0]])
centered = X - X.mean(axis=0, keepdims=True)
print("Mean after centering:", centered.mean(axis=0).tolist())
check_true(np.allclose(centered.mean(axis=0), 0.0), "centering removes mean vector")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 6: Softmax row gradient (**)

Compute how output rows move under cross-entropy. State the shapes, compute the result, and explain the LLM consequence.

Code cell 20

# Your Solution - Exercise 6
answer = None
print("Your answer placeholder:", answer)

Code cell 21

# Solution - Exercise 6
header("Exercise 6: Softmax row gradient")
h = np.array([1.0, 0.0])
p = np.array([0.2, 0.3, 0.5])
y = 1
grad = (p[:, None] - np.eye(3)[y][:, None]) * h[None, :]
print("Gradient rows:", grad.tolist())
check_true(grad[y, 0] < 0, "target row gradient is negative along h")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 7: Sinusoidal encoding (**)

Build a tiny position table. State the shapes, compute the result, and explain the LLM consequence.

Code cell 23

# Your Solution - Exercise 7
answer = None
print("Your answer placeholder:", answer)

Code cell 24

# Solution - Exercise 7
header("Exercise 7: Sinusoidal encoding")
pe = sinusoidal_positions(4, 6)
print("Shape:", pe.shape)
check_close(pe[0, 0], 0.0, name="position-zero sine")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 8: RoPE norm preservation (***)

Apply a rotary position transform. State the shapes, compute the result, and explain the LLM consequence.

Code cell 26

# Your Solution - Exercise 8
answer = None
print("Your answer placeholder:", answer)

Code cell 27

# Solution - Exercise 8
header("Exercise 8: RoPE norm preservation")
x = np.array([1.0, 2.0])
rot = rope_rotate(x, position=3)
print("Rotated:", np.round(rot, 4).tolist())
check_close(np.linalg.norm(rot), np.linalg.norm(x), name="RoPE preserves pair norm")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 9: ALiBi bias (***)

Create a causal distance-bias matrix. State the shapes, compute the result, and explain the LLM consequence.

Code cell 29

# Your Solution - Exercise 9
answer = None
print("Your answer placeholder:", answer)

Code cell 30

# Solution - Exercise 9
header("Exercise 9: ALiBi bias")
bias = alibi_bias(4, slope=-0.5)
print("Bias:", bias)
check_close(bias[3, 0], -1.5, name="distance three penalty")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")

Exercise 10: Parameter count (***)

Compute embedding parameters with and without tying. State the shapes, compute the result, and explain the LLM consequence.

Code cell 32

# Your Solution - Exercise 10
answer = None
print("Your answer placeholder:", answer)

Code cell 33

# Solution - Exercise 10
header("Exercise 10: Parameter count")
vocab, d = 50000, 4096
untied = 2 * vocab * d
tied = vocab * d
print("Untied:", untied, "Tied:", tied)
check_true(untied == 2 * tied, "tying halves input/output embedding table parameters")
print("\nTakeaway: embedding math turns token ids into geometry, and geometry controls similarity, logits, attention, and memory.")