Theory Notebook
Converted from
theory.ipynbfor web reading.
Embedding Space Math
This notebook is the executable companion to notes.md. It turns embedding lookup, vector similarity, analogy structure, position encodings, output gradients, and diagnostics into small checked computations.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def normalize_rows(X):
norms = np.linalg.norm(X, axis=1, keepdims=True)
return X / np.maximum(norms, 1e-12)
def cosine(u, v):
return float(np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)))
def sinusoidal_positions(n_positions, d_model):
pos = np.arange(n_positions)[:, None]
i = np.arange(d_model)[None, :]
angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model)
angles = pos * angle_rates
pe = np.zeros((n_positions, d_model))
pe[:, 0::2] = np.sin(angles[:, 0::2])
pe[:, 1::2] = np.cos(angles[:, 1::2])
return pe
def rope_rotate(x, position, base=10000.0):
x = np.asarray(x, dtype=float)
assert x.shape[0] % 2 == 0
d = x.shape[0]
out = x.copy()
for k in range(0, d, 2):
theta = position / (base ** (k / d))
c, s = np.cos(theta), np.sin(theta)
a, b = x[k], x[k + 1]
out[k] = c * a - s * b
out[k + 1] = s * a + c * b
return out
def alibi_bias(n, slope=-0.5):
i = np.arange(n)[:, None]
j = np.arange(n)[None, :]
dist = np.maximum(i - j, 0)
return slope * dist
def pca2(X):
Xc = X - X.mean(axis=0, keepdims=True)
U, S, Vt = np.linalg.svd(Xc, full_matrices=False)
return Xc @ Vt[:2].T, S
def softmax(logits):
logits = np.asarray(logits, dtype=float)
exp = np.exp(logits - logits.max())
return exp / exp.sum()
print("Embedding helpers ready.")
Demo 1: From token ids to vectors
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 5
header("Demo 1: From token ids to vectors - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")
Demo 2: Why continuous geometry helps
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 7
header("Demo 2: Why continuous geometry helps - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")
Demo 3: Embedding space as model memory
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 9
header("Demo 3: Embedding space as model memory - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")
Demo 4: Static versus contextual meaning
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 11
header("Demo 4: Static versus contextual meaning - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")
Demo 5: Pipeline position after tokenization
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 13
header("Demo 5: Pipeline position after tokenization - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")
Demo 6: Embedding matrix
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 15
header("Demo 6: Embedding matrix - softmax gradient")
h = np.array([1.0, -1.0])
W = np.array([[0.2, 0.0], [-0.1, 0.3], [0.4, -0.2]])
logits = W @ h
p = softmax(logits)
y = 2
grad = (p[:, None] - np.eye(3)[y][:, None]) * h[None, :]
print("Probabilities:", np.round(p, 4).tolist())
print("Target-row gradient:", np.round(grad[y], 4).tolist())
check_true(grad[y, 0] < 0, "target row moves toward hidden state under gradient descent")
Demo 7: One-hot lookup equivalence
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 17
header("Demo 7: One-hot lookup equivalence - sinusoidal positions")
pe = sinusoidal_positions(6, 8)
print("PE shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin at position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional encoding")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 8: Batch sequence tensor shapes
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 19
header("Demo 8: Batch sequence tensor shapes - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
rot = rope_rotate(x, position=7)
print("Original norm:", round(float(np.linalg.norm(x)), 6))
print("Rotated norm:", round(float(np.linalg.norm(rot)), 6))
check_close(np.linalg.norm(rot), np.linalg.norm(x), name="rotation preserves norm")
Demo 9: Input output and tied embeddings
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 21
header("Demo 9: Input output and tied embeddings - ALiBi bias")
bias = alibi_bias(5, slope=-0.25)
print("Bias matrix:", np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="farther past receives larger penalty")
fig, ax = plt.subplots()
im = ax.imshow(bias, cmap="viridis")
ax.set_title("ALiBi causal distance bias")
ax.set_xlabel("Key position")
ax.set_ylabel("Query position")
fig.colorbar(im, ax=ax)
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 10: Residual stream initialization
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 23
header("Demo 10: Residual stream initialization - PCA diagnostic")
rng = np.random.default_rng(0)
X = rng.normal(size=(20, 5))
coords, S = pca2(X)
print("PCA coords shape:", coords.shape)
print("Top singular values:", np.round(S[:3], 4).tolist())
check_true(coords.shape == (20, 2), "PCA projection is two-dimensional")
Demo 11: Dot product
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 25
header("Demo 11: Dot product - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")
Demo 12: Cosine similarity
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 27
header("Demo 12: Cosine similarity - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")
Demo 13: Euclidean distance
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 29
header("Demo 13: Euclidean distance - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")
Demo 14: Norms and frequency effects
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 31
header("Demo 14: Norms and frequency effects - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")
Demo 15: Nearest neighbors
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 33
header("Demo 15: Nearest neighbors - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")
Demo 16: Analogy directions
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 35
header("Demo 16: Analogy directions - softmax gradient")
h = np.array([1.0, -1.0])
W = np.array([[0.2, 0.0], [-0.1, 0.3], [0.4, -0.2]])
logits = W @ h
p = softmax(logits)
y = 2
grad = (p[:, None] - np.eye(3)[y][:, None]) * h[None, :]
print("Probabilities:", np.round(p, 4).tolist())
print("Target-row gradient:", np.round(grad[y], 4).tolist())
check_true(grad[y, 0] < 0, "target row moves toward hidden state under gradient descent")
Demo 17: Subspaces and probes
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 37
header("Demo 17: Subspaces and probes - sinusoidal positions")
pe = sinusoidal_positions(6, 8)
print("PE shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin at position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional encoding")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 18: Isotropy and anisotropy
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 39
header("Demo 18: Isotropy and anisotropy - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
rot = rope_rotate(x, position=7)
print("Original norm:", round(float(np.linalg.norm(x)), 6))
print("Rotated norm:", round(float(np.linalg.norm(rot)), 6))
check_close(np.linalg.norm(rot), np.linalg.norm(x), name="rotation preserves norm")
Demo 19: Centering and whitening
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 41
header("Demo 19: Centering and whitening - ALiBi bias")
bias = alibi_bias(5, slope=-0.25)
print("Bias matrix:", np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="farther past receives larger penalty")
fig, ax = plt.subplots()
im = ax.imshow(bias, cmap="viridis")
ax.set_title("ALiBi causal distance bias")
ax.set_xlabel("Key position")
ax.set_ylabel("Query position")
fig.colorbar(im, ax=ax)
fig.tight_layout()
plt.show()
plt.close(fig)
Demo 20: Bias and representation directions
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 43
header("Demo 20: Bias and representation directions - PCA diagnostic")
rng = np.random.default_rng(0)
X = rng.normal(size=(20, 5))
coords, S = pca2(X)
print("PCA coords shape:", coords.shape)
print("Top singular values:", np.round(S[:3], 4).tolist())
check_true(coords.shape == (20, 2), "PCA projection is two-dimensional")
Demo 21: Language-model loss gradients
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 45
header("Demo 21: Language-model loss gradients - lookup")
E = np.arange(30, dtype=float).reshape(10, 3) / 10
ids = np.array([[1, 2, 1], [3, 4, 5]])
X = E[ids]
print("ids shape:", ids.shape, "embedding shape:", X.shape)
print("First vector:", X[0, 0].tolist())
check_true(X.shape == (2, 3, 3), "lookup creates B x T x d tensor")
Demo 22: Word2vec and negative sampling
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 47
header("Demo 22: Word2vec and negative sampling - one-hot equivalence")
E = np.arange(20, dtype=float).reshape(5, 4)
idx = 3
one_hot = np.eye(5)[idx]
lookup = E[idx]
product = one_hot @ E
print("lookup:", lookup.tolist())
print("one-hot product:", product.tolist())
check_true(np.allclose(lookup, product), "one-hot multiplication equals row lookup")
Demo 23: GloVe and co-occurrence factorization
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 49
header("Demo 23: GloVe and co-occurrence factorization - cosine neighbors")
tokens = ["cat", "dog", "car"]
X = np.array([[1.0, 0.9], [0.9, 1.0], [-1.0, 0.2]])
sims = normalize_rows(X) @ normalize_rows(X).T
print("Cosine matrix:", np.round(sims, 3))
nearest = tokens[int(np.argsort(-sims[0])[1])]
print("Nearest to cat:", nearest)
check_true(nearest == "dog", "nearest neighbor reflects direction")
Demo 24: Fine-tuning drift
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 51
header("Demo 24: Fine-tuning drift - analogy")
gender = np.array([1.0, 0.0])
royalty = np.array([0.0, 1.0])
man = gender
woman = -gender
king = gender + royalty
queen = -gender + royalty
pred = king - man + woman
print("Predicted queen vector:", pred.tolist())
check_true(np.allclose(pred, queen), "linear offset recovers synthetic relation")
Demo 25: Vocabulary resizing
This demo connects one embedding-space concept to a concrete numeric check.
Code cell 53
header("Demo 25: Vocabulary resizing - anisotropy")
rng = np.random.default_rng(42)
X = rng.normal(size=(40, 8)) + np.array([3.0] + [0.0] * 7)
mean_norm = np.linalg.norm(X.mean(axis=0))
centered_mean_norm = np.linalg.norm((X - X.mean(axis=0)).mean(axis=0))
print("Mean norm before:", round(float(mean_norm), 4))
print("Mean norm after centering:", round(float(centered_mean_norm), 8))
check_true(centered_mean_norm < mean_norm, "centering removes dominant mean direction")