Theory Notebook

Converted from theory.ipynb for web reading.

Positional Encodings

This notebook is the executable companion to notes.md. It checks sinusoidal rows, learned table sizes, relative offsets, RoPE rotations, ALiBi bias, and decode position ids.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def sinusoidal_positions(n, d):
    pos = np.arange(n)[:, None]
    i = np.arange(d)[None, :]
    rates = 1 / np.power(10000, (2 * (i // 2)) / d)
    angles = pos * rates
    pe = np.zeros((n, d))
    pe[:, 0::2] = np.sin(angles[:, 0::2])
    pe[:, 1::2] = np.cos(angles[:, 1::2])
    return pe

def relative_offsets(T):
    i = np.arange(T)[:, None]
    j = np.arange(T)[None, :]
    return i - j

def rope_rotate(x, position, base=10000.0):
    x = np.asarray(x, dtype=float)
    assert len(x) % 2 == 0
    out = x.copy()
    d = len(x)
    for k in range(0, d, 2):
        theta = position / (base ** (k / d))
        c, s = np.cos(theta), np.sin(theta)
        a, b = x[k], x[k + 1]
        out[k] = c * a - s * b
        out[k + 1] = s * a + c * b
    return out

def alibi_bias(T, slope=-0.25):
    return slope * np.maximum(relative_offsets(T), 0)

print("Positional-encoding helpers ready.")

Demo 1: Why attention needs position

This demo turns one position-encoding idea into a checked numeric example.

Code cell 5

header("Demo 1: Why attention needs position - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 2: Absolute versus relative position

This demo turns one position-encoding idea into a checked numeric example.

Code cell 7

header("Demo 2: Absolute versus relative position - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")

Demo 3: Additive versus score-based position

This demo turns one position-encoding idea into a checked numeric example.

Code cell 9

header("Demo 3: Additive versus score-based position - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")

Demo 4: Length extrapolation

This demo turns one position-encoding idea into a checked numeric example.

Code cell 11

header("Demo 4: Length extrapolation - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")

Demo 5: Position in decoder-only LLMs

This demo turns one position-encoding idea into a checked numeric example.

Code cell 13

header("Demo 5: Position in decoder-only LLMs - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")

Demo 6: Position indices

This demo turns one position-encoding idea into a checked numeric example.

Code cell 15

header("Demo 6: Position indices - RoPE relative dot product")
q = np.array([1.0, 0.0])
k = np.array([0.5, 0.5])
i_pos, j_pos = 7, 4
left = np.dot(rope_rotate(q, i_pos), rope_rotate(k, j_pos))
right = np.dot(q, rope_rotate(k, j_pos - i_pos))
print("Left:", round(float(left), 6), "Right:", round(float(right), 6))
check_close(left, right, tol=1e-8, name="relative rotation identity")

Demo 7: Token plus position representation

This demo turns one position-encoding idea into a checked numeric example.

Code cell 17

header("Demo 7: Token plus position representation - ALiBi")
bias = alibi_bias(5, slope=-0.25)
print(np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="linear past-distance penalty")

Demo 8: Attention score modification

This demo turns one position-encoding idea into a checked numeric example.

Code cell 19

header("Demo 8: Attention score modification - learned table params")
Tmax, d = 4096, 4096
params = Tmax * d
print("Position parameters:", params)
check_true(params == 16777216, "learned absolute table size")

Demo 9: Relative offset notation

This demo turns one position-encoding idea into a checked numeric example.

Code cell 21

header("Demo 9: Relative offset notation - decode position ids")
prefix = 12
generated = np.arange(5)
pos_ids = prefix + generated
print("Decode position ids:", pos_ids.tolist())
check_true(pos_ids[0] == prefix and np.all(np.diff(pos_ids) == 1), "decode positions advance by one")

Demo 10: Position interpolation and scaling

This demo turns one position-encoding idea into a checked numeric example.

Code cell 23

header("Demo 10: Position interpolation and scaling - length scaling")
train_len, target_len = 2048, 8192
scale = train_len / target_len
orig_pos = np.array([0, 2048, 4096, 8191])
scaled_pos = orig_pos * scale
print("Scaled positions:", np.round(scaled_pos, 2).tolist())
check_true(scaled_pos[-1] < train_len, "interpolation maps target positions into training range")

Demo 11: Frequency ladder

This demo turns one position-encoding idea into a checked numeric example.

Code cell 25

header("Demo 11: Frequency ladder - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 12: Sine cosine pairs

This demo turns one position-encoding idea into a checked numeric example.

Code cell 27

header("Demo 12: Sine cosine pairs - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")

Demo 13: Linear relative-offset intuition

This demo turns one position-encoding idea into a checked numeric example.

Code cell 29

header("Demo 13: Linear relative-offset intuition - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")

Demo 14: Visualization and aliasing

This demo turns one position-encoding idea into a checked numeric example.

Code cell 31

header("Demo 14: Visualization and aliasing - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")

Demo 15: Limitations

This demo turns one position-encoding idea into a checked numeric example.

Code cell 33

header("Demo 15: Limitations - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")

Demo 16: Learned position table

This demo turns one position-encoding idea into a checked numeric example.

Code cell 35

header("Demo 16: Learned position table - RoPE relative dot product")
q = np.array([1.0, 0.0])
k = np.array([0.5, 0.5])
i_pos, j_pos = 7, 4
left = np.dot(rope_rotate(q, i_pos), rope_rotate(k, j_pos))
right = np.dot(q, rope_rotate(k, j_pos - i_pos))
print("Left:", round(float(left), 6), "Right:", round(float(right), 6))
check_close(left, right, tol=1e-8, name="relative rotation identity")

Demo 17: Training length limit

This demo turns one position-encoding idea into a checked numeric example.

Code cell 37

header("Demo 17: Training length limit - ALiBi")
bias = alibi_bias(5, slope=-0.25)
print(np.round(bias, 2))
check_close(bias[4, 0], -1.0, name="linear past-distance penalty")

Demo 18: Interpolation resizing

This demo turns one position-encoding idea into a checked numeric example.

Code cell 39

header("Demo 18: Interpolation resizing - learned table params")
Tmax, d = 4096, 4096
params = Tmax * d
print("Position parameters:", params)
check_true(params == 16777216, "learned absolute table size")

Demo 19: BERT GPT-style usage

This demo turns one position-encoding idea into a checked numeric example.

Code cell 41

header("Demo 19: BERT GPT-style usage - decode position ids")
prefix = 12
generated = np.arange(5)
pos_ids = prefix + generated
print("Decode position ids:", pos_ids.tolist())
check_true(pos_ids[0] == prefix and np.all(np.diff(pos_ids) == 1), "decode positions advance by one")

Demo 20: Failure modes

This demo turns one position-encoding idea into a checked numeric example.

Code cell 43

header("Demo 20: Failure modes - length scaling")
train_len, target_len = 2048, 8192
scale = train_len / target_len
orig_pos = np.array([0, 2048, 4096, 8191])
scaled_pos = orig_pos * scale
print("Scaled positions:", np.round(scaled_pos, 2).tolist())
check_true(scaled_pos[-1] < train_len, "interpolation maps target positions into training range")

Demo 21: Relative bias matrices

This demo turns one position-encoding idea into a checked numeric example.

Code cell 45

header("Demo 21: Relative bias matrices - sinusoidal table")
pe = sinusoidal_positions(6, 8)
print("Shape:", pe.shape)
print("Position 0:", np.round(pe[0], 3).tolist())
check_close(pe[0, 0], 0.0, name="sin position zero")
fig, ax = plt.subplots()
ax.plot(pe[:, 0], color=COLORS["primary"], label="dim 0")
ax.plot(pe[:, 1], color=COLORS["secondary"], label="dim 1")
ax.set_title("Sinusoidal positional features")
ax.set_xlabel("Position")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)

Demo 22: Shaw-style relative keys

This demo turns one position-encoding idea into a checked numeric example.

Code cell 47

header("Demo 22: Shaw-style relative keys - token plus position")
x = np.array([1.0, 2.0, 3.0, 4.0])
p = sinusoidal_positions(1, 4)[0]
h = x + p
print("Position vector:", np.round(p, 3).tolist())
print("Hidden init:", np.round(h, 3).tolist())
check_true(h.shape == x.shape, "addition preserves model width")

Demo 23: Transformer-XL intuition

This demo turns one position-encoding idea into a checked numeric example.

Code cell 49

header("Demo 23: Transformer-XL intuition - relative offsets")
R = relative_offsets(4)
print(R)
check_true(R[3, 0] == 3 and R[0, 3] == -3, "offset matrix is query minus key")

Demo 24: Bucketed distances

This demo turns one position-encoding idea into a checked numeric example.

Code cell 51

header("Demo 24: Bucketed distances - relative bias")
bias = -0.1 * np.abs(relative_offsets(5))
print(np.round(bias, 2))
check_close(bias[0, 4], -0.4, name="distance four penalty")

Demo 25: When relative position helps

This demo turns one position-encoding idea into a checked numeric example.

Code cell 53

header("Demo 25: When relative position helps - RoPE norm")
x = np.array([1.0, 0.0, 0.5, -0.5])
y = rope_rotate(x, position=11)
print("Rotated:", np.round(y, 4).tolist())
check_close(np.linalg.norm(y), np.linalg.norm(x), name="RoPE preserves norm")