Theory Notebook

Converted from theory.ipynb for web reading.

Tokenization Math

This notebook is the executable companion to notes.md. It implements tiny BPE, unigram/Viterbi, entropy, fertility, special-token, and context-cost examples from scratch.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    value = float(value)
    target = float(target)
    ok = abs(value - target) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
    assert ok, name

def char_tokens(text):
    return tuple(text)

def pair_counts(corpus):
    counts = {}
    for word, freq in corpus.items():
        pieces = list(word)
        for a, b in zip(pieces, pieces[1:]):
            counts[(a, b)] = counts.get((a, b), 0) + freq
    return counts

def merge_word(word, pair):
    out = []
    i = 0
    while i < len(word):
        if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
            out.append(word[i] + word[i + 1])
            i += 2
        else:
            out.append(word[i])
            i += 1
    return tuple(out)

def bpe_train(words, num_merges=3):
    corpus = {tuple(w): f for w, f in words.items()}
    merges = []
    for _ in range(num_merges):
        counts = {}
        for pieces, freq in corpus.items():
            for a, b in zip(pieces, pieces[1:]):
                counts[(a, b)] = counts.get((a, b), 0) + freq
        if not counts:
            break
        pair = max(counts, key=counts.get)
        merges.append(pair)
        corpus = {merge_word(pieces, pair): freq for pieces, freq in corpus.items()}
    return merges, corpus

def bpe_encode(text, merges):
    pieces = tuple(text)
    for pair in merges:
        pieces = merge_word(pieces, pair)
    return list(pieces)

def entropy_from_counts(counts):
    vals = np.array(list(counts.values()), dtype=float)
    probs = vals / vals.sum()
    return float(-(probs * np.log2(probs)).sum())

def viterbi_segment(text, token_logprobs):
    n = len(text)
    dp = [-1e18] * (n + 1)
    back = [None] * (n + 1)
    dp[0] = 0.0
    for i in range(n):
        if dp[i] < -1e17:
            continue
        for tok, logp in token_logprobs.items():
            if text.startswith(tok, i):
                j = i + len(tok)
                score = dp[i] + logp
                if score > dp[j]:
                    dp[j] = score
                    back[j] = (i, tok)
    if back[n] is None:
        return [], -np.inf
    out = []
    pos = n
    while pos > 0:
        prev, tok = back[pos]
        out.append(tok)
        pos = prev
    return list(reversed(out)), dp[n]

def fertility(text, tokens):
    words = max(1, len(text.split()))
    return len(tokens) / words

print("Tokenization helpers ready.")

Demo 1: Text is not the model input

This demo turns a tokenizer concept into a small checked computation.

Code cell 5

header("Demo 1: Text is not the model input - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")

Demo 2: The tokenizer as a learned compression map

This demo turns a tokenizer concept into a small checked computation.

Code cell 7

header("Demo 2: The tokenizer as a learned compression map - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")

Demo 3: Open vocabulary pressure

This demo turns a tokenizer concept into a small checked computation.

Code cell 9

header("Demo 3: Open vocabulary pressure - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")

Demo 4: Tokenization tax

This demo turns a tokenizer concept into a small checked computation.

Code cell 11

header("Demo 4: Tokenization tax - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")

Demo 5: Where tokenization affects LLM behavior

This demo turns a tokenizer concept into a small checked computation.

Code cell 13

header("Demo 5: Where tokenization affects LLM behavior - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")

Demo 6: Alphabet strings and byte sequences

This demo turns a tokenizer concept into a small checked computation.

Code cell 15

header("Demo 6: Alphabet strings and byte sequences - vocabulary parameter cost")
vocab_small, vocab_big, d_model = 32000, 100000, 4096
small = vocab_small * d_model
big = vocab_big * d_model
print("Small embedding params:", small)
print("Big embedding params:", big)
check_true(big > small, "larger vocabulary costs more embedding parameters")

Demo 7: Vocabulary and token ids

This demo turns a tokenizer concept into a small checked computation.

Code cell 17

header("Demo 7: Vocabulary and token ids - attention token cost")
n1, n2 = 1000, 1500
cost_ratio = (n2 / n1) ** 2
print("Cost ratio:", round(cost_ratio, 3))
check_close(cost_ratio, 2.25, name="quadratic attention scaling")

Demo 8: Tokenizer and detokenizer functions

This demo turns a tokenizer concept into a small checked computation.

Code cell 19

header("Demo 8: Tokenizer and detokenizer functions - fertility")
text_a = "the cat sat"
tokens_a = ["the", " cat", " sat"]
text_b = "uncommonword example"
tokens_b = ["un", "common", "word", " example"]
fert_a = fertility(text_a, tokens_a)
fert_b = fertility(text_b, tokens_b)
print("Fertility A:", fert_a, "Fertility B:", fert_b)
check_true(fert_b > fert_a, "more splits create higher fertility")

Demo 9: Lossless versus lossy normalization

This demo turns a tokenizer concept into a small checked computation.

Code cell 21

header("Demo 9: Lossless versus lossy normalization - round trip")
text = "  code\nblock"
tokens = list(text.encode("utf-8"))
decoded = bytes(tokens).decode("utf-8")
print("Byte ids:", tokens[:8], "...")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte representation round-trips exactly")

Demo 10: Pre-tokenization boundaries

This demo turns a tokenizer concept into a small checked computation.

Code cell 23

header("Demo 10: Pre-tokenization boundaries - special token protection")
ordinary = ["<", "tool", ">"]
special = ["<tool>"]
print("Ordinary split:", ordinary)
print("Protected special:", special)
check_true(len(special) < len(ordinary), "special token stays atomic")

Demo 11: BPE merge objective

This demo turns a tokenizer concept into a small checked computation.

Code cell 25

header("Demo 11: BPE merge objective - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")

Demo 12: Greedy training loop

This demo turns a tokenizer concept into a small checked computation.

Code cell 27

header("Demo 12: Greedy training loop - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")

Demo 13: Encoding with learned merges

This demo turns a tokenizer concept into a small checked computation.

Code cell 29

header("Demo 13: Encoding with learned merges - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")

Demo 14: Byte-level BPE

This demo turns a tokenizer concept into a small checked computation.

Code cell 31

header("Demo 14: Byte-level BPE - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")

Demo 15: BPE limitations

This demo turns a tokenizer concept into a small checked computation.

Code cell 33

header("Demo 15: BPE limitations - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")

Demo 16: Unigram token probability model

This demo turns a tokenizer concept into a small checked computation.

Code cell 35

header("Demo 16: Unigram token probability model - vocabulary parameter cost")
vocab_small, vocab_big, d_model = 32000, 100000, 4096
small = vocab_small * d_model
big = vocab_big * d_model
print("Small embedding params:", small)
print("Big embedding params:", big)
check_true(big > small, "larger vocabulary costs more embedding parameters")

Demo 17: Viterbi segmentation

This demo turns a tokenizer concept into a small checked computation.

Code cell 37

header("Demo 17: Viterbi segmentation - attention token cost")
n1, n2 = 1000, 1500
cost_ratio = (n2 / n1) ** 2
print("Cost ratio:", round(cost_ratio, 3))
check_close(cost_ratio, 2.25, name="quadratic attention scaling")

Demo 18: Forward probabilities

This demo turns a tokenizer concept into a small checked computation.

Code cell 39

header("Demo 18: Forward probabilities - fertility")
text_a = "the cat sat"
tokens_a = ["the", " cat", " sat"]
text_b = "uncommonword example"
tokens_b = ["un", "common", "word", " example"]
fert_a = fertility(text_a, tokens_a)
fert_b = fertility(text_b, tokens_b)
print("Fertility A:", fert_a, "Fertility B:", fert_b)
check_true(fert_b > fert_a, "more splits create higher fertility")

Demo 19: EM intuition

This demo turns a tokenizer concept into a small checked computation.

Code cell 41

header("Demo 19: EM intuition - round trip")
text = "  code\nblock"
tokens = list(text.encode("utf-8"))
decoded = bytes(tokens).decode("utf-8")
print("Byte ids:", tokens[:8], "...")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte representation round-trips exactly")

Demo 20: Subword regularization

This demo turns a tokenizer concept into a small checked computation.

Code cell 43

header("Demo 20: Subword regularization - special token protection")
ordinary = ["<", "tool", ">"]
special = ["<tool>"]
print("Ordinary split:", ordinary)
print("Protected special:", special)
check_true(len(special) < len(ordinary), "special token stays atomic")

Demo 21: WordPiece merge score

This demo turns a tokenizer concept into a small checked computation.

Code cell 45

header("Demo 21: WordPiece merge score - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")

Demo 22: Continuation markers

This demo turns a tokenizer concept into a small checked computation.

Code cell 47

header("Demo 22: Continuation markers - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")

Demo 23: Greedy longest-match encoding

This demo turns a tokenizer concept into a small checked computation.

Code cell 49

header("Demo 23: Greedy longest-match encoding - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")

Demo 24: WordPiece versus BPE

This demo turns a tokenizer concept into a small checked computation.

Code cell 51

header("Demo 24: WordPiece versus BPE - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")

Demo 25: Unknown-token risk

This demo turns a tokenizer concept into a small checked computation.

Code cell 53

header("Demo 25: Unknown-token risk - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")