Theory Notebook
Converted from
theory.ipynbfor web reading.
Tokenization Math
This notebook is the executable companion to notes.md. It implements tiny BPE, unigram/Viterbi, entropy, fertility, special-token, and context-cost examples from scratch.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def check_true(condition, name):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
assert ok, name
def check_close(value, target, tol=1e-8, name="value"):
value = float(value)
target = float(target)
ok = abs(value - target) <= tol
print(f"{'PASS' if ok else 'FAIL'} - {name}: got {value:.6f}, expected {target:.6f}")
assert ok, name
def char_tokens(text):
return tuple(text)
def pair_counts(corpus):
counts = {}
for word, freq in corpus.items():
pieces = list(word)
for a, b in zip(pieces, pieces[1:]):
counts[(a, b)] = counts.get((a, b), 0) + freq
return counts
def merge_word(word, pair):
out = []
i = 0
while i < len(word):
if i < len(word) - 1 and (word[i], word[i + 1]) == pair:
out.append(word[i] + word[i + 1])
i += 2
else:
out.append(word[i])
i += 1
return tuple(out)
def bpe_train(words, num_merges=3):
corpus = {tuple(w): f for w, f in words.items()}
merges = []
for _ in range(num_merges):
counts = {}
for pieces, freq in corpus.items():
for a, b in zip(pieces, pieces[1:]):
counts[(a, b)] = counts.get((a, b), 0) + freq
if not counts:
break
pair = max(counts, key=counts.get)
merges.append(pair)
corpus = {merge_word(pieces, pair): freq for pieces, freq in corpus.items()}
return merges, corpus
def bpe_encode(text, merges):
pieces = tuple(text)
for pair in merges:
pieces = merge_word(pieces, pair)
return list(pieces)
def entropy_from_counts(counts):
vals = np.array(list(counts.values()), dtype=float)
probs = vals / vals.sum()
return float(-(probs * np.log2(probs)).sum())
def viterbi_segment(text, token_logprobs):
n = len(text)
dp = [-1e18] * (n + 1)
back = [None] * (n + 1)
dp[0] = 0.0
for i in range(n):
if dp[i] < -1e17:
continue
for tok, logp in token_logprobs.items():
if text.startswith(tok, i):
j = i + len(tok)
score = dp[i] + logp
if score > dp[j]:
dp[j] = score
back[j] = (i, tok)
if back[n] is None:
return [], -np.inf
out = []
pos = n
while pos > 0:
prev, tok = back[pos]
out.append(tok)
pos = prev
return list(reversed(out)), dp[n]
def fertility(text, tokens):
words = max(1, len(text.split()))
return len(tokens) / words
print("Tokenization helpers ready.")
Demo 1: Text is not the model input
This demo turns a tokenizer concept into a small checked computation.
Code cell 5
header("Demo 1: Text is not the model input - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")
Demo 2: The tokenizer as a learned compression map
This demo turns a tokenizer concept into a small checked computation.
Code cell 7
header("Demo 2: The tokenizer as a learned compression map - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")
Demo 3: Open vocabulary pressure
This demo turns a tokenizer concept into a small checked computation.
Code cell 9
header("Demo 3: Open vocabulary pressure - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")
Demo 4: Tokenization tax
This demo turns a tokenizer concept into a small checked computation.
Code cell 11
header("Demo 4: Tokenization tax - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")
Demo 5: Where tokenization affects LLM behavior
This demo turns a tokenizer concept into a small checked computation.
Code cell 13
header("Demo 5: Where tokenization affects LLM behavior - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")
Demo 6: Alphabet strings and byte sequences
This demo turns a tokenizer concept into a small checked computation.
Code cell 15
header("Demo 6: Alphabet strings and byte sequences - vocabulary parameter cost")
vocab_small, vocab_big, d_model = 32000, 100000, 4096
small = vocab_small * d_model
big = vocab_big * d_model
print("Small embedding params:", small)
print("Big embedding params:", big)
check_true(big > small, "larger vocabulary costs more embedding parameters")
Demo 7: Vocabulary and token ids
This demo turns a tokenizer concept into a small checked computation.
Code cell 17
header("Demo 7: Vocabulary and token ids - attention token cost")
n1, n2 = 1000, 1500
cost_ratio = (n2 / n1) ** 2
print("Cost ratio:", round(cost_ratio, 3))
check_close(cost_ratio, 2.25, name="quadratic attention scaling")
Demo 8: Tokenizer and detokenizer functions
This demo turns a tokenizer concept into a small checked computation.
Code cell 19
header("Demo 8: Tokenizer and detokenizer functions - fertility")
text_a = "the cat sat"
tokens_a = ["the", " cat", " sat"]
text_b = "uncommonword example"
tokens_b = ["un", "common", "word", " example"]
fert_a = fertility(text_a, tokens_a)
fert_b = fertility(text_b, tokens_b)
print("Fertility A:", fert_a, "Fertility B:", fert_b)
check_true(fert_b > fert_a, "more splits create higher fertility")
Demo 9: Lossless versus lossy normalization
This demo turns a tokenizer concept into a small checked computation.
Code cell 21
header("Demo 9: Lossless versus lossy normalization - round trip")
text = " code\nblock"
tokens = list(text.encode("utf-8"))
decoded = bytes(tokens).decode("utf-8")
print("Byte ids:", tokens[:8], "...")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte representation round-trips exactly")
Demo 10: Pre-tokenization boundaries
This demo turns a tokenizer concept into a small checked computation.
Code cell 23
header("Demo 10: Pre-tokenization boundaries - special token protection")
ordinary = ["<", "tool", ">"]
special = ["<tool>"]
print("Ordinary split:", ordinary)
print("Protected special:", special)
check_true(len(special) < len(ordinary), "special token stays atomic")
Demo 11: BPE merge objective
This demo turns a tokenizer concept into a small checked computation.
Code cell 25
header("Demo 11: BPE merge objective - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")
Demo 12: Greedy training loop
This demo turns a tokenizer concept into a small checked computation.
Code cell 27
header("Demo 12: Greedy training loop - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")
Demo 13: Encoding with learned merges
This demo turns a tokenizer concept into a small checked computation.
Code cell 29
header("Demo 13: Encoding with learned merges - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")
Demo 14: Byte-level BPE
This demo turns a tokenizer concept into a small checked computation.
Code cell 31
header("Demo 14: Byte-level BPE - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")
Demo 15: BPE limitations
This demo turns a tokenizer concept into a small checked computation.
Code cell 33
header("Demo 15: BPE limitations - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")
Demo 16: Unigram token probability model
This demo turns a tokenizer concept into a small checked computation.
Code cell 35
header("Demo 16: Unigram token probability model - vocabulary parameter cost")
vocab_small, vocab_big, d_model = 32000, 100000, 4096
small = vocab_small * d_model
big = vocab_big * d_model
print("Small embedding params:", small)
print("Big embedding params:", big)
check_true(big > small, "larger vocabulary costs more embedding parameters")
Demo 17: Viterbi segmentation
This demo turns a tokenizer concept into a small checked computation.
Code cell 37
header("Demo 17: Viterbi segmentation - attention token cost")
n1, n2 = 1000, 1500
cost_ratio = (n2 / n1) ** 2
print("Cost ratio:", round(cost_ratio, 3))
check_close(cost_ratio, 2.25, name="quadratic attention scaling")
Demo 18: Forward probabilities
This demo turns a tokenizer concept into a small checked computation.
Code cell 39
header("Demo 18: Forward probabilities - fertility")
text_a = "the cat sat"
tokens_a = ["the", " cat", " sat"]
text_b = "uncommonword example"
tokens_b = ["un", "common", "word", " example"]
fert_a = fertility(text_a, tokens_a)
fert_b = fertility(text_b, tokens_b)
print("Fertility A:", fert_a, "Fertility B:", fert_b)
check_true(fert_b > fert_a, "more splits create higher fertility")
Demo 19: EM intuition
This demo turns a tokenizer concept into a small checked computation.
Code cell 41
header("Demo 19: EM intuition - round trip")
text = " code\nblock"
tokens = list(text.encode("utf-8"))
decoded = bytes(tokens).decode("utf-8")
print("Byte ids:", tokens[:8], "...")
print("Decoded:", repr(decoded))
check_true(decoded == text, "byte representation round-trips exactly")
Demo 20: Subword regularization
This demo turns a tokenizer concept into a small checked computation.
Code cell 43
header("Demo 20: Subword regularization - special token protection")
ordinary = ["<", "tool", ">"]
special = ["<tool>"]
print("Ordinary split:", ordinary)
print("Protected special:", special)
check_true(len(special) < len(ordinary), "special token stays atomic")
Demo 21: WordPiece merge score
This demo turns a tokenizer concept into a small checked computation.
Code cell 45
header("Demo 21: WordPiece merge score - BPE training")
words = {"low": 5, "lower": 2, "newest": 3, "widest": 2}
merges, corpus = bpe_train(words, num_merges=4)
print("Merges:", merges)
print("Corpus:", {" ".join(k): v for k, v in corpus.items()})
check_true(len(merges) == 4, "requested number of merges learned")
Demo 22: Continuation markers
This demo turns a tokenizer concept into a small checked computation.
Code cell 47
header("Demo 22: Continuation markers - BPE encode")
merges = [('l', 'o'), ('lo', 'w'), ('e', 'r')]
tokens = bpe_encode("lower", merges)
print("Tokens:", tokens)
check_true("low" in tokens, "learned merge appears in encoding")
Demo 23: Greedy longest-match encoding
This demo turns a tokenizer concept into a small checked computation.
Code cell 49
header("Demo 23: Greedy longest-match encoding - compression ratio")
text = "tokenization matters"
tokens = ["token", "ization", " matters"]
ratio = len(text) / len(tokens)
print("Characters:", len(text), "tokens:", len(tokens), "chars/token:", round(ratio, 3))
check_true(ratio > 1.0, "subwords compress characters into fewer model positions")
Demo 24: WordPiece versus BPE
This demo turns a tokenizer concept into a small checked computation.
Code cell 51
header("Demo 24: WordPiece versus BPE - token entropy")
counts = {"the": 50, "cat": 10, "sat": 10, "z": 1}
H = entropy_from_counts(counts)
print("Entropy bits:", round(H, 4))
check_true(0 < H < np.log2(len(counts)), "skewed token distribution has submaximal entropy")
Demo 25: Unknown-token risk
This demo turns a tokenizer concept into a small checked computation.
Code cell 53
header("Demo 25: Unknown-token risk - Viterbi segmentation")
logp = {"a": -2.0, "b": -2.0, "ab": -0.3, "aba": -1.0}
seg, score = viterbi_segment("abab", logp)
print("Best segmentation:", seg, "score:", round(score, 4))
check_true(seg == ["ab", "ab"], "Viterbi selects the high-probability pieces")