Theory NotebookMath for LLMs

Full Dataset Assembly

LLM Training Data Pipeline / Full Dataset Assembly

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Full Dataset Assembly - Theory Notebook

Executable companion to notes.md.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import json
import math
import hashlib
import re
from collections import Counter, defaultdict

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def stable_hash(value):
    if not isinstance(value, str):
        value = json.dumps(value, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(value.encode("utf-8")).hexdigest()

def token_count(text):
    return len(re.findall(r"\S+", text))

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, value, expected, tol=1e-6):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def normalize_text(text):
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()

def shingle(text, n=5):
    toks = re.findall(r"\w+", text.lower())
    return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}

def jaccard(a, b):
    return len(a & b) / max(1, len(a | b))

print("Pipeline helpers ready.")

Synthetic records

Code cell 5

records = [
    {"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
    {"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
    {"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))

Stable hashes

Code cell 7

ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))

Acceptance rate

Code cell 9

lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)

Source histogram

Code cell 11

source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))

Mixture sampling

Code cell 13

domains = ["web", "code", "books"]
alpha = np.array([0.5, 0.3, 0.2])
draws = np.random.choice(domains, size=1000, p=alpha)
freq = Counter(draws)
print(freq)
check_true("all domains sampled", set(freq) == set(domains))

Deterministic split

Code cell 15

def split_for_id(identifier):
    bucket = int(stable_hash(identifier), 16) % 100
    return "train" if bucket < 90 else "validation" if bucket < 95 else "test"
splits = [split_for_id(r["id"]) for r in records]
print(splits)
check_true("split assigned for each record", len(splits) == len(records))

Sequence packing

Code cell 17

lengths = [3, 5, 4, 7, 2]
capacity = 8
packs, current = [], []
used = 0
for length in lengths:
    if used + length > capacity:
        packs.append(current); current = []; used = 0
    current.append(length); used += length
packs.append(current)
print("packs", packs)
check_true("each pack within capacity", all(sum(p) <= capacity for p in packs))

Manifest proportions

Code cell 19

manifest = {"web": 5000, "code": 3000, "books": 2000}
total = sum(manifest.values())
proportions = {k: v / total for k, v in manifest.items()}
print(proportions)
check_close("proportions sum to one", sum(proportions.values()), 1.0)

Token-weighted accounting

Code cell 21

tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)

Deterministic random seed

Code cell 23

np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))

Rate dashboard

Code cell 25

stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))

Audit statistic 11

Code cell 27

values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 12

Code cell 29

values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 13

Code cell 31

values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 14

Code cell 33

values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 15

Code cell 35

values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 16

Code cell 37

values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 17

Code cell 39

values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 18

Code cell 41

values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 19

Code cell 43

values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 20

Code cell 45

values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 21

Code cell 47

values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 22

Code cell 49

values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 23

Code cell 51

values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 24

Code cell 53

values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))