Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
Full Dataset Assembly - Theory Notebook
Executable companion to notes.md.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import json
import math
import hashlib
import re
from collections import Counter, defaultdict
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def stable_hash(value):
if not isinstance(value, str):
value = json.dumps(value, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def token_count(text):
return len(re.findall(r"\S+", text))
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, value, expected, tol=1e-6):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def normalize_text(text):
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def shingle(text, n=5):
toks = re.findall(r"\w+", text.lower())
return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}
def jaccard(a, b):
return len(a & b) / max(1, len(a | b))
print("Pipeline helpers ready.")
Synthetic records
Code cell 5
records = [
{"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
{"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
{"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))
Stable hashes
Code cell 7
ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))
Acceptance rate
Code cell 9
lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)
Source histogram
Code cell 11
source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))
Mixture sampling
Code cell 13
domains = ["web", "code", "books"]
alpha = np.array([0.5, 0.3, 0.2])
draws = np.random.choice(domains, size=1000, p=alpha)
freq = Counter(draws)
print(freq)
check_true("all domains sampled", set(freq) == set(domains))
Deterministic split
Code cell 15
def split_for_id(identifier):
bucket = int(stable_hash(identifier), 16) % 100
return "train" if bucket < 90 else "validation" if bucket < 95 else "test"
splits = [split_for_id(r["id"]) for r in records]
print(splits)
check_true("split assigned for each record", len(splits) == len(records))
Sequence packing
Code cell 17
lengths = [3, 5, 4, 7, 2]
capacity = 8
packs, current = [], []
used = 0
for length in lengths:
if used + length > capacity:
packs.append(current); current = []; used = 0
current.append(length); used += length
packs.append(current)
print("packs", packs)
check_true("each pack within capacity", all(sum(p) <= capacity for p in packs))
Manifest proportions
Code cell 19
manifest = {"web": 5000, "code": 3000, "books": 2000}
total = sum(manifest.values())
proportions = {k: v / total for k, v in manifest.items()}
print(proportions)
check_close("proportions sum to one", sum(proportions.values()), 1.0)
Token-weighted accounting
Code cell 21
tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)
Deterministic random seed
Code cell 23
np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))
Rate dashboard
Code cell 25
stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))
Audit statistic 11
Code cell 27
values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 12
Code cell 29
values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 13
Code cell 31
values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 14
Code cell 33
values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 15
Code cell 35
values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 16
Code cell 37
values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 17
Code cell 39
values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 18
Code cell 41
values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 19
Code cell 43
values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 20
Code cell 45
values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 21
Code cell 47
values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 22
Code cell 49
values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 23
Code cell 51
values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 24
Code cell 53
values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))