Theory NotebookMath for LLMs

Quality Checks

LLM Training Data Pipeline / Quality Checks

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Quality Checks - Theory Notebook

Executable companion to notes.md.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import json
import math
import hashlib
import re
from collections import Counter, defaultdict

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def stable_hash(value):
    if not isinstance(value, str):
        value = json.dumps(value, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(value.encode("utf-8")).hexdigest()

def token_count(text):
    return len(re.findall(r"\S+", text))

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, value, expected, tol=1e-6):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def normalize_text(text):
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()

def shingle(text, n=5):
    toks = re.findall(r"\w+", text.lower())
    return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}

def jaccard(a, b):
    return len(a & b) / max(1, len(a | b))

print("Pipeline helpers ready.")

Synthetic records

Code cell 5

records = [
    {"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
    {"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
    {"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))

Stable hashes

Code cell 7

ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))

Acceptance rate

Code cell 9

lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)

Source histogram

Code cell 11

source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))

Rule filters

Code cell 13

def quality_rule(r):
    text = r["text"]
    return token_count(text) >= 4 and len(set(text.split())) / max(1, token_count(text)) > 0.5
flags = [quality_rule(r) for r in records]
print(flags)
check_true("short record rejected", flags[-1] is False)

PII-like regex

Code cell 15

texts = ["email me at name@example.com", "ordinary educational sentence"]
pii = [bool(re.search(r"[\w.-]+@[\w.-]+", t)) for t in texts]
print(pii)
check_true("email detected", pii[0] and not pii[1])

Threshold curve

Code cell 17

scores = np.linspace(0, 1, 50)
accept_rates = [(scores >= t).mean() for t in np.linspace(0, 1, 10)]
fig, ax = plt.subplots()
ax.plot(np.linspace(0, 1, 10), accept_rates, marker="o", color=COLORS["primary"])
ax.set_title("Acceptance rate by threshold")
ax.set_xlabel("Threshold")
ax.set_ylabel("Acceptance rate")
fig.tight_layout()
plt.show()
print("acceptance rates", np.round(accept_rates, 2))

Slice audit

Code cell 19

slice_scores = {"web": [0.2, 0.7, 0.8], "code": [0.9, 0.6], "books": [0.75, 0.8]}
means = {k: float(np.mean(v)) for k, v in slice_scores.items()}
print(means)
check_true("all slices summarized", set(means) == set(slice_scores))

Token-weighted accounting

Code cell 21

tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)

Deterministic random seed

Code cell 23

np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))

Rate dashboard

Code cell 25

stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))

Audit statistic 11

Code cell 27

values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 12

Code cell 29

values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 13

Code cell 31

values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 14

Code cell 33

values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 15

Code cell 35

values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 16

Code cell 37

values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 17

Code cell 39

values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 18

Code cell 41

values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 19

Code cell 43

values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 20

Code cell 45

values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 21

Code cell 47

values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 22

Code cell 49

values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 23

Code cell 51

values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 24

Code cell 53

values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))