Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
Quality Checks - Theory Notebook
Executable companion to notes.md.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import json
import math
import hashlib
import re
from collections import Counter, defaultdict
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def stable_hash(value):
if not isinstance(value, str):
value = json.dumps(value, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def token_count(text):
return len(re.findall(r"\S+", text))
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, value, expected, tol=1e-6):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def normalize_text(text):
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def shingle(text, n=5):
toks = re.findall(r"\w+", text.lower())
return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}
def jaccard(a, b):
return len(a & b) / max(1, len(a | b))
print("Pipeline helpers ready.")
Synthetic records
Code cell 5
records = [
{"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
{"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
{"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))
Stable hashes
Code cell 7
ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))
Acceptance rate
Code cell 9
lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)
Source histogram
Code cell 11
source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))
Rule filters
Code cell 13
def quality_rule(r):
text = r["text"]
return token_count(text) >= 4 and len(set(text.split())) / max(1, token_count(text)) > 0.5
flags = [quality_rule(r) for r in records]
print(flags)
check_true("short record rejected", flags[-1] is False)
PII-like regex
Code cell 15
texts = ["email me at name@example.com", "ordinary educational sentence"]
pii = [bool(re.search(r"[\w.-]+@[\w.-]+", t)) for t in texts]
print(pii)
check_true("email detected", pii[0] and not pii[1])
Threshold curve
Code cell 17
scores = np.linspace(0, 1, 50)
accept_rates = [(scores >= t).mean() for t in np.linspace(0, 1, 10)]
fig, ax = plt.subplots()
ax.plot(np.linspace(0, 1, 10), accept_rates, marker="o", color=COLORS["primary"])
ax.set_title("Acceptance rate by threshold")
ax.set_xlabel("Threshold")
ax.set_ylabel("Acceptance rate")
fig.tight_layout()
plt.show()
print("acceptance rates", np.round(accept_rates, 2))
Slice audit
Code cell 19
slice_scores = {"web": [0.2, 0.7, 0.8], "code": [0.9, 0.6], "books": [0.75, 0.8]}
means = {k: float(np.mean(v)) for k, v in slice_scores.items()}
print(means)
check_true("all slices summarized", set(means) == set(slice_scores))
Token-weighted accounting
Code cell 21
tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)
Deterministic random seed
Code cell 23
np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))
Rate dashboard
Code cell 25
stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))
Audit statistic 11
Code cell 27
values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 12
Code cell 29
values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 13
Code cell 31
values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 14
Code cell 33
values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 15
Code cell 35
values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 16
Code cell 37
values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 17
Code cell 39
values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 18
Code cell 41
values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 19
Code cell 43
values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 20
Code cell 45
values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 21
Code cell 47
values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 22
Code cell 49
values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 23
Code cell 51
values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 24
Code cell 53
values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))