Theory Notebook
Theory Notebook
Converted from
theory.ipynbfor web reading.
JSONL Generation - Theory Notebook
Executable companion to notes.md.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import json
import math
import hashlib
import re
from collections import Counter, defaultdict
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
def stable_hash(value):
if not isinstance(value, str):
value = json.dumps(value, sort_keys=True, ensure_ascii=False)
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def token_count(text):
return len(re.findall(r"\S+", text))
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, value, expected, tol=1e-6):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
def normalize_text(text):
text = text.replace("\r\n", "\n").replace("\r", "\n")
text = re.sub(r"[ \t]+", " ", text)
return text.strip()
def shingle(text, n=5):
toks = re.findall(r"\w+", text.lower())
return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}
def jaccard(a, b):
return len(a & b) / max(1, len(a | b))
print("Pipeline helpers ready.")
Synthetic records
Code cell 5
records = [
{"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
{"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
{"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))
Stable hashes
Code cell 7
ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))
Acceptance rate
Code cell 9
lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)
Source histogram
Code cell 11
source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))
Generator function
Code cell 13
def generate(rows):
for row in rows:
yield {"id": stable_hash(row)[:12], "text": normalize_text(row), "tokens": token_count(row)}
rows = [" first row ", "second row with more words", "third row"]
generated = list(generate(rows))
print(generated)
check_true("generator emits one record per row", len(generated) == len(rows))
Shard rotation
Code cell 15
max_per_shard = 2
shards = defaultdict(list)
for i, rec in enumerate(generated):
shards[i // max_per_shard].append(rec)
print({k: len(v) for k, v in shards.items()})
check_true("multiple shards created", len(shards) == 2)
Quarantine invalid rows
Code cell 17
raw_rows = ["valid text", "", "also valid"]
good, bad = [], []
for row in raw_rows:
(good if token_count(row) > 0 else bad).append(row)
print("good", good, "bad", bad)
check_true("empty row quarantined", len(bad) == 1)
Throughput metric
Code cell 19
processed = np.array([1000, 1200, 900, 1100])
seconds = np.array([1.0, 1.2, 0.95, 1.05])
throughput = processed / seconds
print("records/sec", np.round(throughput, 2))
check_true("throughput finite", np.isfinite(throughput).all())
Token-weighted accounting
Code cell 21
tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)
Deterministic random seed
Code cell 23
np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))
Rate dashboard
Code cell 25
stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))
Audit statistic 11
Code cell 27
values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 12
Code cell 29
values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 13
Code cell 31
values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 14
Code cell 33
values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 15
Code cell 35
values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 16
Code cell 37
values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 17
Code cell 39
values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 18
Code cell 41
values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 19
Code cell 43
values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 20
Code cell 45
values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 21
Code cell 47
values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 22
Code cell 49
values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 23
Code cell 51
values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))
Audit statistic 24
Code cell 53
values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))