Theory Notebook

Converted from theory.ipynb for web reading.

JSONL Generation - Theory Notebook

Executable companion to notes.md.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import json
import math
import hashlib
import re
from collections import Counter, defaultdict

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def stable_hash(value):
    if not isinstance(value, str):
        value = json.dumps(value, sort_keys=True, ensure_ascii=False)
    return hashlib.sha256(value.encode("utf-8")).hexdigest()

def token_count(text):
    return len(re.findall(r"\S+", text))

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, value, expected, tol=1e-6):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

def normalize_text(text):
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    return text.strip()

def shingle(text, n=5):
    toks = re.findall(r"\w+", text.lower())
    return {" ".join(toks[i:i+n]) for i in range(max(0, len(toks)-n+1))}

def jaccard(a, b):
    return len(a & b) / max(1, len(a | b))

print("Pipeline helpers ready.")

Synthetic records

Code cell 5

records = [
    {"id": "a", "text": "clean educational text about linear algebra", "source": "web", "license": "cc"},
    {"id": "b", "text": "code example def add(a, b): return a + b", "source": "code", "license": "mit"},
    {"id": "c", "text": "short", "source": "web", "license": "unknown"},
]
print("record count", len(records))
print("token counts", [token_count(r["text"]) for r in records])
check_true("all records have text", all("text" in r for r in records))

Stable hashes

Code cell 7

ids = [stable_hash({"text": r["text"], "source": r["source"]})[:12] for r in records]
print("stable ids", ids)
check_true("ids are unique", len(set(ids)) == len(ids))

Acceptance rate

Code cell 9

lengths = np.array([token_count(r["text"]) for r in records])
accepted = lengths >= 4
rate = accepted.mean()
token_rate = lengths[accepted].sum() / lengths.sum()
print("document acceptance", rate)
print("token acceptance", token_rate)
check_true("token and document rates can differ", abs(rate - token_rate) > 0)

Source histogram

Code cell 11

source_counts = Counter(r["source"] for r in records)
fig, ax = plt.subplots()
ax.bar(list(source_counts), list(source_counts.values()), color=COLORS["primary"])
ax.set_title("Records by source")
ax.set_xlabel("Source")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print("source counts", dict(source_counts))

Generator function

Code cell 13

def generate(rows):
    for row in rows:
        yield {"id": stable_hash(row)[:12], "text": normalize_text(row), "tokens": token_count(row)}
rows = [" first row ", "second row with more words", "third row"]
generated = list(generate(rows))
print(generated)
check_true("generator emits one record per row", len(generated) == len(rows))

Shard rotation

Code cell 15

max_per_shard = 2
shards = defaultdict(list)
for i, rec in enumerate(generated):
    shards[i // max_per_shard].append(rec)
print({k: len(v) for k, v in shards.items()})
check_true("multiple shards created", len(shards) == 2)

Quarantine invalid rows

Code cell 17

raw_rows = ["valid text", "", "also valid"]
good, bad = [], []
for row in raw_rows:
    (good if token_count(row) > 0 else bad).append(row)
print("good", good, "bad", bad)
check_true("empty row quarantined", len(bad) == 1)

Throughput metric

Code cell 19

processed = np.array([1000, 1200, 900, 1100])
seconds = np.array([1.0, 1.2, 0.95, 1.05])
throughput = processed / seconds
print("records/sec", np.round(throughput, 2))
check_true("throughput finite", np.isfinite(throughput).all())

Token-weighted accounting

Code cell 21

tok = np.array([token_count(r["text"]) for r in records])
weights = tok / tok.sum()
print("token weights", np.round(weights, 3))
check_close("token weights sum", weights.sum(), 1.0)

Deterministic random seed

Code cell 23

np.random.seed(42)
a = np.random.choice(10, size=5)
np.random.seed(42)
b = np.random.choice(10, size=5)
print(a, b)
check_true("seed reproduces sample", np.array_equal(a, b))

Rate dashboard

Code cell 25

stages = ["input", "valid", "accepted", "released"]
counts = np.array([1000, 900, 720, 700])
fig, ax = plt.subplots()
ax.plot(stages, counts, marker="o", color=COLORS["secondary"])
ax.set_title("Pipeline count dashboard")
ax.set_xlabel("Stage")
ax.set_ylabel("Records")
fig.tight_layout()
plt.show()
print(dict(zip(stages, counts)))

Audit statistic 11

Code cell 27

values = np.array([11, 12, 13], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 12

Code cell 29

values = np.array([12, 13, 14], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 13

Code cell 31

values = np.array([13, 14, 15], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 14

Code cell 33

values = np.array([14, 15, 16], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 15

Code cell 35

values = np.array([15, 16, 17], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 16

Code cell 37

values = np.array([16, 17, 18], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 17

Code cell 39

values = np.array([17, 18, 19], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 18

Code cell 41

values = np.array([18, 19, 20], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 19

Code cell 43

values = np.array([19, 20, 21], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 20

Code cell 45

values = np.array([20, 21, 22], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 21

Code cell 47

values = np.array([21, 22, 23], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 22

Code cell 49

values = np.array([22, 23, 24], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 23

Code cell 51

values = np.array([23, 24, 25], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))

Audit statistic 24

Code cell 53

values = np.array([24, 25, 26], dtype=float)
mean = values.mean()
print("audit values", values, "mean", mean)
check_true("mean finite", np.isfinite(mean))