Theory NotebookMath for LLMs

Data Versioning and Lineage

Production ML and MLOps / Data Versioning and Lineage

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Data Versioning and Lineage

Data versioning and lineage turn training data from an invisible dependency into an auditable mathematical artifact.

This notebook is the executable companion to notes.md. It uses synthetic production signals so every cell runs without external services or data files.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def softmax(z):
    z = np.asarray(z, dtype=float)
    z = z - np.max(z)
    e = np.exp(z)
    return e / e.sum()

def psi(ref, cur, eps=1e-8):
    ref = np.asarray(ref, dtype=float) + eps
    cur = np.asarray(cur, dtype=float) + eps
    ref = ref / ref.sum()
    cur = cur / cur.sum()
    return float(np.sum((cur - ref) * np.log(cur / ref)))

def js_divergence(p, q, eps=1e-8):
    p = np.asarray(p, dtype=float) + eps
    q = np.asarray(q, dtype=float) + eps
    p = p / p.sum()
    q = q / q.sum()
    m = 0.5 * (p + q)
    return float(0.5 * np.sum(p * np.log(p / m)) + 0.5 * np.sum(q * np.log(q / m)))

def percentile(values, q):
    return float(np.percentile(np.asarray(values, dtype=float), q))

print("Helper functions ready.")

Demo 1: production ML as an artifact graph

This demo makes the production idea concrete with a small numerical object.

Code cell 5

header("Demo 1 - production ML as an artifact graph: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 2: why data changes break models

This demo makes the production idea concrete with a small numerical object.

Code cell 7

header("Demo 2 - why data changes break models: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 3: versioning versus backups

This demo makes the production idea concrete with a small numerical object.

Code cell 9

header("Demo 3 - versioning versus backups: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 4: lineage as rollback memory

This demo makes the production idea concrete with a small numerical object.

Code cell 11

header("Demo 4 - lineage as rollback memory: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 5: technical debt from hidden dependencies

This demo makes the production idea concrete with a small numerical object.

Code cell 13

header("Demo 5 - technical debt from hidden dependencies: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 6: dataset snapshot Dv\mathcal{D}_v

This demo makes the production idea concrete with a small numerical object.

Code cell 15

header("Demo 6 - dataset snapshot $\\mathcal{D}_v$: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 7: artifact hash h(a)h(a)

This demo makes the production idea concrete with a small numerical object.

Code cell 17

header("Demo 7 - artifact hash $h(a)$: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 8: lineage DAG G=(V,E)G=(V,E)

This demo makes the production idea concrete with a small numerical object.

Code cell 19

header("Demo 8 - lineage DAG $G=(V,E)$: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 9: provenance metadata

This demo makes the production idea concrete with a small numerical object.

Code cell 21

header("Demo 9 - provenance metadata: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 10: reproducibility predicate

This demo makes the production idea concrete with a small numerical object.

Code cell 23

header("Demo 10 - reproducibility predicate: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 11: immutable snapshots

This demo makes the production idea concrete with a small numerical object.

Code cell 25

header("Demo 11 - immutable snapshots: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 12: Git versus object store versioning

This demo makes the production idea concrete with a small numerical object.

Code cell 27

header("Demo 12 - Git versus object store versioning: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 13: DVC-style pointer files

This demo makes the production idea concrete with a small numerical object.

Code cell 29

header("Demo 13 - DVC-style pointer files: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 14: semantic dataset versions

This demo makes the production idea concrete with a small numerical object.

Code cell 31

header("Demo 14 - semantic dataset versions: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 15: data diffs and rollback

This demo makes the production idea concrete with a small numerical object.

Code cell 33

header("Demo 15 - data diffs and rollback: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 16: runs, jobs, and datasets

This demo makes the production idea concrete with a small numerical object.

Code cell 35

header("Demo 16 - runs, jobs, and datasets: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 17: pipeline metadata

This demo makes the production idea concrete with a small numerical object.

Code cell 37

header("Demo 17 - pipeline metadata: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 18: dependency closure

This demo makes the production idea concrete with a small numerical object.

Code cell 39

header("Demo 18 - dependency closure: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 19: model-to-data linkage

This demo makes the production idea concrete with a small numerical object.

Code cell 41

header("Demo 19 - model-to-data linkage: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")

Demo 20: audit trails

This demo makes the production idea concrete with a small numerical object.

Code cell 43

header("Demo 20 - audit trails: hash-style version check")
values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
fingerprint = int(np.sum(values * np.arange(1, len(values) + 1)))
same_values = np.array([17, 23, 42, 99, 101], dtype=np.int64)
same_fingerprint = int(np.sum(same_values * np.arange(1, len(same_values) + 1)))
print("Fingerprint:", fingerprint)
check_close(fingerprint, same_fingerprint, name="recomputed fingerprint")
print("Production lesson: deterministic fingerprints make equality auditable.")

Demo 21: schema checks

This demo makes the production idea concrete with a small numerical object.

Code cell 45

header("Demo 21 - schema checks: release metric comparison")
baseline = np.array([0.72, 0.73, 0.71, 0.74, 0.72])
candidate = np.array([0.74, 0.75, 0.73, 0.76, 0.75])
delta = candidate - baseline
mean_delta = float(delta.mean())
stderr = float(delta.std(ddof=1) / np.sqrt(len(delta)))
print("Mean delta:", round(mean_delta, 4))
print("Standard error:", round(stderr, 4))
check_true(mean_delta > 0, "candidate improves average metric")
print("Production lesson: promotion should record uncertainty, not only a point estimate.")

Demo 22: row count and hash checks

This demo makes the production idea concrete with a small numerical object.

Code cell 47

header("Demo 22 - row count and hash checks: drift statistic")
ref = np.array([0.20, 0.30, 0.25, 0.25])
cur = np.array([0.10, 0.25, 0.30, 0.35])
score = psi(ref, cur)
print("PSI:", round(score, 6))
check_true(score >= 0, "PSI is nonnegative for positive bins")
fig, ax = plt.subplots()
idx = np.arange(len(ref))
ax.bar(idx - 0.18, ref, width=0.36, color=COLORS["primary"], label="Reference")
ax.bar(idx + 0.18, cur, width=0.36, color=COLORS["secondary"], label="Current")
ax.set_title("Reference versus current production distribution")
ax.set_xlabel("Bin")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Production lesson: drift is a distance between reference and current behavior.")

Demo 23: freshness service-level objectives

This demo makes the production idea concrete with a small numerical object.

Code cell 49

header("Demo 23 - freshness service-level objectives: latency and tail risk")
latency_ms = np.array([42, 45, 47, 50, 52, 55, 61, 75, 110, 180], dtype=float)
p50 = percentile(latency_ms, 50)
p95 = percentile(latency_ms, 95)
print("p50 latency:", round(p50, 2), "ms")
print("p95 latency:", round(p95, 2), "ms")
check_true(p95 > p50, "tail latency exceeds median latency")
print("Production lesson: users experience tail latency, not average latency.")

Demo 24: privacy and license gates

This demo makes the production idea concrete with a small numerical object.

Code cell 51

header("Demo 24 - privacy and license gates: guardrail decision table")
scores = np.array([0.05, 0.20, 0.45, 0.80, 0.95])
threshold = 0.70
actions = np.where(scores >= threshold, "escalate", "allow")
print("Scores:", scores)
print("Actions:", actions.tolist())
check_true(np.sum(actions == "escalate") == 2, "two requests cross the guardrail threshold")
print("Production lesson: runtime policies are decision functions with thresholds.")

Demo 25: continuous integration validation

This demo makes the production idea concrete with a small numerical object.

Code cell 53

header("Demo 25 - continuous integration validation: artifact dependency graph")
nodes = ["raw", "clean", "features", "model", "endpoint"]
edges = [("raw", "clean"), ("clean", "features"), ("features", "model"), ("model", "endpoint")]
adjacency = {node: [] for node in nodes}
for src, dst in edges:
    adjacency[src].append(dst)
print("Nodes:", nodes)
print("Edges:", edges)
check_true(len(edges) == len(nodes) - 1, "pipeline has one forward dependency chain")
check_true("model" in adjacency["features"], "features point to model artifact")
print("Production lesson: lineage is a graph, not a folder name.")