Exercises Notebook
Converted from
exercises.ipynbfor web reading.
Data Mixture Optimization - Exercises
Ten graded exercises for the approved Chapter 16 TOC.
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import json
import hashlib
import re
import numpy as np
from collections import Counter, defaultdict
def header(title):
print("\n" + "=" * 72)
print(title)
print("=" * 72)
def stable_hash(value):
if not isinstance(value, str):
value = json.dumps(value, sort_keys=True)
return hashlib.sha256(value.encode("utf-8")).hexdigest()
def token_count(text):
return len(re.findall(r"\S+", text))
def check_true(name, condition):
ok = bool(condition)
print(f"{'PASS' if ok else 'FAIL'} - {name}")
return ok
def check_close(name, value, expected, tol=1e-6):
ok = np.allclose(value, expected, atol=tol, rtol=tol)
print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
return ok
print("Exercise helpers ready.")
Exercise 1: Mixture Check (*)
Build a synthetic mixture example, compute a validation statistic, and explain the pipeline implication.
Code cell 5
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 6
# Solution
header("Exercise 1: Mixture Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: mixture checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 2: Simplex Check (*)
Build a synthetic simplex example, compute a validation statistic, and explain the pipeline implication.
Code cell 8
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 9
# Solution
header("Exercise 2: Simplex Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: simplex checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 3: Domain Check (*)
Build a synthetic domain example, compute a validation statistic, and explain the pipeline implication.
Code cell 11
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 12
# Solution
header("Exercise 3: Domain Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: domain checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 4: Proxy Model Check (**)
Build a synthetic proxy model example, compute a validation statistic, and explain the pipeline implication.
Code cell 14
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 15
# Solution
header("Exercise 4: Proxy Model Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: proxy model checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 5: Dro Check (**)
Build a synthetic DRO example, compute a validation statistic, and explain the pipeline implication.
Code cell 17
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 18
# Solution
header("Exercise 5: Dro Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: DRO checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 6: Effective Tokens Check (**)
Build a synthetic effective tokens example, compute a validation statistic, and explain the pipeline implication.
Code cell 20
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 21
# Solution
header("Exercise 6: Effective Tokens Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: effective tokens checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 7: Validation Loss Check (**)
Build a synthetic validation loss example, compute a validation statistic, and explain the pipeline implication.
Code cell 23
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 24
# Solution
header("Exercise 7: Validation Loss Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: validation loss checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 8: Mixture Check (***)
Build a synthetic mixture example, compute a validation statistic, and explain the pipeline implication.
Code cell 26
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 27
# Solution
header("Exercise 8: Mixture Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: mixture checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 9: Simplex Check (***)
Build a synthetic simplex example, compute a validation statistic, and explain the pipeline implication.
Code cell 29
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 30
# Solution
header("Exercise 9: Simplex Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: simplex checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")
Exercise 10: Domain Check (***)
Build a synthetic domain example, compute a validation statistic, and explain the pipeline implication.
Code cell 32
# Your Solution
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)
Code cell 33
# Solution
header("Exercise 10: Domain Check")
records = [
{"id": "a", "text": "alpha beta gamma", "source": "web"},
{"id": "b", "text": "alpha beta", "source": "code"},
{"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: domain checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")