Exercises Notebook

Converted from exercises.ipynb for web reading.

Data Mixture Optimization - Exercises

Ten graded exercises for the approved Chapter 16 TOC.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import json
import hashlib
import re
import numpy as np
from collections import Counter, defaultdict

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def stable_hash(value):
    if not isinstance(value, str):
        value = json.dumps(value, sort_keys=True)
    return hashlib.sha256(value.encode("utf-8")).hexdigest()

def token_count(text):
    return len(re.findall(r"\S+", text))

def check_true(name, condition):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, value, expected, tol=1e-6):
    ok = np.allclose(value, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: value={value}, expected={expected}")
    return ok

print("Exercise helpers ready.")

Exercise 1: Mixture Check (*)

Build a synthetic mixture example, compute a validation statistic, and explain the pipeline implication.

Code cell 5

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 6

# Solution
header("Exercise 1: Mixture Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: mixture checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 2: Simplex Check (*)

Build a synthetic simplex example, compute a validation statistic, and explain the pipeline implication.

Code cell 8

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 9

# Solution
header("Exercise 2: Simplex Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: simplex checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 3: Domain Check (*)

Build a synthetic domain example, compute a validation statistic, and explain the pipeline implication.

Code cell 11

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 12

# Solution
header("Exercise 3: Domain Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: domain checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 4: Proxy Model Check (**)

Build a synthetic proxy model example, compute a validation statistic, and explain the pipeline implication.

Code cell 14

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 15

# Solution
header("Exercise 4: Proxy Model Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: proxy model checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 5: Dro Check (**)

Build a synthetic DRO example, compute a validation statistic, and explain the pipeline implication.

Code cell 17

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 18

# Solution
header("Exercise 5: Dro Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: DRO checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 6: Effective Tokens Check (**)

Build a synthetic effective tokens example, compute a validation statistic, and explain the pipeline implication.

Code cell 20

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 21

# Solution
header("Exercise 6: Effective Tokens Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: effective tokens checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 7: Validation Loss Check (**)

Build a synthetic validation loss example, compute a validation statistic, and explain the pipeline implication.

Code cell 23

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 24

# Solution
header("Exercise 7: Validation Loss Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: validation loss checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 8: Mixture Check (***)

Build a synthetic mixture example, compute a validation statistic, and explain the pipeline implication.

Code cell 26

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 27

# Solution
header("Exercise 8: Mixture Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: mixture checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 9: Simplex Check (***)

Build a synthetic simplex example, compute a validation statistic, and explain the pipeline implication.

Code cell 29

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 30

# Solution
header("Exercise 9: Simplex Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: simplex checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")

Exercise 10: Domain Check (***)

Build a synthetic domain example, compute a validation statistic, and explain the pipeline implication.

Code cell 32

# Your Solution
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
]
result = None
print("result:", result)

Code cell 33

# Solution
header("Exercise 10: Domain Check")
records = [
    {"id": "a", "text": "alpha beta gamma", "source": "web"},
    {"id": "b", "text": "alpha beta", "source": "code"},
    {"id": "c", "text": "delta epsilon zeta eta", "source": "books"},
]
lengths = np.array([token_count(r["text"]) for r in records])
ids = [stable_hash(r)[:10] for r in records]
source_counts = Counter(r["source"] for r in records)
check_true("ids are unique", len(set(ids)) == len(records))
check_true("all token counts positive", np.all(lengths > 0))
check_close("source count total", sum(source_counts.values()), len(records))
print("ids:", ids)
print("lengths:", lengths)
print("source_counts:", dict(source_counts))
print("\nTakeaway: domain checks should produce explicit counts, hashes, and validation signals before the next pipeline stage.")