Theory NotebookMath for LLMs

Structural Causal Models

Causal Inference / Structural Causal Models

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Structural Causal Models

Structural causal models encode how variables are generated so interventions can be represented as changes to mechanisms, not merely changes to observations.

This notebook is the executable companion to notes.md. It uses small synthetic causal systems so every cell runs without external data.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


import itertools
import math

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def sigmoid(z):
    return 1.0 / (1.0 + np.exp(-z))

def simulate_confounding(n=1000):
    z = np.random.binomial(1, 0.5, size=n)
    x = np.random.binomial(1, 0.2 + 0.6 * z)
    y = 1.0 + 2.0 * x + 3.0 * z + np.random.normal(0, 0.2, size=n)
    return z, x, y

def backdoor_adjustment(z, x, y):
    effect = 0.0
    for val in [0, 1]:
        mask_z = z == val
        p_z = np.mean(mask_z)
        y1 = np.mean(y[mask_z & (x == 1)])
        y0 = np.mean(y[mask_z & (x == 0)])
        effect += (y1 - y0) * p_z
    return float(effect)

def ate(y1, y0):
    return float(np.mean(y1 - y0))

def shd(adj_true, adj_hat):
    return int(np.sum(np.asarray(adj_true) != np.asarray(adj_hat)))

def acyclicity_value(w):
    eigvals = np.linalg.eigvals(np.asarray(w) * np.asarray(w))
    return float(np.sum(np.exp(eigvals)).real - w.shape[0])

print("Helper functions ready.")

Demo 1: correlation vs causation

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 5

header("Demo 1 - correlation vs causation: confounding versus adjustment")
z, x, y = simulate_confounding(n=2000)
naive = float(np.mean(y[x == 1]) - np.mean(y[x == 0]))
adjusted = backdoor_adjustment(z, x, y)
print("Naive difference:", round(naive, 3))
print("Backdoor-adjusted effect:", round(adjusted, 3))
check_true(abs(adjusted - 2.0) < abs(naive - 2.0), "adjustment moves closer to true effect")
print("Causal lesson: conditioning on the right confounder can recover an interventional contrast.")

Demo 2: mechanisms as stable assignments

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 7

header("Demo 2 - mechanisms as stable assignments: do is not condition")
z, x, y = simulate_confounding(n=3000)
observed = float(np.mean(y[x == 1]))
interventional = float(sum(np.mean(y[(x == 1) & (z == val)]) * np.mean(z == val) for val in [0, 1]))
print("Observed E[Y | X=1]:", round(observed, 3))
print("Adjusted E[Y | do(X=1)]:", round(interventional, 3))
check_true(abs(observed - interventional) > 0.1, "conditioning differs from intervention under confounding")
print("Causal lesson: intervention changes a mechanism, not merely a conditioning event.")

Demo 3: DAGs as causal assumptions

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 9

header("Demo 3 - DAGs as causal assumptions: potential outcomes")
n = 500
baseline = np.random.normal(0.0, 1.0, size=n)
y0 = baseline + np.random.normal(0, 0.1, size=n)
y1 = y0 + 1.5
effect = ate(y1, y0)
print("ATE:", round(effect, 3))
check_close(effect, 1.5, tol=0.05, name="average treatment effect")
print("Causal lesson: the causal effect compares two potential outcomes for the same units.")

Demo 4: interventions as model surgery

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 11

header("Demo 4 - interventions as model surgery: graph adjacency and SHD")
true_adj = np.array([[0, 1, 1], [0, 0, 1], [0, 0, 0]])
pred_adj = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
distance = shd(true_adj, pred_adj)
print("Structural Hamming distance:", distance)
check_true(distance == 1, "one edge differs")
fig, ax = plt.subplots()
ax.imshow(true_adj - pred_adj, cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_title("Difference between true and estimated adjacency")
ax.set_xlabel("Child index")
ax.set_ylabel("Parent index")
fig.tight_layout()
plt.show()
plt.close(fig)
print("Causal lesson: discovery is evaluated as graph structure, not just prediction loss.")

Demo 5: why SCMs matter for ML distribution shift

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 13

header("Demo 5 - why SCMs matter for ML distribution shift: NOTEARS acyclicity")
w_dag = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.0, 0.0, 0.0]])
w_cycle = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.4, 0.0, 0.0]])
h_dag = acyclicity_value(w_dag)
h_cycle = acyclicity_value(w_cycle)
print("h(W) for DAG:", round(h_dag, 8))
print("h(W) for cyclic graph:", round(h_cycle, 8))
check_true(h_cycle > h_dag, "cycle has larger acyclicity penalty")
print("Causal lesson: some discovery methods encode DAG structure as a smooth constraint.")

Demo 6: exogenous variables $\mathbf{U}$

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 15

header("Demo 6 - exogenous variables $\\mathbf{U}$: collider warning")
n = 3000
x = np.random.binomial(1, 0.5, size=n)
y = np.random.binomial(1, 0.5, size=n)
c = ((x + y + np.random.binomial(1, 0.15, size=n)) >= 1).astype(int)
unconditional = float(np.corrcoef(x, y)[0, 1])
conditioned = float(np.corrcoef(x[c == 1], y[c == 1])[0, 1])
print("Unconditional correlation:", round(unconditional, 3))
print("Correlation after conditioning on collider:", round(conditioned, 3))
check_true(abs(conditioned) > abs(unconditional), "conditioning on collider creates association")
print("Causal lesson: controlling for more variables can create bias.")

Demo 7: endogenous variables $\mathbf{V}$

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 17

header("Demo 7 - endogenous variables $\\mathbf{V}$: confounding versus adjustment")
z, x, y = simulate_confounding(n=2000)
naive = float(np.mean(y[x == 1]) - np.mean(y[x == 0]))
adjusted = backdoor_adjustment(z, x, y)
print("Naive difference:", round(naive, 3))
print("Backdoor-adjusted effect:", round(adjusted, 3))
check_true(abs(adjusted - 2.0) < abs(naive - 2.0), "adjustment moves closer to true effect")
print("Causal lesson: conditioning on the right confounder can recover an interventional contrast.")

Demo 8: structural assignments $V_i=f_i(\operatorname{pa}_i,U_i)$

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 19

header("Demo 8 - structural assignments $V_i=f_i(\\operatorname{pa}_i,U_i)$: do is not condition")
z, x, y = simulate_confounding(n=3000)
observed = float(np.mean(y[x == 1]))
interventional = float(sum(np.mean(y[(x == 1) & (z == val)]) * np.mean(z == val) for val in [0, 1]))
print("Observed E[Y | X=1]:", round(observed, 3))
print("Adjusted E[Y | do(X=1)]:", round(interventional, 3))
check_true(abs(observed - interventional) > 0.1, "conditioning differs from intervention under confounding")
print("Causal lesson: intervention changes a mechanism, not merely a conditioning event.")

Demo 9: causal graph $G$

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 21

header("Demo 9 - causal graph $G$: potential outcomes")
n = 500
baseline = np.random.normal(0.0, 1.0, size=n)
y0 = baseline + np.random.normal(0, 0.1, size=n)
y1 = y0 + 1.5
effect = ate(y1, y0)
print("ATE:", round(effect, 3))
check_close(effect, 1.5, tol=0.05, name="average treatment effect")
print("Causal lesson: the causal effect compares two potential outcomes for the same units.")

Demo 10: observational interventional and counterfactual distributions

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 23

header("Demo 10 - observational interventional and counterfactual distributions: graph adjacency and SHD")
true_adj = np.array([[0, 1, 1], [0, 0, 1], [0, 0, 0]])
pred_adj = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
distance = shd(true_adj, pred_adj)
print("Structural Hamming distance:", distance)
check_true(distance == 1, "one edge differs")
fig, ax = plt.subplots()
ax.imshow(true_adj - pred_adj, cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_title("Difference between true and estimated adjacency")
ax.set_xlabel("Child index")
ax.set_ylabel("Parent index")
fig.tight_layout()
plt.show()
plt.close(fig)
print("Causal lesson: discovery is evaluated as graph structure, not just prediction loss.")

Demo 11: parents descendants and ancestors

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 25

header("Demo 11 - parents descendants and ancestors: NOTEARS acyclicity")
w_dag = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.0, 0.0, 0.0]])
w_cycle = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.4, 0.0, 0.0]])
h_dag = acyclicity_value(w_dag)
h_cycle = acyclicity_value(w_cycle)
print("h(W) for DAG:", round(h_dag, 8))
print("h(W) for cyclic graph:", round(h_cycle, 8))
check_true(h_cycle > h_dag, "cycle has larger acyclicity penalty")
print("Causal lesson: some discovery methods encode DAG structure as a smooth constraint.")

Demo 12: paths and blocked paths

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 27

header("Demo 12 - paths and blocked paths: collider warning")
n = 3000
x = np.random.binomial(1, 0.5, size=n)
y = np.random.binomial(1, 0.5, size=n)
c = ((x + y + np.random.binomial(1, 0.15, size=n)) >= 1).astype(int)
unconditional = float(np.corrcoef(x, y)[0, 1])
conditioned = float(np.corrcoef(x[c == 1], y[c == 1])[0, 1])
print("Unconditional correlation:", round(unconditional, 3))
print("Correlation after conditioning on collider:", round(conditioned, 3))
check_true(abs(conditioned) > abs(unconditional), "conditioning on collider creates association")
print("Causal lesson: controlling for more variables can create bias.")

Demo 13: d-separation

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 29

header("Demo 13 - d-separation: confounding versus adjustment")
z, x, y = simulate_confounding(n=2000)
naive = float(np.mean(y[x == 1]) - np.mean(y[x == 0]))
adjusted = backdoor_adjustment(z, x, y)
print("Naive difference:", round(naive, 3))
print("Backdoor-adjusted effect:", round(adjusted, 3))
check_true(abs(adjusted - 2.0) < abs(naive - 2.0), "adjustment moves closer to true effect")
print("Causal lesson: conditioning on the right confounder can recover an interventional contrast.")

Demo 14: causal Markov property

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 31

header("Demo 14 - causal Markov property: do is not condition")
z, x, y = simulate_confounding(n=3000)
observed = float(np.mean(y[x == 1]))
interventional = float(sum(np.mean(y[(x == 1) & (z == val)]) * np.mean(z == val) for val in [0, 1]))
print("Observed E[Y | X=1]:", round(observed, 3))
print("Adjusted E[Y | do(X=1)]:", round(interventional, 3))
check_true(abs(observed - interventional) > 0.1, "conditioning differs from intervention under confounding")
print("Causal lesson: intervention changes a mechanism, not merely a conditioning event.")

Demo 15: latent confounding and bidirected edges

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 33

header("Demo 15 - latent confounding and bidirected edges: potential outcomes")
n = 500
baseline = np.random.normal(0.0, 1.0, size=n)
y0 = baseline + np.random.normal(0, 0.1, size=n)
y1 = y0 + 1.5
effect = ate(y1, y0)
print("ATE:", round(effect, 3))
check_close(effect, 1.5, tol=0.05, name="average treatment effect")
print("Causal lesson: the causal effect compares two potential outcomes for the same units.")

Demo 16: linear SCMs

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 35

header("Demo 16 - linear SCMs: graph adjacency and SHD")
true_adj = np.array([[0, 1, 1], [0, 0, 1], [0, 0, 0]])
pred_adj = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
distance = shd(true_adj, pred_adj)
print("Structural Hamming distance:", distance)
check_true(distance == 1, "one edge differs")
fig, ax = plt.subplots()
ax.imshow(true_adj - pred_adj, cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_title("Difference between true and estimated adjacency")
ax.set_xlabel("Child index")
ax.set_ylabel("Parent index")
fig.tight_layout()
plt.show()
plt.close(fig)
print("Causal lesson: discovery is evaluated as graph structure, not just prediction loss.")

Demo 17: nonlinear SCMs

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 37

header("Demo 17 - nonlinear SCMs: NOTEARS acyclicity")
w_dag = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.0, 0.0, 0.0]])
w_cycle = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.4, 0.0, 0.0]])
h_dag = acyclicity_value(w_dag)
h_cycle = acyclicity_value(w_cycle)
print("h(W) for DAG:", round(h_dag, 8))
print("h(W) for cyclic graph:", round(h_cycle, 8))
check_true(h_cycle > h_dag, "cycle has larger acyclicity penalty")
print("Causal lesson: some discovery methods encode DAG structure as a smooth constraint.")

Demo 18: independent noise terms

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 39

header("Demo 18 - independent noise terms: collider warning")
n = 3000
x = np.random.binomial(1, 0.5, size=n)
y = np.random.binomial(1, 0.5, size=n)
c = ((x + y + np.random.binomial(1, 0.15, size=n)) >= 1).astype(int)
unconditional = float(np.corrcoef(x, y)[0, 1])
conditioned = float(np.corrcoef(x[c == 1], y[c == 1])[0, 1])
print("Unconditional correlation:", round(unconditional, 3))
print("Correlation after conditioning on collider:", round(conditioned, 3))
check_true(abs(conditioned) > abs(unconditional), "conditioning on collider creates association")
print("Causal lesson: controlling for more variables can create bias.")

Demo 19: modularity and autonomy

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 41

header("Demo 19 - modularity and autonomy: confounding versus adjustment")
z, x, y = simulate_confounding(n=2000)
naive = float(np.mean(y[x == 1]) - np.mean(y[x == 0]))
adjusted = backdoor_adjustment(z, x, y)
print("Naive difference:", round(naive, 3))
print("Backdoor-adjusted effect:", round(adjusted, 3))
check_true(abs(adjusted - 2.0) < abs(naive - 2.0), "adjustment moves closer to true effect")
print("Causal lesson: conditioning on the right confounder can recover an interventional contrast.")

Demo 20: Markovian vs semi-Markovian models

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 43

header("Demo 20 - Markovian vs semi-Markovian models: do is not condition")
z, x, y = simulate_confounding(n=3000)
observed = float(np.mean(y[x == 1]))
interventional = float(sum(np.mean(y[(x == 1) & (z == val)]) * np.mean(z == val) for val in [0, 1]))
print("Observed E[Y | X=1]:", round(observed, 3))
print("Adjusted E[Y | do(X=1)]:", round(interventional, 3))
check_true(abs(observed - interventional) > 0.1, "conditioning differs from intervention under confounding")
print("Causal lesson: intervention changes a mechanism, not merely a conditioning event.")

Demo 21: total causal effect

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 45

header("Demo 21 - total causal effect: potential outcomes")
n = 500
baseline = np.random.normal(0.0, 1.0, size=n)
y0 = baseline + np.random.normal(0, 0.1, size=n)
y1 = y0 + 1.5
effect = ate(y1, y0)
print("ATE:", round(effect, 3))
check_close(effect, 1.5, tol=0.05, name="average treatment effect")
print("Causal lesson: the causal effect compares two potential outcomes for the same units.")

Demo 22: intervention distribution $P(Y \mid \operatorname{do}(X=x))$

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 47

header("Demo 22 - intervention distribution $P(Y \\mid \\operatorname{do}(X=x))$: graph adjacency and SHD")
true_adj = np.array([[0, 1, 1], [0, 0, 1], [0, 0, 0]])
pred_adj = np.array([[0, 1, 0], [0, 0, 1], [0, 0, 0]])
distance = shd(true_adj, pred_adj)
print("Structural Hamming distance:", distance)
check_true(distance == 1, "one edge differs")
fig, ax = plt.subplots()
ax.imshow(true_adj - pred_adj, cmap="RdBu_r", vmin=-1, vmax=1)
ax.set_title("Difference between true and estimated adjacency")
ax.set_xlabel("Child index")
ax.set_ylabel("Parent index")
fig.tight_layout()
plt.show()
plt.close(fig)
print("Causal lesson: discovery is evaluated as graph structure, not just prediction loss.")

Demo 23: backdoor adjustment preview

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 49

header("Demo 23 - backdoor adjustment preview: NOTEARS acyclicity")
w_dag = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.0, 0.0, 0.0]])
w_cycle = np.array([[0.0, 0.8, 0.0], [0.0, 0.0, 0.5], [0.4, 0.0, 0.0]])
h_dag = acyclicity_value(w_dag)
h_cycle = acyclicity_value(w_cycle)
print("h(W) for DAG:", round(h_dag, 8))
print("h(W) for cyclic graph:", round(h_cycle, 8))
check_true(h_cycle > h_dag, "cycle has larger acyclicity penalty")
print("Causal lesson: some discovery methods encode DAG structure as a smooth constraint.")

Demo 24: frontdoor preview

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 51

header("Demo 24 - frontdoor preview: collider warning")
n = 3000
x = np.random.binomial(1, 0.5, size=n)
y = np.random.binomial(1, 0.5, size=n)
c = ((x + y + np.random.binomial(1, 0.15, size=n)) >= 1).astype(int)
unconditional = float(np.corrcoef(x, y)[0, 1])
conditioned = float(np.corrcoef(x[c == 1], y[c == 1])[0, 1])
print("Unconditional correlation:", round(unconditional, 3))
print("Correlation after conditioning on collider:", round(conditioned, 3))
check_true(abs(conditioned) > abs(unconditional), "conditioning on collider creates association")
print("Causal lesson: controlling for more variables can create bias.")

Demo 25: estimand vs estimator

This demo turns the approved TOC concept into a small causal object or calculation.

Code cell 53

header("Demo 25 - estimand vs estimator: confounding versus adjustment")
z, x, y = simulate_confounding(n=2000)
naive = float(np.mean(y[x == 1]) - np.mean(y[x == 0]))
adjusted = backdoor_adjustment(z, x, y)
print("Naive difference:", round(naive, 3))
print("Backdoor-adjusted effect:", round(adjusted, 3))
check_true(abs(adjusted - 2.0) < abs(naive - 2.0), "adjustment moves closer to true effect")
print("Causal lesson: conditioning on the right confounder can recover an interventional contrast.")

Previous lesson Next lesson