Theory NotebookMath for LLMs

Multi Agent Systems

Game Theory / Multi Agent Systems

Notesnotes.md Theory Notebooktheory.ipynb Exercises Notebookexercises.ipynb

Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Multi-Agent Systems

Multi-agent systems study strategic learning when multiple agents act, adapt, communicate, and optimize in a shared environment.

This notebook is the executable companion to notes.md. It uses small payoff matrices and synthetic games so every cell runs without external data.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3


COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

def header(title):
    print("\n" + "=" * 72)
    print(title)
    print("=" * 72)

def check_true(condition, name):
    ok = bool(condition)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    assert ok, name

def check_close(value, target, tol=1e-8, name="value"):
    ok = abs(float(value) - float(target)) <= tol
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {float(value):.6f}, expected {float(target):.6f}")
    assert ok, name

def pure_nash(payoff_a, payoff_b):
    payoff_a = np.asarray(payoff_a, dtype=float)
    payoff_b = np.asarray(payoff_b, dtype=float)
    equilibria = []
    for i in range(payoff_a.shape[0]):
        for j in range(payoff_a.shape[1]):
            row_best = payoff_a[i, j] >= np.max(payoff_a[:, j]) - 1e-12
            col_best = payoff_b[i, j] >= np.max(payoff_b[i, :]) - 1e-12
            if row_best and col_best:
                equilibria.append((i, j))
    return equilibria

def expected_payoff(payoff, p, q):
    return float(np.asarray(p) @ np.asarray(payoff, dtype=float) @ np.asarray(q))

def grid_zero_sum_value(payoff, grid=101):
    payoff = np.asarray(payoff, dtype=float)
    ps = np.linspace(0, 1, grid)
    qs = np.linspace(0, 1, grid)
    row_values = []
    for p0 in ps:
        p = np.array([p0, 1 - p0])
        row_values.append(min(expected_payoff(payoff, p, np.array([q0, 1 - q0])) for q0 in qs))
    col_values = []
    for q0 in qs:
        q = np.array([q0, 1 - q0])
        col_values.append(max(expected_payoff(payoff, np.array([p0, 1 - p0]), q) for p0 in ps))
    return float(max(row_values)), float(min(col_values))

def fictitious_play_rps(steps=200):
    payoff = np.array([[0, -1, 1], [1, 0, -1], [-1, 1, 0]], dtype=float)
    counts_a = np.ones(3)
    counts_b = np.ones(3)
    history = []
    for _ in range(steps):
        q = counts_b / counts_b.sum()
        p = counts_a / counts_a.sum()
        a = int(np.argmax(payoff @ q))
        b = int(np.argmin(p @ payoff))
        counts_a[a] += 1
        counts_b[b] += 1
        history.append(counts_a / counts_a.sum())
    return np.array(history)

def pgd_1d(theta=1.0, x=0.25, y=1.0, eps=0.5, steps=20, alpha=0.05):
    delta = 0.0
    for _ in range(steps):
        pred = theta * (x + delta)
        grad = 2 * (pred - y) * theta
        delta = np.clip(delta + alpha * np.sign(grad), -eps, eps)
    robust_loss = (theta * (x + delta) - y) ** 2
    return float(delta), float(robust_loss)

print("Helper functions ready.")

Demo 1: many learners sharing one environment

This demo turns the approved TOC concept into a small executable game.

Code cell 5

header("Demo 1 - many learners sharing one environment: pure Nash equilibria")
A = np.array([[3, 0], [5, 1]], dtype=float)
B = np.array([[3, 5], [0, 1]], dtype=float)
eq = pure_nash(A, B)
print("Pure Nash equilibria:", eq)
check_true((1, 1) in eq, "defect-defect is a pure equilibrium in this payoff table")
print("Game-theory lesson: equilibrium checks unilateral deviations, not social optimality.")

Demo 2: nonstationarity

This demo turns the approved TOC concept into a small executable game.

Code cell 7

header("Demo 2 - nonstationarity: matching pennies mixed strategy")
A = np.array([[1, -1], [-1, 1]], dtype=float)
p = np.array([0.5, 0.5])
q = np.array([0.5, 0.5])
value = expected_payoff(A, p, q)
print("Expected payoff at mixed equilibrium:", value)
check_close(value, 0.0, name="matching pennies value")
print("Game-theory lesson: randomization can remove profitable pure deviations.")

Demo 3: cooperation vs competition

This demo turns the approved TOC concept into a small executable game.

Code cell 9

header("Demo 3 - cooperation vs competition: minimax grid value")
A = np.array([[1, -1], [-1, 1]], dtype=float)
lower, upper = grid_zero_sum_value(A, grid=101)
print("Grid maximin:", round(lower, 4))
print("Grid minimax:", round(upper, 4))
check_true(abs(lower - upper) < 0.05, "grid approximation nearly matches minimax value")
print("Game-theory lesson: zero-sum optimal attack and defense meet at one value.")

Demo 4: communication and coordination

This demo turns the approved TOC concept into a small executable game.

Code cell 11

header("Demo 4 - communication and coordination: fictitious play in rock-paper-scissors")
hist = fictitious_play_rps(steps=180)
final_policy = hist[-1]
print("Final empirical row policy:", np.round(final_policy, 3).tolist())
check_true(np.all(np.abs(final_policy - 1/3) < 0.15), "empirical policy approaches the mixed equilibrium region")
fig, ax = plt.subplots()
ax.plot(hist[:, 0], color=COLORS["primary"], label="Rock")
ax.plot(hist[:, 1], color=COLORS["secondary"], label="Paper")
ax.plot(hist[:, 2], color=COLORS["tertiary"], label="Scissors")
ax.set_title("Fictitious play empirical strategy")
ax.set_xlabel("Iteration")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Game-theory lesson: repeated adaptation can approximate equilibrium behavior.")

Demo 5: emergent behavior in AI systems

This demo turns the approved TOC concept into a small executable game.

Code cell 13

header("Demo 5 - emergent behavior in AI systems: adversarial inner maximization")
delta, loss = pgd_1d(theta=1.0, x=0.25, y=1.0, eps=0.5)
clean_loss = (0.25 - 1.0) ** 2
print("Adversarial delta:", delta)
print("Clean loss:", round(clean_loss, 4))
print("Adversarial loss:", round(loss, 4))
check_true(loss >= clean_loss, "inner maximization does not reduce loss")
print("Game-theory lesson: robust learning optimizes against an adaptive perturbation.")

Demo 6: agent set $N$

This demo turns the approved TOC concept into a small executable game.

Code cell 15

header("Demo 6 - agent set $N$: payoff heatmap")
A = np.array([[2, -1, 0], [0, 1, -2], [-1, 3, 1]], dtype=float)
fig, ax = plt.subplots()
im = ax.imshow(A, cmap="RdBu_r")
ax.set_title("Row-player payoff matrix")
ax.set_xlabel("Column action")
ax.set_ylabel("Row action")
fig.colorbar(im, ax=ax, label="Payoff")
fig.tight_layout()
plt.show()
plt.close(fig)
best_rows = np.argmax(A, axis=0)
print("Best row responses by column action:", best_rows.tolist())
check_true(len(best_rows) == A.shape[1], "one row best response per column action")
print("Game-theory lesson: payoff geometry determines best responses.")

Demo 7: joint action $\mathbf{a}$

This demo turns the approved TOC concept into a small executable game.

Code cell 17

header("Demo 7 - joint action $\\mathbf{a}$: pure Nash equilibria")
A = np.array([[3, 0], [5, 1]], dtype=float)
B = np.array([[3, 5], [0, 1]], dtype=float)
eq = pure_nash(A, B)
print("Pure Nash equilibria:", eq)
check_true((1, 1) in eq, "defect-defect is a pure equilibrium in this payoff table")
print("Game-theory lesson: equilibrium checks unilateral deviations, not social optimality.")

Demo 8: reward vector $\mathbf{r}$

This demo turns the approved TOC concept into a small executable game.

Code cell 19

header("Demo 8 - reward vector $\\mathbf{r}$: matching pennies mixed strategy")
A = np.array([[1, -1], [-1, 1]], dtype=float)
p = np.array([0.5, 0.5])
q = np.array([0.5, 0.5])
value = expected_payoff(A, p, q)
print("Expected payoff at mixed equilibrium:", value)
check_close(value, 0.0, name="matching pennies value")
print("Game-theory lesson: randomization can remove profitable pure deviations.")

Demo 9: Markov game

This demo turns the approved TOC concept into a small executable game.

Code cell 21

header("Demo 9 - Markov game: minimax grid value")
A = np.array([[1, -1], [-1, 1]], dtype=float)
lower, upper = grid_zero_sum_value(A, grid=101)
print("Grid maximin:", round(lower, 4))
print("Grid minimax:", round(upper, 4))
check_true(abs(lower - upper) < 0.05, "grid approximation nearly matches minimax value")
print("Game-theory lesson: zero-sum optimal attack and defense meet at one value.")

Demo 10: joint policy $\boldsymbol{\pi}$

This demo turns the approved TOC concept into a small executable game.

Code cell 23

header("Demo 10 - joint policy $\\boldsymbol{\\pi}$: fictitious play in rock-paper-scissors")
hist = fictitious_play_rps(steps=180)
final_policy = hist[-1]
print("Final empirical row policy:", np.round(final_policy, 3).tolist())
check_true(np.all(np.abs(final_policy - 1/3) < 0.15), "empirical policy approaches the mixed equilibrium region")
fig, ax = plt.subplots()
ax.plot(hist[:, 0], color=COLORS["primary"], label="Rock")
ax.plot(hist[:, 1], color=COLORS["secondary"], label="Paper")
ax.plot(hist[:, 2], color=COLORS["tertiary"], label="Scissors")
ax.set_title("Fictitious play empirical strategy")
ax.set_xlabel("Iteration")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Game-theory lesson: repeated adaptation can approximate equilibrium behavior.")

Demo 11: state transitions

This demo turns the approved TOC concept into a small executable game.

Code cell 25

header("Demo 11 - state transitions: adversarial inner maximization")
delta, loss = pgd_1d(theta=1.0, x=0.25, y=1.0, eps=0.5)
clean_loss = (0.25 - 1.0) ** 2
print("Adversarial delta:", delta)
print("Clean loss:", round(clean_loss, 4))
print("Adversarial loss:", round(loss, 4))
check_true(loss >= clean_loss, "inner maximization does not reduce loss")
print("Game-theory lesson: robust learning optimizes against an adaptive perturbation.")

Demo 12: value functions for agents

This demo turns the approved TOC concept into a small executable game.

Code cell 27

header("Demo 12 - value functions for agents: payoff heatmap")
A = np.array([[2, -1, 0], [0, 1, -2], [-1, 3, 1]], dtype=float)
fig, ax = plt.subplots()
im = ax.imshow(A, cmap="RdBu_r")
ax.set_title("Row-player payoff matrix")
ax.set_xlabel("Column action")
ax.set_ylabel("Row action")
fig.colorbar(im, ax=ax, label="Payoff")
fig.tight_layout()
plt.show()
plt.close(fig)
best_rows = np.argmax(A, axis=0)
print("Best row responses by column action:", best_rows.tolist())
check_true(len(best_rows) == A.shape[1], "one row best response per column action")
print("Game-theory lesson: payoff geometry determines best responses.")

Demo 13: Nash policies

This demo turns the approved TOC concept into a small executable game.

Code cell 29

header("Demo 13 - Nash policies: pure Nash equilibria")
A = np.array([[3, 0], [5, 1]], dtype=float)
B = np.array([[3, 5], [0, 1]], dtype=float)
eq = pure_nash(A, B)
print("Pure Nash equilibria:", eq)
check_true((1, 1) in eq, "defect-defect is a pure equilibrium in this payoff table")
print("Game-theory lesson: equilibrium checks unilateral deviations, not social optimality.")

Demo 14: cooperative team games

This demo turns the approved TOC concept into a small executable game.

Code cell 31

header("Demo 14 - cooperative team games: matching pennies mixed strategy")
A = np.array([[1, -1], [-1, 1]], dtype=float)
p = np.array([0.5, 0.5])
q = np.array([0.5, 0.5])
value = expected_payoff(A, p, q)
print("Expected payoff at mixed equilibrium:", value)
check_close(value, 0.0, name="matching pennies value")
print("Game-theory lesson: randomization can remove profitable pure deviations.")

Demo 15: partially observed settings preview

This demo turns the approved TOC concept into a small executable game.

Code cell 33

header("Demo 15 - partially observed settings preview: minimax grid value")
A = np.array([[1, -1], [-1, 1]], dtype=float)
lower, upper = grid_zero_sum_value(A, grid=101)
print("Grid maximin:", round(lower, 4))
print("Grid minimax:", round(upper, 4))
check_true(abs(lower - upper) < 0.05, "grid approximation nearly matches minimax value")
print("Game-theory lesson: zero-sum optimal attack and defense meet at one value.")

Demo 16: independent learners

This demo turns the approved TOC concept into a small executable game.

Code cell 35

header("Demo 16 - independent learners: fictitious play in rock-paper-scissors")
hist = fictitious_play_rps(steps=180)
final_policy = hist[-1]
print("Final empirical row policy:", np.round(final_policy, 3).tolist())
check_true(np.all(np.abs(final_policy - 1/3) < 0.15), "empirical policy approaches the mixed equilibrium region")
fig, ax = plt.subplots()
ax.plot(hist[:, 0], color=COLORS["primary"], label="Rock")
ax.plot(hist[:, 1], color=COLORS["secondary"], label="Paper")
ax.plot(hist[:, 2], color=COLORS["tertiary"], label="Scissors")
ax.set_title("Fictitious play empirical strategy")
ax.set_xlabel("Iteration")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Game-theory lesson: repeated adaptation can approximate equilibrium behavior.")

Demo 17: fictitious play

This demo turns the approved TOC concept into a small executable game.

Code cell 37

header("Demo 17 - fictitious play: adversarial inner maximization")
delta, loss = pgd_1d(theta=1.0, x=0.25, y=1.0, eps=0.5)
clean_loss = (0.25 - 1.0) ** 2
print("Adversarial delta:", delta)
print("Clean loss:", round(clean_loss, 4))
print("Adversarial loss:", round(loss, 4))
check_true(loss >= clean_loss, "inner maximization does not reduce loss")
print("Game-theory lesson: robust learning optimizes against an adaptive perturbation.")

Demo 18: no-regret learning

This demo turns the approved TOC concept into a small executable game.

Code cell 39

header("Demo 18 - no-regret learning: payoff heatmap")
A = np.array([[2, -1, 0], [0, 1, -2], [-1, 3, 1]], dtype=float)
fig, ax = plt.subplots()
im = ax.imshow(A, cmap="RdBu_r")
ax.set_title("Row-player payoff matrix")
ax.set_xlabel("Column action")
ax.set_ylabel("Row action")
fig.colorbar(im, ax=ax, label="Payoff")
fig.tight_layout()
plt.show()
plt.close(fig)
best_rows = np.argmax(A, axis=0)
print("Best row responses by column action:", best_rows.tolist())
check_true(len(best_rows) == A.shape[1], "one row best response per column action")
print("Game-theory lesson: payoff geometry determines best responses.")

Demo 19: policy gradients in games

This demo turns the approved TOC concept into a small executable game.

Code cell 41

header("Demo 19 - policy gradients in games: pure Nash equilibria")
A = np.array([[3, 0], [5, 1]], dtype=float)
B = np.array([[3, 5], [0, 1]], dtype=float)
eq = pure_nash(A, B)
print("Pure Nash equilibria:", eq)
check_true((1, 1) in eq, "defect-defect is a pure equilibrium in this payoff table")
print("Game-theory lesson: equilibrium checks unilateral deviations, not social optimality.")

Demo 20: equilibrium selection

This demo turns the approved TOC concept into a small executable game.

Code cell 43

header("Demo 20 - equilibrium selection: matching pennies mixed strategy")
A = np.array([[1, -1], [-1, 1]], dtype=float)
p = np.array([0.5, 0.5])
q = np.array([0.5, 0.5])
value = expected_payoff(A, p, q)
print("Expected payoff at mixed equilibrium:", value)
check_close(value, 0.0, name="matching pennies value")
print("Game-theory lesson: randomization can remove profitable pure deviations.")

Demo 21: common-payoff games

This demo turns the approved TOC concept into a small executable game.

Code cell 45

header("Demo 21 - common-payoff games: minimax grid value")
A = np.array([[1, -1], [-1, 1]], dtype=float)
lower, upper = grid_zero_sum_value(A, grid=101)
print("Grid maximin:", round(lower, 4))
print("Grid minimax:", round(upper, 4))
check_true(abs(lower - upper) < 0.05, "grid approximation nearly matches minimax value")
print("Game-theory lesson: zero-sum optimal attack and defense meet at one value.")

Demo 22: conventions

This demo turns the approved TOC concept into a small executable game.

Code cell 47

header("Demo 22 - conventions: fictitious play in rock-paper-scissors")
hist = fictitious_play_rps(steps=180)
final_policy = hist[-1]
print("Final empirical row policy:", np.round(final_policy, 3).tolist())
check_true(np.all(np.abs(final_policy - 1/3) < 0.15), "empirical policy approaches the mixed equilibrium region")
fig, ax = plt.subplots()
ax.plot(hist[:, 0], color=COLORS["primary"], label="Rock")
ax.plot(hist[:, 1], color=COLORS["secondary"], label="Paper")
ax.plot(hist[:, 2], color=COLORS["tertiary"], label="Scissors")
ax.set_title("Fictitious play empirical strategy")
ax.set_xlabel("Iteration")
ax.set_ylabel("Probability")
ax.legend()
fig.tight_layout()
plt.show()
plt.close(fig)
print("Game-theory lesson: repeated adaptation can approximate equilibrium behavior.")

Demo 23: communication protocols

This demo turns the approved TOC concept into a small executable game.

Code cell 49

header("Demo 23 - communication protocols: adversarial inner maximization")
delta, loss = pgd_1d(theta=1.0, x=0.25, y=1.0, eps=0.5)
clean_loss = (0.25 - 1.0) ** 2
print("Adversarial delta:", delta)
print("Clean loss:", round(clean_loss, 4))
print("Adversarial loss:", round(loss, 4))
check_true(loss >= clean_loss, "inner maximization does not reduce loss")
print("Game-theory lesson: robust learning optimizes against an adaptive perturbation.")

Demo 24: credit assignment

This demo turns the approved TOC concept into a small executable game.

Code cell 51

header("Demo 24 - credit assignment: payoff heatmap")
A = np.array([[2, -1, 0], [0, 1, -2], [-1, 3, 1]], dtype=float)
fig, ax = plt.subplots()
im = ax.imshow(A, cmap="RdBu_r")
ax.set_title("Row-player payoff matrix")
ax.set_xlabel("Column action")
ax.set_ylabel("Row action")
fig.colorbar(im, ax=ax, label="Payoff")
fig.tight_layout()
plt.show()
plt.close(fig)
best_rows = np.argmax(A, axis=0)
print("Best row responses by column action:", best_rows.tolist())
check_true(len(best_rows) == A.shape[1], "one row best response per column action")
print("Game-theory lesson: payoff geometry determines best responses.")

Demo 25: social welfare and fairness

This demo turns the approved TOC concept into a small executable game.

Code cell 53

header("Demo 25 - social welfare and fairness: pure Nash equilibria")
A = np.array([[3, 0], [5, 1]], dtype=float)
B = np.array([[3, 5], [0, 1]], dtype=float)
eq = pure_nash(A, B)
print("Pure Nash equilibria:", eq)
check_true((1, 1) in eq, "defect-defect is a pure equilibrium in this payoff table")
print("Game-theory lesson: equilibrium checks unilateral deviations, not social optimality.")

Previous lesson Next lesson