Theory Notebook

Converted from theory.ipynb for web reading.

Time Series — Theory Notebook

Time order changes what counts as signal, what can be predicted, and how uncertainty should evolve with horizon.

This notebook is the interactive companion to notes.md. It moves from dependence diagnostics and linear models to seasonality, state-space reasoning, spectral views, and modern AI links.

Coverage	What You Will Do
Dependence	Compute ACF/PACF, compare stationary and nonstationary series
Linear models	Simulate AR, MA, ARMA, and ARIMA behavior
Forecasting	Build one-step and multi-step forecasts with uncertainty
State space	Simulate hidden-state systems and run a Kalman filter
Spectral view	Detect periodicity with FFT and periodograms
AI links	Connect classical forecasting and temporal encoding ideas to modern sequence models

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({
        "figure.figsize":    (10, 6),
        "figure.dpi":         120,
        "font.size":           13,
        "axes.titlesize":      15,
        "axes.labelsize":      13,
        "xtick.labelsize":     11,
        "ytick.labelsize":     11,
        "legend.fontsize":     11,
        "legend.framealpha":   0.85,
        "lines.linewidth":      2.0,
        "axes.spines.top":     False,
        "axes.spines.right":   False,
        "savefig.bbox":       "tight",
        "savefig.dpi":         150,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

try:
    import seaborn as sns
    if HAS_MPL:
        sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    if HAS_MPL:
        plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

np.random.seed(42)
print("Plot setup complete.")

Code cell 4

import numpy.linalg as la
from scipy import signal

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

rng = np.random.default_rng(42)
np.set_printoptions(precision=4, suppress=True)
mpl.rcParams["figure.max_open_warning"] = 0

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def report(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return cond

def sample_acf(x, max_lag):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    denom = np.dot(x, x)
    vals = [1.0]
    for h in range(1, max_lag + 1):
        vals.append(np.dot(x[:-h], x[h:]) / denom)
    return np.array(vals)

def sample_pacf(x, max_lag):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    pacf = [1.0]
    for h in range(1, max_lag + 1):
        y = x[h:]
        X = np.column_stack([x[h-k:-k] for k in range(1, h + 1)])
        beta = la.lstsq(X, y, rcond=None)[0]
        pacf.append(beta[-1])
    return np.array(pacf)

def rolling_mean(x, window):
    x = np.asarray(x, dtype=float)
    kernel = np.ones(window) / window
    return np.convolve(x, kernel, mode="valid")

def rolling_var(x, window):
    x = np.asarray(x, dtype=float)
    mu = rolling_mean(x, window)
    sq = rolling_mean(x**2, window)
    return sq - mu**2

def simulate_ar(phi, n, sigma=1.0, c=0.0, burn=250):
    phi = np.asarray(phi, dtype=float)
    p = len(phi)
    e = rng.normal(scale=sigma, size=n + burn + p)
    x = np.zeros(n + burn + p)
    mu = c / (1 - phi.sum()) if abs(1 - phi.sum()) > 1e-8 else 0.0
    x[:p] = mu
    for t in range(p, n + burn + p):
        x[t] = c + phi @ x[t-p:t][::-1] + e[t]
    return x[burn + p:]

def simulate_ma(theta, n, sigma=1.0, mu=0.0, burn=50):
    theta = np.asarray(theta, dtype=float)
    q = len(theta)
    e = rng.normal(scale=sigma, size=n + burn + q + 1)
    x = np.zeros(n + burn + q)
    for t in range(q, n + burn + q):
        x[t] = mu + e[t]
        for j in range(1, q + 1):
            x[t] += theta[j-1] * e[t-j]
    return x[burn + q:]

def simulate_arma(phi, theta, n, sigma=1.0, c=0.0, burn=300):
    phi = np.asarray(phi, dtype=float)
    theta = np.asarray(theta, dtype=float)
    p, q = len(phi), len(theta)
    e = rng.normal(scale=sigma, size=n + burn + max(p, q) + 1)
    x = np.zeros(n + burn + max(p, q) + 1)
    mu = c / (1 - phi.sum()) if p > 0 and abs(1 - phi.sum()) > 1e-8 else 0.0
    x[:max(p, q)+1] = mu
    for t in range(max(p, q) + 1, len(x)):
        ar_part = sum(phi[j] * x[t-j-1] for j in range(p))
        ma_part = e[t] + sum(theta[j] * e[t-j-1] for j in range(q))
        x[t] = c + ar_part + ma_part
    return x[burn + max(p, q) + 1:]

def periodogram(x):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    freqs = np.fft.rfftfreq(len(x), d=1.0)
    power = np.abs(np.fft.rfft(x))**2 / len(x)
    return freqs, power

def kalman_filter_1d(obs, q, r, m0=0.0, p0=1.0):
    m, p = m0, p0
    means, vars_ = [], []
    pred_means, pred_vars = [], []
    for y in obs:
        m_pred = m
        p_pred = p + q
        pred_means.append(m_pred)
        pred_vars.append(p_pred)
        if np.isnan(y):
            m, p = m_pred, p_pred
        else:
            k = p_pred / (p_pred + r)
            m = m_pred + k * (y - m_pred)
            p = (1 - k) * p_pred
        means.append(m)
        vars_.append(p)
    return np.array(means), np.array(vars_), np.array(pred_means), np.array(pred_vars)

print("Extra imports and helper functions loaded.")

1. Intuition

1.1 Why Time Order Matters

The same values can tell very different stories depending on their temporal arrangement.

Code cell 6

ordered = np.array([79, 80, 81, 82, 300], dtype=float)
shuffled = np.array([82, 300, 79, 81, 80], dtype=float)

def lag1_corr(x):
    return np.corrcoef(x[:-1], x[1:])[0, 1]

header("Why order matters")
print(f"Ordered mean/var  : {ordered.mean():.2f}, {ordered.var():.2f}")
print(f"Shuffled mean/var : {shuffled.mean():.2f}, {shuffled.var():.2f}")
print(f"Ordered lag-1 corr: {lag1_corr(ordered):.3f}")
print(f"Shuffled lag-1 corr: {lag1_corr(shuffled):.3f}")
report("Means match after shuffling", np.isclose(ordered.mean(), shuffled.mean()))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(ordered, marker="o", color=COLORS["primary"])
    axes[0].set_title("Ordered observations")
    axes[0].set_xlabel("Time")
    axes[0].set_ylabel("Value")
    axes[1].plot(shuffled, marker="o", color=COLORS["secondary"])
    axes[1].set_title("Same values, shuffled")
    axes[1].set_xlabel("Index")
    axes[1].set_ylabel("Value")
    fig.tight_layout()
    plt.show()

1.2 From Snapshots to Sequences

Marginal summaries alone do not reveal serial dependence.

Code cell 8

iid_like = rng.normal(loc=0.0, scale=1.0, size=240)
ar_like = simulate_ar([0.85], n=240)
ar_like = ar_like / ar_like.std()

header("Snapshots vs sequences")
print(f"IID sample mean/var : {iid_like.mean():.3f}, {iid_like.var():.3f}")
print(f"AR sample mean/var  : {ar_like.mean():.3f}, {ar_like.var():.3f}")
print(f"IID lag-1 ACF       : {sample_acf(iid_like, 1)[1]:.3f}")
print(f"AR lag-1 ACF        : {sample_acf(ar_like, 1)[1]:.3f}")
report("AR sequence is more persistent than IID", abs(sample_acf(ar_like, 1)[1]) > abs(sample_acf(iid_like, 1)[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
    axes[0].plot(iid_like[:120], color=COLORS["secondary"])
    axes[0].set_title("IID-like sequence")
    axes[0].set_ylabel("Value")
    axes[1].plot(ar_like[:120], color=COLORS["primary"])
    axes[1].set_title("AR-like sequence")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Value")
    fig.tight_layout()
    plt.show()

1.3 Historical Timeline

Time series analysis developed across a century, with the computational breakthroughs arriving late and enabling the modern ML applications.

Year	Contributor	Contribution
1927	Yule	Autoregressive models for sunspot series
1938	Slutsky, Walker	Moving average models and oscillatory behavior
1970	Box & Jenkins	Unified ARIMA framework and systematic identification
1979	Kalman (applied widely)	State-space filtering enters engineering and econometrics
1982	Engle	ARCH models for volatility clustering
1986	Holt-Winters extended	Seasonal exponential smoothing systematized
1990s	Hamilton, others	Regime-switching, cointegration mainstream in economics
2017	Vaswani et al.	Transformer attention patterns encode sequence dependencies
2017	Taylor & Letham	Prophet — scalable additive forecasting at Meta
2020s	LLM era	Long-range sequence modeling drives NLP, audio, and time-series foundation models

Pattern: Classical statistics gave the language (stationarity, ACF, ARIMA); machine learning gave the scalable nonlinear approximators.

1.4 Why Time Series Matters for AI

Forecasting and uncertainty horizons show up immediately in operations and monitoring.

Code cell 11

phi = 0.8
mu = 0.0
current = 2.0
horizons = np.arange(1, 11)
forecast_means = mu + phi**horizons * (current - mu)
forecast_vars = (1 - phi**(2 * horizons)) / (1 - phi**2)

header("Forecast memory and uncertainty growth")
print("Horizon  ForecastMean  ForecastSD")
for h, m, v in zip(horizons, forecast_means, forecast_vars):
    print(f"{h:>7d}  {m:>12.4f}  {np.sqrt(v):>10.4f}")
report("Forecast uncertainty grows with horizon", np.all(np.diff(forecast_vars) > 0))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(horizons, forecast_means, marker="o", color=COLORS["primary"], label="Forecast mean")
    ax.fill_between(horizons,
                    forecast_means - 1.96 * np.sqrt(forecast_vars),
                    forecast_means + 1.96 * np.sqrt(forecast_vars),
                    color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
    ax.set_title("AR(1) forecast memory vs uncertainty growth")
    ax.set_xlabel("Forecast horizon")
    ax.set_ylabel("Predicted value")
    ax.legend()
    fig.tight_layout()
    plt.show()

2. Formal Definitions

2.1 Time Series as Observed Stochastic Processes

One observed path should be distinguished from the ensemble process that generated it.

Code cell 13

paths = np.stack([simulate_ar([0.7], n=120) for _ in range(200)])
one_path = paths[0]
ensemble_mean = paths.mean(axis=0)

header("One realization vs ensemble process")
print(f"Single-path sample mean : {one_path.mean():.4f}")
print(f"Ensemble time-mean mean : {ensemble_mean.mean():.4f}")
print(f"Single-path lag-1 ACF   : {sample_acf(one_path, 1)[1]:.4f}")
report("Ensemble and single path are different objects", np.abs(one_path.mean() - ensemble_mean.mean()) > 1e-3)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(one_path, color=COLORS["primary"], alpha=0.8, label="One realization")
    ax.plot(ensemble_mean, color=COLORS["secondary"], linestyle="--", label="Ensemble mean at each time")
    ax.set_title("Observed realization vs ensemble behavior")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

2.2 Lag Operator, Backshift Notation, and Difference Operators

Lag notation compresses temporal algebra into polynomial form.

Code cell 15

x = np.array([10, 12, 15, 14, 19, 21, 25], dtype=float)
lag1 = x[:-1]
first_diff = np.diff(x)
seasonal_diff = x[3:] - x[:-3]

header("Lag and difference operators")
print("Original series          :", x)
print("L x_t (lag-1 aligned)    :", lag1)
print("First difference         :", first_diff)
print("Seasonal difference s=3  :", seasonal_diff)
report("First difference length is T-1", len(first_diff) == len(x) - 1)

2.3 Mean Function, Autocovariance, and Autocorrelation

The ACF summarizes lag dependence on a normalized scale.

Code cell 17

ar_sample = simulate_ar([0.6], n=400)
acf_est = sample_acf(ar_sample, 12)
acf_theory = 0.6 ** np.arange(13)

header("Sample ACF vs theoretical ACF for AR(1)")
print("lag  sample   theory")
for h in range(6):
    print(f"{h:>3d}  {acf_est[h]:>7.4f}  {acf_theory[h]:>7.4f}")
report("Sample ACF at lag 1 is positive", acf_est[1] > 0)

if HAS_MPL:
    fig, ax = plt.subplots()
    lags = np.arange(13)
    ax.bar(lags - 0.15, acf_est, width=0.3, color=COLORS["primary"], label="Sample")
    ax.bar(lags + 0.15, acf_theory, width=0.3, color=COLORS["secondary"], alpha=0.8, label="Theory")
    ax.set_title("Autocorrelation of an AR(1) process")
    ax.set_xlabel("Lag")
    ax.set_ylabel("ACF")
    ax.legend()
    fig.tight_layout()
    plt.show()

2.4 White Noise, Random Walks, and Trend-Plus-Noise Models

Similar-looking lines can imply very different dependence structures.

Code cell 19

n = 240
white = rng.normal(scale=1.0, size=n)
walk = np.cumsum(rng.normal(scale=1.0, size=n))
trend = 0.05 * np.arange(n) + rng.normal(scale=1.0, size=n)

header("White noise vs random walk vs trend-plus-noise")
print(f"Lag-1 ACF white noise : {sample_acf(white, 1)[1]:.4f}")
print(f"Lag-1 ACF random walk : {sample_acf(walk, 1)[1]:.4f}")
print(f"Lag-1 ACF trend+noise : {sample_acf(trend, 1)[1]:.4f}")
report("Random walk is more persistent than white noise", sample_acf(walk, 1)[1] > sample_acf(white, 1)[1])

if HAS_MPL:
    fig, axes = plt.subplots(3, 2, figsize=(12, 9))
    series = [white, walk, trend]
    names = ["White noise", "Random walk", "Trend plus noise"]
    colors = [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]
    for i, (s, name, color) in enumerate(zip(series, names, colors)):
        axes[i, 0].plot(s, color=color)
        axes[i, 0].set_title(name)
        axes[i, 0].set_xlabel("Time")
        axes[i, 0].set_ylabel("Value")
        acf_vals = sample_acf(s, 20)
        axes[i, 1].bar(np.arange(21), acf_vals, color=color)
        axes[i, 1].set_title(f"{name} ACF")
        axes[i, 1].set_xlabel("Lag")
        axes[i, 1].set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

2.5 Stationarity: Strict vs Weak

Weak stationarity shows up as stable mean and variance across windows.

Code cell 21

stationary = simulate_ar([0.5], n=320)
nonstationary = np.cumsum(rng.normal(size=320))
win = 40
stat_mean = rolling_mean(stationary, win)
stat_var = rolling_var(stationary, win)
non_mean = rolling_mean(nonstationary, win)
non_var = rolling_var(nonstationary, win)

header("Rolling moments as a stationarity diagnostic")
print(f"Stationary rolling-mean range    : {stat_mean.min():.3f} to {stat_mean.max():.3f}")
print(f"Nonstationary rolling-mean range : {non_mean.min():.3f} to {non_mean.max():.3f}")
print(f"Stationary rolling-var range     : {stat_var.min():.3f} to {stat_var.max():.3f}")
print(f"Nonstationary rolling-var range  : {non_var.min():.3f} to {non_var.max():.3f}")
report("Nonstationary series has less stable rolling mean", np.ptp(non_mean) > np.ptp(stat_mean))

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex="col")
    axes[0, 0].plot(stationary, color=COLORS["primary"])
    axes[0, 0].set_title("Stationary series")
    axes[0, 1].plot(nonstationary, color=COLORS["error"])
    axes[0, 1].set_title("Nonstationary series")
    axes[1, 0].plot(stat_mean, label="Rolling mean", color=COLORS["secondary"])
    axes[1, 0].plot(stat_var, label="Rolling variance", color=COLORS["tertiary"])
    axes[1, 0].legend()
    axes[1, 1].plot(non_mean, label="Rolling mean", color=COLORS["secondary"])
    axes[1, 1].plot(non_var, label="Rolling variance", color=COLORS["tertiary"])
    axes[1, 1].legend()
    for ax in axes.ravel():
        ax.set_xlabel("Time")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

3. Core Theory I: Dependence, Stationarity, and Diagnostics

3.1 Why Stationarity Matters

Stable lag relationships make historical estimation useful for future forecasting.

Code cell 23

stationary = simulate_ar([0.7], n=400)
drifted = 0.03 * np.arange(400) + rng.normal(scale=1.0, size=400)

def half_summary(x):
    half = len(x) // 2
    return x[:half].mean(), x[half:].mean()

s1, s2 = half_summary(stationary)
d1, d2 = half_summary(drifted)

header("Why stationarity matters for estimation")
print(f"Stationary half means: {s1:.3f}, {s2:.3f}")
print(f"Drifted half means   : {d1:.3f}, {d2:.3f}")
report("Drifted series changes mean more than stationary series", abs(d2 - d1) > abs(s2 - s1))

3.2 Autocorrelation Function (ACF)

Different process families leave distinct ACF signatures.

Code cell 25

white = rng.normal(size=500)
ar = simulate_ar([0.75], n=500)
t = np.arange(500)
seasonal = np.sin(2 * np.pi * t / 24) + 0.4 * rng.normal(size=500)

acfs = {
    "White noise": sample_acf(white, 36),
    "AR(1)": sample_acf(ar, 36),
    "Seasonal": sample_acf(seasonal, 36),
}

header("ACF signatures")
print(f"Seasonal lag-24 ACF: {acfs['Seasonal'][24]:.4f}")
print(f"AR lag-1 ACF      : {acfs['AR(1)'][1]:.4f}")
report("Seasonal series spikes at lag 24", abs(acfs["Seasonal"][24]) > 0.5)

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
    for ax, (name, vals), color in zip(axes, acfs.items(), [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]):
        ax.bar(np.arange(len(vals)), vals, color=color)
        ax.set_title(name)
        ax.set_xlabel("Lag")
        ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

3.3 Partial Autocorrelation Function (PACF)

PACF highlights direct lag effects after controlling for shorter lags.

Code cell 27

ar2 = simulate_ar([0.8, -0.25], n=600)
ma1 = simulate_ma([0.7], n=600)
pacf_ar2 = sample_pacf(ar2, 12)
pacf_ma1 = sample_pacf(ma1, 12)

header("PACF signatures")
print("AR(2) PACF first lags:", pacf_ar2[:5])
print("MA(1) PACF first lags:", pacf_ma1[:5])
report("AR(2) PACF has strong lag-2 component", abs(pacf_ar2[2]) > 0.1)

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(len(pacf_ar2)), pacf_ar2, color=COLORS["primary"])
    axes[0].set_title("PACF of AR(2)")
    axes[1].bar(np.arange(len(pacf_ma1)), pacf_ma1, color=COLORS["secondary"])
    axes[1].set_title("PACF of MA(1)")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("PACF")
    fig.tight_layout()
    plt.show()

3.4 Differencing, Unit Roots, and Random Walk Behavior

Differencing can stabilize an integrated level process.

Code cell 29

walk = np.cumsum(rng.normal(size=500))
diff_walk = np.diff(walk)
acf_walk = sample_acf(walk, 20)
acf_diff = sample_acf(diff_walk, 20)

header("Differencing a random walk")
print(f"Lag-1 ACF in levels     : {acf_walk[1]:.4f}")
print(f"Lag-1 ACF after diff    : {acf_diff[1]:.4f}")
report("Differencing reduces persistence", abs(acf_diff[1]) < abs(acf_walk[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes[0, 0].plot(walk, color=COLORS["error"])
    axes[0, 0].set_title("Random walk levels")
    axes[0, 1].bar(np.arange(21), acf_walk, color=COLORS["error"])
    axes[0, 1].set_title("ACF of levels")
    axes[1, 0].plot(diff_walk, color=COLORS["primary"])
    axes[1, 0].set_title("First difference")
    axes[1, 1].bar(np.arange(21), acf_diff, color=COLORS["primary"])
    axes[1, 1].set_title("ACF after differencing")
    for ax in axes.ravel():
        ax.set_xlabel("Time / Lag")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

3.5 Wold Decomposition Intuition

A stationary AR process can be viewed as a filtered innovation sequence.

Code cell 31

phi = 0.6
psi = phi ** np.arange(12)
innovations = rng.normal(size=400)
x = np.zeros(400)
for t in range(1, 400):
    x[t] = phi * x[t-1] + innovations[t]

x_recon = np.zeros_like(x)
for t in range(400):
    s = 0.0
    for j in range(min(t + 1, len(psi))):
        s += psi[j] * innovations[t-j]
    x_recon[t] = s

header("Wold-style innovation representation")
print("First six impulse-response weights:", psi[:6])
print(f"Truncated reconstruction RMSE: {np.sqrt(np.mean((x[50:] - x_recon[50:])**2)):.4f}")
report("Impulse response decays for |phi|<1", np.all(np.diff(psi) < 0))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].stem(np.arange(len(psi)), psi, linefmt=COLORS["primary"], markerfmt="o", basefmt=" ")
    axes[0].set_title("AR(1) impulse response weights")
    axes[0].set_xlabel("Lag j")
    axes[0].set_ylabel(r"$\psi_j$")
    axes[1].plot(x[50:150], color=COLORS["secondary"], label="AR(1) path")
    axes[1].plot(x_recon[50:150], color=COLORS["primary"], linestyle="--", label="Truncated innovation sum")
    axes[1].set_title("Filtered-shock view of the process")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Value")
    axes[1].legend()
    fig.tight_layout()
    plt.show()

4. Core Theory II: Linear Time-Series Models

4.1 Autoregressive Models AR(p)

Root conditions determine whether the recursion is stable.

Code cell 33

phi = np.array([1.2, -0.32])
roots = np.roots(np.array([-phi[1], -phi[0], 1.0]))
ar2 = simulate_ar(phi, n=400)

header("AR(2) stability via characteristic roots")
print("AR coefficients:", phi)
print("Characteristic roots:", roots)
print("Root magnitudes:", np.abs(roots))
report("Roots lie outside the unit circle", np.all(np.abs(roots) > 1))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(ar2[:150], color=COLORS["primary"])
    ax.set_title("Simulated AR(2) process")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

4.2 Moving-Average Models MA(q)

MA models cut off in the ACF because only a finite number of shocks echo forward.

Code cell 35

ma = simulate_ma([0.8], n=500)
ma_acf = sample_acf(ma, 12)

header("MA(1) autocorrelation cutoff")
print("First six ACF values:", ma_acf[:6])
report("Lag-2 ACF is small for MA(1)", abs(ma_acf[2]) < 0.15)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.bar(np.arange(len(ma_acf)), ma_acf, color=COLORS["secondary"])
    ax.set_title("ACF of an MA(1) process")
    ax.set_xlabel("Lag")
    ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

4.3 ARMA Models

When both level persistence and shock echo matter, both ACF and PACF usually decay.

Code cell 37

arma = simulate_arma([0.6], [0.5], n=600)
arma_acf = sample_acf(arma, 12)
arma_pacf = sample_pacf(arma, 12)

header("ARMA(1,1) signatures")
print("ACF first five values :", arma_acf[:5])
print("PACF first five values:", arma_pacf[:5])
report("Both ACF and PACF show nontrivial early lags", np.any(np.abs(arma_acf[1:4]) > 0.1) and np.any(np.abs(arma_pacf[1:4]) > 0.1))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(len(arma_acf)), arma_acf, color=COLORS["primary"])
    axes[0].set_title("ACF of ARMA(1,1)")
    axes[1].bar(np.arange(len(arma_pacf)), arma_pacf, color=COLORS["secondary"])
    axes[1].set_title("PACF of ARMA(1,1)")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

4.4 ARIMA Models

Differencing can expose stationary ARMA-style structure in an integrated series.

Code cell 39

innovations = simulate_arma([0.5], [0.3], n=500)
arima_series = np.cumsum(innovations)
arima_diff = np.diff(arima_series)

header("ARIMA intuition through differencing")
print(f"Lag-1 ACF of levels     : {sample_acf(arima_series, 1)[1]:.4f}")
print(f"Lag-1 ACF of differences: {sample_acf(arima_diff, 1)[1]:.4f}")
report("Differencing exposes more stationary dependence", abs(sample_acf(arima_diff, 1)[1]) < abs(sample_acf(arima_series, 1)[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
    axes[0].plot(arima_series, color=COLORS["error"])
    axes[0].set_title("Integrated series")
    axes[0].set_ylabel("Level")
    axes[1].plot(arima_diff, color=COLORS["primary"])
    axes[1].set_title("First difference")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Change")
    fig.tight_layout()
    plt.show()

4.5 Model Identification with ACF/PACF and Information Criteria

Information criteria help arbitrate among plausible lag orders.

Code cell 41

x = simulate_ar([0.75, -0.18], n=420)

def fit_ar_conditional(x, p):
    y = x[p:]
    X = np.column_stack([x[p-k-1:-k-1] for k in range(p)])
    beta = la.lstsq(X, y, rcond=None)[0]
    resid = y - X @ beta
    n = len(resid)
    rss = np.dot(resid, resid)
    aic = n * np.log(rss / n) + 2 * p
    bic = n * np.log(rss / n) + p * np.log(n)
    return beta, resid, aic, bic

header("Approximate AIC/BIC across AR orders")
results = []
for p in range(1, 6):
    beta, resid, aic, bic = fit_ar_conditional(x, p)
    results.append((p, aic, bic))
    print(f"AR({p}) -> AIC={aic:8.3f}, BIC={bic:8.3f}")
best_aic = min(results, key=lambda row: row[1])[0]
best_bic = min(results, key=lambda row: row[2])[0]
print(f"Best AIC order: AR({best_aic})")
print(f"Best BIC order: AR({best_bic})")
report("Information criteria choose a low-order model", best_aic <= 3 and best_bic <= 3)

5. Core Theory III: Forecasting, Seasonality, and State Space

5.1 One-Step and Multi-Step Forecasting

Multi-step uncertainty grows because more unobserved shocks enter the forecast.

Code cell 43

phi = 0.82
sigma = 1.0
history = simulate_ar([phi], n=180)
last = history[-1]
horizons = np.arange(1, 13)
forecast = phi**horizons * last
fvar = sigma**2 * (1 - phi**(2 * horizons)) / (1 - phi**2)

header("AR(1) one-step and multi-step forecasts")
print("h   mean     sd")
for h, m, s in zip(horizons, forecast, np.sqrt(fvar)):
    print(f"{h:>2d}  {m:>7.3f}  {s:>7.3f}")
report("Forecast intervals widen with horizon", np.all(np.diff(fvar) > 0))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(np.arange(len(history)), history, color=COLORS["neutral"], label="Observed history")
    future_index = np.arange(len(history), len(history) + len(horizons))
    ax.plot(future_index, forecast, marker="o", color=COLORS["primary"], label="Forecast mean")
    ax.fill_between(future_index,
                    forecast - 1.96 * np.sqrt(fvar),
                    forecast + 1.96 * np.sqrt(fvar),
                    color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
    ax.set_title("Forecast path with horizon-dependent uncertainty")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

5.2 Seasonality and Seasonal Decomposition

Additive decomposition separates level, periodic structure, and irregular remainder.

Code cell 45

n = 24 * 14
t = np.arange(n)
true_trend = 30 + 0.03 * t
true_seasonal = 3.5 * np.sin(2 * np.pi * t / 24) + 1.5 * np.cos(2 * np.pi * t / 24)
noise = rng.normal(scale=0.9, size=n)
series = true_trend + true_seasonal + noise

trend_est = rolling_mean(series, 24)
aligned = series[11:-12]
seasonal_raw = aligned - trend_est
seasonal_template = np.array([seasonal_raw[np.arange(i, len(seasonal_raw), 24)].mean() for i in range(24)])
seasonal_template -= seasonal_template.mean()
seasonal_est = np.tile(seasonal_template, n // 24)
resid = aligned - trend_est - seasonal_est[11:-12]

header("Simple seasonal decomposition")
print(f"Estimated seasonal template mean: {seasonal_template.mean():.6f}")
print(f"Residual mean (trimmed sample)  : {resid.mean():.6f}")
report("Seasonal template is centered", abs(seasonal_template.mean()) < 1e-8)

if HAS_MPL:
    fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
    axes[0].plot(series, color=COLORS["primary"])
    axes[0].set_title("Observed seasonal series")
    axes[1].plot(true_trend, color=COLORS["secondary"], label="True trend")
    axes[1].plot(np.arange(11, n-12), trend_est, color=COLORS["highlight"], linestyle="--", label="Moving-average trend")
    axes[1].legend()
    axes[1].set_title("Trend component")
    axes[2].plot(seasonal_template, marker="o", color=COLORS["tertiary"])
    axes[2].set_title("Estimated seasonal template (24-step period)")
    axes[2].set_xlabel("Seasonal index")
    for ax in axes:
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

5.3 SARIMA and Seasonal Structure

Seasonal differencing targets repeated structure at lag s.

Code cell 47

n = 24 * 20
t = np.arange(n)
seasonal_series = 2.8 * np.sin(2 * np.pi * t / 24) + simulate_ar([0.4], n=n, sigma=0.7)
seasonal_diff = seasonal_series[24:] - seasonal_series[:-24]
acf_before = sample_acf(seasonal_series, 30)
acf_after = sample_acf(seasonal_diff, 30)

header("Seasonal differencing")
print(f"Lag-24 ACF before differencing: {acf_before[24]:.4f}")
print(f"Lag-24 ACF after differencing : {acf_after[24]:.4f}")
report("Seasonal differencing reduces lag-24 correlation", abs(acf_after[24]) < abs(acf_before[24]))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(31), acf_before, color=COLORS["secondary"])
    axes[0].set_title("ACF before seasonal differencing")
    axes[1].bar(np.arange(31), acf_after, color=COLORS["primary"])
    axes[1].set_title("ACF after seasonal differencing")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

5.4 State-Space Models

Hidden-state dynamics separate signal evolution from noisy observations.

Code cell 49

n = 180
latent = np.zeros(n)
obs = np.zeros(n)
q, r = 0.12, 0.9
latent[0] = 0.0
obs[0] = latent[0] + rng.normal(scale=np.sqrt(r))
for t in range(1, n):
    latent[t] = latent[t-1] + rng.normal(scale=np.sqrt(q))
    obs[t] = latent[t] + rng.normal(scale=np.sqrt(r))

header("Linear-Gaussian state-space simulation")
print(f"Process noise variance    q: {q:.3f}")
print(f"Observation noise variance r: {r:.3f}")
print(f"Observation RMSE vs state : {np.sqrt(np.mean((obs - latent)**2)):.4f}")
report("Observations are noisier than the latent state", np.std(obs - latent) > 0.5)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["primary"], label="Latent state")
    ax.plot(obs, color=COLORS["secondary"], alpha=0.6, label="Observed measurement")
    ax.set_title("State-space model: hidden state and noisy observations")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

5.5 Kalman Filter

Recursive Gaussian updating improves state estimates from noisy measurements.

Code cell 51

filtered_mean, filtered_var, pred_mean, pred_var = kalman_filter_1d(obs, q=q, r=r, m0=0.0, p0=1.0)
obs_rmse = np.sqrt(np.mean((obs - latent)**2))
filt_rmse = np.sqrt(np.mean((filtered_mean - latent)**2))

header("Kalman filtering a local-level model")
print(f"Observation RMSE: {obs_rmse:.4f}")
print(f"Filtered RMSE   : {filt_rmse:.4f}")
print(f"Final filtered variance: {filtered_var[-1]:.4f}")
report("Kalman filter improves over raw observations", filt_rmse < obs_rmse)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["neutral"], label="True state")
    ax.plot(obs, color=COLORS["secondary"], alpha=0.35, label="Observations")
    ax.plot(filtered_mean, color=COLORS["primary"], label="Filtered mean")
    ax.set_title("Kalman filter tracking a hidden state")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6. Advanced Topics

6.1 Spectral Analysis and the Frequency-Domain View

Frequency space reveals temporal scale directly.

Code cell 53

n = 512
t = np.arange(n)
low_freq = np.sin(2 * np.pi * t / 64) + 0.2 * rng.normal(size=n)
high_freq = np.sin(2 * np.pi * t / 8) + 0.2 * rng.normal(size=n)
f_low, p_low = periodogram(low_freq)
f_high, p_high = periodogram(high_freq)

header("Frequency-domain view")
print(f"Dominant low-frequency component : {f_low[np.argmax(p_low[1:]) + 1]:.4f}")
print(f"Dominant high-frequency component: {f_high[np.argmax(p_high[1:]) + 1]:.4f}")
report("Low-frequency signal peaks at lower frequency than high-frequency signal",
       f_low[np.argmax(p_low[1:]) + 1] < f_high[np.argmax(p_high[1:]) + 1])

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes[0, 0].plot(low_freq, color=COLORS["primary"])
    axes[0, 0].set_title("Low-frequency signal")
    axes[0, 1].plot(f_low[1:], p_low[1:], color=COLORS["primary"])
    axes[0, 1].set_title("Spectrum of low-frequency signal")
    axes[1, 0].plot(high_freq, color=COLORS["secondary"])
    axes[1, 0].set_title("High-frequency signal")
    axes[1, 1].plot(f_high[1:], p_high[1:], color=COLORS["secondary"])
    axes[1, 1].set_title("Spectrum of high-frequency signal")
    for ax in axes.ravel():
        ax.set_xlabel("Time / Frequency")
        ax.set_ylabel("Value / Power")
    fig.tight_layout()
    plt.show()

6.2 Periodogram and Fourier Decomposition

The periodogram is a practical frequency detector for finite samples.

Code cell 55

n = 480
t = np.arange(n)
true_period = 30
mixed = 1.8 * np.sin(2 * np.pi * t / true_period) + 0.8 * np.sin(2 * np.pi * t / 10) + rng.normal(scale=0.6, size=n)
freqs, power = periodogram(mixed)
dom_idx = np.argmax(power[1:]) + 1
dom_freq = freqs[dom_idx]
est_period = 1 / dom_freq

header("Periodogram-based frequency detection")
print(f"True dominant period: {true_period}")
print(f"Estimated dominant period: {est_period:.2f}")
report("Estimated period is close to the true dominant period", abs(est_period - true_period) < 3.0)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(freqs[1:], power[1:], color=COLORS["highlight"])
    ax.axvline(dom_freq, color=COLORS["neutral"], linestyle="--", label=f"Estimated period {est_period:.1f}")
    ax.set_title("Periodogram of a noisy periodic signal")
    ax.set_xlabel("Frequency")
    ax.set_ylabel("Power")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.3 Kalman Smoothing, Forecasting, and Missing Data

State-space models handle missing observations gracefully by propagating uncertainty forward.

Code cell 57

obs_missing = obs.copy()
missing_idx = np.arange(20, len(obs_missing), 17)
obs_missing[missing_idx] = np.nan
filt_missing, var_missing, _, _ = kalman_filter_1d(obs_missing, q=q, r=r, m0=0.0, p0=1.0)

header("Filtering with missing data")
print(f"Number of missing observations: {np.isnan(obs_missing).sum()}")
print(f"RMSE at missing locations      : {np.sqrt(np.mean((filt_missing[missing_idx] - latent[missing_idx])**2)):.4f}")
report("Filter produces finite imputations at missing points", np.isfinite(filt_missing[missing_idx]).all())

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["neutral"], label="True state")
    ax.plot(obs_missing, linestyle="None", marker="o", markersize=4, color=COLORS["secondary"], alpha=0.6, label="Observed / missing")
    ax.plot(filt_missing, color=COLORS["primary"], label="Filtered state")
    ax.set_title("Kalman filtering with missing observations")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.4 Multivariate Time Series Preview

In multiple dimensions, lag dependence becomes matrix-valued and cross-series effects matter.

Code cell 59

A = np.array([[0.7, 0.15], [0.05, 0.6]])
x = np.zeros((250, 2))
eps = rng.normal(scale=0.5, size=(250, 2))
for t in range(1, len(x)):
    x[t] = A @ x[t-1] + eps[t]

header("Multivariate lag dependence preview")
cross_lag = np.corrcoef(x[:-1, 0], x[1:, 1])[0, 1]
print(f"Cross-lag correlation corr(X1_t, X2_t+1): {cross_lag:.4f}")
report("Cross-series dependence is present", abs(cross_lag) > 0.05)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(x[:, 0], color=COLORS["primary"], label="Series 1")
    ax.plot(x[:, 1], color=COLORS["secondary"], label="Series 2")
    ax.set_title("Vector autoregressive preview")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.5 Change Points, Regime Shifts, and Structural Breaks

When the data-generating rules change, a stationary model can fail for the right reason.

Code cell 61

x = np.r_[rng.normal(loc=0.0, scale=1.0, size=120),
          rng.normal(loc=2.5, scale=1.0, size=120)]
left = np.array([x[max(0, i-20):i].mean() if i > 0 else np.nan for i in range(len(x))])
right = np.array([x[i:min(len(x), i+20)].mean() if i < len(x) - 1 else np.nan for i in range(len(x))])
score = np.abs(right - left)
est_break = np.nanargmax(score)

header("Detecting a structural break")
print(f"True break location     : 120")
print(f"Estimated break location: {est_break}")
report("Estimated break is close to truth", abs(est_break - 120) <= 10)

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)
    axes[0].plot(x, color=COLORS["primary"])
    axes[0].axvline(120, color=COLORS["neutral"], linestyle="--", label="True break")
    axes[0].axvline(est_break, color=COLORS["error"], linestyle=":", label="Estimated break")
    axes[0].set_title("Series with a mean shift")
    axes[0].legend()
    axes[1].plot(score, color=COLORS["error"])
    axes[1].set_title("Rolling mean-shift score")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Score")
    fig.tight_layout()
    plt.show()

7. Applications in Machine Learning

7.1 Forecasting for Demand, Traffic, and Monitoring

Classical seasonal baselines still matter in real operational forecasting.

Code cell 63

n = 24 * 28
t = np.arange(n)
demand = 50 + 8 * np.sin(2 * np.pi * t / 24) + 0.03 * t + rng.normal(scale=1.2, size=n)
train, test = demand[:-24], demand[-24:]
naive_forecast = np.repeat(train[-1], 24)
seasonal_forecast = train[-24:]

naive_rmse = np.sqrt(np.mean((test - naive_forecast)**2))
seasonal_rmse = np.sqrt(np.mean((test - seasonal_forecast)**2))

header("Operational forecasting baseline")
print(f"Naive last-value RMSE : {naive_rmse:.4f}")
print(f"Seasonal baseline RMSE: {seasonal_rmse:.4f}")
report("Seasonal baseline beats naive forecast", seasonal_rmse < naive_rmse)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(np.arange(len(train)), train, color=COLORS["neutral"], label="Train")
    test_idx = np.arange(len(train), len(train) + len(test))
    ax.plot(test_idx, test, color=COLORS["primary"], label="Test truth")
    ax.plot(test_idx, naive_forecast, color=COLORS["error"], linestyle="--", label="Naive")
    ax.plot(test_idx, seasonal_forecast, color=COLORS["secondary"], linestyle="--", label="Seasonal baseline")
    ax.set_title("Forecasting demand with classical baselines")
    ax.set_xlabel("Time")
    ax.set_ylabel("Demand")
    ax.legend()
    fig.tight_layout()
    plt.show()

7.2-7.5 Modern Sequence Modeling Links

Temporal encodings, hidden-state recursions, and pretrained forecasters still depend on the same notions of scale, memory, and seasonality.

Code cell 65

positions = np.arange(48)
freqs = np.array([1/48, 1/24, 1/12, 1/6])
encoding = np.column_stack([
    np.sin(2 * np.pi * positions[:, None] * freqs),
    np.cos(2 * np.pi * positions[:, None] * freqs),
])

A = np.array([[0.92, 0.0], [0.0, 0.55]])
state = np.zeros((80, 2))
state[0] = np.array([1.0, 1.0])
for t in range(1, len(state)):
    state[t] = A @ state[t-1]

header("Modern sequence-model links")
print(f"Temporal encoding matrix shape: {encoding.shape}")
print(f"Slow state half-life indicator : {np.where(state[:,0] < 0.5)[0][0]}")
print(f"Fast state half-life indicator : {np.where(state[:,1] < 0.5)[0][0]}")
report("Different state dimensions remember information at different scales",
       np.where(state[:,0] < 0.5)[0][0] > np.where(state[:,1] < 0.5)[0][0])

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    im = axes[0].imshow(encoding.T, aspect="auto", cmap="viridis")
    axes[0].set_title("Fourier-style temporal features")
    axes[0].set_xlabel("Position")
    axes[0].set_ylabel("Feature index")
    fig.colorbar(im, ax=axes[0], label="Value")
    axes[1].plot(state[:, 0], color=COLORS["primary"], label="Slow mode")
    axes[1].plot(state[:, 1], color=COLORS["secondary"], label="Fast mode")
    axes[1].set_title("State-space memory at multiple timescales")
    axes[1].set_xlabel("Step")
    axes[1].set_ylabel("State value")
    axes[1].legend()
    fig.tight_layout()
    plt.show()

8. Common Mistakes

#	Mistake	Why it is wrong	Fix
1	Applying OLS without checking stationarity	Non-stationary regressors produce spurious regressions	Difference or detrend first; check ADF/KPSS
2	Interpreting ACF alone to identify AR order	ACF decays slowly for AR; use PACF for AR order	Use PACF cuts off at lag p for AR(p)
3	Seasonal differencing without regular differencing	Can leave non-seasonal unit root in place	Apply both ∇ and ∇_s as needed
4	Using MSE as the only forecast metric	MSE heavily penalises outliers and ignores calibration	Report MAE, MASE, CRPS for probabilistic forecasts
5	Treating train/test split like i.i.d. data	Future leaks into past in random split	Always split by time; use expanding or rolling windows
6	Ignoring Kalman filter initialisation	Poor prior covariance P0 distorts early estimates	Use diffuse prior or steady-state initialisation
7	Assuming Transformer positional encoding captures periodicity	Sinusoidal PE does not explicitly encode seasonal patterns	Add Fourier features or seasonal embeddings
8	Confusing autocorrelation with cross-correlation	ACF is a single-series property	Use CCF for lead-lag relationships between two series

Recap

The classical toolkit already contains the essential questions modern sequence models must answer: what depends on what, what repeats, what is hidden, how forecast uncertainty grows, and when the data-generating rules have changed.