Theory NotebookMath for LLMs

Time Series

Statistics / Time Series

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Time Series — Theory Notebook

Time order changes what counts as signal, what can be predicted, and how uncertainty should evolve with horizon.

This notebook is the interactive companion to notes.md. It moves from dependence diagnostics and linear models to seasonality, state-space reasoning, spectral views, and modern AI links.

CoverageWhat You Will Do
DependenceCompute ACF/PACF, compare stationary and nonstationary series
Linear modelsSimulate AR, MA, ARMA, and ARIMA behavior
ForecastingBuild one-step and multi-step forecasts with uncertainty
State spaceSimulate hidden-state systems and run a Kalman filter
Spectral viewDetect periodicity with FFT and periodograms
AI linksConnect classical forecasting and temporal encoding ideas to modern sequence models

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({
        "figure.figsize":    (10, 6),
        "figure.dpi":         120,
        "font.size":           13,
        "axes.titlesize":      15,
        "axes.labelsize":      13,
        "xtick.labelsize":     11,
        "ytick.labelsize":     11,
        "legend.fontsize":     11,
        "legend.framealpha":   0.85,
        "lines.linewidth":      2.0,
        "axes.spines.top":     False,
        "axes.spines.right":   False,
        "savefig.bbox":       "tight",
        "savefig.dpi":         150,
    })
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

try:
    import seaborn as sns
    if HAS_MPL:
        sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    if HAS_MPL:
        plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

np.random.seed(42)
print("Plot setup complete.")

Code cell 4

import numpy.linalg as la
from scipy import signal

COLORS = {
    "primary":   "#0077BB",
    "secondary": "#EE7733",
    "tertiary":  "#009988",
    "error":     "#CC3311",
    "neutral":   "#555555",
    "highlight": "#EE3377",
}

rng = np.random.default_rng(42)
np.set_printoptions(precision=4, suppress=True)
mpl.rcParams["figure.max_open_warning"] = 0

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def report(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return cond

def sample_acf(x, max_lag):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    denom = np.dot(x, x)
    vals = [1.0]
    for h in range(1, max_lag + 1):
        vals.append(np.dot(x[:-h], x[h:]) / denom)
    return np.array(vals)

def sample_pacf(x, max_lag):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    pacf = [1.0]
    for h in range(1, max_lag + 1):
        y = x[h:]
        X = np.column_stack([x[h-k:-k] for k in range(1, h + 1)])
        beta = la.lstsq(X, y, rcond=None)[0]
        pacf.append(beta[-1])
    return np.array(pacf)

def rolling_mean(x, window):
    x = np.asarray(x, dtype=float)
    kernel = np.ones(window) / window
    return np.convolve(x, kernel, mode="valid")

def rolling_var(x, window):
    x = np.asarray(x, dtype=float)
    mu = rolling_mean(x, window)
    sq = rolling_mean(x**2, window)
    return sq - mu**2

def simulate_ar(phi, n, sigma=1.0, c=0.0, burn=250):
    phi = np.asarray(phi, dtype=float)
    p = len(phi)
    e = rng.normal(scale=sigma, size=n + burn + p)
    x = np.zeros(n + burn + p)
    mu = c / (1 - phi.sum()) if abs(1 - phi.sum()) > 1e-8 else 0.0
    x[:p] = mu
    for t in range(p, n + burn + p):
        x[t] = c + phi @ x[t-p:t][::-1] + e[t]
    return x[burn + p:]

def simulate_ma(theta, n, sigma=1.0, mu=0.0, burn=50):
    theta = np.asarray(theta, dtype=float)
    q = len(theta)
    e = rng.normal(scale=sigma, size=n + burn + q + 1)
    x = np.zeros(n + burn + q)
    for t in range(q, n + burn + q):
        x[t] = mu + e[t]
        for j in range(1, q + 1):
            x[t] += theta[j-1] * e[t-j]
    return x[burn + q:]

def simulate_arma(phi, theta, n, sigma=1.0, c=0.0, burn=300):
    phi = np.asarray(phi, dtype=float)
    theta = np.asarray(theta, dtype=float)
    p, q = len(phi), len(theta)
    e = rng.normal(scale=sigma, size=n + burn + max(p, q) + 1)
    x = np.zeros(n + burn + max(p, q) + 1)
    mu = c / (1 - phi.sum()) if p > 0 and abs(1 - phi.sum()) > 1e-8 else 0.0
    x[:max(p, q)+1] = mu
    for t in range(max(p, q) + 1, len(x)):
        ar_part = sum(phi[j] * x[t-j-1] for j in range(p))
        ma_part = e[t] + sum(theta[j] * e[t-j-1] for j in range(q))
        x[t] = c + ar_part + ma_part
    return x[burn + max(p, q) + 1:]

def periodogram(x):
    x = np.asarray(x, dtype=float)
    x = x - x.mean()
    freqs = np.fft.rfftfreq(len(x), d=1.0)
    power = np.abs(np.fft.rfft(x))**2 / len(x)
    return freqs, power

def kalman_filter_1d(obs, q, r, m0=0.0, p0=1.0):
    m, p = m0, p0
    means, vars_ = [], []
    pred_means, pred_vars = [], []
    for y in obs:
        m_pred = m
        p_pred = p + q
        pred_means.append(m_pred)
        pred_vars.append(p_pred)
        if np.isnan(y):
            m, p = m_pred, p_pred
        else:
            k = p_pred / (p_pred + r)
            m = m_pred + k * (y - m_pred)
            p = (1 - k) * p_pred
        means.append(m)
        vars_.append(p)
    return np.array(means), np.array(vars_), np.array(pred_means), np.array(pred_vars)

print("Extra imports and helper functions loaded.")

1. Intuition

1.1 Why Time Order Matters

The same values can tell very different stories depending on their temporal arrangement.

Code cell 6

ordered = np.array([79, 80, 81, 82, 300], dtype=float)
shuffled = np.array([82, 300, 79, 81, 80], dtype=float)

def lag1_corr(x):
    return np.corrcoef(x[:-1], x[1:])[0, 1]

header("Why order matters")
print(f"Ordered mean/var  : {ordered.mean():.2f}, {ordered.var():.2f}")
print(f"Shuffled mean/var : {shuffled.mean():.2f}, {shuffled.var():.2f}")
print(f"Ordered lag-1 corr: {lag1_corr(ordered):.3f}")
print(f"Shuffled lag-1 corr: {lag1_corr(shuffled):.3f}")
report("Means match after shuffling", np.isclose(ordered.mean(), shuffled.mean()))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].plot(ordered, marker="o", color=COLORS["primary"])
    axes[0].set_title("Ordered observations")
    axes[0].set_xlabel("Time")
    axes[0].set_ylabel("Value")
    axes[1].plot(shuffled, marker="o", color=COLORS["secondary"])
    axes[1].set_title("Same values, shuffled")
    axes[1].set_xlabel("Index")
    axes[1].set_ylabel("Value")
    fig.tight_layout()
    plt.show()

1.2 From Snapshots to Sequences

Marginal summaries alone do not reveal serial dependence.

Code cell 8

iid_like = rng.normal(loc=0.0, scale=1.0, size=240)
ar_like = simulate_ar([0.85], n=240)
ar_like = ar_like / ar_like.std()

header("Snapshots vs sequences")
print(f"IID sample mean/var : {iid_like.mean():.3f}, {iid_like.var():.3f}")
print(f"AR sample mean/var  : {ar_like.mean():.3f}, {ar_like.var():.3f}")
print(f"IID lag-1 ACF       : {sample_acf(iid_like, 1)[1]:.3f}")
print(f"AR lag-1 ACF        : {sample_acf(ar_like, 1)[1]:.3f}")
report("AR sequence is more persistent than IID", abs(sample_acf(ar_like, 1)[1]) > abs(sample_acf(iid_like, 1)[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
    axes[0].plot(iid_like[:120], color=COLORS["secondary"])
    axes[0].set_title("IID-like sequence")
    axes[0].set_ylabel("Value")
    axes[1].plot(ar_like[:120], color=COLORS["primary"])
    axes[1].set_title("AR-like sequence")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Value")
    fig.tight_layout()
    plt.show()

1.3 Historical Timeline

Time series analysis developed across a century, with the computational breakthroughs arriving late and enabling the modern ML applications.

YearContributorContribution
1927YuleAutoregressive models for sunspot series
1938Slutsky, WalkerMoving average models and oscillatory behavior
1970Box & JenkinsUnified ARIMA framework and systematic identification
1979Kalman (applied widely)State-space filtering enters engineering and econometrics
1982EngleARCH models for volatility clustering
1986Holt-Winters extendedSeasonal exponential smoothing systematized
1990sHamilton, othersRegime-switching, cointegration mainstream in economics
2017Vaswani et al.Transformer attention patterns encode sequence dependencies
2017Taylor & LethamProphet — scalable additive forecasting at Meta
2020sLLM eraLong-range sequence modeling drives NLP, audio, and time-series foundation models

Pattern: Classical statistics gave the language (stationarity, ACF, ARIMA); machine learning gave the scalable nonlinear approximators.

1.4 Why Time Series Matters for AI

Forecasting and uncertainty horizons show up immediately in operations and monitoring.

Code cell 11

phi = 0.8
mu = 0.0
current = 2.0
horizons = np.arange(1, 11)
forecast_means = mu + phi**horizons * (current - mu)
forecast_vars = (1 - phi**(2 * horizons)) / (1 - phi**2)

header("Forecast memory and uncertainty growth")
print("Horizon  ForecastMean  ForecastSD")
for h, m, v in zip(horizons, forecast_means, forecast_vars):
    print(f"{h:>7d}  {m:>12.4f}  {np.sqrt(v):>10.4f}")
report("Forecast uncertainty grows with horizon", np.all(np.diff(forecast_vars) > 0))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(horizons, forecast_means, marker="o", color=COLORS["primary"], label="Forecast mean")
    ax.fill_between(horizons,
                    forecast_means - 1.96 * np.sqrt(forecast_vars),
                    forecast_means + 1.96 * np.sqrt(forecast_vars),
                    color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
    ax.set_title("AR(1) forecast memory vs uncertainty growth")
    ax.set_xlabel("Forecast horizon")
    ax.set_ylabel("Predicted value")
    ax.legend()
    fig.tight_layout()
    plt.show()

2. Formal Definitions

2.1 Time Series as Observed Stochastic Processes

One observed path should be distinguished from the ensemble process that generated it.

Code cell 13

paths = np.stack([simulate_ar([0.7], n=120) for _ in range(200)])
one_path = paths[0]
ensemble_mean = paths.mean(axis=0)

header("One realization vs ensemble process")
print(f"Single-path sample mean : {one_path.mean():.4f}")
print(f"Ensemble time-mean mean : {ensemble_mean.mean():.4f}")
print(f"Single-path lag-1 ACF   : {sample_acf(one_path, 1)[1]:.4f}")
report("Ensemble and single path are different objects", np.abs(one_path.mean() - ensemble_mean.mean()) > 1e-3)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(one_path, color=COLORS["primary"], alpha=0.8, label="One realization")
    ax.plot(ensemble_mean, color=COLORS["secondary"], linestyle="--", label="Ensemble mean at each time")
    ax.set_title("Observed realization vs ensemble behavior")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

2.2 Lag Operator, Backshift Notation, and Difference Operators

Lag notation compresses temporal algebra into polynomial form.

Code cell 15

x = np.array([10, 12, 15, 14, 19, 21, 25], dtype=float)
lag1 = x[:-1]
first_diff = np.diff(x)
seasonal_diff = x[3:] - x[:-3]

header("Lag and difference operators")
print("Original series          :", x)
print("L x_t (lag-1 aligned)    :", lag1)
print("First difference         :", first_diff)
print("Seasonal difference s=3  :", seasonal_diff)
report("First difference length is T-1", len(first_diff) == len(x) - 1)

2.3 Mean Function, Autocovariance, and Autocorrelation

The ACF summarizes lag dependence on a normalized scale.

Code cell 17

ar_sample = simulate_ar([0.6], n=400)
acf_est = sample_acf(ar_sample, 12)
acf_theory = 0.6 ** np.arange(13)

header("Sample ACF vs theoretical ACF for AR(1)")
print("lag  sample   theory")
for h in range(6):
    print(f"{h:>3d}  {acf_est[h]:>7.4f}  {acf_theory[h]:>7.4f}")
report("Sample ACF at lag 1 is positive", acf_est[1] > 0)

if HAS_MPL:
    fig, ax = plt.subplots()
    lags = np.arange(13)
    ax.bar(lags - 0.15, acf_est, width=0.3, color=COLORS["primary"], label="Sample")
    ax.bar(lags + 0.15, acf_theory, width=0.3, color=COLORS["secondary"], alpha=0.8, label="Theory")
    ax.set_title("Autocorrelation of an AR(1) process")
    ax.set_xlabel("Lag")
    ax.set_ylabel("ACF")
    ax.legend()
    fig.tight_layout()
    plt.show()

2.4 White Noise, Random Walks, and Trend-Plus-Noise Models

Similar-looking lines can imply very different dependence structures.

Code cell 19

n = 240
white = rng.normal(scale=1.0, size=n)
walk = np.cumsum(rng.normal(scale=1.0, size=n))
trend = 0.05 * np.arange(n) + rng.normal(scale=1.0, size=n)

header("White noise vs random walk vs trend-plus-noise")
print(f"Lag-1 ACF white noise : {sample_acf(white, 1)[1]:.4f}")
print(f"Lag-1 ACF random walk : {sample_acf(walk, 1)[1]:.4f}")
print(f"Lag-1 ACF trend+noise : {sample_acf(trend, 1)[1]:.4f}")
report("Random walk is more persistent than white noise", sample_acf(walk, 1)[1] > sample_acf(white, 1)[1])

if HAS_MPL:
    fig, axes = plt.subplots(3, 2, figsize=(12, 9))
    series = [white, walk, trend]
    names = ["White noise", "Random walk", "Trend plus noise"]
    colors = [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]
    for i, (s, name, color) in enumerate(zip(series, names, colors)):
        axes[i, 0].plot(s, color=color)
        axes[i, 0].set_title(name)
        axes[i, 0].set_xlabel("Time")
        axes[i, 0].set_ylabel("Value")
        acf_vals = sample_acf(s, 20)
        axes[i, 1].bar(np.arange(21), acf_vals, color=color)
        axes[i, 1].set_title(f"{name} ACF")
        axes[i, 1].set_xlabel("Lag")
        axes[i, 1].set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

2.5 Stationarity: Strict vs Weak

Weak stationarity shows up as stable mean and variance across windows.

Code cell 21

stationary = simulate_ar([0.5], n=320)
nonstationary = np.cumsum(rng.normal(size=320))
win = 40
stat_mean = rolling_mean(stationary, win)
stat_var = rolling_var(stationary, win)
non_mean = rolling_mean(nonstationary, win)
non_var = rolling_var(nonstationary, win)

header("Rolling moments as a stationarity diagnostic")
print(f"Stationary rolling-mean range    : {stat_mean.min():.3f} to {stat_mean.max():.3f}")
print(f"Nonstationary rolling-mean range : {non_mean.min():.3f} to {non_mean.max():.3f}")
print(f"Stationary rolling-var range     : {stat_var.min():.3f} to {stat_var.max():.3f}")
print(f"Nonstationary rolling-var range  : {non_var.min():.3f} to {non_var.max():.3f}")
report("Nonstationary series has less stable rolling mean", np.ptp(non_mean) > np.ptp(stat_mean))

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex="col")
    axes[0, 0].plot(stationary, color=COLORS["primary"])
    axes[0, 0].set_title("Stationary series")
    axes[0, 1].plot(nonstationary, color=COLORS["error"])
    axes[0, 1].set_title("Nonstationary series")
    axes[1, 0].plot(stat_mean, label="Rolling mean", color=COLORS["secondary"])
    axes[1, 0].plot(stat_var, label="Rolling variance", color=COLORS["tertiary"])
    axes[1, 0].legend()
    axes[1, 1].plot(non_mean, label="Rolling mean", color=COLORS["secondary"])
    axes[1, 1].plot(non_var, label="Rolling variance", color=COLORS["tertiary"])
    axes[1, 1].legend()
    for ax in axes.ravel():
        ax.set_xlabel("Time")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

3. Core Theory I: Dependence, Stationarity, and Diagnostics

3.1 Why Stationarity Matters

Stable lag relationships make historical estimation useful for future forecasting.

Code cell 23

stationary = simulate_ar([0.7], n=400)
drifted = 0.03 * np.arange(400) + rng.normal(scale=1.0, size=400)

def half_summary(x):
    half = len(x) // 2
    return x[:half].mean(), x[half:].mean()

s1, s2 = half_summary(stationary)
d1, d2 = half_summary(drifted)

header("Why stationarity matters for estimation")
print(f"Stationary half means: {s1:.3f}, {s2:.3f}")
print(f"Drifted half means   : {d1:.3f}, {d2:.3f}")
report("Drifted series changes mean more than stationary series", abs(d2 - d1) > abs(s2 - s1))

3.2 Autocorrelation Function (ACF)

Different process families leave distinct ACF signatures.

Code cell 25

white = rng.normal(size=500)
ar = simulate_ar([0.75], n=500)
t = np.arange(500)
seasonal = np.sin(2 * np.pi * t / 24) + 0.4 * rng.normal(size=500)

acfs = {
    "White noise": sample_acf(white, 36),
    "AR(1)": sample_acf(ar, 36),
    "Seasonal": sample_acf(seasonal, 36),
}

header("ACF signatures")
print(f"Seasonal lag-24 ACF: {acfs['Seasonal'][24]:.4f}")
print(f"AR lag-1 ACF      : {acfs['AR(1)'][1]:.4f}")
report("Seasonal series spikes at lag 24", abs(acfs["Seasonal"][24]) > 0.5)

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
    for ax, (name, vals), color in zip(axes, acfs.items(), [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]):
        ax.bar(np.arange(len(vals)), vals, color=color)
        ax.set_title(name)
        ax.set_xlabel("Lag")
        ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

3.3 Partial Autocorrelation Function (PACF)

PACF highlights direct lag effects after controlling for shorter lags.

Code cell 27

ar2 = simulate_ar([0.8, -0.25], n=600)
ma1 = simulate_ma([0.7], n=600)
pacf_ar2 = sample_pacf(ar2, 12)
pacf_ma1 = sample_pacf(ma1, 12)

header("PACF signatures")
print("AR(2) PACF first lags:", pacf_ar2[:5])
print("MA(1) PACF first lags:", pacf_ma1[:5])
report("AR(2) PACF has strong lag-2 component", abs(pacf_ar2[2]) > 0.1)

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(len(pacf_ar2)), pacf_ar2, color=COLORS["primary"])
    axes[0].set_title("PACF of AR(2)")
    axes[1].bar(np.arange(len(pacf_ma1)), pacf_ma1, color=COLORS["secondary"])
    axes[1].set_title("PACF of MA(1)")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("PACF")
    fig.tight_layout()
    plt.show()

3.4 Differencing, Unit Roots, and Random Walk Behavior

Differencing can stabilize an integrated level process.

Code cell 29

walk = np.cumsum(rng.normal(size=500))
diff_walk = np.diff(walk)
acf_walk = sample_acf(walk, 20)
acf_diff = sample_acf(diff_walk, 20)

header("Differencing a random walk")
print(f"Lag-1 ACF in levels     : {acf_walk[1]:.4f}")
print(f"Lag-1 ACF after diff    : {acf_diff[1]:.4f}")
report("Differencing reduces persistence", abs(acf_diff[1]) < abs(acf_walk[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes[0, 0].plot(walk, color=COLORS["error"])
    axes[0, 0].set_title("Random walk levels")
    axes[0, 1].bar(np.arange(21), acf_walk, color=COLORS["error"])
    axes[0, 1].set_title("ACF of levels")
    axes[1, 0].plot(diff_walk, color=COLORS["primary"])
    axes[1, 0].set_title("First difference")
    axes[1, 1].bar(np.arange(21), acf_diff, color=COLORS["primary"])
    axes[1, 1].set_title("ACF after differencing")
    for ax in axes.ravel():
        ax.set_xlabel("Time / Lag")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

3.5 Wold Decomposition Intuition

A stationary AR process can be viewed as a filtered innovation sequence.

Code cell 31

phi = 0.6
psi = phi ** np.arange(12)
innovations = rng.normal(size=400)
x = np.zeros(400)
for t in range(1, 400):
    x[t] = phi * x[t-1] + innovations[t]

x_recon = np.zeros_like(x)
for t in range(400):
    s = 0.0
    for j in range(min(t + 1, len(psi))):
        s += psi[j] * innovations[t-j]
    x_recon[t] = s

header("Wold-style innovation representation")
print("First six impulse-response weights:", psi[:6])
print(f"Truncated reconstruction RMSE: {np.sqrt(np.mean((x[50:] - x_recon[50:])**2)):.4f}")
report("Impulse response decays for |phi|<1", np.all(np.diff(psi) < 0))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    axes[0].stem(np.arange(len(psi)), psi, linefmt=COLORS["primary"], markerfmt="o", basefmt=" ")
    axes[0].set_title("AR(1) impulse response weights")
    axes[0].set_xlabel("Lag j")
    axes[0].set_ylabel(r"$\psi_j$")
    axes[1].plot(x[50:150], color=COLORS["secondary"], label="AR(1) path")
    axes[1].plot(x_recon[50:150], color=COLORS["primary"], linestyle="--", label="Truncated innovation sum")
    axes[1].set_title("Filtered-shock view of the process")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Value")
    axes[1].legend()
    fig.tight_layout()
    plt.show()

4. Core Theory II: Linear Time-Series Models

4.1 Autoregressive Models AR(p)

Root conditions determine whether the recursion is stable.

Code cell 33

phi = np.array([1.2, -0.32])
roots = np.roots(np.array([-phi[1], -phi[0], 1.0]))
ar2 = simulate_ar(phi, n=400)

header("AR(2) stability via characteristic roots")
print("AR coefficients:", phi)
print("Characteristic roots:", roots)
print("Root magnitudes:", np.abs(roots))
report("Roots lie outside the unit circle", np.all(np.abs(roots) > 1))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(ar2[:150], color=COLORS["primary"])
    ax.set_title("Simulated AR(2) process")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

4.2 Moving-Average Models MA(q)

MA models cut off in the ACF because only a finite number of shocks echo forward.

Code cell 35

ma = simulate_ma([0.8], n=500)
ma_acf = sample_acf(ma, 12)

header("MA(1) autocorrelation cutoff")
print("First six ACF values:", ma_acf[:6])
report("Lag-2 ACF is small for MA(1)", abs(ma_acf[2]) < 0.15)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.bar(np.arange(len(ma_acf)), ma_acf, color=COLORS["secondary"])
    ax.set_title("ACF of an MA(1) process")
    ax.set_xlabel("Lag")
    ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

4.3 ARMA Models

When both level persistence and shock echo matter, both ACF and PACF usually decay.

Code cell 37

arma = simulate_arma([0.6], [0.5], n=600)
arma_acf = sample_acf(arma, 12)
arma_pacf = sample_pacf(arma, 12)

header("ARMA(1,1) signatures")
print("ACF first five values :", arma_acf[:5])
print("PACF first five values:", arma_pacf[:5])
report("Both ACF and PACF show nontrivial early lags", np.any(np.abs(arma_acf[1:4]) > 0.1) and np.any(np.abs(arma_pacf[1:4]) > 0.1))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(len(arma_acf)), arma_acf, color=COLORS["primary"])
    axes[0].set_title("ACF of ARMA(1,1)")
    axes[1].bar(np.arange(len(arma_pacf)), arma_pacf, color=COLORS["secondary"])
    axes[1].set_title("PACF of ARMA(1,1)")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

4.4 ARIMA Models

Differencing can expose stationary ARMA-style structure in an integrated series.

Code cell 39

innovations = simulate_arma([0.5], [0.3], n=500)
arima_series = np.cumsum(innovations)
arima_diff = np.diff(arima_series)

header("ARIMA intuition through differencing")
print(f"Lag-1 ACF of levels     : {sample_acf(arima_series, 1)[1]:.4f}")
print(f"Lag-1 ACF of differences: {sample_acf(arima_diff, 1)[1]:.4f}")
report("Differencing exposes more stationary dependence", abs(sample_acf(arima_diff, 1)[1]) < abs(sample_acf(arima_series, 1)[1]))

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
    axes[0].plot(arima_series, color=COLORS["error"])
    axes[0].set_title("Integrated series")
    axes[0].set_ylabel("Level")
    axes[1].plot(arima_diff, color=COLORS["primary"])
    axes[1].set_title("First difference")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Change")
    fig.tight_layout()
    plt.show()

4.5 Model Identification with ACF/PACF and Information Criteria

Information criteria help arbitrate among plausible lag orders.

Code cell 41

x = simulate_ar([0.75, -0.18], n=420)

def fit_ar_conditional(x, p):
    y = x[p:]
    X = np.column_stack([x[p-k-1:-k-1] for k in range(p)])
    beta = la.lstsq(X, y, rcond=None)[0]
    resid = y - X @ beta
    n = len(resid)
    rss = np.dot(resid, resid)
    aic = n * np.log(rss / n) + 2 * p
    bic = n * np.log(rss / n) + p * np.log(n)
    return beta, resid, aic, bic

header("Approximate AIC/BIC across AR orders")
results = []
for p in range(1, 6):
    beta, resid, aic, bic = fit_ar_conditional(x, p)
    results.append((p, aic, bic))
    print(f"AR({p}) -> AIC={aic:8.3f}, BIC={bic:8.3f}")
best_aic = min(results, key=lambda row: row[1])[0]
best_bic = min(results, key=lambda row: row[2])[0]
print(f"Best AIC order: AR({best_aic})")
print(f"Best BIC order: AR({best_bic})")
report("Information criteria choose a low-order model", best_aic <= 3 and best_bic <= 3)

5. Core Theory III: Forecasting, Seasonality, and State Space

5.1 One-Step and Multi-Step Forecasting

Multi-step uncertainty grows because more unobserved shocks enter the forecast.

Code cell 43

phi = 0.82
sigma = 1.0
history = simulate_ar([phi], n=180)
last = history[-1]
horizons = np.arange(1, 13)
forecast = phi**horizons * last
fvar = sigma**2 * (1 - phi**(2 * horizons)) / (1 - phi**2)

header("AR(1) one-step and multi-step forecasts")
print("h   mean     sd")
for h, m, s in zip(horizons, forecast, np.sqrt(fvar)):
    print(f"{h:>2d}  {m:>7.3f}  {s:>7.3f}")
report("Forecast intervals widen with horizon", np.all(np.diff(fvar) > 0))

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(np.arange(len(history)), history, color=COLORS["neutral"], label="Observed history")
    future_index = np.arange(len(history), len(history) + len(horizons))
    ax.plot(future_index, forecast, marker="o", color=COLORS["primary"], label="Forecast mean")
    ax.fill_between(future_index,
                    forecast - 1.96 * np.sqrt(fvar),
                    forecast + 1.96 * np.sqrt(fvar),
                    color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
    ax.set_title("Forecast path with horizon-dependent uncertainty")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

5.2 Seasonality and Seasonal Decomposition

Additive decomposition separates level, periodic structure, and irregular remainder.

Code cell 45

n = 24 * 14
t = np.arange(n)
true_trend = 30 + 0.03 * t
true_seasonal = 3.5 * np.sin(2 * np.pi * t / 24) + 1.5 * np.cos(2 * np.pi * t / 24)
noise = rng.normal(scale=0.9, size=n)
series = true_trend + true_seasonal + noise

trend_est = rolling_mean(series, 24)
aligned = series[11:-12]
seasonal_raw = aligned - trend_est
seasonal_template = np.array([seasonal_raw[np.arange(i, len(seasonal_raw), 24)].mean() for i in range(24)])
seasonal_template -= seasonal_template.mean()
seasonal_est = np.tile(seasonal_template, n // 24)
resid = aligned - trend_est - seasonal_est[11:-12]

header("Simple seasonal decomposition")
print(f"Estimated seasonal template mean: {seasonal_template.mean():.6f}")
print(f"Residual mean (trimmed sample)  : {resid.mean():.6f}")
report("Seasonal template is centered", abs(seasonal_template.mean()) < 1e-8)

if HAS_MPL:
    fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
    axes[0].plot(series, color=COLORS["primary"])
    axes[0].set_title("Observed seasonal series")
    axes[1].plot(true_trend, color=COLORS["secondary"], label="True trend")
    axes[1].plot(np.arange(11, n-12), trend_est, color=COLORS["highlight"], linestyle="--", label="Moving-average trend")
    axes[1].legend()
    axes[1].set_title("Trend component")
    axes[2].plot(seasonal_template, marker="o", color=COLORS["tertiary"])
    axes[2].set_title("Estimated seasonal template (24-step period)")
    axes[2].set_xlabel("Seasonal index")
    for ax in axes:
        ax.set_ylabel("Value")
    fig.tight_layout()
    plt.show()

5.3 SARIMA and Seasonal Structure

Seasonal differencing targets repeated structure at lag s.

Code cell 47

n = 24 * 20
t = np.arange(n)
seasonal_series = 2.8 * np.sin(2 * np.pi * t / 24) + simulate_ar([0.4], n=n, sigma=0.7)
seasonal_diff = seasonal_series[24:] - seasonal_series[:-24]
acf_before = sample_acf(seasonal_series, 30)
acf_after = sample_acf(seasonal_diff, 30)

header("Seasonal differencing")
print(f"Lag-24 ACF before differencing: {acf_before[24]:.4f}")
print(f"Lag-24 ACF after differencing : {acf_after[24]:.4f}")
report("Seasonal differencing reduces lag-24 correlation", abs(acf_after[24]) < abs(acf_before[24]))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
    axes[0].bar(np.arange(31), acf_before, color=COLORS["secondary"])
    axes[0].set_title("ACF before seasonal differencing")
    axes[1].bar(np.arange(31), acf_after, color=COLORS["primary"])
    axes[1].set_title("ACF after seasonal differencing")
    for ax in axes:
        ax.set_xlabel("Lag")
        ax.set_ylabel("ACF")
    fig.tight_layout()
    plt.show()

5.4 State-Space Models

Hidden-state dynamics separate signal evolution from noisy observations.

Code cell 49

n = 180
latent = np.zeros(n)
obs = np.zeros(n)
q, r = 0.12, 0.9
latent[0] = 0.0
obs[0] = latent[0] + rng.normal(scale=np.sqrt(r))
for t in range(1, n):
    latent[t] = latent[t-1] + rng.normal(scale=np.sqrt(q))
    obs[t] = latent[t] + rng.normal(scale=np.sqrt(r))

header("Linear-Gaussian state-space simulation")
print(f"Process noise variance    q: {q:.3f}")
print(f"Observation noise variance r: {r:.3f}")
print(f"Observation RMSE vs state : {np.sqrt(np.mean((obs - latent)**2)):.4f}")
report("Observations are noisier than the latent state", np.std(obs - latent) > 0.5)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["primary"], label="Latent state")
    ax.plot(obs, color=COLORS["secondary"], alpha=0.6, label="Observed measurement")
    ax.set_title("State-space model: hidden state and noisy observations")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

5.5 Kalman Filter

Recursive Gaussian updating improves state estimates from noisy measurements.

Code cell 51

filtered_mean, filtered_var, pred_mean, pred_var = kalman_filter_1d(obs, q=q, r=r, m0=0.0, p0=1.0)
obs_rmse = np.sqrt(np.mean((obs - latent)**2))
filt_rmse = np.sqrt(np.mean((filtered_mean - latent)**2))

header("Kalman filtering a local-level model")
print(f"Observation RMSE: {obs_rmse:.4f}")
print(f"Filtered RMSE   : {filt_rmse:.4f}")
print(f"Final filtered variance: {filtered_var[-1]:.4f}")
report("Kalman filter improves over raw observations", filt_rmse < obs_rmse)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["neutral"], label="True state")
    ax.plot(obs, color=COLORS["secondary"], alpha=0.35, label="Observations")
    ax.plot(filtered_mean, color=COLORS["primary"], label="Filtered mean")
    ax.set_title("Kalman filter tracking a hidden state")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6. Advanced Topics

6.1 Spectral Analysis and the Frequency-Domain View

Frequency space reveals temporal scale directly.

Code cell 53

n = 512
t = np.arange(n)
low_freq = np.sin(2 * np.pi * t / 64) + 0.2 * rng.normal(size=n)
high_freq = np.sin(2 * np.pi * t / 8) + 0.2 * rng.normal(size=n)
f_low, p_low = periodogram(low_freq)
f_high, p_high = periodogram(high_freq)

header("Frequency-domain view")
print(f"Dominant low-frequency component : {f_low[np.argmax(p_low[1:]) + 1]:.4f}")
print(f"Dominant high-frequency component: {f_high[np.argmax(p_high[1:]) + 1]:.4f}")
report("Low-frequency signal peaks at lower frequency than high-frequency signal",
       f_low[np.argmax(p_low[1:]) + 1] < f_high[np.argmax(p_high[1:]) + 1])

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(12, 8))
    axes[0, 0].plot(low_freq, color=COLORS["primary"])
    axes[0, 0].set_title("Low-frequency signal")
    axes[0, 1].plot(f_low[1:], p_low[1:], color=COLORS["primary"])
    axes[0, 1].set_title("Spectrum of low-frequency signal")
    axes[1, 0].plot(high_freq, color=COLORS["secondary"])
    axes[1, 0].set_title("High-frequency signal")
    axes[1, 1].plot(f_high[1:], p_high[1:], color=COLORS["secondary"])
    axes[1, 1].set_title("Spectrum of high-frequency signal")
    for ax in axes.ravel():
        ax.set_xlabel("Time / Frequency")
        ax.set_ylabel("Value / Power")
    fig.tight_layout()
    plt.show()

6.2 Periodogram and Fourier Decomposition

The periodogram is a practical frequency detector for finite samples.

Code cell 55

n = 480
t = np.arange(n)
true_period = 30
mixed = 1.8 * np.sin(2 * np.pi * t / true_period) + 0.8 * np.sin(2 * np.pi * t / 10) + rng.normal(scale=0.6, size=n)
freqs, power = periodogram(mixed)
dom_idx = np.argmax(power[1:]) + 1
dom_freq = freqs[dom_idx]
est_period = 1 / dom_freq

header("Periodogram-based frequency detection")
print(f"True dominant period: {true_period}")
print(f"Estimated dominant period: {est_period:.2f}")
report("Estimated period is close to the true dominant period", abs(est_period - true_period) < 3.0)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(freqs[1:], power[1:], color=COLORS["highlight"])
    ax.axvline(dom_freq, color=COLORS["neutral"], linestyle="--", label=f"Estimated period {est_period:.1f}")
    ax.set_title("Periodogram of a noisy periodic signal")
    ax.set_xlabel("Frequency")
    ax.set_ylabel("Power")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.3 Kalman Smoothing, Forecasting, and Missing Data

State-space models handle missing observations gracefully by propagating uncertainty forward.

Code cell 57

obs_missing = obs.copy()
missing_idx = np.arange(20, len(obs_missing), 17)
obs_missing[missing_idx] = np.nan
filt_missing, var_missing, _, _ = kalman_filter_1d(obs_missing, q=q, r=r, m0=0.0, p0=1.0)

header("Filtering with missing data")
print(f"Number of missing observations: {np.isnan(obs_missing).sum()}")
print(f"RMSE at missing locations      : {np.sqrt(np.mean((filt_missing[missing_idx] - latent[missing_idx])**2)):.4f}")
report("Filter produces finite imputations at missing points", np.isfinite(filt_missing[missing_idx]).all())

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(latent, color=COLORS["neutral"], label="True state")
    ax.plot(obs_missing, linestyle="None", marker="o", markersize=4, color=COLORS["secondary"], alpha=0.6, label="Observed / missing")
    ax.plot(filt_missing, color=COLORS["primary"], label="Filtered state")
    ax.set_title("Kalman filtering with missing observations")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.4 Multivariate Time Series Preview

In multiple dimensions, lag dependence becomes matrix-valued and cross-series effects matter.

Code cell 59

A = np.array([[0.7, 0.15], [0.05, 0.6]])
x = np.zeros((250, 2))
eps = rng.normal(scale=0.5, size=(250, 2))
for t in range(1, len(x)):
    x[t] = A @ x[t-1] + eps[t]

header("Multivariate lag dependence preview")
cross_lag = np.corrcoef(x[:-1, 0], x[1:, 1])[0, 1]
print(f"Cross-lag correlation corr(X1_t, X2_t+1): {cross_lag:.4f}")
report("Cross-series dependence is present", abs(cross_lag) > 0.05)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(x[:, 0], color=COLORS["primary"], label="Series 1")
    ax.plot(x[:, 1], color=COLORS["secondary"], label="Series 2")
    ax.set_title("Vector autoregressive preview")
    ax.set_xlabel("Time")
    ax.set_ylabel("Value")
    ax.legend()
    fig.tight_layout()
    plt.show()

6.5 Change Points, Regime Shifts, and Structural Breaks

When the data-generating rules change, a stationary model can fail for the right reason.

Code cell 61

x = np.r_[rng.normal(loc=0.0, scale=1.0, size=120),
          rng.normal(loc=2.5, scale=1.0, size=120)]
left = np.array([x[max(0, i-20):i].mean() if i > 0 else np.nan for i in range(len(x))])
right = np.array([x[i:min(len(x), i+20)].mean() if i < len(x) - 1 else np.nan for i in range(len(x))])
score = np.abs(right - left)
est_break = np.nanargmax(score)

header("Detecting a structural break")
print(f"True break location     : 120")
print(f"Estimated break location: {est_break}")
report("Estimated break is close to truth", abs(est_break - 120) <= 10)

if HAS_MPL:
    fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)
    axes[0].plot(x, color=COLORS["primary"])
    axes[0].axvline(120, color=COLORS["neutral"], linestyle="--", label="True break")
    axes[0].axvline(est_break, color=COLORS["error"], linestyle=":", label="Estimated break")
    axes[0].set_title("Series with a mean shift")
    axes[0].legend()
    axes[1].plot(score, color=COLORS["error"])
    axes[1].set_title("Rolling mean-shift score")
    axes[1].set_xlabel("Time")
    axes[1].set_ylabel("Score")
    fig.tight_layout()
    plt.show()

7. Applications in Machine Learning

7.1 Forecasting for Demand, Traffic, and Monitoring

Classical seasonal baselines still matter in real operational forecasting.

Code cell 63

n = 24 * 28
t = np.arange(n)
demand = 50 + 8 * np.sin(2 * np.pi * t / 24) + 0.03 * t + rng.normal(scale=1.2, size=n)
train, test = demand[:-24], demand[-24:]
naive_forecast = np.repeat(train[-1], 24)
seasonal_forecast = train[-24:]

naive_rmse = np.sqrt(np.mean((test - naive_forecast)**2))
seasonal_rmse = np.sqrt(np.mean((test - seasonal_forecast)**2))

header("Operational forecasting baseline")
print(f"Naive last-value RMSE : {naive_rmse:.4f}")
print(f"Seasonal baseline RMSE: {seasonal_rmse:.4f}")
report("Seasonal baseline beats naive forecast", seasonal_rmse < naive_rmse)

if HAS_MPL:
    fig, ax = plt.subplots()
    ax.plot(np.arange(len(train)), train, color=COLORS["neutral"], label="Train")
    test_idx = np.arange(len(train), len(train) + len(test))
    ax.plot(test_idx, test, color=COLORS["primary"], label="Test truth")
    ax.plot(test_idx, naive_forecast, color=COLORS["error"], linestyle="--", label="Naive")
    ax.plot(test_idx, seasonal_forecast, color=COLORS["secondary"], linestyle="--", label="Seasonal baseline")
    ax.set_title("Forecasting demand with classical baselines")
    ax.set_xlabel("Time")
    ax.set_ylabel("Demand")
    ax.legend()
    fig.tight_layout()
    plt.show()

7.2-7.5 Modern Sequence Modeling Links

Temporal encodings, hidden-state recursions, and pretrained forecasters still depend on the same notions of scale, memory, and seasonality.

Code cell 65

positions = np.arange(48)
freqs = np.array([1/48, 1/24, 1/12, 1/6])
encoding = np.column_stack([
    np.sin(2 * np.pi * positions[:, None] * freqs),
    np.cos(2 * np.pi * positions[:, None] * freqs),
])

A = np.array([[0.92, 0.0], [0.0, 0.55]])
state = np.zeros((80, 2))
state[0] = np.array([1.0, 1.0])
for t in range(1, len(state)):
    state[t] = A @ state[t-1]

header("Modern sequence-model links")
print(f"Temporal encoding matrix shape: {encoding.shape}")
print(f"Slow state half-life indicator : {np.where(state[:,0] < 0.5)[0][0]}")
print(f"Fast state half-life indicator : {np.where(state[:,1] < 0.5)[0][0]}")
report("Different state dimensions remember information at different scales",
       np.where(state[:,0] < 0.5)[0][0] > np.where(state[:,1] < 0.5)[0][0])

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    im = axes[0].imshow(encoding.T, aspect="auto", cmap="viridis")
    axes[0].set_title("Fourier-style temporal features")
    axes[0].set_xlabel("Position")
    axes[0].set_ylabel("Feature index")
    fig.colorbar(im, ax=axes[0], label="Value")
    axes[1].plot(state[:, 0], color=COLORS["primary"], label="Slow mode")
    axes[1].plot(state[:, 1], color=COLORS["secondary"], label="Fast mode")
    axes[1].set_title("State-space memory at multiple timescales")
    axes[1].set_xlabel("Step")
    axes[1].set_ylabel("State value")
    axes[1].legend()
    fig.tight_layout()
    plt.show()

8. Common Mistakes

#MistakeWhy it is wrongFix
1Applying OLS without checking stationarityNon-stationary regressors produce spurious regressionsDifference or detrend first; check ADF/KPSS
2Interpreting ACF alone to identify AR orderACF decays slowly for AR; use PACF for AR orderUse PACF cuts off at lag p for AR(p)
3Seasonal differencing without regular differencingCan leave non-seasonal unit root in placeApply both ∇ and ∇_s as needed
4Using MSE as the only forecast metricMSE heavily penalises outliers and ignores calibrationReport MAE, MASE, CRPS for probabilistic forecasts
5Treating train/test split like i.i.d. dataFuture leaks into past in random splitAlways split by time; use expanding or rolling windows
6Ignoring Kalman filter initialisationPoor prior covariance P0 distorts early estimatesUse diffuse prior or steady-state initialisation
7Assuming Transformer positional encoding captures periodicitySinusoidal PE does not explicitly encode seasonal patternsAdd Fourier features or seasonal embeddings
8Confusing autocorrelation with cross-correlationACF is a single-series propertyUse CCF for lead-lag relationships between two series

Recap

The classical toolkit already contains the essential questions modern sequence models must answer: what depends on what, what repeats, what is hidden, how forecast uncertainty grows, and when the data-generating rules have changed.