Theory Notebook
Converted from
theory.ipynbfor web reading.
Time Series — Theory Notebook
Time order changes what counts as signal, what can be predicted, and how uncertainty should evolve with horizon.
This notebook is the interactive companion to notes.md. It moves from dependence diagnostics and linear models to seasonality, state-space reasoning, spectral views, and modern AI links.
| Coverage | What You Will Do |
|---|---|
| Dependence | Compute ACF/PACF, compare stationary and nonstationary series |
| Linear models | Simulate AR, MA, ARMA, and ARIMA behavior |
| Forecasting | Build one-step and multi-step forecasts with uncertainty |
| State space | Simulate hidden-state systems and run a Kalman filter |
| Spectral view | Detect periodicity with FFT and periodograms |
| AI links | Connect classical forecasting and temporal encoding ideas to modern sequence models |
Code cell 2
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
import seaborn as sns
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
np.random.seed(42)
print("Plot setup complete.")
Code cell 3
import numpy as np
try:
import matplotlib.pyplot as plt
import matplotlib as mpl
mpl.rcParams.update({
"figure.figsize": (10, 6),
"figure.dpi": 120,
"font.size": 13,
"axes.titlesize": 15,
"axes.labelsize": 13,
"xtick.labelsize": 11,
"ytick.labelsize": 11,
"legend.fontsize": 11,
"legend.framealpha": 0.85,
"lines.linewidth": 2.0,
"axes.spines.top": False,
"axes.spines.right": False,
"savefig.bbox": "tight",
"savefig.dpi": 150,
})
HAS_MPL = True
except ImportError:
HAS_MPL = False
try:
import seaborn as sns
if HAS_MPL:
sns.set_theme(style="whitegrid", palette="colorblind")
HAS_SNS = True
except ImportError:
if HAS_MPL:
plt.style.use("seaborn-v0_8-whitegrid")
HAS_SNS = False
np.random.seed(42)
print("Plot setup complete.")
Code cell 4
import numpy.linalg as la
from scipy import signal
COLORS = {
"primary": "#0077BB",
"secondary": "#EE7733",
"tertiary": "#009988",
"error": "#CC3311",
"neutral": "#555555",
"highlight": "#EE3377",
}
rng = np.random.default_rng(42)
np.set_printoptions(precision=4, suppress=True)
mpl.rcParams["figure.max_open_warning"] = 0
def header(title):
print("\n" + "=" * len(title))
print(title)
print("=" * len(title))
def report(name, cond):
print(f"{'PASS' if cond else 'FAIL'} - {name}")
return cond
def sample_acf(x, max_lag):
x = np.asarray(x, dtype=float)
x = x - x.mean()
denom = np.dot(x, x)
vals = [1.0]
for h in range(1, max_lag + 1):
vals.append(np.dot(x[:-h], x[h:]) / denom)
return np.array(vals)
def sample_pacf(x, max_lag):
x = np.asarray(x, dtype=float)
x = x - x.mean()
pacf = [1.0]
for h in range(1, max_lag + 1):
y = x[h:]
X = np.column_stack([x[h-k:-k] for k in range(1, h + 1)])
beta = la.lstsq(X, y, rcond=None)[0]
pacf.append(beta[-1])
return np.array(pacf)
def rolling_mean(x, window):
x = np.asarray(x, dtype=float)
kernel = np.ones(window) / window
return np.convolve(x, kernel, mode="valid")
def rolling_var(x, window):
x = np.asarray(x, dtype=float)
mu = rolling_mean(x, window)
sq = rolling_mean(x**2, window)
return sq - mu**2
def simulate_ar(phi, n, sigma=1.0, c=0.0, burn=250):
phi = np.asarray(phi, dtype=float)
p = len(phi)
e = rng.normal(scale=sigma, size=n + burn + p)
x = np.zeros(n + burn + p)
mu = c / (1 - phi.sum()) if abs(1 - phi.sum()) > 1e-8 else 0.0
x[:p] = mu
for t in range(p, n + burn + p):
x[t] = c + phi @ x[t-p:t][::-1] + e[t]
return x[burn + p:]
def simulate_ma(theta, n, sigma=1.0, mu=0.0, burn=50):
theta = np.asarray(theta, dtype=float)
q = len(theta)
e = rng.normal(scale=sigma, size=n + burn + q + 1)
x = np.zeros(n + burn + q)
for t in range(q, n + burn + q):
x[t] = mu + e[t]
for j in range(1, q + 1):
x[t] += theta[j-1] * e[t-j]
return x[burn + q:]
def simulate_arma(phi, theta, n, sigma=1.0, c=0.0, burn=300):
phi = np.asarray(phi, dtype=float)
theta = np.asarray(theta, dtype=float)
p, q = len(phi), len(theta)
e = rng.normal(scale=sigma, size=n + burn + max(p, q) + 1)
x = np.zeros(n + burn + max(p, q) + 1)
mu = c / (1 - phi.sum()) if p > 0 and abs(1 - phi.sum()) > 1e-8 else 0.0
x[:max(p, q)+1] = mu
for t in range(max(p, q) + 1, len(x)):
ar_part = sum(phi[j] * x[t-j-1] for j in range(p))
ma_part = e[t] + sum(theta[j] * e[t-j-1] for j in range(q))
x[t] = c + ar_part + ma_part
return x[burn + max(p, q) + 1:]
def periodogram(x):
x = np.asarray(x, dtype=float)
x = x - x.mean()
freqs = np.fft.rfftfreq(len(x), d=1.0)
power = np.abs(np.fft.rfft(x))**2 / len(x)
return freqs, power
def kalman_filter_1d(obs, q, r, m0=0.0, p0=1.0):
m, p = m0, p0
means, vars_ = [], []
pred_means, pred_vars = [], []
for y in obs:
m_pred = m
p_pred = p + q
pred_means.append(m_pred)
pred_vars.append(p_pred)
if np.isnan(y):
m, p = m_pred, p_pred
else:
k = p_pred / (p_pred + r)
m = m_pred + k * (y - m_pred)
p = (1 - k) * p_pred
means.append(m)
vars_.append(p)
return np.array(means), np.array(vars_), np.array(pred_means), np.array(pred_vars)
print("Extra imports and helper functions loaded.")
1. Intuition
1.1 Why Time Order Matters
The same values can tell very different stories depending on their temporal arrangement.
Code cell 6
ordered = np.array([79, 80, 81, 82, 300], dtype=float)
shuffled = np.array([82, 300, 79, 81, 80], dtype=float)
def lag1_corr(x):
return np.corrcoef(x[:-1], x[1:])[0, 1]
header("Why order matters")
print(f"Ordered mean/var : {ordered.mean():.2f}, {ordered.var():.2f}")
print(f"Shuffled mean/var : {shuffled.mean():.2f}, {shuffled.var():.2f}")
print(f"Ordered lag-1 corr: {lag1_corr(ordered):.3f}")
print(f"Shuffled lag-1 corr: {lag1_corr(shuffled):.3f}")
report("Means match after shuffling", np.isclose(ordered.mean(), shuffled.mean()))
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(ordered, marker="o", color=COLORS["primary"])
axes[0].set_title("Ordered observations")
axes[0].set_xlabel("Time")
axes[0].set_ylabel("Value")
axes[1].plot(shuffled, marker="o", color=COLORS["secondary"])
axes[1].set_title("Same values, shuffled")
axes[1].set_xlabel("Index")
axes[1].set_ylabel("Value")
fig.tight_layout()
plt.show()
1.2 From Snapshots to Sequences
Marginal summaries alone do not reveal serial dependence.
Code cell 8
iid_like = rng.normal(loc=0.0, scale=1.0, size=240)
ar_like = simulate_ar([0.85], n=240)
ar_like = ar_like / ar_like.std()
header("Snapshots vs sequences")
print(f"IID sample mean/var : {iid_like.mean():.3f}, {iid_like.var():.3f}")
print(f"AR sample mean/var : {ar_like.mean():.3f}, {ar_like.var():.3f}")
print(f"IID lag-1 ACF : {sample_acf(iid_like, 1)[1]:.3f}")
print(f"AR lag-1 ACF : {sample_acf(ar_like, 1)[1]:.3f}")
report("AR sequence is more persistent than IID", abs(sample_acf(ar_like, 1)[1]) > abs(sample_acf(iid_like, 1)[1]))
if HAS_MPL:
fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
axes[0].plot(iid_like[:120], color=COLORS["secondary"])
axes[0].set_title("IID-like sequence")
axes[0].set_ylabel("Value")
axes[1].plot(ar_like[:120], color=COLORS["primary"])
axes[1].set_title("AR-like sequence")
axes[1].set_xlabel("Time")
axes[1].set_ylabel("Value")
fig.tight_layout()
plt.show()
1.3 Historical Timeline
Time series analysis developed across a century, with the computational breakthroughs arriving late and enabling the modern ML applications.
| Year | Contributor | Contribution |
|---|---|---|
| 1927 | Yule | Autoregressive models for sunspot series |
| 1938 | Slutsky, Walker | Moving average models and oscillatory behavior |
| 1970 | Box & Jenkins | Unified ARIMA framework and systematic identification |
| 1979 | Kalman (applied widely) | State-space filtering enters engineering and econometrics |
| 1982 | Engle | ARCH models for volatility clustering |
| 1986 | Holt-Winters extended | Seasonal exponential smoothing systematized |
| 1990s | Hamilton, others | Regime-switching, cointegration mainstream in economics |
| 2017 | Vaswani et al. | Transformer attention patterns encode sequence dependencies |
| 2017 | Taylor & Letham | Prophet — scalable additive forecasting at Meta |
| 2020s | LLM era | Long-range sequence modeling drives NLP, audio, and time-series foundation models |
Pattern: Classical statistics gave the language (stationarity, ACF, ARIMA); machine learning gave the scalable nonlinear approximators.
1.4 Why Time Series Matters for AI
Forecasting and uncertainty horizons show up immediately in operations and monitoring.
Code cell 11
phi = 0.8
mu = 0.0
current = 2.0
horizons = np.arange(1, 11)
forecast_means = mu + phi**horizons * (current - mu)
forecast_vars = (1 - phi**(2 * horizons)) / (1 - phi**2)
header("Forecast memory and uncertainty growth")
print("Horizon ForecastMean ForecastSD")
for h, m, v in zip(horizons, forecast_means, forecast_vars):
print(f"{h:>7d} {m:>12.4f} {np.sqrt(v):>10.4f}")
report("Forecast uncertainty grows with horizon", np.all(np.diff(forecast_vars) > 0))
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(horizons, forecast_means, marker="o", color=COLORS["primary"], label="Forecast mean")
ax.fill_between(horizons,
forecast_means - 1.96 * np.sqrt(forecast_vars),
forecast_means + 1.96 * np.sqrt(forecast_vars),
color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
ax.set_title("AR(1) forecast memory vs uncertainty growth")
ax.set_xlabel("Forecast horizon")
ax.set_ylabel("Predicted value")
ax.legend()
fig.tight_layout()
plt.show()
2. Formal Definitions
2.1 Time Series as Observed Stochastic Processes
One observed path should be distinguished from the ensemble process that generated it.
Code cell 13
paths = np.stack([simulate_ar([0.7], n=120) for _ in range(200)])
one_path = paths[0]
ensemble_mean = paths.mean(axis=0)
header("One realization vs ensemble process")
print(f"Single-path sample mean : {one_path.mean():.4f}")
print(f"Ensemble time-mean mean : {ensemble_mean.mean():.4f}")
print(f"Single-path lag-1 ACF : {sample_acf(one_path, 1)[1]:.4f}")
report("Ensemble and single path are different objects", np.abs(one_path.mean() - ensemble_mean.mean()) > 1e-3)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(one_path, color=COLORS["primary"], alpha=0.8, label="One realization")
ax.plot(ensemble_mean, color=COLORS["secondary"], linestyle="--", label="Ensemble mean at each time")
ax.set_title("Observed realization vs ensemble behavior")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
2.2 Lag Operator, Backshift Notation, and Difference Operators
Lag notation compresses temporal algebra into polynomial form.
Code cell 15
x = np.array([10, 12, 15, 14, 19, 21, 25], dtype=float)
lag1 = x[:-1]
first_diff = np.diff(x)
seasonal_diff = x[3:] - x[:-3]
header("Lag and difference operators")
print("Original series :", x)
print("L x_t (lag-1 aligned) :", lag1)
print("First difference :", first_diff)
print("Seasonal difference s=3 :", seasonal_diff)
report("First difference length is T-1", len(first_diff) == len(x) - 1)
2.3 Mean Function, Autocovariance, and Autocorrelation
The ACF summarizes lag dependence on a normalized scale.
Code cell 17
ar_sample = simulate_ar([0.6], n=400)
acf_est = sample_acf(ar_sample, 12)
acf_theory = 0.6 ** np.arange(13)
header("Sample ACF vs theoretical ACF for AR(1)")
print("lag sample theory")
for h in range(6):
print(f"{h:>3d} {acf_est[h]:>7.4f} {acf_theory[h]:>7.4f}")
report("Sample ACF at lag 1 is positive", acf_est[1] > 0)
if HAS_MPL:
fig, ax = plt.subplots()
lags = np.arange(13)
ax.bar(lags - 0.15, acf_est, width=0.3, color=COLORS["primary"], label="Sample")
ax.bar(lags + 0.15, acf_theory, width=0.3, color=COLORS["secondary"], alpha=0.8, label="Theory")
ax.set_title("Autocorrelation of an AR(1) process")
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
ax.legend()
fig.tight_layout()
plt.show()
2.4 White Noise, Random Walks, and Trend-Plus-Noise Models
Similar-looking lines can imply very different dependence structures.
Code cell 19
n = 240
white = rng.normal(scale=1.0, size=n)
walk = np.cumsum(rng.normal(scale=1.0, size=n))
trend = 0.05 * np.arange(n) + rng.normal(scale=1.0, size=n)
header("White noise vs random walk vs trend-plus-noise")
print(f"Lag-1 ACF white noise : {sample_acf(white, 1)[1]:.4f}")
print(f"Lag-1 ACF random walk : {sample_acf(walk, 1)[1]:.4f}")
print(f"Lag-1 ACF trend+noise : {sample_acf(trend, 1)[1]:.4f}")
report("Random walk is more persistent than white noise", sample_acf(walk, 1)[1] > sample_acf(white, 1)[1])
if HAS_MPL:
fig, axes = plt.subplots(3, 2, figsize=(12, 9))
series = [white, walk, trend]
names = ["White noise", "Random walk", "Trend plus noise"]
colors = [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]
for i, (s, name, color) in enumerate(zip(series, names, colors)):
axes[i, 0].plot(s, color=color)
axes[i, 0].set_title(name)
axes[i, 0].set_xlabel("Time")
axes[i, 0].set_ylabel("Value")
acf_vals = sample_acf(s, 20)
axes[i, 1].bar(np.arange(21), acf_vals, color=color)
axes[i, 1].set_title(f"{name} ACF")
axes[i, 1].set_xlabel("Lag")
axes[i, 1].set_ylabel("ACF")
fig.tight_layout()
plt.show()
2.5 Stationarity: Strict vs Weak
Weak stationarity shows up as stable mean and variance across windows.
Code cell 21
stationary = simulate_ar([0.5], n=320)
nonstationary = np.cumsum(rng.normal(size=320))
win = 40
stat_mean = rolling_mean(stationary, win)
stat_var = rolling_var(stationary, win)
non_mean = rolling_mean(nonstationary, win)
non_var = rolling_var(nonstationary, win)
header("Rolling moments as a stationarity diagnostic")
print(f"Stationary rolling-mean range : {stat_mean.min():.3f} to {stat_mean.max():.3f}")
print(f"Nonstationary rolling-mean range : {non_mean.min():.3f} to {non_mean.max():.3f}")
print(f"Stationary rolling-var range : {stat_var.min():.3f} to {stat_var.max():.3f}")
print(f"Nonstationary rolling-var range : {non_var.min():.3f} to {non_var.max():.3f}")
report("Nonstationary series has less stable rolling mean", np.ptp(non_mean) > np.ptp(stat_mean))
if HAS_MPL:
fig, axes = plt.subplots(2, 2, figsize=(12, 8), sharex="col")
axes[0, 0].plot(stationary, color=COLORS["primary"])
axes[0, 0].set_title("Stationary series")
axes[0, 1].plot(nonstationary, color=COLORS["error"])
axes[0, 1].set_title("Nonstationary series")
axes[1, 0].plot(stat_mean, label="Rolling mean", color=COLORS["secondary"])
axes[1, 0].plot(stat_var, label="Rolling variance", color=COLORS["tertiary"])
axes[1, 0].legend()
axes[1, 1].plot(non_mean, label="Rolling mean", color=COLORS["secondary"])
axes[1, 1].plot(non_var, label="Rolling variance", color=COLORS["tertiary"])
axes[1, 1].legend()
for ax in axes.ravel():
ax.set_xlabel("Time")
ax.set_ylabel("Value")
fig.tight_layout()
plt.show()
3. Core Theory I: Dependence, Stationarity, and Diagnostics
3.1 Why Stationarity Matters
Stable lag relationships make historical estimation useful for future forecasting.
Code cell 23
stationary = simulate_ar([0.7], n=400)
drifted = 0.03 * np.arange(400) + rng.normal(scale=1.0, size=400)
def half_summary(x):
half = len(x) // 2
return x[:half].mean(), x[half:].mean()
s1, s2 = half_summary(stationary)
d1, d2 = half_summary(drifted)
header("Why stationarity matters for estimation")
print(f"Stationary half means: {s1:.3f}, {s2:.3f}")
print(f"Drifted half means : {d1:.3f}, {d2:.3f}")
report("Drifted series changes mean more than stationary series", abs(d2 - d1) > abs(s2 - s1))
3.2 Autocorrelation Function (ACF)
Different process families leave distinct ACF signatures.
Code cell 25
white = rng.normal(size=500)
ar = simulate_ar([0.75], n=500)
t = np.arange(500)
seasonal = np.sin(2 * np.pi * t / 24) + 0.4 * rng.normal(size=500)
acfs = {
"White noise": sample_acf(white, 36),
"AR(1)": sample_acf(ar, 36),
"Seasonal": sample_acf(seasonal, 36),
}
header("ACF signatures")
print(f"Seasonal lag-24 ACF: {acfs['Seasonal'][24]:.4f}")
print(f"AR lag-1 ACF : {acfs['AR(1)'][1]:.4f}")
report("Seasonal series spikes at lag 24", abs(acfs["Seasonal"][24]) > 0.5)
if HAS_MPL:
fig, axes = plt.subplots(1, 3, figsize=(15, 4), sharey=True)
for ax, (name, vals), color in zip(axes, acfs.items(), [COLORS["secondary"], COLORS["primary"], COLORS["tertiary"]]):
ax.bar(np.arange(len(vals)), vals, color=color)
ax.set_title(name)
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
fig.tight_layout()
plt.show()
3.3 Partial Autocorrelation Function (PACF)
PACF highlights direct lag effects after controlling for shorter lags.
Code cell 27
ar2 = simulate_ar([0.8, -0.25], n=600)
ma1 = simulate_ma([0.7], n=600)
pacf_ar2 = sample_pacf(ar2, 12)
pacf_ma1 = sample_pacf(ma1, 12)
header("PACF signatures")
print("AR(2) PACF first lags:", pacf_ar2[:5])
print("MA(1) PACF first lags:", pacf_ma1[:5])
report("AR(2) PACF has strong lag-2 component", abs(pacf_ar2[2]) > 0.1)
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
axes[0].bar(np.arange(len(pacf_ar2)), pacf_ar2, color=COLORS["primary"])
axes[0].set_title("PACF of AR(2)")
axes[1].bar(np.arange(len(pacf_ma1)), pacf_ma1, color=COLORS["secondary"])
axes[1].set_title("PACF of MA(1)")
for ax in axes:
ax.set_xlabel("Lag")
ax.set_ylabel("PACF")
fig.tight_layout()
plt.show()
3.4 Differencing, Unit Roots, and Random Walk Behavior
Differencing can stabilize an integrated level process.
Code cell 29
walk = np.cumsum(rng.normal(size=500))
diff_walk = np.diff(walk)
acf_walk = sample_acf(walk, 20)
acf_diff = sample_acf(diff_walk, 20)
header("Differencing a random walk")
print(f"Lag-1 ACF in levels : {acf_walk[1]:.4f}")
print(f"Lag-1 ACF after diff : {acf_diff[1]:.4f}")
report("Differencing reduces persistence", abs(acf_diff[1]) < abs(acf_walk[1]))
if HAS_MPL:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].plot(walk, color=COLORS["error"])
axes[0, 0].set_title("Random walk levels")
axes[0, 1].bar(np.arange(21), acf_walk, color=COLORS["error"])
axes[0, 1].set_title("ACF of levels")
axes[1, 0].plot(diff_walk, color=COLORS["primary"])
axes[1, 0].set_title("First difference")
axes[1, 1].bar(np.arange(21), acf_diff, color=COLORS["primary"])
axes[1, 1].set_title("ACF after differencing")
for ax in axes.ravel():
ax.set_xlabel("Time / Lag")
ax.set_ylabel("Value")
fig.tight_layout()
plt.show()
3.5 Wold Decomposition Intuition
A stationary AR process can be viewed as a filtered innovation sequence.
Code cell 31
phi = 0.6
psi = phi ** np.arange(12)
innovations = rng.normal(size=400)
x = np.zeros(400)
for t in range(1, 400):
x[t] = phi * x[t-1] + innovations[t]
x_recon = np.zeros_like(x)
for t in range(400):
s = 0.0
for j in range(min(t + 1, len(psi))):
s += psi[j] * innovations[t-j]
x_recon[t] = s
header("Wold-style innovation representation")
print("First six impulse-response weights:", psi[:6])
print(f"Truncated reconstruction RMSE: {np.sqrt(np.mean((x[50:] - x_recon[50:])**2)):.4f}")
report("Impulse response decays for |phi|<1", np.all(np.diff(psi) < 0))
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].stem(np.arange(len(psi)), psi, linefmt=COLORS["primary"], markerfmt="o", basefmt=" ")
axes[0].set_title("AR(1) impulse response weights")
axes[0].set_xlabel("Lag j")
axes[0].set_ylabel(r"$\psi_j$")
axes[1].plot(x[50:150], color=COLORS["secondary"], label="AR(1) path")
axes[1].plot(x_recon[50:150], color=COLORS["primary"], linestyle="--", label="Truncated innovation sum")
axes[1].set_title("Filtered-shock view of the process")
axes[1].set_xlabel("Time")
axes[1].set_ylabel("Value")
axes[1].legend()
fig.tight_layout()
plt.show()
4. Core Theory II: Linear Time-Series Models
4.1 Autoregressive Models AR(p)
Root conditions determine whether the recursion is stable.
Code cell 33
phi = np.array([1.2, -0.32])
roots = np.roots(np.array([-phi[1], -phi[0], 1.0]))
ar2 = simulate_ar(phi, n=400)
header("AR(2) stability via characteristic roots")
print("AR coefficients:", phi)
print("Characteristic roots:", roots)
print("Root magnitudes:", np.abs(roots))
report("Roots lie outside the unit circle", np.all(np.abs(roots) > 1))
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(ar2[:150], color=COLORS["primary"])
ax.set_title("Simulated AR(2) process")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
fig.tight_layout()
plt.show()
4.2 Moving-Average Models MA(q)
MA models cut off in the ACF because only a finite number of shocks echo forward.
Code cell 35
ma = simulate_ma([0.8], n=500)
ma_acf = sample_acf(ma, 12)
header("MA(1) autocorrelation cutoff")
print("First six ACF values:", ma_acf[:6])
report("Lag-2 ACF is small for MA(1)", abs(ma_acf[2]) < 0.15)
if HAS_MPL:
fig, ax = plt.subplots()
ax.bar(np.arange(len(ma_acf)), ma_acf, color=COLORS["secondary"])
ax.set_title("ACF of an MA(1) process")
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
fig.tight_layout()
plt.show()
4.3 ARMA Models
When both level persistence and shock echo matter, both ACF and PACF usually decay.
Code cell 37
arma = simulate_arma([0.6], [0.5], n=600)
arma_acf = sample_acf(arma, 12)
arma_pacf = sample_pacf(arma, 12)
header("ARMA(1,1) signatures")
print("ACF first five values :", arma_acf[:5])
print("PACF first five values:", arma_pacf[:5])
report("Both ACF and PACF show nontrivial early lags", np.any(np.abs(arma_acf[1:4]) > 0.1) and np.any(np.abs(arma_pacf[1:4]) > 0.1))
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
axes[0].bar(np.arange(len(arma_acf)), arma_acf, color=COLORS["primary"])
axes[0].set_title("ACF of ARMA(1,1)")
axes[1].bar(np.arange(len(arma_pacf)), arma_pacf, color=COLORS["secondary"])
axes[1].set_title("PACF of ARMA(1,1)")
for ax in axes:
ax.set_xlabel("Lag")
ax.set_ylabel("Value")
fig.tight_layout()
plt.show()
4.4 ARIMA Models
Differencing can expose stationary ARMA-style structure in an integrated series.
Code cell 39
innovations = simulate_arma([0.5], [0.3], n=500)
arima_series = np.cumsum(innovations)
arima_diff = np.diff(arima_series)
header("ARIMA intuition through differencing")
print(f"Lag-1 ACF of levels : {sample_acf(arima_series, 1)[1]:.4f}")
print(f"Lag-1 ACF of differences: {sample_acf(arima_diff, 1)[1]:.4f}")
report("Differencing exposes more stationary dependence", abs(sample_acf(arima_diff, 1)[1]) < abs(sample_acf(arima_series, 1)[1]))
if HAS_MPL:
fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)
axes[0].plot(arima_series, color=COLORS["error"])
axes[0].set_title("Integrated series")
axes[0].set_ylabel("Level")
axes[1].plot(arima_diff, color=COLORS["primary"])
axes[1].set_title("First difference")
axes[1].set_xlabel("Time")
axes[1].set_ylabel("Change")
fig.tight_layout()
plt.show()
4.5 Model Identification with ACF/PACF and Information Criteria
Information criteria help arbitrate among plausible lag orders.
Code cell 41
x = simulate_ar([0.75, -0.18], n=420)
def fit_ar_conditional(x, p):
y = x[p:]
X = np.column_stack([x[p-k-1:-k-1] for k in range(p)])
beta = la.lstsq(X, y, rcond=None)[0]
resid = y - X @ beta
n = len(resid)
rss = np.dot(resid, resid)
aic = n * np.log(rss / n) + 2 * p
bic = n * np.log(rss / n) + p * np.log(n)
return beta, resid, aic, bic
header("Approximate AIC/BIC across AR orders")
results = []
for p in range(1, 6):
beta, resid, aic, bic = fit_ar_conditional(x, p)
results.append((p, aic, bic))
print(f"AR({p}) -> AIC={aic:8.3f}, BIC={bic:8.3f}")
best_aic = min(results, key=lambda row: row[1])[0]
best_bic = min(results, key=lambda row: row[2])[0]
print(f"Best AIC order: AR({best_aic})")
print(f"Best BIC order: AR({best_bic})")
report("Information criteria choose a low-order model", best_aic <= 3 and best_bic <= 3)
5. Core Theory III: Forecasting, Seasonality, and State Space
5.1 One-Step and Multi-Step Forecasting
Multi-step uncertainty grows because more unobserved shocks enter the forecast.
Code cell 43
phi = 0.82
sigma = 1.0
history = simulate_ar([phi], n=180)
last = history[-1]
horizons = np.arange(1, 13)
forecast = phi**horizons * last
fvar = sigma**2 * (1 - phi**(2 * horizons)) / (1 - phi**2)
header("AR(1) one-step and multi-step forecasts")
print("h mean sd")
for h, m, s in zip(horizons, forecast, np.sqrt(fvar)):
print(f"{h:>2d} {m:>7.3f} {s:>7.3f}")
report("Forecast intervals widen with horizon", np.all(np.diff(fvar) > 0))
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(np.arange(len(history)), history, color=COLORS["neutral"], label="Observed history")
future_index = np.arange(len(history), len(history) + len(horizons))
ax.plot(future_index, forecast, marker="o", color=COLORS["primary"], label="Forecast mean")
ax.fill_between(future_index,
forecast - 1.96 * np.sqrt(fvar),
forecast + 1.96 * np.sqrt(fvar),
color=COLORS["primary"], alpha=0.2, label="Approx. 95% band")
ax.set_title("Forecast path with horizon-dependent uncertainty")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
5.2 Seasonality and Seasonal Decomposition
Additive decomposition separates level, periodic structure, and irregular remainder.
Code cell 45
n = 24 * 14
t = np.arange(n)
true_trend = 30 + 0.03 * t
true_seasonal = 3.5 * np.sin(2 * np.pi * t / 24) + 1.5 * np.cos(2 * np.pi * t / 24)
noise = rng.normal(scale=0.9, size=n)
series = true_trend + true_seasonal + noise
trend_est = rolling_mean(series, 24)
aligned = series[11:-12]
seasonal_raw = aligned - trend_est
seasonal_template = np.array([seasonal_raw[np.arange(i, len(seasonal_raw), 24)].mean() for i in range(24)])
seasonal_template -= seasonal_template.mean()
seasonal_est = np.tile(seasonal_template, n // 24)
resid = aligned - trend_est - seasonal_est[11:-12]
header("Simple seasonal decomposition")
print(f"Estimated seasonal template mean: {seasonal_template.mean():.6f}")
print(f"Residual mean (trimmed sample) : {resid.mean():.6f}")
report("Seasonal template is centered", abs(seasonal_template.mean()) < 1e-8)
if HAS_MPL:
fig, axes = plt.subplots(3, 1, figsize=(10, 8), sharex=True)
axes[0].plot(series, color=COLORS["primary"])
axes[0].set_title("Observed seasonal series")
axes[1].plot(true_trend, color=COLORS["secondary"], label="True trend")
axes[1].plot(np.arange(11, n-12), trend_est, color=COLORS["highlight"], linestyle="--", label="Moving-average trend")
axes[1].legend()
axes[1].set_title("Trend component")
axes[2].plot(seasonal_template, marker="o", color=COLORS["tertiary"])
axes[2].set_title("Estimated seasonal template (24-step period)")
axes[2].set_xlabel("Seasonal index")
for ax in axes:
ax.set_ylabel("Value")
fig.tight_layout()
plt.show()
5.3 SARIMA and Seasonal Structure
Seasonal differencing targets repeated structure at lag s.
Code cell 47
n = 24 * 20
t = np.arange(n)
seasonal_series = 2.8 * np.sin(2 * np.pi * t / 24) + simulate_ar([0.4], n=n, sigma=0.7)
seasonal_diff = seasonal_series[24:] - seasonal_series[:-24]
acf_before = sample_acf(seasonal_series, 30)
acf_after = sample_acf(seasonal_diff, 30)
header("Seasonal differencing")
print(f"Lag-24 ACF before differencing: {acf_before[24]:.4f}")
print(f"Lag-24 ACF after differencing : {acf_after[24]:.4f}")
report("Seasonal differencing reduces lag-24 correlation", abs(acf_after[24]) < abs(acf_before[24]))
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4), sharey=True)
axes[0].bar(np.arange(31), acf_before, color=COLORS["secondary"])
axes[0].set_title("ACF before seasonal differencing")
axes[1].bar(np.arange(31), acf_after, color=COLORS["primary"])
axes[1].set_title("ACF after seasonal differencing")
for ax in axes:
ax.set_xlabel("Lag")
ax.set_ylabel("ACF")
fig.tight_layout()
plt.show()
5.4 State-Space Models
Hidden-state dynamics separate signal evolution from noisy observations.
Code cell 49
n = 180
latent = np.zeros(n)
obs = np.zeros(n)
q, r = 0.12, 0.9
latent[0] = 0.0
obs[0] = latent[0] + rng.normal(scale=np.sqrt(r))
for t in range(1, n):
latent[t] = latent[t-1] + rng.normal(scale=np.sqrt(q))
obs[t] = latent[t] + rng.normal(scale=np.sqrt(r))
header("Linear-Gaussian state-space simulation")
print(f"Process noise variance q: {q:.3f}")
print(f"Observation noise variance r: {r:.3f}")
print(f"Observation RMSE vs state : {np.sqrt(np.mean((obs - latent)**2)):.4f}")
report("Observations are noisier than the latent state", np.std(obs - latent) > 0.5)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(latent, color=COLORS["primary"], label="Latent state")
ax.plot(obs, color=COLORS["secondary"], alpha=0.6, label="Observed measurement")
ax.set_title("State-space model: hidden state and noisy observations")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
5.5 Kalman Filter
Recursive Gaussian updating improves state estimates from noisy measurements.
Code cell 51
filtered_mean, filtered_var, pred_mean, pred_var = kalman_filter_1d(obs, q=q, r=r, m0=0.0, p0=1.0)
obs_rmse = np.sqrt(np.mean((obs - latent)**2))
filt_rmse = np.sqrt(np.mean((filtered_mean - latent)**2))
header("Kalman filtering a local-level model")
print(f"Observation RMSE: {obs_rmse:.4f}")
print(f"Filtered RMSE : {filt_rmse:.4f}")
print(f"Final filtered variance: {filtered_var[-1]:.4f}")
report("Kalman filter improves over raw observations", filt_rmse < obs_rmse)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(latent, color=COLORS["neutral"], label="True state")
ax.plot(obs, color=COLORS["secondary"], alpha=0.35, label="Observations")
ax.plot(filtered_mean, color=COLORS["primary"], label="Filtered mean")
ax.set_title("Kalman filter tracking a hidden state")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
6. Advanced Topics
6.1 Spectral Analysis and the Frequency-Domain View
Frequency space reveals temporal scale directly.
Code cell 53
n = 512
t = np.arange(n)
low_freq = np.sin(2 * np.pi * t / 64) + 0.2 * rng.normal(size=n)
high_freq = np.sin(2 * np.pi * t / 8) + 0.2 * rng.normal(size=n)
f_low, p_low = periodogram(low_freq)
f_high, p_high = periodogram(high_freq)
header("Frequency-domain view")
print(f"Dominant low-frequency component : {f_low[np.argmax(p_low[1:]) + 1]:.4f}")
print(f"Dominant high-frequency component: {f_high[np.argmax(p_high[1:]) + 1]:.4f}")
report("Low-frequency signal peaks at lower frequency than high-frequency signal",
f_low[np.argmax(p_low[1:]) + 1] < f_high[np.argmax(p_high[1:]) + 1])
if HAS_MPL:
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0, 0].plot(low_freq, color=COLORS["primary"])
axes[0, 0].set_title("Low-frequency signal")
axes[0, 1].plot(f_low[1:], p_low[1:], color=COLORS["primary"])
axes[0, 1].set_title("Spectrum of low-frequency signal")
axes[1, 0].plot(high_freq, color=COLORS["secondary"])
axes[1, 0].set_title("High-frequency signal")
axes[1, 1].plot(f_high[1:], p_high[1:], color=COLORS["secondary"])
axes[1, 1].set_title("Spectrum of high-frequency signal")
for ax in axes.ravel():
ax.set_xlabel("Time / Frequency")
ax.set_ylabel("Value / Power")
fig.tight_layout()
plt.show()
6.2 Periodogram and Fourier Decomposition
The periodogram is a practical frequency detector for finite samples.
Code cell 55
n = 480
t = np.arange(n)
true_period = 30
mixed = 1.8 * np.sin(2 * np.pi * t / true_period) + 0.8 * np.sin(2 * np.pi * t / 10) + rng.normal(scale=0.6, size=n)
freqs, power = periodogram(mixed)
dom_idx = np.argmax(power[1:]) + 1
dom_freq = freqs[dom_idx]
est_period = 1 / dom_freq
header("Periodogram-based frequency detection")
print(f"True dominant period: {true_period}")
print(f"Estimated dominant period: {est_period:.2f}")
report("Estimated period is close to the true dominant period", abs(est_period - true_period) < 3.0)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(freqs[1:], power[1:], color=COLORS["highlight"])
ax.axvline(dom_freq, color=COLORS["neutral"], linestyle="--", label=f"Estimated period {est_period:.1f}")
ax.set_title("Periodogram of a noisy periodic signal")
ax.set_xlabel("Frequency")
ax.set_ylabel("Power")
ax.legend()
fig.tight_layout()
plt.show()
6.3 Kalman Smoothing, Forecasting, and Missing Data
State-space models handle missing observations gracefully by propagating uncertainty forward.
Code cell 57
obs_missing = obs.copy()
missing_idx = np.arange(20, len(obs_missing), 17)
obs_missing[missing_idx] = np.nan
filt_missing, var_missing, _, _ = kalman_filter_1d(obs_missing, q=q, r=r, m0=0.0, p0=1.0)
header("Filtering with missing data")
print(f"Number of missing observations: {np.isnan(obs_missing).sum()}")
print(f"RMSE at missing locations : {np.sqrt(np.mean((filt_missing[missing_idx] - latent[missing_idx])**2)):.4f}")
report("Filter produces finite imputations at missing points", np.isfinite(filt_missing[missing_idx]).all())
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(latent, color=COLORS["neutral"], label="True state")
ax.plot(obs_missing, linestyle="None", marker="o", markersize=4, color=COLORS["secondary"], alpha=0.6, label="Observed / missing")
ax.plot(filt_missing, color=COLORS["primary"], label="Filtered state")
ax.set_title("Kalman filtering with missing observations")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
6.4 Multivariate Time Series Preview
In multiple dimensions, lag dependence becomes matrix-valued and cross-series effects matter.
Code cell 59
A = np.array([[0.7, 0.15], [0.05, 0.6]])
x = np.zeros((250, 2))
eps = rng.normal(scale=0.5, size=(250, 2))
for t in range(1, len(x)):
x[t] = A @ x[t-1] + eps[t]
header("Multivariate lag dependence preview")
cross_lag = np.corrcoef(x[:-1, 0], x[1:, 1])[0, 1]
print(f"Cross-lag correlation corr(X1_t, X2_t+1): {cross_lag:.4f}")
report("Cross-series dependence is present", abs(cross_lag) > 0.05)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(x[:, 0], color=COLORS["primary"], label="Series 1")
ax.plot(x[:, 1], color=COLORS["secondary"], label="Series 2")
ax.set_title("Vector autoregressive preview")
ax.set_xlabel("Time")
ax.set_ylabel("Value")
ax.legend()
fig.tight_layout()
plt.show()
6.5 Change Points, Regime Shifts, and Structural Breaks
When the data-generating rules change, a stationary model can fail for the right reason.
Code cell 61
x = np.r_[rng.normal(loc=0.0, scale=1.0, size=120),
rng.normal(loc=2.5, scale=1.0, size=120)]
left = np.array([x[max(0, i-20):i].mean() if i > 0 else np.nan for i in range(len(x))])
right = np.array([x[i:min(len(x), i+20)].mean() if i < len(x) - 1 else np.nan for i in range(len(x))])
score = np.abs(right - left)
est_break = np.nanargmax(score)
header("Detecting a structural break")
print(f"True break location : 120")
print(f"Estimated break location: {est_break}")
report("Estimated break is close to truth", abs(est_break - 120) <= 10)
if HAS_MPL:
fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)
axes[0].plot(x, color=COLORS["primary"])
axes[0].axvline(120, color=COLORS["neutral"], linestyle="--", label="True break")
axes[0].axvline(est_break, color=COLORS["error"], linestyle=":", label="Estimated break")
axes[0].set_title("Series with a mean shift")
axes[0].legend()
axes[1].plot(score, color=COLORS["error"])
axes[1].set_title("Rolling mean-shift score")
axes[1].set_xlabel("Time")
axes[1].set_ylabel("Score")
fig.tight_layout()
plt.show()
7. Applications in Machine Learning
7.1 Forecasting for Demand, Traffic, and Monitoring
Classical seasonal baselines still matter in real operational forecasting.
Code cell 63
n = 24 * 28
t = np.arange(n)
demand = 50 + 8 * np.sin(2 * np.pi * t / 24) + 0.03 * t + rng.normal(scale=1.2, size=n)
train, test = demand[:-24], demand[-24:]
naive_forecast = np.repeat(train[-1], 24)
seasonal_forecast = train[-24:]
naive_rmse = np.sqrt(np.mean((test - naive_forecast)**2))
seasonal_rmse = np.sqrt(np.mean((test - seasonal_forecast)**2))
header("Operational forecasting baseline")
print(f"Naive last-value RMSE : {naive_rmse:.4f}")
print(f"Seasonal baseline RMSE: {seasonal_rmse:.4f}")
report("Seasonal baseline beats naive forecast", seasonal_rmse < naive_rmse)
if HAS_MPL:
fig, ax = plt.subplots()
ax.plot(np.arange(len(train)), train, color=COLORS["neutral"], label="Train")
test_idx = np.arange(len(train), len(train) + len(test))
ax.plot(test_idx, test, color=COLORS["primary"], label="Test truth")
ax.plot(test_idx, naive_forecast, color=COLORS["error"], linestyle="--", label="Naive")
ax.plot(test_idx, seasonal_forecast, color=COLORS["secondary"], linestyle="--", label="Seasonal baseline")
ax.set_title("Forecasting demand with classical baselines")
ax.set_xlabel("Time")
ax.set_ylabel("Demand")
ax.legend()
fig.tight_layout()
plt.show()
7.2-7.5 Modern Sequence Modeling Links
Temporal encodings, hidden-state recursions, and pretrained forecasters still depend on the same notions of scale, memory, and seasonality.
Code cell 65
positions = np.arange(48)
freqs = np.array([1/48, 1/24, 1/12, 1/6])
encoding = np.column_stack([
np.sin(2 * np.pi * positions[:, None] * freqs),
np.cos(2 * np.pi * positions[:, None] * freqs),
])
A = np.array([[0.92, 0.0], [0.0, 0.55]])
state = np.zeros((80, 2))
state[0] = np.array([1.0, 1.0])
for t in range(1, len(state)):
state[t] = A @ state[t-1]
header("Modern sequence-model links")
print(f"Temporal encoding matrix shape: {encoding.shape}")
print(f"Slow state half-life indicator : {np.where(state[:,0] < 0.5)[0][0]}")
print(f"Fast state half-life indicator : {np.where(state[:,1] < 0.5)[0][0]}")
report("Different state dimensions remember information at different scales",
np.where(state[:,0] < 0.5)[0][0] > np.where(state[:,1] < 0.5)[0][0])
if HAS_MPL:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
im = axes[0].imshow(encoding.T, aspect="auto", cmap="viridis")
axes[0].set_title("Fourier-style temporal features")
axes[0].set_xlabel("Position")
axes[0].set_ylabel("Feature index")
fig.colorbar(im, ax=axes[0], label="Value")
axes[1].plot(state[:, 0], color=COLORS["primary"], label="Slow mode")
axes[1].plot(state[:, 1], color=COLORS["secondary"], label="Fast mode")
axes[1].set_title("State-space memory at multiple timescales")
axes[1].set_xlabel("Step")
axes[1].set_ylabel("State value")
axes[1].legend()
fig.tight_layout()
plt.show()
8. Common Mistakes
| # | Mistake | Why it is wrong | Fix |
|---|---|---|---|
| 1 | Applying OLS without checking stationarity | Non-stationary regressors produce spurious regressions | Difference or detrend first; check ADF/KPSS |
| 2 | Interpreting ACF alone to identify AR order | ACF decays slowly for AR; use PACF for AR order | Use PACF cuts off at lag p for AR(p) |
| 3 | Seasonal differencing without regular differencing | Can leave non-seasonal unit root in place | Apply both ∇ and ∇_s as needed |
| 4 | Using MSE as the only forecast metric | MSE heavily penalises outliers and ignores calibration | Report MAE, MASE, CRPS for probabilistic forecasts |
| 5 | Treating train/test split like i.i.d. data | Future leaks into past in random split | Always split by time; use expanding or rolling windows |
| 6 | Ignoring Kalman filter initialisation | Poor prior covariance P0 distorts early estimates | Use diffuse prior or steady-state initialisation |
| 7 | Assuming Transformer positional encoding captures periodicity | Sinusoidal PE does not explicitly encode seasonal patterns | Add Fourier features or seasonal embeddings |
| 8 | Confusing autocorrelation with cross-correlation | ACF is a single-series property | Use CCF for lead-lag relationships between two series |
Recap
The classical toolkit already contains the essential questions modern sequence models must answer: what depends on what, what repeats, what is hidden, how forecast uncertainty grows, and when the data-generating rules have changed.