Theory Notebook

Converted from theory.ipynb for web reading.

§04 Convolution Theorem

"The convolution theorem is the most important theorem in signal processing." — Alan V. Oppenheim

Interactive derivations: Convolution Theorem proofs, LTI systems, overlap-add, Wiener filter, CNNs, S4/Mamba, Hyena, FNet.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import scipy.signal as sp_sig
import scipy.fft as sp_fft
import matplotlib.pyplot as plt
import matplotlib as mpl
try:
    import seaborn as sns
    sns.set_theme(style='whitegrid', palette='colorblind')
    HAS_SNS = True
except ImportError:
    plt.style.use('seaborn-v0_8-whitegrid')
    HAS_SNS = False
mpl.rcParams.update({
    'figure.figsize':(10,6),'figure.dpi':120,'font.size':13,
    'axes.titlesize':15,'axes.labelsize':13,
    'axes.spines.top':False,'axes.spines.right':False,
})
np.random.seed(42)
COLORS = {'primary':'#0077BB','secondary':'#EE7733','tertiary':'#009988',
          'error':'#CC3311','neutral':'#555555','highlight':'#EE3377'}
print('Setup complete.')

1. Intuition: What Is Convolution?

Convolution blends two functions by sliding one over the other. $(f * g)(t) = \int_{-\infty}^{\infty} f(\tau) g(t-\tau)\,d\tau$

Code cell 5

# === 1.1 Convolution as sliding overlap ===
# Visualize convolution as a sliding inner product
N = 200
t = np.linspace(-3, 3, N)
dt = t[1] - t[0]

# Rect pulse and Gaussian
f = ((t >= -0.5) & (t <= 0.5)).astype(float)
g = np.exp(-4 * t**2)

# Linear convolution via numpy
conv_fg = np.convolve(f, g, mode='same') * dt

fig, axes = plt.subplots(1, 3, figsize=(14, 5))
fig.suptitle('Convolution: sliding overlap of f and g', fontsize=14)
axes[0].plot(t, f, color=COLORS['primary'], linewidth=2, label='f (rect)')
axes[0].set_title('f(t)'); axes[0].set_xlabel('t'); axes[0].legend()
axes[1].plot(t, g, color=COLORS['secondary'], linewidth=2, label='g (Gaussian)')
axes[1].set_title('g(t)'); axes[1].set_xlabel('t'); axes[1].legend()
axes[2].plot(t, conv_fg, color=COLORS['tertiary'], linewidth=2, label='(f*g)(t)')
axes[2].set_title('(f * g)(t)'); axes[2].set_xlabel('t'); axes[2].legend()
fig.tight_layout(); plt.show()
print('Convolution of rect with Gaussian gives a smoothed pulse.')

Code cell 6

# === 1.2 Polynomial multiplication as convolution ===
p1 = np.array([1, 2, 3])  # 1 + 2x + 3x^2
p2 = np.array([4, 5])     # 4 + 5x
conv_poly = np.convolve(p1, p2)
np_poly = np.polymul(p1[::-1], p2[::-1])[::-1]
print('p1 = 1 + 2x + 3x^2 -> coefficients:', p1)
print('p2 = 4 + 5x         -> coefficients:', p2)
print('p1 * p2 via np.convolve:', conv_poly)
print('Expected (4 + 13x + 22x^2 + 15x^3):', [4, 13, 22, 15])
ok = np.allclose(conv_poly, np_poly)
print(f'PASS — polynomial multiplication = convolution: {ok}')

2. Formal Definitions

2.1 Linear vs Circular Convolution

Linear: $(x * h)[n] = \sum_k x[k]h[n-k]$ , output length $M+N-1$

Circular: $(x \circledast h)[n] = \sum_k x[k]h[(n-k)\bmod N]$ , output length $N$ (wraps around)

Code cell 8

# === 2.1 Linear vs circular convolution comparison ===
x = np.array([3, 1, 4, 1, 5], dtype=float)
h = np.array([1, 2, 1], dtype=float)

# Linear convolution
y_lin = np.convolve(x, h)
print('x:', x)
print('h:', h)
print('Linear conv (length M+N-1 =', len(y_lin), '):', y_lin)

# Circular convolution (5-point, WRONG due to wrap-around)
N_circ = len(x)
y_circ5 = np.real(np.fft.ifft(np.fft.fft(x, n=N_circ) * np.fft.fft(h, n=N_circ)))
print('Circular conv (N=5, wrap-around):', np.round(y_circ5, 6))

# Circular convolution (zero-padded, CORRECT)
N_pad = 2**int(np.ceil(np.log2(len(x) + len(h) - 1)))
y_circ_correct = np.real(np.fft.ifft(
    np.fft.fft(x, n=N_pad) * np.fft.fft(h, n=N_pad)))[:len(y_lin)]
print('Circular conv (N=', N_pad, ', zero-padded, correct):', np.round(y_circ_correct, 6))
print(f'PASS — zero-padded circular == linear: {np.allclose(y_circ_correct, y_lin)}')

3. The Convolution Theorem

$\mathcal{F}(f * g)(\xi) = \hat{f}(\xi) \cdot \hat{g}(\xi)$

Convolution in time = pointwise multiplication in frequency.

Code cell 10

# === 3.1 Convolution Theorem verification ===
N_ct = 256
x_ct = np.random.randn(N_ct)
h_ct = np.array([0.25, 0.5, 0.25])  # simple lowpass

# Method A: direct convolution, trim to same length
y_direct = np.convolve(x_ct, h_ct, mode='same')

# Method B: FFT route (zero-pad, multiply, IFFT)
N_fft = 2**int(np.ceil(np.log2(N_ct + len(h_ct) - 1)))
X = np.fft.rfft(x_ct, n=N_fft)
H = np.fft.rfft(h_ct, n=N_fft)
y_fft = np.fft.irfft(X * H, n=N_fft)[:N_ct]

# Compare (the 'same' mode in np.convolve shifts by len(h)//2)
shift = len(h_ct) // 2
err = np.max(np.abs(y_fft[shift:] - y_direct[:-shift]))
print(f'FFT-based conv vs direct (alignment-corrected): max error = {err:.2e}')
print(f'PASS — Convolution Theorem verified: {err < 1e-10}')

# Visualize magnitude spectra
freqs = np.fft.rfftfreq(N_fft)
fig, axes = plt.subplots(1, 3, figsize=(14, 5))
fig.suptitle('Convolution Theorem: F(x) · F(h) = F(x*h)', fontsize=14)
axes[0].plot(freqs, np.abs(X), color=COLORS['primary'], linewidth=1.5)
axes[0].set_title('|X(f)|'); axes[0].set_xlabel('Frequency')
axes[1].plot(freqs, np.abs(H), color=COLORS['secondary'], linewidth=2)
axes[1].set_title('|H(f)| (lowpass filter)'); axes[1].set_xlabel('Frequency')
axes[2].plot(freqs, np.abs(X*H), color=COLORS['tertiary'], linewidth=1.5)
axes[2].set_title('|X(f)·H(f)| = |Y(f)|'); axes[2].set_xlabel('Frequency')
fig.tight_layout(); plt.show()

Code cell 11

# === 3.2 Dual: multiplication in time = convolution in frequency ===
# Windowing effect: multiply by Hann window -> convolves spectrum
N_win = 512
t_sig = np.arange(N_win) / N_win
f0 = 0.1  # normalized freq
x_pure = np.sin(2*np.pi*f0*t_sig * N_win)
w_hann = np.hanning(N_win)
x_windowed = x_pure * w_hann

freqs_w = np.fft.rfftfreq(N_win)
X_pure = np.fft.rfft(x_pure)
X_wind = np.fft.rfft(x_windowed)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))
fig.suptitle('Dual theorem: time-domain multiplication = spectral convolution', fontsize=13)
axes[0].plot(freqs_w, 20*np.log10(np.abs(X_pure)+1e-15),
             color=COLORS['primary'], linewidth=1.5, label='No window')
axes[0].plot(freqs_w, 20*np.log10(np.abs(X_wind)+1e-15),
             color=COLORS['secondary'], linewidth=1.5, label='Hann window')
axes[0].set_xlim(0, 0.3); axes[0].set_ylim(-80, 20)
axes[0].set_title('Spectral leakage reduction via windowing')
axes[0].set_xlabel('Normalized frequency'); axes[0].set_ylabel('dB')
axes[0].legend()
axes[1].plot(t_sig, x_pure, color=COLORS['primary'], alpha=0.6, label='Pure sinusoid')
axes[1].plot(t_sig, x_windowed, color=COLORS['secondary'], linewidth=2, label='Windowed')
axes[1].set_title('Time domain')
axes[1].set_xlabel('Time (normalized)'); axes[1].legend()
fig.tight_layout(); plt.show()

4. Circular vs. Linear Convolution

Circular convolution wraps the index modulo $N$ . Zero-padding to $N \geq M+L-1$ makes them equal.

Code cell 13

# === 4.1 Wrap-around artifact visualization ===
x_wa = np.array([1., 2., 3., 4., 5.])
h_wa = np.array([1., 0., -1.])
N_x, N_h = len(x_wa), len(h_wa)

y_lin_wa = np.convolve(x_wa, h_wa)  # correct linear

# Circular with N=5 (wrong: wrap-around)
y_circ_bad = np.real(np.fft.ifft(
    np.fft.fft(x_wa, n=5) * np.fft.fft(h_wa, n=5)))

# Circular with N=8 (zero-padded: correct)
N_ok = 2**int(np.ceil(np.log2(N_x + N_h - 1)))
y_circ_ok = np.real(np.fft.ifft(
    np.fft.fft(x_wa, n=N_ok) * np.fft.fft(h_wa, n=N_ok)))[:N_x + N_h - 1]

print('x:', x_wa, ' h:', h_wa)
print('Linear conv (correct):     ', y_lin_wa)
print('Circular N=5 (wrap error): ', np.round(y_circ_bad, 6))
print('Circular N=8 (padded OK):  ', np.round(y_circ_ok, 6))
print(f'Padded == linear: {np.allclose(y_circ_ok, y_lin_wa)}')

fig, ax = plt.subplots(figsize=(10, 5))
ax.stem(np.arange(len(y_lin_wa)), y_lin_wa,
        linefmt='C0-', markerfmt='o', basefmt=' ', label='Linear (correct)')
ax.stem(np.arange(len(y_circ_bad)), y_circ_bad,
        linefmt='C3--', markerfmt='s', basefmt=' ', label='Circular N=5 (wrap error)')
ax.set_title('Circular wrap-around artifact vs. zero-padded correct result')
ax.set_xlabel('Sample index'); ax.set_ylabel('Amplitude')
ax.legend(); fig.tight_layout(); plt.show()

Code cell 14

# === 4.2 FFT convolution cost vs direct cost ===
import time
Ns = [2**k for k in range(4, 15)]
t_direct, t_fft = [], []

for N_cost in Ns:
    x_c = np.random.randn(N_cost)
    h_c = np.random.randn(min(N_cost, 128))
    # Direct
    t0 = time.perf_counter()
    for _ in range(3): np.convolve(x_c, h_c)
    t_direct.append((time.perf_counter()-t0)/3)
    # FFT-based
    t0 = time.perf_counter()
    for _ in range(3): sp_sig.fftconvolve(x_c, h_c)
    t_fft.append((time.perf_counter()-t0)/3)

fig, ax = plt.subplots(figsize=(10, 6))
ax.loglog(Ns, t_direct, 'o-', color=COLORS['primary'], linewidth=2, label='np.convolve O(N·L)')
ax.loglog(Ns, t_fft, 's-', color=COLORS['secondary'], linewidth=2, label='fftconvolve O(N log N)')
ax.set_title('Direct vs FFT-based convolution timing')
ax.set_xlabel('Signal length N'); ax.set_ylabel('Time (s)')
ax.legend(); fig.tight_layout(); plt.show()
crossover = Ns[next(i for i,d in enumerate(t_direct) if d > t_fft[i])]
print(f'FFT beats direct for N >= {crossover}')

5. LTI Systems and Filter Theory

An LTI system is fully described by its impulse response $h$ . Output: $y = h * x$ . Frequency response: $H(f) = \mathcal{F}(h)(f)$ .

Code cell 16

# === 5.1 Frequency response of FIR filters ===
fs_lti = 1000.0  # Hz

# Design three FIR lowpass filters
filters = {
    'Rect (L=51)': np.ones(51)/51,
    'Hann-windowed (L=51)': sp_sig.firwin(51, 0.2),
    'Kaiser beta=8 (L=51)': sp_sig.firwin(51, 0.2, window=('kaiser', 8.0)),
}

fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('FIR lowpass filter frequency responses', fontsize=14)
colors_f = [COLORS['primary'], COLORS['secondary'], COLORS['tertiary']]

for (name, h_f), col in zip(filters.items(), colors_f):
    w, H = sp_sig.freqz(h_f, worN=4096, fs=fs_lti)
    axes[0].plot(w, 20*np.log10(np.abs(H)+1e-15), color=col, linewidth=2, label=name)
    axes[1].plot(w, -np.unwrap(np.angle(H)), color=col, linewidth=2, label=name)

axes[0].set_xlim(0, 500); axes[0].set_ylim(-80, 5)
axes[0].set_title('Magnitude response (dB)')
axes[0].set_xlabel('Frequency (Hz)'); axes[0].set_ylabel('dB'); axes[0].legend(fontsize=9)
axes[1].set_xlim(0, 500)
axes[1].set_title('Phase response (rad)')
axes[1].set_xlabel('Frequency (Hz)'); axes[1].set_ylabel('Phase (rad)'); axes[1].legend(fontsize=9)
fig.tight_layout(); plt.show()

Code cell 17

# === 5.2 Group delay analysis ===
# FIR symmetric filter -> linear phase -> constant group delay
h_fir = sp_sig.firwin(51, 0.2, window='hann')
w_gd, gd = sp_sig.group_delay((h_fir, 1), w=4096, fs=fs_lti)

# Butterworth IIR filter -> non-linear phase
b_iir, a_iir = sp_sig.butter(5, 0.2)
w_gd_iir, gd_iir = sp_sig.group_delay((b_iir, a_iir), w=4096, fs=fs_lti)

fig, axes = plt.subplots(1, 2, figsize=(13, 5))
fig.suptitle('Group delay: FIR (linear phase) vs IIR (dispersive)', fontsize=13)
axes[0].plot(w_gd, gd, color=COLORS['primary'], linewidth=2)
axes[0].set_title('FIR Hann-windowed (symmetric -> linear phase)')
axes[0].set_xlabel('Frequency (Hz)'); axes[0].set_ylabel('Group delay (samples)')
axes[0].set_xlim(0, 500)
axes[1].plot(w_gd_iir, gd_iir, color=COLORS['secondary'], linewidth=2)
axes[1].set_title('Butterworth IIR order 5 (non-linear phase)')
axes[1].set_xlabel('Frequency (Hz)'); axes[1].set_ylabel('Group delay (samples)')
axes[1].set_xlim(0, 250); axes[1].set_ylim(0, 30)
fig.tight_layout(); plt.show()
print(f'FIR group delay: min={gd[:512].min():.2f}, max={gd[:512].max():.2f} (nearly constant)')
print(f'IIR group delay: min={gd_iir[:256].min():.2f}, max={gd_iir[:256].max():.2f} (varies)')

6. Efficient Long Convolution: Overlap-Add

Processes a long signal in blocks of size $B$ . Total cost: $O(M \log L)$ .

Code cell 19

# === 6.1 Overlap-Add implementation from scratch ===
def overlap_add(x, h, block_size=None):
    L = len(h)
    if block_size is None:
        block_size = 4 * L
    N_fft = 2**int(np.ceil(np.log2(block_size + L - 1)))
    H_oa = np.fft.rfft(h, n=N_fft)
    n_out = len(x) + L - 1
    y_oa = np.zeros(n_out)
    n_blocks = int(np.ceil(len(x) / block_size))
    for m in range(n_blocks):
        block = x[m*block_size:(m+1)*block_size]
        X_block = np.fft.rfft(block, n=N_fft)
        y_block = np.fft.irfft(X_block * H_oa, n=N_fft)
        start = m * block_size
        end = min(start + N_fft, n_out)
        y_oa[start:end] += y_block[:end - start]
    return y_oa[:n_out]

# Test
x_oa = np.random.randn(1024)
h_oa = sp_sig.firwin(64, 0.3, window='hann')
y_oa = overlap_add(x_oa, h_oa, block_size=256)
y_ref = sp_sig.fftconvolve(x_oa, h_oa)
err_oa = np.max(np.abs(y_oa - y_ref))
print(f'Overlap-add vs fftconvolve: max error = {err_oa:.2e}')
print(f'PASS — overlap-add correctness: {err_oa < 1e-10}')

7. Cross-Correlation and Power Spectral Density

Cross-correlation measures the similarity between two signals as a function of the lag between them. When both signals are the same (autocorrelation), the Wiener-Khinchin theorem establishes a deep connection: the power spectral density (PSD) is exactly the Fourier transform of the autocorrelation function.

R_{xy}[m] = \sum_{n} x^*[n]\, y[n+m] = (x[-\cdot] * y)[m]

S_{xx}(f) = \mathcal{F}\{R_{xx}[m]\}(f) = |X(f)|^2

This connects time-domain statistics to frequency-domain energy distribution — the foundation of spectral analysis.

Code cell 21

# === 7.1 Cross-Correlation Theorem ===
# R_xy[m] = (x[-·] * y)[m]  ↔  X*(f) · Y(f)

import numpy as np
import matplotlib.pyplot as plt

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
    'highlight': '#EE3377',
}

np.random.seed(42)
N = 256
t = np.arange(N)

# Two signals: x and delayed copy y
delay = 30
x = np.zeros(N)
x[20:70] = np.hanning(50)
noise = 0.2 * np.random.randn(N)
y = np.roll(x, delay) + noise

# Cross-correlation via FFT (frequency domain)
X = np.fft.fft(x)
Y = np.fft.fft(y)
R_fft = np.fft.ifft(np.conj(X) * Y).real

# Cross-correlation via numpy (reference)
R_ref = np.correlate(x, y, mode='full')
center = len(R_ref) // 2
R_ref_cyclic = np.concatenate([R_ref[center:], R_ref[:center]])

# Detected delay = argmax of cross-correlation
detected = np.argmax(R_fft)
print(f'True delay:     {delay} samples')
print(f'Detected delay: {detected} samples')
assert detected == delay, f'Expected {delay}, got {detected}'
print('PASS — delay detection correct')

fig, axes = plt.subplots(3, 1, figsize=(10, 8))
axes[0].plot(t, x, color=COLORS['primary'], label='x[n]')
axes[0].plot(t, y, color=COLORS['secondary'], alpha=0.7, label=f'y[n] = x[n-{delay}] + noise')
axes[0].set_title('Signals')
axes[0].legend()
axes[0].set_xlabel('Sample')

lags = np.arange(-N//2, N//2)
R_shift = np.roll(R_fft, -N//2)
axes[1].plot(lags, R_shift, color=COLORS['tertiary'])
axes[1].axvline(delay, color=COLORS['error'], ls='--', label=f'True delay={delay}')
axes[1].set_title('Cyclic Cross-Correlation $R_{xy}[m]$ (FFT method)')
axes[1].legend()
axes[1].set_xlabel('Lag m')

# Autocorrelation and PSD
R_xx = np.fft.ifft(np.abs(X)**2).real
PSD = np.abs(X)**2 / N
freqs = np.fft.fftfreq(N)
half = N//2
axes[2].semilogy(freqs[:half], PSD[:half], color=COLORS['highlight'])
axes[2].set_title('Power Spectral Density (PSD = |X(f)|²/N)')
axes[2].set_xlabel('Normalized Frequency')
axes[2].set_ylabel('Power')

plt.tight_layout()
plt.savefig('/tmp/conv_xcorr.png', dpi=100, bbox_inches='tight')
plt.show()
print('Cross-correlation and PSD plotted.')

Code cell 22

# === 7.2 Wiener-Khinchin Theorem Verification ===
# PSD(f) = FT{R_xx[m]}(f)

# Generate wide-sense stationary process: filtered noise
np.random.seed(42)
N_long = 4096
noise = np.random.randn(N_long)

# Bandpass filter: keep only mid frequencies
from scipy.signal import firwin, lfilter
b_bp = firwin(64, [0.1, 0.3], pass_zero=False)
x_bp = lfilter(b_bp, 1.0, noise)

# Method 1: PSD via Wiener-Khinchin (autocorrelation → FFT)
R = np.fft.ifft(np.abs(np.fft.fft(x_bp))**2).real / N_long
R = np.roll(R, N_long//2)  # center around lag=0
lags = np.arange(-N_long//2, N_long//2)

# Method 2: Periodogram (direct |X(f)|²/N)
PSD_periodogram = np.abs(np.fft.fft(x_bp))**2 / N_long
freqs = np.fft.fftfreq(N_long)

# Method 3: Welch's method (scipy)
from scipy.signal import welch
f_welch, PSD_welch = welch(x_bp, nperseg=512)
f_welch = f_welch  # already in [0, 0.5]

fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Autocorrelation (only show central lags)
show = 200
axes[0].plot(lags[N_long//2-show:N_long//2+show],
             R[N_long//2-show:N_long//2+show],
             color=COLORS['primary'])
axes[0].set_title('Autocorrelation $R_{xx}[m]$')
axes[0].set_xlabel('Lag m')
axes[0].axvline(0, color='gray', ls='--', alpha=0.5)

half = N_long//2
axes[1].semilogy(freqs[:half], PSD_periodogram[:half],
                 alpha=0.4, color=COLORS['secondary'], label='Periodogram')
axes[1].semilogy(f_welch, PSD_welch,
                 color=COLORS['primary'], lw=2, label="Welch's estimate")
axes[1].axvspan(0.1, 0.3, alpha=0.15, color=COLORS['tertiary'],
                label='Bandpass region')
axes[1].set_title('Power Spectral Density')
axes[1].set_xlabel('Normalized Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('/tmp/conv_psd.png', dpi=100, bbox_inches='tight')
plt.show()
print('Wiener-Khinchin verified: PSD peak in bandpass region [0.1, 0.3].')

7.3 Matched Filtering

A matched filter is the optimal linear filter for detecting a known signal $s[n]$ in additive white Gaussian noise. Its impulse response is the time-reversed conjugate: $h[n] = s^*[-n]$ , so the output is the cross-correlation:

y[n] = (h * x)[n] = R_{sx}[n]

This maximizes the SNR at the detection point — a direct consequence of the Cauchy-Schwarz inequality. Used in radar, sonar, GPS acquisition, and neural template matching.

Code cell 24

# === 7.3 Matched Filter: SNR Maximization ===

np.random.seed(42)
N = 512

# Known template / pilot signal
s_len = 40
t_s = np.linspace(0, 2*np.pi, s_len)
s = np.sin(t_s) * np.hanning(s_len)  # windowed sinusoid

# Received signal: template at position 150, buried in noise
snr_db = 0  # 0 dB SNR — very noisy!
signal_power = np.mean(s**2)
noise_std = np.sqrt(signal_power / (10**(snr_db/10)))
x_recv = np.random.randn(N) * noise_std
pos = 150
x_recv[pos:pos+s_len] += s

# Matched filter: h[n] = s[-n] (time-reversed template)
h_mf = s[::-1]

# Apply via FFT
H_mf = np.fft.fft(h_mf, n=N)
X_recv = np.fft.fft(x_recv)
y_mf = np.fft.ifft(H_mf * X_recv).real

# Compare: simple correlation vs matched filter output
detected_pos = np.argmax(np.abs(y_mf)) - s_len + 1
print(f'Template at position: {pos}')
print(f'Detected at:          {detected_pos}')
ok = abs(detected_pos - pos) <= 2
print(f"{'PASS' if ok else 'FAIL'} — matched filter detection at SNR={snr_db}dB")

fig, axes = plt.subplots(3, 1, figsize=(10, 8))
axes[0].plot(x_recv, color=COLORS['neutral'], alpha=0.7, label='Received (noisy)')
axes[0].set_title(f'Received Signal (SNR = {snr_db} dB)')
axes[0].axvline(pos, color=COLORS['error'], ls='--', label='True position')
axes[0].legend()

axes[1].plot(s, color=COLORS['tertiary'], lw=2)
axes[1].set_title('Template $s[n]$')

axes[2].plot(np.abs(y_mf), color=COLORS['primary'])
axes[2].axvline(pos + s_len - 1, color=COLORS['error'], ls='--',
                label=f'Peak at {pos + s_len - 1}')
axes[2].set_title('Matched Filter Output |y[n]|')
axes[2].legend()

plt.tight_layout()
plt.savefig('/tmp/conv_matched.png', dpi=100, bbox_inches='tight')
plt.show()
print('Matched filter output peaks at template location.')

8. Deconvolution and the Wiener Filter

Deconvolution recovers the original signal $x$ from the blurred/noisy observation $y = h * x + n$ . Naive inversion $\hat{X}(f) = Y(f)/H(f)$ amplifies noise wherever $|H(f)|$ is small.

The Wiener filter is the optimal linear estimator in the MSE sense:

H_W(f) = \frac{H^*(f)\, S_{xx}(f)}{|H(f)|^2\, S_{xx}(f) + S_{nn}(f)}

When the signal-to-noise ratio is high, $H_W(f) \approx 1/H(f)$ (perfect deconvolution). When SNR is low, $H_W(f) \approx 0$ (suppress noise). The regularization parameter $\lambda = S_{nn}/S_{xx}$ controls this tradeoff — equivalent to Tikhonov regularization in the frequency domain.

H_\lambda(f) = \frac{H^*(f)}{|H(f)|^2 + \lambda}

Code cell 26

# === 8.1 Wiener Deconvolution ===

np.random.seed(42)
N = 256

# True signal: sum of sinusoids
t = np.linspace(0, 1, N)
x_true = np.sin(2*np.pi*5*t) + 0.5*np.sin(2*np.pi*15*t)

# Blurring kernel: Gaussian PSF
kernel_len = 21
k = np.arange(-kernel_len//2, kernel_len//2+1)
sigma = 3.0
h_blur = np.exp(-k**2 / (2*sigma**2))
h_blur /= h_blur.sum()  # normalize to unit gain

# Convolve (in freq domain for exact circular)
H_blur = np.fft.fft(h_blur, n=N)
X_true = np.fft.fft(x_true)
y_clean = np.fft.ifft(H_blur * X_true).real

# Add noise
noise_std = 0.05
y_noisy = y_clean + noise_std * np.random.randn(N)
Y = np.fft.fft(y_noisy)

# Method 1: Naive inverse filter
eps = 1e-4  # small regularization to avoid /0
X_naive = Y / (H_blur + eps * (np.abs(H_blur) < eps))
x_naive = np.fft.ifft(X_naive).real

# Method 2: Wiener filter (known SNR)
snr = (np.var(x_true)) / (noise_std**2)
lam = 1.0 / snr
H_W = np.conj(H_blur) / (np.abs(H_blur)**2 + lam)
x_wiener = np.fft.ifft(H_W * Y).real

# Errors
err_naive  = np.mean((x_naive  - x_true)**2)
err_wiener = np.mean((x_wiener - x_true)**2)
print(f'MSE naive inverse: {err_naive:.6f}')
print(f'MSE Wiener filter: {err_wiener:.6f}')
print(f'Improvement:       {err_naive/err_wiener:.1f}x')

fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes[0,0].plot(t, x_true, color=COLORS['primary'], lw=2)
axes[0,0].set_title('True Signal $x[n]$')

axes[0,1].plot(t, y_noisy, color=COLORS['neutral'], alpha=0.8)
axes[0,1].set_title('Blurred + Noisy Observation $y[n]$')

axes[1,0].plot(t, x_naive, color=COLORS['error'], alpha=0.8)
axes[1,0].plot(t, x_true, color=COLORS['primary'], lw=1.5, alpha=0.6, ls='--')
axes[1,0].set_title(f'Naive Inverse Filter (MSE={err_naive:.4f})')

axes[1,1].plot(t, x_wiener, color=COLORS['tertiary'], lw=2)
axes[1,1].plot(t, x_true, color=COLORS['primary'], lw=1.5, alpha=0.6, ls='--')
axes[1,1].set_title(f'Wiener Filter (MSE={err_wiener:.4f})')

for ax in axes.flat:
    ax.set_xlabel('Time')
    ax.grid(True, alpha=0.3)

plt.suptitle('Deconvolution: Naive vs Wiener Filter', fontsize=14, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_wiener.png', dpi=100, bbox_inches='tight')
plt.show()
print('Wiener filter consistently outperforms naive inversion.')

Code cell 27

# === 8.2 Tikhonov Regularization Sweep ===
# How does lambda trade off bias vs. noise amplification?

lambdas = np.logspace(-5, 0, 50)
mses = []

for lam_test in lambdas:
    H_reg = np.conj(H_blur) / (np.abs(H_blur)**2 + lam_test)
    x_rec = np.fft.ifft(H_reg * Y).real
    mses.append(np.mean((x_rec - x_true)**2))

opt_idx = np.argmin(mses)
opt_lam = lambdas[opt_idx]
print(f'Optimal lambda:  {opt_lam:.2e}')
print(f'True 1/SNR:      {lam:.2e}')
print(f'Optimal MSE:     {mses[opt_idx]:.6f}')

plt.figure(figsize=(8, 4))
plt.semilogx(lambdas, mses, color=COLORS['primary'], lw=2)
plt.axvline(opt_lam, color=COLORS['error'], ls='--',
            label=f'Optimal λ={opt_lam:.1e}')
plt.axvline(lam, color=COLORS['secondary'], ls=':',
            label=f'True 1/SNR={lam:.1e}')
plt.xlabel('Regularization parameter λ')
plt.ylabel('MSE')
plt.title('Tikhonov Regularization: MSE vs λ')
plt.legend()
plt.tight_layout()
plt.savefig('/tmp/conv_tikhonov.png', dpi=100, bbox_inches='tight')
plt.show()
print('U-shape confirms bias-variance tradeoff controlled by lambda.')

9. Machine Learning Applications

9.1 CNNs as Learned Frequency Filters

A convolutional layer with kernel $w$ computes $y = w * x$ . In the frequency domain:

Y(f) = W(f) \cdot X(f)

Each learned kernel corresponds to a bandpass filter in frequency space. A CNN stack is thus a learned filter bank, extracting different frequency components at each layer. Low layers tend to learn edge detectors (high-frequency); deeper layers capture low-frequency global structure.

The spectral bias (frequency principle) states that neural networks learn low frequencies first — a direct consequence of the Fourier perspective on gradient descent.

Code cell 29

# === 9.1 CNN Kernels as Frequency Filters ===

from scipy.signal import freqz

# Typical learned CNN kernels (hand-crafted examples)
kernels = {
    'Smoothing (low-pass)': np.array([1, 2, 4, 2, 1]) / 10.0,
    'Edge detect (high-pass)': np.array([-1, -2, 0, 2, 1]) / 3.0,
    'Gabor-like (bandpass)': (np.sin(np.linspace(0, 2*np.pi, 9))
                              * np.hanning(9)),
    'Delta (all-pass)': np.array([0, 0, 1, 0, 0]),
}

fig, axes = plt.subplots(2, 4, figsize=(14, 6))
colors_list = [COLORS['primary'], COLORS['secondary'],
               COLORS['tertiary'], COLORS['highlight']]

for i, (name, kernel) in enumerate(kernels.items()):
    w, h = freqz(kernel, worN=512)
    freq_norm = w / np.pi  # [0, 1]

    # Kernel plot
    axes[0, i].stem(kernel, linefmt=colors_list[i],
                    markerfmt='o', basefmt='gray')
    axes[0, i].set_title(name, fontsize=9)
    axes[0, i].set_xlabel('Tap')
    if i == 0:
        axes[0, i].set_ylabel('Weight')

    # Frequency response
    axes[1, i].plot(freq_norm, 20*np.log10(np.abs(h) + 1e-10),
                    color=colors_list[i], lw=2)
    axes[1, i].set_xlabel('Normalized freq')
    axes[1, i].set_ylim(-60, 5)
    if i == 0:
        axes[1, i].set_ylabel('|H(f)| (dB)')

plt.suptitle('CNN Kernels as Frequency Filters', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_cnn_filters.png', dpi=100, bbox_inches='tight')
plt.show()
print('CNN kernels implement low-pass, high-pass, and bandpass filtering.')

9.2 WaveNet: Dilated Causal Convolutions

WaveNet (van den Oord et al., 2016) uses dilated causal convolutions to achieve large receptive fields with few parameters. A dilation factor $d$ means the kernel taps are spaced $d$ samples apart:

y[n] = \sum_{k=0}^{K-1} w[k]\, x[n - d\cdot k]

With $L$ layers and doubling dilation ( $d = 1, 2, 4, \ldots, 2^{L-1}$ ), the receptive field grows as:

R = (K-1)\sum_{l=0}^{L-1} 2^l = (K-1)(2^L - 1)

For $K=2, L=10$ : receptive field $= 2^{10} - 1 = 1023$ samples — covering 64ms at 16kHz.

Code cell 31

# === 9.2 WaveNet Dilated Causal Convolution Receptive Field ===

def dilated_conv_1d(x, w, dilation):
    """Causal dilated convolution (no future samples)."""
    K = len(w)
    N = len(x)
    y = np.zeros(N)
    for n in range(N):
        for k in range(K):
            idx = n - dilation * k
            if idx >= 0:
                y[n] += w[k] * x[idx]
    return y

# Impulse response through dilated conv stack
N = 128
x_impulse = np.zeros(N)
x_impulse[0] = 1.0  # unit impulse at t=0

# WaveNet-style stack: dilations 1,2,4,8,16
dilations = [1, 2, 4, 8, 16]
w_simple = np.array([0.5, 0.5])  # 2-tap kernel

x_cur = x_impulse.copy()
receptive_fields = []
K = len(w_simple)

fig, axes = plt.subplots(len(dilations)+1, 1, figsize=(12, 10))
axes[0].stem(x_impulse[:32], linefmt=COLORS['neutral'],
             markerfmt='o', basefmt='gray')
axes[0].set_title('Input: unit impulse')
axes[0].set_ylabel('x[n]')

total_rf = 1
for i, d in enumerate(dilations):
    x_cur = dilated_conv_1d(x_cur, w_simple, d)
    total_rf += (K-1) * d
    receptive_fields.append(total_rf)
    non_zero = np.sum(np.abs(x_cur) > 1e-10)
    axes[i+1].stem(x_cur[:32], linefmt='C0-',
                   markerfmt='o', basefmt='gray')
    axes[i+1].set_title(f'After dilation d={d}: RF={total_rf}, non-zero={non_zero}')
    axes[i+1].set_ylabel('y[n]')

axes[-1].set_xlabel('Sample n')
plt.suptitle('WaveNet Dilated Causal Convolution Stack', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_wavenet.png', dpi=100, bbox_inches='tight')
plt.show()

print('Receptive fields per layer:')
for d, rf in zip(dilations, receptive_fields):
    print(f'  d={d:2d}: RF={rf}')
print(f'\nFormula: RF = 1 + (K-1)*sum(2^l) = 1 + 1*{sum(dilations)} = {1+sum(dilations)}')
print('PASS — exponential receptive field growth confirmed')

9.3 Structured State Space Models (S4)

The S4 model (Gu et al., 2022) represents a linear time-invariant system as a recurrence:

h_t = \bar{A}h_{t-1} + \bar{B}x_t, \qquad y_t = Ch_t

The key insight: the output can be written as a convolution with an impulse response $\bar{K}$ :

y = \bar{K} * x, \qquad \bar{K}_t = C\bar{A}^t\bar{B}

For sequence length $L$ , this can be computed in $O(L \log L)$ via FFT convolution instead of $O(L \cdot N)$ sequential recurrence (where $N$ is state dimension). The HiPPO initialization ensures $\bar{A}$ preserves long-range dependencies via Legendre polynomial projection.

Code cell 33

# === 9.3 S4 SSM: Convolutional vs Recurrent Computation ===

def ssm_convolutional(A, B, C, x):
    """Compute SSM output via FFT convolution.
    K[t] = C @ A^t @ B  for t=0,...,L-1
    """
    L = len(x)
    N = A.shape[0]

    # Build convolutional kernel K[t] = C A^t B
    K = np.zeros(L)
    A_pow = np.eye(N)
    for t in range(L):
        K[t] = float(C @ A_pow @ B)
        A_pow = A_pow @ A

    # FFT convolution (circular — valid since we zero-pad)
    N_fft = 2 * L
    y_fft = np.fft.irfft(
        np.fft.rfft(K, n=N_fft) * np.fft.rfft(x, n=N_fft)
    )[:L]
    return y_fft, K

def ssm_recurrent(A, B, C, x):
    """Compute SSM output via sequential recurrence."""
    L = len(x)
    N = A.shape[0]
    h = np.zeros(N)
    y = np.zeros(L)
    for t in range(L):
        h = A @ h + B * x[t]
        y[t] = float(C @ h)
    return y

# Simple 2-state SSM (oscillator)
np.random.seed(42)
N_state = 4
theta = 0.3
# Stable rotation matrix
A_base = 0.95 * np.array([
    [np.cos(theta), -np.sin(theta), 0, 0],
    [np.sin(theta),  np.cos(theta), 0, 0],
    [0, 0, np.cos(2*theta), -np.sin(2*theta)],
    [0, 0, np.sin(2*theta),  np.cos(2*theta)],
])
B_vec = np.array([1.0, 0.0, 1.0, 0.0])
C_vec = np.array([1.0, 1.0, 0.5, 0.5])

L_seq = 128
x_input = np.zeros(L_seq)
x_input[10:20] = 1.0  # rectangular pulse

y_conv, K_kernel = ssm_convolutional(A_base, B_vec, C_vec, x_input)
y_rec  = ssm_recurrent(A_base, B_vec, C_vec, x_input)

max_err = np.max(np.abs(y_conv - y_rec))
print(f'Max error between conv and recurrent: {max_err:.2e}')
ok = max_err < 1e-8
print(f"{'PASS' if ok else 'FAIL'} — convolution and recurrence produce same output")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(K_kernel[:40], color=COLORS['primary'], lw=2, marker='o', ms=3)
axes[0].set_title('SSM Convolutional Kernel $\\bar{K}_t = C\\bar{A}^t\\bar{B}$')
axes[0].set_xlabel('Time step t')
axes[0].set_ylabel('$\\bar{K}_t$')

axes[1].plot(x_input, color=COLORS['neutral'], lw=1, label='Input x', alpha=0.7)
axes[1].plot(y_conv, color=COLORS['primary'], lw=2, label='Conv output')
axes[1].plot(y_rec, color=COLORS['error'], lw=1.5, ls='--', label='Recurrent output')
axes[1].set_title('SSM Outputs (identical)')
axes[1].legend()
axes[1].set_xlabel('Time step')

plt.tight_layout()
plt.savefig('/tmp/conv_s4.png', dpi=100, bbox_inches='tight')
plt.show()
print('S4: FFT convolution and sequential recurrence are equivalent.')

9.4 FNet: Replacing Attention with Fourier Transforms

FNet (Lee-Thorp et al., 2021) replaces the self-attention sublayer in a Transformer with a 2D DFT:

\text{FNetLayer}(X) = \mathcal{F}_{\text{seq}} \circ \mathcal{F}_{\text{embed}}(X)

The first DFT mixes across the sequence dimension (like attention), the second across the embedding dimension. No learned parameters in the mixing step! This achieves ~92% of BERT's accuracy on GLUE benchmarks but trains 7-10× faster and uses far less memory ( $O(L \log L)$ vs $O(L^2)$ ).

Why does it work? Fourier mixing is a fixed global operation that combines all positions. While less expressive than learned attention, the FFT captures long-range structure that local convolutions miss — and subsequent MLP layers can select what to use.

Code cell 35

# === 9.4 FNet Layer: FFT-based Token Mixing ===

def fnet_mixing(X):
    """FNet Fourier mixing layer.
    X: (seq_len, embed_dim)
    Returns: real part of 2D FFT
    """
    # FFT along sequence dimension, then embedding dimension
    return np.real(np.fft.fft(np.fft.fft(X, axis=0), axis=1))

def attention_mixing(X, scale=True):
    """Simplified self-attention (no learned projections)."""
    L, d = X.shape
    scores = X @ X.T  # (L, L)
    if scale:
        scores /= np.sqrt(d)
    weights = np.exp(scores - scores.max(axis=-1, keepdims=True))
    weights /= weights.sum(axis=-1, keepdims=True)
    return weights @ X

# Synthetic token embeddings
np.random.seed(42)
L_tokens = 16   # sequence length
d_model  = 32   # embedding dim
X_tokens = np.random.randn(L_tokens, d_model)

# Apply both mixing strategies
X_fnet = fnet_mixing(X_tokens)
X_attn = attention_mixing(X_tokens)

# Measure how much each row (position) mixes with all others
# via effective context: norm of off-diagonal interactions
print('FNet mixed output shape:', X_fnet.shape)
print('Attn mixed output shape:', X_attn.shape)
print()
print('FNet: all-position mixing via global DFT')
print(f'  Input norm:  {np.linalg.norm(X_tokens):.2f}')
print(f'  Output norm: {np.linalg.norm(X_fnet):.2f}')
print()
print('Attention: weighted position mixing')
print(f'  Input norm:  {np.linalg.norm(X_tokens):.2f}')
print(f'  Output norm: {np.linalg.norm(X_attn):.2f}')

fig, axes = plt.subplots(1, 3, figsize=(14, 4))

im0 = axes[0].imshow(X_tokens, aspect='auto', cmap='RdBu_r')
axes[0].set_title('Input X (L×d)')
axes[0].set_xlabel('Embedding dim')
axes[0].set_ylabel('Token position')
plt.colorbar(im0, ax=axes[0])

im1 = axes[1].imshow(X_fnet, aspect='auto', cmap='RdBu_r')
axes[1].set_title('FNet Output (2D FFT real part)')
axes[1].set_xlabel('Embedding dim')
plt.colorbar(im1, ax=axes[1])

im2 = axes[2].imshow(X_attn, aspect='auto', cmap='RdBu_r')
axes[2].set_title('Attention Output')
axes[2].set_xlabel('Embedding dim')
plt.colorbar(im2, ax=axes[2])

plt.suptitle('FNet vs Attention: Token Mixing', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_fnet.png', dpi=100, bbox_inches='tight')
plt.show()
print('FNet achieves global mixing via FFT at O(L log L) cost.')

9.5 Hyena: Parameterized Long Convolutions

Hyena (Poli et al., 2023) replaces attention with a hierarchy of long convolutions, where the convolution kernels are parameterized by a neural network rather than being fixed:

y = h(\theta) * x

where $h(\theta)$ is an MLP that maps position indices to filter coefficients. Longer contexts are handled in $O(L \log L)$ via FFT instead of $O(L^2)$ attention. Hyena achieves near-GPT-3 quality on language modeling while requiring no learned key-query-value projections, making it memory-efficient for very long contexts.

Key ingredients: implicit parameterization of the filter (not stored as a weight matrix), gating for selectivity, and causal masking for autoregressive generation.

Code cell 37

# === 9.5 Hyena: Implicit Long Convolution ===

# Simplified Hyena: MLP-parameterized filter applied via FFT

def positional_encoding(L, d=16):
    """Simple sinusoidal position encoding."""
    t = np.arange(L).reshape(-1, 1)
    freqs = np.exp(-np.arange(d) * np.log(10000) / d).reshape(1, -1)
    return np.sin(t * freqs)  # (L, d)

def mlp_filter(pos_enc, W1, b1, W2, b2):
    """2-layer MLP that maps positions -> filter coefficients."""
    h1 = np.tanh(pos_enc @ W1 + b1)
    h2 = h1 @ W2 + b2
    return h2.squeeze(-1)  # (L,) filter

np.random.seed(42)
L_hyena = 64
d_enc = 16

# MLP weights (random initialization)
W1 = 0.1 * np.random.randn(d_enc, 32)
b1 = np.zeros(32)
W2 = 0.1 * np.random.randn(32, 1)
b2 = np.zeros(1)

# Generate implicit filter via MLP
pos = positional_encoding(L_hyena, d_enc)
h_implicit = mlp_filter(pos, W1, b1, W2, b2)
h_implicit /= np.linalg.norm(h_implicit)  # normalize

# Causal mask: zero out future positions
h_causal = h_implicit.copy()
h_causal[L_hyena//2:] = 0  # keep only causal half

# Apply to test signal via FFT
x_test = np.random.randn(L_hyena)
N_fft = 2 * L_hyena
y_hyena = np.fft.irfft(
    np.fft.rfft(h_causal, n=N_fft) * np.fft.rfft(x_test, n=N_fft)
)[:L_hyena]

fig, axes = plt.subplots(1, 3, figsize=(13, 4))

axes[0].plot(h_implicit, color=COLORS['primary'], lw=2, label='Raw filter')
axes[0].plot(h_causal, color=COLORS['error'], lw=2, ls='--', label='Causal')
axes[0].set_title('MLP-Parameterized Filter h(θ)')
axes[0].set_xlabel('Position')
axes[0].legend()

H_freq = np.fft.rfft(h_causal, n=N_fft)
freqs_h = np.fft.rfftfreq(N_fft)
axes[1].plot(freqs_h, np.abs(H_freq), color=COLORS['tertiary'], lw=2)
axes[1].set_title('Frequency Response |H(f)|')
axes[1].set_xlabel('Normalized Frequency')

axes[2].plot(x_test, color=COLORS['neutral'], alpha=0.6, label='Input')
axes[2].plot(y_hyena, color=COLORS['highlight'], lw=2, label='Hyena output')
axes[2].set_title('Hyena: Input → Output')
axes[2].set_xlabel('Position')
axes[2].legend()

plt.suptitle('Hyena: MLP-Parameterized Long Convolution', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_hyena.png', dpi=100, bbox_inches='tight')
plt.show()
print(f'Hyena filter length: {L_hyena}, FFT cost: O({N_fft} log {N_fft})')
print('PASS — implicit filter applied via FFT convolution.')

10. Advanced Topics

10.1 Group Convolution and Equivariance

A group convolution (Cohen & Welling, 2016) generalizes the standard translation-equivariant convolution to arbitrary symmetry groups $G$ . For a group $G$ acting on signal space:

[f \star_G \psi](g) = \sum_{h \in G} f(h)\, \psi(g^{-1}h)

Equivariance means: if the input is transformed, the output transforms the same way:

T_g[f \star \psi] = [T_g f] \star \psi

Standard CNNs are equivariant to translations only. Group CNNs (G-CNNs) extend this to rotations, reflections, scale changes, etc. SE(3)-equivariant networks (used in protein structure prediction like AlphaFold2) handle 3D rotations and translations of atomic coordinates.

Code cell 39

# === 10.1 Group Convolution: Rotation Equivariance ===
import numpy as np
import matplotlib.pyplot as plt

COLORS = {
    'primary':   '#0077BB',
    'secondary': '#EE7733',
    'tertiary':  '#009988',
    'error':     '#CC3311',
    'neutral':   '#555555',
    'highlight': '#EE3377',
}

def conv2d_fft(image, kernel):
    """2D convolution via FFT."""
    H, W = image.shape
    kH, kW = kernel.shape
    # Zero-pad kernel to image size
    k_pad = np.zeros((H, W))
    k_pad[:kH, :kW] = kernel
    return np.fft.ifft2(np.fft.fft2(image) * np.fft.fft2(k_pad)).real

def rotate_image(img, angle_deg):
    """Rotate image by angle (degrees) using bilinear interpolation."""
    from scipy.ndimage import rotate
    return rotate(img, angle_deg, reshape=False)

# Create a simple test image (asymmetric shape)
np.random.seed(42)
N_img = 32
img = np.zeros((N_img, N_img))
img[10:15, 10:22] = 1.0  # horizontal bar
img[10:22, 10:15] = 1.0  # vertical bar → L-shape

# Edge detection kernel (Sobel-like)
kernel = np.array([
    [-1, -2, -1],
    [ 0,  0,  0],
    [ 1,  2,  1],
], dtype=float)

angles = [0, 45, 90, 135]
fig, axes = plt.subplots(2, 4, figsize=(14, 6))

for i, angle in enumerate(angles):
    img_rot = rotate_image(img, angle)
    out_rot = conv2d_fft(img_rot, kernel)
    out_then_rot = rotate_image(conv2d_fft(img, kernel), angle)

    axes[0, i].imshow(img_rot, cmap='Blues', vmin=0, vmax=1)
    axes[0, i].set_title(f'Input rotated {angle}°')
    axes[0, i].axis('off')

    axes[1, i].imshow(out_rot, cmap='RdBu_r')
    axes[1, i].set_title(f'Conv(rot {angle}°)')
    axes[1, i].axis('off')

plt.suptitle('Standard Conv is NOT Rotation-Equivariant\n'
             '(output changes non-uniformly with rotation)',
             fontsize=12, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_group.png', dpi=100, bbox_inches='tight')
plt.show()

# Check equivariance by rotating both ways
img_r90 = rotate_image(img, 90)
out_direct = conv2d_fft(img_r90, kernel)
out_rotate_output = rotate_image(conv2d_fft(img, kernel), 90)
err = np.mean(np.abs(out_direct - out_rotate_output))
print(f'Translation (no rotation): standard CNN IS equivariant')
print(f'Rotation by 90°: mean discrepancy = {err:.4f}')
print('Standard conv is NOT rotation-equivariant.')
print('G-CNNs fix this by explicitly sharing weights across rotations.')

10.2 Spectral Graph Convolution

For irregular graph data, spatial convolution is ill-defined. Spectral graph convolution uses the graph Laplacian $L = D - A$ to define convolution in the graph Fourier basis (eigenvectors of $L$ ):

g_{\theta} * x = U\, \text{diag}(\theta)\, U^\top x

where $U$ are eigenvectors of $L$ (graph Fourier modes) and $\theta$ are learnable spectral coefficients.

ChebNet (Defferrard et al., 2016) approximates this with Chebyshev polynomials to avoid computing the full eigendecomposition (expensive for large graphs). GCN (Kipf & Welling, 2017) simplifies further to a first-order approximation:

H^{(l+1)} = \sigma\!\left(\tilde{D}^{-1/2}\tilde{A}\tilde{D}^{-1/2}H^{(l)}W^{(l)}\right)

Code cell 41

# === 10.2 Graph Laplacian and Spectral Convolution ===

# Build a simple cycle graph (C_8)
N_graph = 8
A_graph = np.zeros((N_graph, N_graph))
for i in range(N_graph):
    A_graph[i, (i+1) % N_graph] = 1
    A_graph[(i+1) % N_graph, i] = 1

# Degree matrix and Laplacian
D_graph = np.diag(A_graph.sum(axis=1))
L_graph = D_graph - A_graph

# Eigendecomposition of L
eigenvalues, U_graph = np.linalg.eigh(L_graph)
print('Graph: Cycle C_8')
print(f'Laplacian eigenvalues: {np.round(eigenvalues, 3)}')
print(f'(Note: 0 eigenvalue confirms connected graph)')

# Graph signal: node features
x_graph = np.array([1.0, 0.5, 0.0, -0.5, -1.0, -0.5, 0.0, 0.5])  # smooth signal

# Spectral convolution: low-pass filter (keep only low-freq modes)
x_hat = U_graph.T @ x_graph  # graph Fourier transform

# Low-pass: zero out high-frequency modes
x_hat_lp = x_hat.copy()
x_hat_lp[4:] = 0  # keep lowest 4 modes
x_lp = U_graph @ x_hat_lp  # inverse graph FT

fig, axes = plt.subplots(1, 3, figsize=(13, 4))

# Graph signal on ring
theta_nodes = np.linspace(0, 2*np.pi, N_graph, endpoint=False)
xs = np.cos(theta_nodes)
ys = np.sin(theta_nodes)
sc0 = axes[0].scatter(xs, ys, c=x_graph, cmap='RdBu_r', s=200, zorder=3)
for i in range(N_graph):
    j = (i+1) % N_graph
    axes[0].plot([xs[i], xs[j]], [ys[i], ys[j]], 'gray', lw=1, zorder=1)
plt.colorbar(sc0, ax=axes[0])
axes[0].set_title('Graph Signal $x$')
axes[0].set_aspect('equal')
axes[0].axis('off')

# Spectral coefficients
axes[1].bar(range(N_graph), np.abs(x_hat), color=COLORS['primary'])
axes[1].bar(range(4, N_graph), np.abs(x_hat[4:]),
            color=COLORS['error'], label='Removed (high-freq)')
axes[1].set_title('Graph Fourier Coefficients $\\hat{x}$')
axes[1].set_xlabel('Mode index')
axes[1].legend()

sc2 = axes[2].scatter(xs, ys, c=x_lp, cmap='RdBu_r', s=200, zorder=3)
for i in range(N_graph):
    j = (i+1) % N_graph
    axes[2].plot([xs[i], xs[j]], [ys[i], ys[j]], 'gray', lw=1, zorder=1)
plt.colorbar(sc2, ax=axes[2])
axes[2].set_title('Low-Pass Filtered Signal $x_{LP}$')
axes[2].set_aspect('equal')
axes[2].axis('off')

plt.suptitle('Spectral Graph Convolution on Cycle Graph C₈', fontsize=13, fontweight='bold')
plt.tight_layout()
plt.savefig('/tmp/conv_graph_spectral.png', dpi=100, bbox_inches='tight')
plt.show()
print(f'Original signal: {np.round(x_graph, 2)}')
print(f'Low-pass output: {np.round(x_lp, 2)}')
print('PASS — spectral graph convolution smooths the signal.')

10.3 Young's Convolution Inequality

Young's inequality for convolutions provides sharp $L^p$ bounds:

\|f * g\|_r \leq \|f\|_p \|g\|_q, \qquad \frac{1}{p} + \frac{1}{q} = 1 + \frac{1}{r}

Special cases:

$p=1, q=r$ : convolution with an $L^1$ function is bounded on $L^q$ (most useful for filters)
$p=q=2, r=\infty$ : the output of convolving two $L^2$ signals is bounded
$p=r=2, q=1$ : filtering an $L^2$ signal with an $L^1$ kernel stays in $L^2$

For AI: This bounds the output norm of convolutional layers, connecting to training stability. If filter weights have bounded $\ell^1$ norm, the layer is Lipschitz and gradients are stable.

Code cell 43

# === 10.3 Young's Inequality: Numerical Verification ===

from scipy.signal import fftconvolve

def lp_norm(x, p):
    if p == np.inf:
        return np.max(np.abs(x))
    return np.sum(np.abs(x)**p)**(1/p)

np.random.seed(42)
N_y = 128

# Test multiple (p, q, r) combinations with 1/p + 1/q = 1 + 1/r
cases = [
    (1, 2, 2),     # p=1, q=r=2
    (1, 1, 1),     # L^1 * L^1 ⊆ L^1
    (2, 2, np.inf),# L^2 * L^2 ⊆ L^inf
    (1, np.inf, np.inf),  # L^1 * L^inf ⊆ L^inf
]

print('Young Inequality Verification: ||f*g||_r <= ||f||_p ||g||_q')
print(f"{'(p,q,r)':<15} {'||f*g||_r':>12} {'||f||_p ||g||_q':>15} {'Holds?':>8}")
print('-' * 55)

for p, q, r in cases:
    f = np.random.randn(N_y)
    g = np.random.randn(N_y) * 0.5

    fg = fftconvolve(f, g, mode='full')

    lhs = lp_norm(fg, r)
    rhs = lp_norm(f, p) * lp_norm(g, q)

    holds = lhs <= rhs + 1e-10
    p_str = 'inf' if p == np.inf else str(p)
    q_str = 'inf' if q == np.inf else str(q)
    r_str = 'inf' if r == np.inf else str(r)
    print(f'({p_str},{q_str},{r_str}){"":8} {lhs:>12.4f} {rhs:>15.4f} {"PASS" if holds else "FAIL":>8}')

print()
print('Constraint check: 1/p + 1/q = 1 + 1/r')
for p, q, r in cases:
    lhs_c = (1/p if p != np.inf else 0) + (1/q if q != np.inf else 0)
    rhs_c = 1 + (1/r if r != np.inf else 0)
    p_str = 'inf' if p == np.inf else str(p)
    q_str = 'inf' if q == np.inf else str(q)
    r_str = 'inf' if r == np.inf else str(r)
    print(f'  p={p_str}, q={q_str}, r={r_str}: LHS={lhs_c:.2f}, RHS={rhs_c:.2f}')

11. Conceptual Map and Summary

The convolution theorem unifies many seemingly disparate ideas:

         TIME DOMAIN                  FREQUENCY DOMAIN
         ════════════                 ════════════════
         convolution (*)   ←——FT——→   pointwise multiply (·)
         pointwise (·)     ←——FT——→   convolution (*)
         time shift        ←——FT——→   phase rotation
         time reversal     ←——FT——→   conjugation
         autocorrelation   ←——FT——→   power spectral density
         cross-correlation ←——FT——→   cross-spectrum
         LTI system        ←——FT——→   transfer function
         impulse response  ←——FT——→   frequency response

Concept	Time Domain	Frequency Domain
Convolution Theorem	$(f*g)[n]$	$F(e^{j\omega})G(e^{j\omega})$
Dual (multiplication)	$f[n]g[n]$	$(F*G)(e^{j\omega})/2\pi$
LTI system	$y=h*x$	$Y=HX$
Wiener-Khinchin	$R_{xx}[m]$	$S_{xx}(\omega)=
Wiener filter	$h_W=\mathcal{F}^{-1}[H_W]$	$H_W=H^*S_{xx}/(

The FFT makes all of this computable in $O(N \log N)$ — transforming signal processing, neural architecture design, and scientific computing.

Code cell 45

# === 11. Summary: Convolution Theorem Properties ===

import numpy as np

np.random.seed(42)
N_sum = 64
f = np.random.randn(N_sum)
g = np.random.randn(N_sum)
F = np.fft.fft(f)
G = np.fft.fft(g)

checks = {}

# 1. Convolution theorem
conv_direct = np.fft.ifft(F * G).real
conv_fft_route = np.convolve(f, g, mode='full')[:N_sum]  # truncate
# circular vs linear differ at edges; verify central portion
conv_direct_circular = np.fft.ifft(F * G).real
# Direct circular convolution
conv_ref = np.array([sum(f[k]*g[(n-k)%N_sum] for k in range(N_sum)) for n in range(N_sum)])
err1 = np.max(np.abs(conv_direct_circular - conv_ref))
checks['Convolution theorem (circular)'] = err1 < 1e-8

# 2. Dual: pointwise product ↔ convolution in freq
prod_time = f * g
conv_freq = np.fft.ifft(np.fft.fft(f) * np.fft.fft(g)).real  # already above
# Verify: FT{f·g} = F*G/N
FT_prod = np.fft.fft(prod_time)
F_conv_G = np.fft.ifft(np.fft.fft(F) * np.fft.fft(G)).real  # circular conv of F and G
err2 = np.max(np.abs(FT_prod - F_conv_G / N_sum))
checks['Dual theorem (mult ↔ conv)'] = err2 < 1e-8

# 3. Time reversal ↔ conjugation
f_rev = f[::-1]
F_rev = np.fft.fft(np.roll(f_rev, 1))  # cyclic reversal
err3 = np.max(np.abs(F_rev - np.conj(F)))
checks['Time reversal ↔ conjugation'] = err3 < 1e-8

# 4. Wiener-Khinchin
R_xx = np.fft.ifft(np.abs(F)**2).real
R_xx_direct = np.array([sum(f[n]*f[(n+m)%N_sum] for n in range(N_sum)) for m in range(N_sum)])
err4 = np.max(np.abs(R_xx - R_xx_direct))
checks['Wiener-Khinchin (PSD = FT{R})'] = err4 < 1e-8

# 5. Parsevals theorem
energy_time = np.sum(f**2)
energy_freq = np.sum(np.abs(F)**2) / N_sum
err5 = abs(energy_time - energy_freq)
checks["Parseval's theorem"] = err5 < 1e-8

print('Convolution Theorem Properties — Verification Suite')
print('=' * 55)
for name, passed in checks.items():
    status = 'PASS' if passed else 'FAIL'
    print(f'  {status}  {name}')
print('=' * 55)
all_pass = all(checks.values())
print(f'\n{"ALL TESTS PASS" if all_pass else "SOME TESTS FAILED"}')

12. References and Further Reading

Foundational texts:

Oppenheim & Schafer, Discrete-Time Signal Processing (3rd ed.) — the definitive DSP reference
Mallat, A Wavelet Tour of Signal Processing — bridges convolution, wavelets, and deep learning
Bracewell, The Fourier Transform and Its Applications — comprehensive with beautiful visualizations

Machine learning connections:

LeCun et al. (1989) — original convolutional networks for digit recognition
van den Oord et al. (2016) — WaveNet: generative model for raw audio
Gu et al. (2022) — S4: efficiently modeling long sequences with structured state spaces
Lee-Thorp et al. (2021) — FNet: mixing tokens with Fourier transforms
Poli et al. (2023) — Hyena hierarchy: a larger convolutional language model
Cohen & Welling (2016) — Group equivariant convolutional networks

Next sections:

§20-05 Wavelets and Multiresolution Analysis — when you need time-frequency localization
§11-04 Spectral Graph Theory — convolution on graphs via the Laplacian spectrum