Exercises NotebookMath for LLMs

Fourier Transform

Fourier Analysis and Signal Processing / Fourier Transform

Run notebook
Exercises Notebook

Exercises Notebook

Converted from exercises.ipynb for web reading.

Fourier Transform — Exercises

8 exercises covering the Fourier Transform from basic computations to AI applications.

FormatDescription
ProblemMarkdown cell with task description
Your SolutionCode cell with scaffolding (# YOUR CODE HERE)
SolutionCode cell with reference solution and checks

Difficulty Levels

LevelExercisesFocus
1–3Core mechanics
★★4–6Deeper theory
★★★7–8AI applications

Topic Map

TopicExercises
FT from definition1
FT properties2
Parseval's relation3
Inversion theorem4
Uncertainty principle5
Distributions6
Random Fourier Features7
FNet experiment8

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import fft as sp_fft

try:
    import matplotlib.pyplot as plt
    import matplotlib as mpl
    mpl.rcParams.update({'figure.figsize': (10, 6), 'figure.dpi': 120,
                          'font.size': 13, 'axes.spines.top': False,
                          'axes.spines.right': False})
    COLORS = {'primary': '#0077BB', 'secondary': '#EE7733',
              'tertiary': '#009988', 'error': '#CC3311',
              'neutral': '#555555', 'highlight': '#EE3377'}
    HAS_MPL = True
except ImportError:
    HAS_MPL = False

np.set_printoptions(precision=6, suppress=True)
np.random.seed(42)

def header(title):
    print('\n' + '=' * len(title))
    print(title)
    print('=' * len(title))

def check_close(name, got, expected, tol=1e-4):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    if not ok:
        print(f'  expected: {expected}')
        print(f'  got:      {got}')
    return ok

def check_true(name, cond):
    print(f"{'PASS' if cond else 'FAIL'} - {name}")
    return cond

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

print('Setup complete.')

Exercise 1 ★ — Computing Fourier Transforms from Definition

(a) For f(t)=eatf(t) = e^{-a|t|} with a>0a > 0, the Fourier Transform is

f^(ξ)=2aa2+(2πξ)2\hat{f}(\xi) = \frac{2a}{a^2 + (2\pi\xi)^2}

Verify this numerically for a=1.5a = 1.5 at ξ{0,0.5,1.0,2.0,5.0}\xi \in \{0, 0.5, 1.0, 2.0, 5.0\}.

(b) Verify the Riemann-Lebesgue lemma: show f^(ξ)0|\hat{f}(\xi)| \to 0 as ξ|\xi| \to \infty by evaluating at ξ=10,20,50\xi = 10, 20, 50.

(c) Verify the LL^\infty bound: f^f1=2/a\lVert\hat{f}\rVert_\infty \leq \lVert f\rVert_1 = 2/a.

(d) Plot the signal f(t)f(t) and its spectrum f^(ξ)|\hat{f}(\xi)| side by side.

Code cell 5

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 6

# Solution
# Exercise 1: Solution
a = 1.5
t = np.linspace(-20, 20, 16384)
dt = t[1] - t[0]

# (a) Signal and numerical FT
f_exp = np.exp(-a * np.abs(t))
xi, F_exp = numerical_ft(f_exp, t)
dxi = xi[1] - xi[0]

xi_test = np.array([0.0, 0.5, 1.0, 2.0, 5.0])
F_theory = 2*a / (a**2 + (2*np.pi*xi_test)**2)
F_num = np.array([np.real(F_exp[np.argmin(np.abs(xi - xv))]) for xv in xi_test])

header('Exercise 1: FT of Double-Sided Exponential')
print(f'  a = {a},  theoretical ||f||_1 = {2/a:.4f}')
print(f'\n  {"xi":>6}  {"Theory":>12}  {"Numerical":>12}  {"Error":>10}')
print('  ' + '-'*46)
for xv, ft, fn in zip(xi_test, F_theory, F_num):
    print(f'  {xv:>6.2f}  {ft:>12.6f}  {fn:>12.6f}  {abs(ft-fn):>10.2e}')

check_close('FT matches Lorentzian at test points', F_num, F_theory, tol=0.005)

# (b) Riemann-Lebesgue
xi_large = np.array([10.0, 20.0, 50.0])
F_large = 2*a / (a**2 + (2*np.pi*xi_large)**2)
print(f'\nRiemann-Lebesgue decay:')
for xv, fv in zip(xi_large, F_large):
    print(f'  |hat_f({xv:.0f})| = {fv:.6f}')
check_true('Riemann-Lebesgue: |F(xi)| -> 0 as xi -> inf', F_large[-1] < 0.001)

# (c) L-inf bound
L1_norm = np.sum(np.abs(f_exp)) * dt
F_max = np.max(np.abs(F_exp))
check_true('L-inf bound: ||F||_inf <= ||f||_1', F_max <= L1_norm + 1e-4)
print(f'  ||F||_inf = {F_max:.4f},  ||f||_1 = {L1_norm:.4f} (theory: {2/a:.4f})')

# (d) Plot
if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(12, 5))
    fig.suptitle(r'$f(t)=e^{-a|t|}$ and its Fourier Transform')
    mask_t = np.abs(t) < 5
    axes[0].plot(t[mask_t], f_exp[mask_t], color=COLORS['primary'])
    axes[0].set_xlabel('$t$'); axes[0].set_ylabel('$f(t)$')
    axes[0].set_title(f'$e^{{-{a}|t|}}$')
    mask_f = np.abs(xi) < 6
    axes[1].plot(xi[mask_f], np.abs(F_exp[mask_f]),
                 color=COLORS['primary'], label='Numerical')
    axes[1].plot(xi[mask_f], 2*a/(a**2+(2*np.pi*xi[mask_f])**2),
                 color=COLORS['error'], linestyle='--', label='Theory')
    axes[1].set_xlabel(r'$\xi$'); axes[1].set_ylabel(r'$|\hat{f}(\xi)|$')
    axes[1].set_title('Lorentzian spectrum'); axes[1].legend()
    fig.tight_layout(); plt.show()

print('\nTakeaway: The Lorentzian spectrum decays as O(1/xi^2), reflecting the'
      ' kink (non-differentiability) of e^{-a|t|} at t=0. Smoother functions'
      ' have faster spectral decay (Theorem B.1 in notes.md).')

Exercise 2 ★ — Applying FT Properties

Use properties (not direct integration) to find the Fourier Transforms:

(a) g(t)=eπ(t3)2g(t) = e^{-\pi(t-3)^2} — use the time-shift property.

(b) h(t)=teπt2h(t) = t\,e^{-\pi t^2} — use the frequency differentiation property: F[(2πit)f(t)](ξ)=ddξf^(ξ)\mathcal{F}[(-2\pi i t)f(t)](\xi) = \frac{d}{d\xi}\hat{f}(\xi).

(c) p(t)=eπt2cos(4πt)p(t) = e^{-\pi t^2}\cos(4\pi t) — use the modulation theorem: F[e2πiξ0tf(t)](ξ)=f^(ξξ0)\mathcal{F}[e^{2\pi i\xi_0 t}f(t)](\xi) = \hat{f}(\xi - \xi_0).

(d) Verify all three numerically. Starting fact: F[eπt2](ξ)=eπξ2\mathcal{F}[e^{-\pi t^2}](\xi) = e^{-\pi\xi^2}.

Code cell 8

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 9

# Solution
# Exercise 2: Solution
t = np.linspace(-10, 10, 8192)

g = np.exp(-np.pi * (t - 3)**2)
h = t * np.exp(-np.pi * t**2)
p = np.exp(-np.pi * t**2) * np.cos(4*np.pi * t)

xi_g, F_g_num = numerical_ft(g, t)
xi_h, F_h_num = numerical_ft(h, t)
xi_p, F_p_num = numerical_ft(p, t)

# (a) Time-shift: FT[f(t-a)](xi) = e^{-2pi*i*xi*a} * FT[f](xi)
#     FT[g](xi) = e^{-2pi*i*xi*3} * e^{-pi*xi^2}
F_g_theory = np.exp(-2j*np.pi*xi_g*3) * np.exp(-np.pi*xi_g**2)

# (b) Freq diff: FT[(-2pi*i*t)*f](xi) = d/d(xi) FT[f](xi)
#     => FT[t*f(t)](xi) = (1/(-2pi*i)) * d/d(xi) FT[f](xi)
#     d/d(xi) e^{-pi*xi^2} = -2pi*xi * e^{-pi*xi^2}
#     => FT[t*e^{-pi*t^2}](xi) = (-2pi*xi * e^{-pi*xi^2}) / (-2pi*i)
#                               = (xi/i) * e^{-pi*xi^2} = -i*xi*e^{-pi*xi^2}
F_h_theory = -1j * xi_h * np.exp(-np.pi * xi_h**2)

# (c) Modulation: cos(4pi*t) = (e^{2pi*i*2*t} + e^{-2pi*i*2*t})/2
#     FT[p](xi) = (1/2)[FT[e^{-pi*t^2}](xi-2) + FT[e^{-pi*t^2}](xi+2)]
#               = (1/2)[e^{-pi*(xi-2)^2} + e^{-pi*(xi+2)^2}]
F_p_theory = 0.5*(np.exp(-np.pi*(xi_p-2)**2) + np.exp(-np.pi*(xi_p+2)**2))

header('Exercise 2: FT Properties')
mask = np.abs(xi_g) < 5

err_a = np.max(np.abs(F_g_num[mask] - F_g_theory[mask]))
check_close('(a) time-shift: error vs theory', err_a, 0.0, tol=0.005)

mask_h = np.abs(xi_h) < 5
err_b = np.max(np.abs(F_h_num[mask_h] - F_h_theory[mask_h]))
check_close('(b) freq-diff: error vs theory', err_b, 0.0, tol=0.005)

mask_p = np.abs(xi_p) < 5
err_c = np.max(np.abs(np.real(F_p_num[mask_p]) - F_p_theory[mask_p]))
check_close('(c) modulation: error vs theory', err_c, 0.0, tol=0.01)

if HAS_MPL:
    fig, axes = plt.subplots(1, 3, figsize=(15, 5))
    fig.suptitle('Exercise 2: FT Properties (numerical vs theory)')
    for ax, xi_v, F_num, F_th, title in [
        (axes[0], xi_g, np.abs(F_g_num), np.abs(F_g_theory),
         r'(a) $|\hat{g}(\xi)|$: time-shift'),
        (axes[1], xi_h, np.abs(F_h_num), np.abs(F_h_theory),
         r'(b) $|\hat{h}(\xi)|$: freq-diff'),
        (axes[2], xi_p, np.real(F_p_num), F_p_theory,
         r'(c) $\hat{p}(\xi)$: modulation'),
    ]:
        m = np.abs(xi_v) < 5
        ax.plot(xi_v[m], F_num[m], color=COLORS['primary'], label='Numerical')
        ax.plot(xi_v[m], F_th[m], color=COLORS['error'],
                linestyle='--', linewidth=2, label='Theory')
        ax.set_xlabel(r'$\xi$'); ax.set_title(title); ax.legend(fontsize=9)
    fig.tight_layout(); plt.show()

print('\nTakeaway: All three transforms follow from FT[e^{-pi*t^2}]=e^{-pi*xi^2}'
      ' plus the shift, differentiation, and modulation properties.'
      ' No integration needed — properties are the computational engine.')

Exercise 3 \u2605\u2605 -- Parseval's Theorem and Energy Spectral Density

Difficulty: \u2605\u2605 | Topic: Plancherel / Parseval

Background

Plancherel's theorem states that the Fourier transform is an isometry on L2(R)L^2(\mathbb{R}):

f(t)2dt=f^(ξ)2dξ\int_{-\infty}^{\infty} |f(t)|^2\,dt = \int_{-\infty}^{\infty} |\hat{f}(\xi)|^2\,d\xi

The function f^(ξ)2|\hat{f}(\xi)|^2 is the energy spectral density (ESD).

Tasks

(a) Let f(t)=eπt2f(t) = e^{-\pi t^2} (normalised Gaussian). Compute the total energy E=f22E = \|f\|_2^2 analytically. Verify Parseval numerically to within 10310^{-3}.

(b) Let g(t)=rect(t)g(t) = \operatorname{rect}(t) (unit rectangle). Compute EgE_g analytically. Verify g^(ξ)2dξ=Eg\int |\hat{g}(\xi)|^2\,d\xi = E_g numerically.

(c) Plot the ESD g^(ξ)2|\hat{g}(\xi)|^2 for ξ[10,10]\xi \in [-10, 10]. Mark the first three side-lobe peaks. What fraction of the total energy lies in the main lobe ξ1|\xi| \leq 1? (Answer should be 90.3%\approx 90.3\%.)

(d) Bandwidth--energy trade-off: for gT(t)=rect(t/T)/Tg_T(t) = \operatorname{rect}(t/T)/\sqrt{T} (unit-energy rectangle of width TT), show numerically that the frequency at which the cumulative ESD reaches 90% scales as ξ90%1/T\xi_{90\%} \propto 1/T. Test with T{0.5,1,2,4}T \in \{0.5, 1, 2, 4\}.

Code cell 11

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 12

# Solution
# SOLUTION
import numpy as np
import scipy.fft as sp_fft
import matplotlib.pyplot as plt
from scipy.signal import find_peaks

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

def check_close(name, got, expected, tol=1e-3):
    ok = np.isclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: {got:.6f} ~= {expected:.6f}")

COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

t = np.linspace(-8, 8, 16001); dt = t[1] - t[0]

# (a) Gaussian: int |e^{-pi*t^2}|^2 dt = int e^{-2*pi*t^2} dt = 1/sqrt(2)
analytical_E_gauss = 1.0 / np.sqrt(2)
f = np.exp(-np.pi * t**2)
E_time = np.sum(f**2) * dt
xi, F = numerical_ft(f, t); dxi = xi[1] - xi[0]
E_freq = np.sum(np.abs(F)**2) * dxi
check_close("E_gauss time vs analytical", E_time, analytical_E_gauss)
check_close("Parseval Gaussian", E_time, E_freq)

# (b) rect: width=1, height=1 -> E = 1
analytical_E_rect = 1.0
g = (np.abs(t) < 0.5).astype(float)
E_rect_time = np.sum(g**2) * dt
xi_g, G = numerical_ft(g, t); dxi_g = xi_g[1] - xi_g[0]
E_rect_freq = np.sum(np.abs(G)**2) * dxi_g
check_close("E_rect time vs analytical", E_rect_time, analytical_E_rect)
check_close("Parseval rect", E_rect_time, E_rect_freq)

# (c) ESD and main-lobe fraction
ESD_all = np.abs(G)**2
mask_main = np.abs(xi_g) <= 1.0
total_energy = np.sum(ESD_all) * dxi_g
main_energy = np.sum(ESD_all[mask_main]) * dxi_g
main_lobe_fraction = main_energy / total_energy
check_close("Main-lobe fraction", main_lobe_fraction, 0.9028, tol=5e-3)

# Find first 3 positive side-lobe peaks
ESD_side = np.where(np.abs(xi_g) > 1.0, ESD_all, 0)
mask10 = np.abs(xi_g) <= 10
peaks, _ = find_peaks(ESD_side[mask10], height=0)
xi_peaks = xi_g[mask10][peaks]
pos_peaks = sorted(xi_peaks[xi_peaks > 1.0])[:3]

fig, ax = plt.subplots(figsize=(8, 3))
ax.plot(xi_g[mask10], ESD_all[mask10], color=COLORS['primary'], lw=1.5)
for p in pos_peaks:
    idx_p = np.argmin(np.abs(xi_g - p))
    ax.axvline(p, color=COLORS['error'], lw=0.8, ls='--')
    ax.axvline(-p, color=COLORS['error'], lw=0.8, ls='--')
ax.axvspan(-1, 1, alpha=0.15, color=COLORS['tertiary'],
           label=f'Main lobe ({main_lobe_fraction*100:.1f}%)')
ax.set_xlabel(r'$\xi$ (Hz)'); ax.set_ylabel(r'$|\hat{g}|^2$')
ax.set_title('ESD of rect(t)'); ax.legend(fontsize=8)
plt.tight_layout(); plt.show()
print(f"Side-lobe peaks (positive xi): {[f'{p:.2f}' for p in pos_peaks]}")

# (d) bandwidth scaling
print("\nBandwidth scaling:")
for T in [0.5, 1.0, 2.0, 4.0]:
    g_T = (np.abs(t) < T/2).astype(float) / np.sqrt(T)
    xi_T, G_T = numerical_ft(g_T, t); dxi_T = xi_T[1] - xi_T[0]
    ESD_T = np.abs(G_T)**2
    cumulative = np.cumsum(ESD_T) * dxi_T
    cumulative /= cumulative[-1]
    idx90 = np.searchsorted(cumulative, 0.90)
    xi_90 = abs(xi_T[idx90])
    print(f"  T={T:.1f}: xi_90% ~= {xi_90:.3f}  (T*xi_90 ~= {T*xi_90:.3f}, expect ~= 0.60)")

Exercise 4 \u2605\u2605 -- The Inversion Theorem and Gibbs Phenomenon

Difficulty: \u2605\u2605 | Topic: Fourier Inversion, Partial Sums

Background

The Fourier Inversion Theorem states that for fL1(R)L2(R)f \in L^1(\mathbb{R}) \cap L^2(\mathbb{R}),

f(t)=f^(ξ)e2πiξtdξa.e.f(t) = \int_{-\infty}^{\infty} \hat{f}(\xi)\,e^{2\pi i\xi t}\,d\xi \quad \text{a.e.}

Numerically truncating the integral to [B,B][-B, B] gives the band-limited reconstruction fB(t)f_B(t), which converges but exhibits Gibbs phenomenon near jump discontinuities: overshoot of 9%\approx 9\% that does not vanish as BB \to \infty.

Tasks

(a) Compute f^(ξ)\hat{f}(\xi) for f(t)=e2tf(t) = e^{-2|t|} numerically. Apply the inverse FT and verify fIFT(F)<103\|f - \text{IFT}(F)\|_\infty < 10^{-3}.

(b) For g(t)=sign(t)etg(t) = \operatorname{sign}(t)\,e^{-|t|} (odd function), verify g^(ξ)\hat{g}(\xi) is purely imaginary, then reconstruct gg via IFT.

(c) Let h(t)=1[0,1](t)h(t) = \mathbb{1}_{[0,1]}(t) (has a jump discontinuity). Band-limit by zeroing h^(ξ)\hat{h}(\xi) for ξ>B|\xi| > B. Plot reconstructions for B{5,10,20,50}B \in \{5,10,20,50\}. Measure the overshoot near t=0+t = 0^+ and verify it converges to 8.9%\approx 8.9\% (Gibbs constant).

(d) Apply a Fejer window (triangular taper in frequency) to suppress Gibbs phenomenon. Show overshoot drops below 2%2\%.

Code cell 14

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 15

# Solution
# SOLUTION
import numpy as np
import scipy.fft as sp_fft
import matplotlib.pyplot as plt

COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

def numerical_ift(F_vals, xi_vals):
    # IFT in xi-convention: IFT(F)(t) = FT(F)(-xi)(t)
    # Use: ifft of fftshifted F, then fftshift
    dxi = xi_vals[1] - xi_vals[0]
    f_vals = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(F_vals))) * dxi
    t_out = sp_fft.fftshift(sp_fft.fftfreq(len(xi_vals), d=dxi))
    return t_out, f_vals

t = np.linspace(-8, 8, 16001); dt = t[1] - t[0]

# (a)
f = np.exp(-2 * np.abs(t))
xi, F = numerical_ft(f, t)
t_rec, f_rec_full = numerical_ift(F, xi)
f_rec = np.interp(t, t_rec, f_rec_full.real)
err_a = np.max(np.abs(f - f_rec))
print(f"(a) max inversion error = {err_a:.2e}  {'PASS' if err_a < 1e-3 else 'FAIL'}")

# (b)
g = np.sign(t) * np.exp(-np.abs(t)); g[len(t)//2] = 0
xi_g, G = numerical_ft(g, t)
imag_frac = np.max(np.abs(G.real)) / (np.max(np.abs(G)) + 1e-15)
print(f"(b) max|Re(G)|/max|G| = {imag_frac:.4f}  {'PASS' if imag_frac < 0.01 else 'FAIL'}")
t_grec, g_rec_full = numerical_ift(G, xi_g)
g_rec = np.interp(t, t_grec, g_rec_full.real)
mask3 = np.abs(t) < 3
err_b = np.max(np.abs(g[mask3] - g_rec[mask3]))
print(f"(b) max inversion error (|t|<3) = {err_b:.2e}  {'PASS' if err_b < 1e-3 else 'FAIL'}")

# (c) Gibbs
h = ((t >= 0) & (t <= 1)).astype(float)
xi_h, H = numerical_ft(h, t)
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
ax = axes[0]
overshoot_vals = {}
colors_list = [COLORS['primary'], COLORS['secondary'], COLORS['tertiary'], COLORS['highlight']]
for color, B in zip(colors_list, [5, 10, 20, 50]):
    H_band = H.copy(); H_band[np.abs(xi_h) > B] = 0
    t_brec, h_brec_full = numerical_ift(H_band, xi_h)
    h_brec = np.interp(t, t_brec, h_brec_full.real)
    ax.plot(t, h_brec, label=f'B={B}', lw=1, color=color)
    right_mask = (t > 0) & (t < 0.5)
    overshoot_vals[B] = (np.max(h_brec[right_mask]) - 1.0) * 100
ax.plot(t, h, 'k--', lw=1.5, label='original')
ax.set_xlim(-0.5, 1.5); ax.set_ylim(-0.2, 1.35)
ax.set_xlabel('t'); ax.set_title('Gibbs Phenomenon'); ax.legend(fontsize=8)

# (d) Fejer window
ax2 = axes[1]
for color, B in zip(colors_list, [5, 10, 20, 50]):
    W = np.maximum(0, 1 - np.abs(xi_h) / B)
    H_win = H * W
    t_wrec, h_wrec_full = numerical_ift(H_win, xi_h)
    h_wrec = np.interp(t, t_wrec, h_wrec_full.real)
    right_mask = (t > 0) & (t < 0.5)
    overshoot_fejer = (np.max(h_wrec[right_mask]) - 1.0) * 100
    ax2.plot(t, h_wrec, label=f'B={B} (Fejer, {overshoot_fejer:.1f}%)', lw=1, color=color)
ax2.plot(t, h, 'k--', lw=1.5, label='original')
ax2.set_xlim(-0.5, 1.5); ax2.set_ylim(-0.2, 1.35)
ax2.set_xlabel('t'); ax2.set_title('Fejer-windowed Reconstruction'); ax2.legend(fontsize=8)
plt.tight_layout(); plt.show()
print("\nGibbs overshoot (no window):"); [print(f"  B={B}: +{v:.2f}%") for B, v in overshoot_vals.items()]

Exercise 5 \u2605\u2605 -- Heisenberg Uncertainty Principle

Difficulty: \u2605\u2605 | Topic: Time-Frequency Uncertainty

Background

The Heisenberg Uncertainty Principle states:

ΔtΔξ    14π\Delta t \cdot \Delta \xi \;\geq\; \frac{1}{4\pi}

where Δt\Delta t and Δξ\Delta \xi are the RMS spreads (standard deviations) in time and frequency. Equality holds if and only if f(t)=ceπαt2e2πiξ0tf(t) = c\,e^{-\pi\alpha t^2}e^{2\pi i\xi_0 t} -- a (possibly modulated) Gaussian.

Tasks

(a) Compute Δt\Delta t and Δξ\Delta \xi for f(t)=eπt2f(t) = e^{-\pi t^2} with α=1\alpha = 1. Verify ΔtΔξ=1/(4π)\Delta t \cdot \Delta \xi = 1/(4\pi) (equality) numerically.

(b) Repeat for α{0.25,0.5,1,2,4}\alpha \in \{0.25, 0.5, 1, 2, 4\}. Show that as α\alpha increases, Δt\Delta t decreases and Δξ\Delta \xi increases proportionally. Plot on a log-log plot.

(c) For g(t)=rect(t)g(t) = \operatorname{rect}(t), compute ΔtΔξ\Delta t \cdot \Delta \xi numerically. Is the product larger than 1/(4π)1/(4\pi)? Explain why Δξ\Delta \xi is very large.

(d) For h(t)=eπt2cos(10πt)h(t) = e^{-\pi t^2}\cos(10\pi t) (modulated Gaussian), compute the time-frequency spread. Is the product still 1/(4π)\approx 1/(4\pi)?

Code cell 17

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 18

# Solution
# SOLUTION
import numpy as np
import scipy.fft as sp_fft
import matplotlib.pyplot as plt

COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

def rms_spread(x, density):
    # density is pre-multiplied by dx (i.e., weights per grid point)
    norm = np.sum(density)
    if norm < 1e-15: return np.nan
    mean = np.sum(x * density) / norm
    return np.sqrt(np.sum((x - mean)**2 * density) / norm)

t = np.linspace(-10, 10, 20001); dt = t[1] - t[0]
bound = 1 / (4 * np.pi)

# (a) alpha = 1
alpha = 1.0; f = np.exp(-np.pi * alpha * t**2)
xi, F = numerical_ft(f, t); dxi = xi[1] - xi[0]
Dt = rms_spread(t, f**2 * dt); Dx = rms_spread(xi, np.abs(F)**2 * dxi)
print(f"(a) alpha=1: Delta_t={Dt:.5f}, Delta_xi={Dx:.5f}, product={Dt*Dx:.6f}, 1/4pi={bound:.6f}")
print(f"    {'PASS' if abs(Dt*Dx - bound)/bound < 0.01 else 'FAIL'} - equality (within 1%)")

# (b) alpha sweep
fig, axes = plt.subplots(1, 2, figsize=(10, 4))
alphas = [0.25, 0.5, 1.0, 2.0, 4.0]
Dts, Dxs = [], []
print("\n(b) alpha sweep:")
for a in alphas:
    fa = np.exp(-np.pi * a * t**2)
    xi_a, Fa = numerical_ft(fa, t); dxi_a = xi_a[1] - xi_a[0]
    Dta = rms_spread(t, fa**2 * dt); Dxa = rms_spread(xi_a, np.abs(Fa)**2 * dxi_a)
    Dts.append(Dta); Dxs.append(Dxa)
    print(f"  alpha={a:.2f}: Delta_t={Dta:.4f}, Delta_xi={Dxa:.4f}, product={Dta*Dxa:.6f}")

ax = axes[0]
ax.scatter(Dts, Dxs, color=COLORS['primary'], zorder=3, s=60, label='Gaussian (varying alpha)')
dt_range = np.linspace(0.05, 0.8, 200)
ax.plot(dt_range, bound / dt_range, color=COLORS['error'], lw=1.5, ls='--',
        label='Delta_t * Delta_xi = 1/4pi')
ax.set_xscale('log'); ax.set_yscale('log')
ax.set_xlabel('Delta_t'); ax.set_ylabel('Delta_xi')
ax.set_title('Uncertainty Principle: Time vs Frequency Spread'); ax.legend(fontsize=8)

# (c) rect
g = (np.abs(t) < 0.5).astype(float)
xi_g, G = numerical_ft(g, t); dxi_g = xi_g[1] - xi_g[0]
Dg_t = rms_spread(t, g**2 * dt)
Dg_xi = rms_spread(xi_g, np.abs(G)**2 * dxi_g)
print(f"\n(c) rect: Delta_t={Dg_t:.4f}, Delta_xi={Dg_xi:.4f}, product={Dg_t*Dg_xi:.4f}")
print("    Delta_xi is large (sinc tails ~ 1/xi -> variance integral diverges)")

ax2 = axes[1]
mask20 = np.abs(xi_g) < 20
ax2.plot(xi_g[mask20], np.abs(G[mask20])**2, color=COLORS['primary'], lw=1)
ax2.fill_between(xi_g[mask20], np.abs(G[mask20])**2, alpha=0.3, color=COLORS['primary'])
ax2.set_xlabel('xi'); ax2.set_ylabel('|g_hat|^2')
ax2.set_title('ESD of rect(t): heavy tails -> Delta_xi large')
plt.tight_layout(); plt.show()

# (d) modulated Gaussian
h = np.exp(-np.pi * t**2) * np.cos(10 * np.pi * t)
xi_h, H = numerical_ft(h, t); dxi_h = xi_h[1] - xi_h[0]
Dh_t = rms_spread(t, np.abs(h)**2 * dt)
Dh_xi = rms_spread(xi_h, np.abs(H)**2 * dxi_h)
print(f"\n(d) modulated Gaussian: Delta_t={Dh_t:.4f}, Delta_xi={Dh_xi:.4f}, product={Dh_t*Dh_xi:.6f}")
print(f"    Expected >= {bound:.6f} (1/4pi); ratio = {Dh_t*Dh_xi/bound:.3f}")
print("    (ratio > 1: two separated spectral lobes inflate Delta_xi)")

Exercise 6 \u2605\u2605 -- Distributions and the Poisson Summation Formula

Difficulty: \u2605\u2605 | Topic: Tempered Distributions, Dirac Comb

Background

The Poisson Summation Formula (PSF):

n=f(n)=k=f^(k)\sum_{n=-\infty}^{\infty} f(n) = \sum_{k=-\infty}^{\infty} \hat{f}(k)

The Dirac comb IIIT(t)=Tnδ(tnT)\text{III}_T(t) = T\sum_n \delta(t - nT) satisfies IIIT^=III1/T\widehat{\text{III}_T} = \text{III}_{1/T}, making PSF a spectral identity.

Tasks

(a) For σ{0.5,0.2,0.1,0.05}\sigma \in \{0.5, 0.2, 0.1, 0.05\}, define δσ(t)=eπt2/σ2/σ\delta_\sigma(t) = e^{-\pi t^2/\sigma^2}/\sigma. Verify δ^σ(ξ)1\hat{\delta}_\sigma(\xi) \to 1 as σ0\sigma \to 0 on [5,5][-5, 5].

(b) For f(t)=eπt2f(t) = e^{-\pi t^2}, verify the PSF numerically: n=1010eπn2k=1010eπk21.0865\sum_{n=-10}^{10} e^{-\pi n^2} \approx \sum_{k=-10}^{10} e^{-\pi k^2} \approx 1.0865.

(c) Let f(t)=e2πtf(t) = e^{-2\pi|t|}. Sample at Ts=1T_s = 1 Hz. Verify numerically that the DTFT matches the periodised spectrum kf^(ξk)\sum_k \hat{f}(\xi - k) on ξ[0,1]\xi \in [0, 1].

(d) Define a sampled Gaussian s(t)=n=2020eπn2δσ(tn)s(t) = \sum_{n=-20}^{20} e^{-\pi n^2} \delta_\sigma(t - n) with σ=0.1\sigma = 0.1. Compute the FT of ss numerically and compare to the Dirac comb prediction.

Code cell 20

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 21

# Solution
# SOLUTION
import numpy as np
import scipy.fft as sp_fft
import matplotlib.pyplot as plt

COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

def numerical_ft(f_vals, t_vals):
    dt = t_vals[1] - t_vals[0]
    F = sp_fft.fftshift(sp_fft.fft(sp_fft.ifftshift(f_vals))) * dt
    xi = sp_fft.fftshift(sp_fft.fftfreq(len(t_vals), d=dt))
    return xi, F

t = np.linspace(-8, 8, 16001); dt = t[1] - t[0]

# (a) delta approximation convergence
print("(a) delta_sigma -> FT -> 1:")
for sigma in [0.5, 0.2, 0.1, 0.05]:
    delta_s = np.exp(-np.pi * t**2 / sigma**2) / sigma
    xi, D = numerical_ft(delta_s, t)
    mask5 = np.abs(xi) <= 5
    # analytical FT: exp(-pi*sigma^2*xi^2) -> 1 as sigma -> 0
    D_analytical = np.exp(-np.pi * sigma**2 * xi[mask5]**2)
    err_num = np.max(np.abs(1.0 - D[mask5].real))
    err_ana = np.max(np.abs(1.0 - D_analytical))
    print(f"  sigma={sigma:.2f}: num err={err_num:.5f}, ana err={err_ana:.5f}")

# (b) PSF
N = 10; K = 10
lhs = sum(np.exp(-np.pi * n**2) for n in range(-N, N+1))
rhs = sum(np.exp(-np.pi * k**2) for k in range(-K, K+1))
theta3 = 1.0865240831689086
print(f"\n(b) PSF: LHS={lhs:.8f}, RHS={rhs:.8f}, theta3={theta3:.8f}")
print(f"    PSF verified: {'PASS' if abs(lhs - rhs) < 1e-10 else 'FAIL'}")

# (c) Aliasing via PSF
# FT of e^{-2*pi*|t|}: using pair e^{-a|t|} <-> 2a/(a^2 + 4*pi^2*xi^2)
# here a = 2*pi, so f_hat(xi) = 4*pi/(4*pi^2 + 4*pi^2*xi^2) = 1/(pi*(1 + xi^2))
fhat = lambda xi_v: 1.0 / (np.pi * (1 + xi_v**2))
f_cont = lambda tt: np.exp(-2 * np.pi * np.abs(tt))
ns = np.arange(-30, 31)
xi_dtft = np.linspace(0, 1, 1001)
DTFT = np.array([np.sum(f_cont(ns) * np.exp(-2j * np.pi * xi_v * ns)) for xi_v in xi_dtft])
K_sum = 5
periodised = np.array([sum(fhat(xi_v - k) for k in range(-K_sum, K_sum+1)) for xi_v in xi_dtft])
err_c = np.max(np.abs(DTFT.real - periodised))
print(f"\n(c) Aliasing: max|DTFT - periodised| = {err_c:.4f}  {'PASS' if err_c < 0.02 else 'FAIL'}")

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].plot(xi_dtft, DTFT.real, color=COLORS['primary'], label='DTFT')
axes[0].plot(xi_dtft, periodised, '--', color=COLORS['error'], label='Periodised FT')
axes[0].set_xlabel('xi'); axes[0].set_title('PSF Aliasing Verification'); axes[0].legend()

# (d) Sampled Gaussian
sigma_d = 0.1
s = np.zeros_like(t)
for n in range(-20, 21):
    s += np.exp(-np.pi * n**2) * np.exp(-np.pi * (t - n)**2 / sigma_d**2) / sigma_d
xi_s, S = numerical_ft(s, t)
mask6 = np.abs(xi_s) <= 6
axes[1].plot(xi_s[mask6], np.abs(S[mask6]), color=COLORS['primary'], lw=0.8,
             label='|FT(s)|')
for k in range(-5, 6):
    axes[1].axvline(k, color=COLORS['error'], lw=0.5, alpha=0.7)
axes[1].set_xlabel('xi'); axes[1].set_title('Sampled Gaussian FT ~= Dirac Comb')
axes[1].legend()
plt.tight_layout(); plt.show()

Exercise 7 \u2605\u2605\u2605 -- Random Fourier Features

Difficulty: \u2605\u2605\u2605 | Topic: Bochner's Theorem, Kernel Approximation

Background

Bochner's theorem (1933): A continuous, positive-definite function k:RdRk : \mathbb{R}^d \to \mathbb{R} is the Fourier transform of a non-negative measure p(ω)p(\omega):

k(xy)=Eωp[zω(x)zω(y)]k(\mathbf{x} - \mathbf{y}) = \mathbb{E}_{\omega \sim p}[z_\omega(\mathbf{x})\overline{z_\omega(\mathbf{y})}]

Random Fourier Features (RFF) (Rahimi & Recht 2007): sample DD frequencies ω1,,ωDp()\omega_1, \ldots, \omega_D \sim p(\cdot) and approximate:

k(x,y)z^(x)z^(y),z^(x)=1D[cos(ωjx+bj)]j=1Dk(\mathbf{x}, \mathbf{y}) \approx \hat{\mathbf{z}}(\mathbf{x})^\top \hat{\mathbf{z}}(\mathbf{y}), \qquad \hat{\mathbf{z}}(\mathbf{x}) = \frac{1}{\sqrt{D}}\left[\cos(\omega_j^\top \mathbf{x} + b_j)\right]_{j=1}^{D}

For the RBF kernel k(x,y)=exy2/(2σ2)k(\mathbf{x},\mathbf{y}) = e^{-\|\mathbf{x}-\mathbf{y}\|^2/(2\sigma^2)}, the spectral density is p(ω)=N(0,σ2I)p(\omega) = \mathcal{N}(0, \sigma^{-2} I).

Tasks

(a) Implement RFF for the RBF kernel with σ=1\sigma = 1 in d=1d=1. Verify convergence for D{10,50,200,1000}D \in \{10, 50, 200, 1000\}. Compute max approximation error for each DD.

(b) Implement the RFF kernel matrix for n=100n=100 points. Show the relative Frobenius error KZ^Z^F/KF<5%\|K - \hat{Z}\hat{Z}^\top\|_F / \|K\|_F < 5\% for D=500D = 500.

(c) Run 50 trials for each D{50,100,200,500,1000}D \in \{50, 100, 200, 500, 1000\}. Plot the 95th percentile of the max error vs DD on a log-log plot. Fit a power law (expect exponent 0.5\approx -0.5).

(d) Explain in comments: why does RFF allow O(nD)O(nD) kernel machines instead of O(n2)O(n^2)?

Code cell 23

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 24

# Solution
# SOLUTION
import numpy as np
import matplotlib.pyplot as plt

rng = np.random.default_rng(0)
COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

def rbf_kernel_matrix(X, Y, sigma=1.0):
    # X: (n,), Y: (m,) -> returns (n, m)
    return np.exp(-((X[:, None] - Y[None, :])**2) / (2 * sigma**2))

def rff_features(X, omega, bias, sigma=1.0):
    # X: (n,) -> (n, 1); omega: (D,) -> (D, 1)
    # Returns (n, D): sqrt(2/D) * cos(omega * x / sigma + bias)
    if X.ndim == 1: X = X[:, None]
    if omega.ndim == 1: omega = omega[:, None]
    phase = X @ (omega.T / sigma) + bias[None, :]  # (n, D)
    return np.sqrt(2.0 / omega.shape[0]) * np.cos(phase)

# (a) convergence plot
delta = np.linspace(-3, 3, 501)
k_exact = np.exp(-delta**2 / 2)  # sigma=1 RBF kernel at y=0

fig, axes = plt.subplots(1, 2, figsize=(12, 4))
ax = axes[0]
ax.plot(delta, k_exact, 'k-', lw=2, label='Exact RBF')
colors_list = [COLORS['primary'], COLORS['secondary'], COLORS['tertiary'], COLORS['highlight']]
print("(a) Max RFF approximation errors:")
for color, D in zip(colors_list, [10, 50, 200, 1000]):
    trial_errs = []
    for _ in range(20):
        omega_D = rng.normal(0, 1, size=D)
        bias_D = rng.uniform(0, 2*np.pi, size=D)
        z_d = rff_features(delta, omega_D, bias_D)
        z_0 = rff_features(np.array([0.0]), omega_D, bias_D)
        k_rff = (z_d * z_0).sum(axis=1)
        trial_errs.append(np.max(np.abs(k_exact - k_rff)))
    print(f"  D={D:5d}: mean max error = {np.mean(trial_errs):.4f}")
    # plot one representative
    omega_D = rng.normal(0, 1, size=D)
    bias_D = rng.uniform(0, 2*np.pi, size=D)
    z_d = rff_features(delta, omega_D, bias_D)
    z_0 = rff_features(np.array([0.0]), omega_D, bias_D)
    ax.plot(delta, (z_d * z_0).sum(1), color=color, alpha=0.7, lw=1, label=f'D={D}')
ax.set_xlabel('x - y'); ax.set_ylabel('k'); ax.set_title('RBF vs RFF'); ax.legend(fontsize=8)

# (b) Gram matrix
n = 100; X_data = rng.uniform(-2, 2, size=n)
K_exact = rbf_kernel_matrix(X_data, X_data)
D_gram = 500
omega_g = rng.normal(0, 1, size=D_gram); bias_g = rng.uniform(0, 2*np.pi, size=D_gram)
Z = rff_features(X_data, omega_g, bias_g)
K_rff = Z @ Z.T
rel_err = np.linalg.norm(K_exact - K_rff, 'fro') / np.linalg.norm(K_exact, 'fro')
print(f"\n(b) Gram matrix relative Frobenius error (D={D_gram}): {rel_err:.4f}")
print(f"    {'PASS' if rel_err < 0.05 else 'NOTE: increase D for tighter bound'}")

# (c) error scaling
D_vals = [50, 100, 200, 500, 1000]; n_trials = 50
p95_errs = []
for D in D_vals:
    trial_errs = []
    for _ in range(n_trials):
        oms = rng.normal(0, 1, size=D); bs = rng.uniform(0, 2*np.pi, size=D)
        z_d = rff_features(delta, oms, bs)
        z_0 = rff_features(np.array([0.0]), oms, bs)
        trial_errs.append(np.max(np.abs(k_exact - (z_d * z_0).sum(1))))
    p95_errs.append(np.percentile(trial_errs, 95))

ax2 = axes[1]
ax2.loglog(D_vals, p95_errs, 'o-', color=COLORS['primary'], lw=2)
log_D = np.log(D_vals); log_e = np.log(p95_errs)
slope, intercept = np.polyfit(log_D, log_e, 1)
ax2.loglog(D_vals, np.exp(intercept) * np.array(D_vals)**slope, '--',
           color=COLORS['error'], label=f'fit: D^{slope:.2f}')
ax2.set_xlabel('D'); ax2.set_ylabel('P95 max error')
ax2.set_title('RFF Error Scaling ~ D^(-1/2)'); ax2.legend()
plt.tight_layout(); plt.show()
print(f"\n(c) Power law exponent = {slope:.3f}  (expect ~= -0.5)")

# (d) Why RFF enables O(nD) kernel machines:
# Exact kernel SVM: Gram matrix K is (n x n) -> O(n^2) storage and compute.
# With RFF: map each x_i to z(x_i) in R^D -> linear SVM on features -> O(nD).
# Trade-off: approximation error ~ O(1/sqrt(D)); must choose D large enough for accuracy.
print("\n(d) RFF replaces O(n^2) kernel matrix with O(nD) explicit features. Trade-off: accuracy vs D.")

Exercise 8 \u2605\u2605\u2605 -- FNet: Fourier Mixing vs Attention

Difficulty: \u2605\u2605\u2605 | Topic: FNet, Spectral Mixing in Transformers

Background

FNet (Lee-Thorp et al., 2022) replaces the attention sub-layer with a parameter-free 2D DFT applied to the token-embedding matrix HRn×dH \in \mathbb{R}^{n \times d}:

FNet-mix(H)=Re ⁣[FseqFembed(H)]\text{FNet-mix}(H) = \operatorname{Re}\!\left[\mathcal{F}_{\text{seq}} \circ \mathcal{F}_{\text{embed}}(H)\right]

Unlike attention (O(n2d)O(n^2 d)), FNet mixing is O(ndlogn)O(nd\log n) and achieves 70--92% of BERT's accuracy on GLUE while being 7x faster on TPUs.

Tasks

(a) Implement fnet_mix(H). Verify it is a linear operator.

(b) Show that FNet mixing is not position-equivariant. Construct HH and its row-shifted version HH'; show fnet_mix(H)shift(fnet_mix(H))\text{fnet\_mix}(H') \neq \text{shift}(\text{fnet\_mix}(H)).

(c) Compare FNet mixing to full self-attention on HR32×64H \in \mathbb{R}^{32 \times 64}. Plot the "mixing matrix" as a heatmap for both: DFT matrix/n|\text{DFT matrix}|/n (FNet) vs softmax(QK/dk)\text{softmax}(QK^\top/\sqrt{d_k}) (attention).

(d) Time both operations for n{64,128,256,512,1024}n \in \{64, 128, 256, 512, 1024\} with d=64d=64. Plot wall-clock time on a log-log plot and fit power laws. Verify FNet scales as O(nlogn)O(n\log n) while attention scales as O(n2)O(n^2).

Code cell 26

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 27

# Solution
# SOLUTION
import numpy as np
import time
import matplotlib.pyplot as plt

rng = np.random.default_rng(0)
COLORS = dict(primary='#0077BB', secondary='#EE7733', tertiary='#009988',
              error='#CC3311', neutral='#555555', highlight='#EE3377')

def fnet_mix(H):
    return np.fft.fft(np.fft.fft(H, axis=0), axis=1).real

def softmax_attention(H, W_Q, W_K, W_V):
    d_k = W_Q.shape[1]
    Q = H @ W_Q; K = H @ W_K; V = H @ W_V
    scores = Q @ K.T / np.sqrt(d_k)
    scores -= scores.max(axis=-1, keepdims=True)  # numerical stability
    A = np.exp(scores); A /= A.sum(axis=-1, keepdims=True)
    return A @ V, A

# (a) linearity
n, d = 8, 4
H = rng.normal(size=(n, d)); G = rng.normal(size=(n, d))
a, b = 1.7, -0.3
out1 = fnet_mix(a*H + b*G); out2 = a*fnet_mix(H) + b*fnet_mix(G)
err_lin = np.max(np.abs(out1 - out2))
print(f"(a) Linearity: max diff = {err_lin:.2e}  {'PASS' if err_lin < 1e-10 else 'FAIL'}")

# (b) shift equivariance
H_shift = np.roll(H, -1, axis=0)
diff_shift = np.max(np.abs(fnet_mix(H_shift) - np.roll(fnet_mix(H), -1, axis=0)))
print(f"(b) Shift equivariance error = {diff_shift:.4f}  (> 0 -> NOT equivariant)")
print("    FNet 2D FFT mixes sequence and embedding axes jointly -> not shift-equivariant")

# (c) Mixing matrix comparison
n32, d32 = 32, 64
H32 = rng.normal(size=(n32, d32))
W_Q = rng.normal(size=(d32, 16)) / np.sqrt(d32)
W_K = rng.normal(size=(d32, 16)) / np.sqrt(d32)
W_V = rng.normal(size=(d32, 16)) / np.sqrt(d32)
_, A_attn = softmax_attention(H32, W_Q, W_K, W_V)

DFT_mat = np.fft.fft(np.eye(n32), axis=0) / n32
M_fnet = np.abs(DFT_mat)

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
im0 = axes[0].imshow(M_fnet, aspect='auto', cmap='Blues')
axes[0].set_title('FNet: |DFT matrix|/n (uniform mixing)')
axes[0].set_xlabel('Source token'); axes[0].set_ylabel('Target token')
plt.colorbar(im0, ax=axes[0])
im1 = axes[1].imshow(A_attn, aspect='auto', cmap='Reds')
axes[1].set_title('Attention: softmax(QK^T / sqrt(d_k)) (sparse, data-dependent)')
axes[1].set_xlabel('Source token'); axes[1].set_ylabel('Target token')
plt.colorbar(im1, ax=axes[1])
plt.tight_layout(); plt.show()

# (d) Computational scaling
print("\n(d) FNet vs Attention scaling:")
n_vals = [64, 128, 256, 512, 1024]; d_bench = 64; n_rep = 50
fnet_times, attn_times = [], []
for n_val in n_vals:
    H_b = rng.normal(size=(n_val, d_bench))
    WQ = rng.normal(size=(d_bench, 32)); WK = WQ.copy(); WV = rng.normal(size=(d_bench, 32))
    t0 = time.perf_counter()
    for _ in range(n_rep): fnet_mix(H_b)
    fnet_times.append((time.perf_counter() - t0) / n_rep)
    t0 = time.perf_counter()
    for _ in range(n_rep): softmax_attention(H_b, WQ, WK, WV)
    attn_times.append((time.perf_counter() - t0) / n_rep)
    print(f"  n={n_val:5d}: FNet={fnet_times[-1]*1e6:.1f}us  Attn={attn_times[-1]*1e6:.1f}us  ratio={attn_times[-1]/fnet_times[-1]:.1f}x")

fig2, ax2 = plt.subplots(figsize=(7, 4))
ax2.loglog(n_vals, fnet_times, 'o-', color=COLORS['primary'], lw=2, label='FNet')
ax2.loglog(n_vals, attn_times, 's-', color=COLORS['error'], lw=2, label='Attention')
log_n = np.log(n_vals)
for times, color, name in [(fnet_times, COLORS['tertiary'], 'FNet'), (attn_times, COLORS['secondary'], 'Attn')]:
    sl, ic = np.polyfit(log_n, np.log(times), 1)
    ax2.loglog(n_vals, np.exp(ic) * np.array(n_vals)**sl, '--', color=color,
               label=f'{name} fit: n^{sl:.2f}')
ax2.set_xlabel('Sequence length n'); ax2.set_ylabel('Time (s)')
ax2.set_title('FNet O(n log n) vs Attention O(n^2)'); ax2.legend(fontsize=8)
plt.tight_layout(); plt.show()

Summary

#ExerciseDifficultyTopics
1FT of eate^{-a\|t\|}\u2605Lorentzian, Riemann-Lebesgue
2FT Properties\u2605Time-shift, differentiation, modulation
3Parseval / ESD\u2605\u2605Plancherel, bandwidth trade-off
4Inversion and Gibbs\u2605\u2605IFT, band-limiting, Fejer window
5Uncertainty Principle\u2605\u2605RMS spread, time-frequency trade-off
6Distributions and PSF\u2605\u2605Dirac delta, Poisson summation, aliasing
7Random Fourier Features\u2605\u2605\u2605Bochner, kernel approximation
8FNet vs Attention\u2605\u2605\u2605Spectral mixing, O(nlogn)O(n\log n) vs O(n2)O(n^2)

Key Takeaways

  • The FT is an isometry on L2(R)L^2(\mathbb{R}) (Plancherel) -- energy is perfectly preserved.
  • Uncertainty: you cannot simultaneously localise in time and frequency; Gaussians achieve the minimum.
  • Distributions extend the FT to δ\delta, IIIT\text{III}_T via the Poisson summation formula.
  • Bochner's theorem connects kernel methods (SVMs, GPs) to Fourier analysis of positive-definite functions.
  • FNet shows that a parameter-free global Fourier mix captures substantial linguistic structure at O(nlogn)O(n\log n) cost.

Next Sections

  • \u00a703 Discrete Fourier Transform -- DFT, FFT algorithms, circular convolution, spectral leakage
  • \u00a704 Convolution Theorem -- Convolution-multiplication duality, LTI systems, filter design
  • \u00a705 Wavelets -- Multi-resolution analysis, STFT, Morlet wavelets, wavelet scattering networks

Exercise 9: Gaussian Smoothing as Spectral Damping

Convolve a noisy signal with a Gaussian kernel and verify that high-frequency energy decreases.

Code cell 30

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 31

# Solution
header("Exercise 9: Gaussian Smoothing")
t = np.linspace(-4, 4, 2048)
dt = t[1] - t[0]
signal = np.sin(2*np.pi*2*t) + 0.35*np.sin(2*np.pi*35*t)
sigma = 0.12
kernel = np.exp(-0.5 * (t / sigma)**2)
kernel /= np.sum(kernel)
smoothed = np.fft.ifft(np.fft.fft(signal) * np.fft.fft(np.fft.ifftshift(kernel))).real
freq = np.fft.fftfreq(len(t), d=dt)
S = np.fft.fft(signal)
T = np.fft.fft(smoothed)
high = np.abs(freq) > 10
energy_before = np.sum(np.abs(S[high])**2)
energy_after = np.sum(np.abs(T[high])**2)
print("high-frequency energy before:", round(float(energy_before), 3))
print("high-frequency energy after:", round(float(energy_after), 3))
check_true("smoothing damps high frequencies", energy_after < energy_before)
print("Takeaway: convolution by a Gaussian multiplies the spectrum by another Gaussian.")

Exercise 10: Empirical Uncertainty Product

Compute time spread, frequency spread, and their product for a Gaussian pulse and a wider Gaussian pulse.

Code cell 33

# Your Solution
print("Write your solution here, then compare with the reference solution below.")

Code cell 34

# Solution
header("Exercise 10: Uncertainty Product")
def spread_product(sigma):
    t = np.linspace(-8, 8, 4096)
    dt = t[1] - t[0]
    f = np.exp(-np.pi * (t / sigma)**2)
    xi, F = numerical_ft(f, t)
    dxi = xi[1] - xi[0]
    Et = np.sum(np.abs(f)**2) * dt
    Ef = np.sum(np.abs(F)**2) * dxi
    Dt = np.sqrt(np.sum(t**2 * np.abs(f)**2) * dt / Et)
    Dxi = np.sqrt(np.sum(xi**2 * np.abs(F)**2) * dxi / Ef)
    return Dt * Dxi
products = [spread_product(s) for s in [0.7, 1.5]]
print("products:", [round(float(p), 6) for p in products])
check_close("scale-invariant Gaussian product", products[0], products[1], tol=5e-4)
print("Takeaway: rescaling moves uncertainty between domains but cannot beat the product bound.")