Theory NotebookMath for LLMs

Series and Sequences

Calculus Fundamentals / Series and Sequences

Run notebook
Theory Notebook

Theory Notebook

Converted from theory.ipynb for web reading.

Series and Sequences — Theory Notebook

"An infinite series is merely a sequence of partial sums — the question is whether that sequence has a limit."

Interactive exploration of sequences, series convergence, power series, Taylor approximations, and their roles in modern machine learning.

Code cell 2

import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl

try:
    import seaborn as sns
    sns.set_theme(style="whitegrid", palette="colorblind")
    HAS_SNS = True
except ImportError:
    plt.style.use("seaborn-v0_8-whitegrid")
    HAS_SNS = False

mpl.rcParams.update({
    "figure.figsize":    (10, 6),
    "figure.dpi":         120,
    "font.size":           13,
    "axes.titlesize":      15,
    "axes.labelsize":      13,
    "xtick.labelsize":     11,
    "ytick.labelsize":     11,
    "legend.fontsize":     11,
    "legend.framealpha":   0.85,
    "lines.linewidth":      2.0,
    "axes.spines.top":     False,
    "axes.spines.right":   False,
    "savefig.bbox":       "tight",
    "savefig.dpi":         150,
})
np.random.seed(42)
print("Plot setup complete.")

Code cell 3

import numpy as np
import numpy.linalg as la
from scipy import integrate, special, stats
from math import factorial
import matplotlib.patches as patches

COLORS = {
    "primary": "#0077BB",
    "secondary": "#EE7733",
    "tertiary": "#009988",
    "error": "#CC3311",
    "neutral": "#555555",
    "highlight": "#EE3377",
}
HAS_MPL = True
np.set_printoptions(precision=8, suppress=True)
np.random.seed(42)

def header(title):
    print("\n" + "=" * len(title))
    print(title)
    print("=" * len(title))

def check_true(name, cond):
    ok = bool(cond)
    print(f"{'PASS' if ok else 'FAIL'} - {name}")
    return ok

def check_close(name, got, expected, tol=1e-8):
    ok = np.allclose(got, expected, atol=tol, rtol=tol)
    print(f"{'PASS' if ok else 'FAIL'} - {name}: got {got}, expected {expected}")
    return ok



def centered_diff(f, x, h=1e-6):
    return (f(x + h) - f(x - h)) / (2 * h)

def forward_diff(f, x, h=1e-6):
    return (f(x + h) - f(x)) / h

def backward_diff(f, x, h=1e-6):
    return (f(x) - f(x - h)) / h



def grad_check(f, x, analytic_grad, h=1e-6):
    x = np.asarray(x, dtype=float)
    analytic_grad = np.asarray(analytic_grad, dtype=float)
    numeric_grad = np.zeros_like(x, dtype=float)
    for idx in np.ndindex(x.shape):
        x_plus = x.copy(); x_minus = x.copy()
        x_plus[idx] += h; x_minus[idx] -= h
        numeric_grad[idx] = (f(x_plus) - f(x_minus)) / (2 * h)
    denom = la.norm(analytic_grad) + la.norm(numeric_grad) + 1e-12
    return la.norm(analytic_grad - numeric_grad) / denom



def check(name, got, expected, tol=1e-8):
    return check_close(name, got, expected, tol=tol)

print("Chapter helper setup complete.")

1. Sequences — Visualizing Convergence

A sequence (an)(a_n) converges to LL if for every ϵ>0\epsilon > 0 there exists NN such that anL<ϵ|a_n - L| < \epsilon for all n>Nn > N.

Key examples:

  • an=1/n0a_n = 1/n \to 0 (converges)
  • an=(1)na_n = (-1)^n (diverges — oscillates)
  • an=(1+1/n)nea_n = (1 + 1/n)^n \to e (converges)

Code cell 5

# === 1.1 Sequence Convergence Visualization ===

ns = np.arange(1, 101)

sequences = {
    r'$1/n \to 0$': (1/ns, 0),
    r'$(-1)^n/n \to 0$': ((-1)**ns / ns, 0),
    r'$(1+1/n)^n \to e$': ((1 + 1/ns)**ns, np.e),
    r'$n/(n+1) \to 1$': (ns/(ns+1), 1),
}

if HAS_MPL:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    for ax, (label, (seq, limit)) in zip(axes.flat, sequences.items()):
        ax.plot(ns, seq, 'o', markersize=3, color='steelblue', alpha=0.7)
        ax.axhline(limit, color='red', lw=2, linestyle='--', label=f'Limit = {limit:.4f}')
        ax.set_title(label, fontsize=13)
        ax.set_xlabel('n'); ax.legend()
    plt.suptitle('Sequence Convergence Examples', fontsize=15)
    plt.tight_layout()
    plt.show()

# Numerical verification
print('Convergence verification:')
print(f'  (1 + 1/n)^n at n=10000: {(1 + 1/10000)**10000:.10f}')
print(f'  e = {np.e:.10f}')
check('(1+1/n)^n → e', (1+1/10000)**10000, np.e, tol=1e-4)

Code cell 6

# === 1.2 Epsilon-N Definition — Interactive Proof ===

# Prove: 1/n -> 0
# For given epsilon, find N such that |1/n - 0| < epsilon for all n > N

print('Epsilon-N demo for a_n = 1/n -> 0')
print('=' * 50)

for epsilon in [0.1, 0.01, 0.001, 0.0001]:
    N = int(np.ceil(1 / epsilon))  # Need n > 1/epsilon
    # Verify: for all n > N, |1/n| < epsilon
    test_ns = np.arange(N+1, N+100)
    max_error = np.max(1/test_ns)
    satisfied = max_error < epsilon
    print(f'  epsilon={epsilon:.4f}: N={N}, max |a_n - 0| for n>N = {max_error:.6f} < {epsilon}? {satisfied}')

print()
print('Proof: given epsilon > 0, choose N = ceil(1/epsilon).')
print('Then for n > N: |1/n| < 1/N <= epsilon. QED')

2. Monotone Convergence Theorem

Theorem: Every bounded monotone sequence converges.

If (an)(a_n) is increasing and bounded above, it converges to sup{an}\sup\{a_n\}.

Key application: an=(1+1/n)na_n = (1 + 1/n)^n is increasing and bounded by 3 — this proves ee exists without knowing its value in advance.

Code cell 8

# === 2.1 Monotone Convergence Theorem Demo ===

ns = np.arange(1, 500)

# Sequence 1: increasing, bounded above
a = (1 + 1/ns)**ns  # Converges to e
print('a_n = (1+1/n)^n:')
print(f'  Increasing? {np.all(np.diff(a) > 0)}')
print(f'  Bounded above by 3? {np.all(a < 3)}')
print(f'  Limit (n=500): {a[-1]:.10f}')
print(f'  True e:        {np.e:.10f}')

# Sequence 2: decreasing, bounded below
b = 1 + 1/ns  # Decreasing to 1
print(f'\nb_n = 1 + 1/n:')
print(f'  Decreasing? {np.all(np.diff(b) < 0)}')
print(f'  Bounded below by 1? {np.all(b > 1)}')
print(f'  Limit (n=500): {b[-1]:.10f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes[0].plot(ns[:50], a[:50], 'b-o', ms=3)
    axes[0].axhline(np.e, color='r', ls='--', label=f'e = {np.e:.4f}')
    axes[0].set_title('Increasing, bounded above: $(1+1/n)^n \\to e$')
    axes[0].set_xlabel('n'); axes[0].legend()
    axes[1].plot(ns[:50], b[:50], 'g-o', ms=3)
    axes[1].axhline(1, color='r', ls='--', label='Limit = 1')
    axes[1].set_title('Decreasing, bounded below: $1+1/n \\to 1$')
    axes[1].set_xlabel('n'); axes[1].legend()
    plt.suptitle('Monotone Convergence Theorem', fontsize=14)
    plt.tight_layout(); plt.show()

3. Geometric Series

n=0rn=11r,r<1\sum_{n=0}^\infty r^n = \frac{1}{1-r}, \quad |r| < 1

Proof: SN=1+r++rN=1rN+11r11rS_N = 1 + r + \cdots + r^N = \frac{1-r^{N+1}}{1-r} \to \frac{1}{1-r} as NN \to \infty when r<1|r| < 1.

ML connection: Adam bias correction, discounted returns in RL.

Code cell 10

# === 3.1 Geometric Series — Convergence and Applications ===

def geometric_partial_sum(r, N):
    """Sum of r^0 + r^1 + ... + r^N"""
    return (1 - r**(N+1)) / (1 - r)

def geometric_sum(r):
    """Exact sum: 1/(1-r) for |r| < 1"""
    return 1 / (1 - r)

print('Geometric series convergence:')
for r in [0.5, 0.9, 0.99, -0.5]:
    exact = geometric_sum(r)
    partial_100 = geometric_partial_sum(r, 100)
    print(f'  r={r:5.2f}: exact={exact:.8f}, N=100 approx={partial_100:.8f}, '
          f'error={abs(exact-partial_100):.2e}')

print()
print('=== RL: Discounted return G_0 = sum gamma^t * r_t (r_t=1 constant) ===')
for gamma in [0.9, 0.95, 0.99]:
    G0 = 1 / (1 - gamma)  # exact geometric sum
    G0_sim = sum(gamma**t for t in range(1000))  # simulated
    print(f'  gamma={gamma}: G_0 = 1/(1-gamma) = {G0:.4f}, simulated (1000 steps) = {G0_sim:.4f}')

print()
print('=== Adam: bias correction uses geometric series ===')
beta1, T = 0.9, [1, 5, 10, 50, 100]
for t in T:
    correction = 1 - beta1**t
    print(f'  t={t:3d}: 1 - beta^t = {correction:.6f}, correction factor = {1/correction:.4f}')

Code cell 11

# === 3.2 Harmonic Series Diverges — Oresme's Proof Visualized ===

ns = np.arange(1, 10001)

# Harmonic series partial sums
harmonic_partial = np.cumsum(1/ns)

# p-series for comparison
p2_partial = np.cumsum(1/ns**2)  # converges to pi^2/6
p15_partial = np.cumsum(1/ns**1.5)  # converges
p09_partial = np.cumsum(1/ns**0.9)  # diverges

print('Series behavior at n=10000:')
print(f'  Harmonic Sigma(1/n):     S_10000 = {harmonic_partial[-1]:.4f} (diverges, grows like ln(n))')
print(f'  p=0.9: Sigma(1/n^0.9):   S_10000 = {p09_partial[-1]:.4f} (diverges)')
print(f'  p=1.5: Sigma(1/n^1.5):   S_10000 = {p15_partial[-1]:.4f} (converges)')
print(f'  p=2:   Sigma(1/n^2):     S_10000 = {p2_partial[-1]:.6f} (converges to pi^2/6={np.pi**2/6:.6f})')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    idx = np.logspace(0, 4, 200).astype(int) - 1
    axes[0].semilogx(ns[idx], harmonic_partial[idx], label='Harmonic (p=1)', lw=2)
    axes[0].semilogx(ns[idx], p09_partial[idx], label='p=0.9', lw=2)
    axes[0].semilogx(ns[idx], p15_partial[idx], label='p=1.5', lw=2)
    axes[0].semilogx(ns[idx], p2_partial[idx], label='p=2', lw=2)
    axes[0].set_xlabel('n (log scale)'); axes[0].set_ylabel('Partial sum')
    axes[0].set_title('p-Series: Partial Sums'); axes[0].legend()
    # Show harmonic grows like ln(n)
    axes[1].semilogx(ns[idx], harmonic_partial[idx], 'b-', label='Harmonic', lw=2)
    axes[1].semilogx(ns[idx], np.log(ns[idx]), 'r--', label='ln(n)', lw=2)
    axes[1].set_xlabel('n (log scale)'); axes[1].set_title('Harmonic ~ ln(n)'); axes[1].legend()
    plt.tight_layout(); plt.show()

4. Convergence Tests

TestBest forConverges whenDiverges whenInconclusive
DivergenceAnyan↛0a_n \not\to 0an0a_n \to 0
RatioFactorials, cnc^nL<1L < 1L>1L > 1L=1L = 1
Root(f(n))n(f(n))^nL<1L < 1L>1L > 1L=1L = 1
Alternating(1)nbn\sum(-1)^n b_nbn0b_n \searrow 0bnb_n not decreasing
IntegralDecreasing f(n)f(n)f<\int f < \inftyf=\int f = \inftyff not monotone

Code cell 13

# === 4.1 Ratio Test Demo ===

print('=== Ratio Test Examples ===')

from math import factorial

def ratio_test_limit(a_fn, N=50):
    """Estimate ratio |a_{n+1}/a_n| as n -> infinity"""
    ratios = [abs(a_fn(n+1)/a_fn(n)) for n in range(10, N)]
    return ratios[-1]  # converged value

examples = [
    ('sum 2^n/n!',   lambda n: 2**n / factorial(n),    'L=0 < 1, converges'),
    ('sum n!/n^n',   lambda n: factorial(n) / n**n,    'L=1/e < 1, converges'),
    ('sum n!*x^n x=0.1', lambda n: factorial(n)*0.1**n, 'L->inf, diverges'),
    ('sum 1/n^2',    lambda n: 1/n**2,                  'L=1 (inconclusive)'),
]

for name, a_fn, expected in examples:
    L = ratio_test_limit(a_fn, N=30)
    verdict = 'converges' if L < 0.999 else ('diverges' if L > 1.001 else 'inconclusive')
    print(f'  {name}: L = {L:.6f} -> {verdict}')
    print(f'    (Expected: {expected})')

print()
print('=== Alternating Series Test ===')
# Alternating harmonic: sum (-1)^(n+1)/n = ln(2)
alt_harmonic = sum((-1)**(n+1)/n for n in range(1, 100001))
print(f'Alternating harmonic (100k terms): {alt_harmonic:.10f}')
print(f'ln(2):                              {np.log(2):.10f}')
check('Alternating harmonic = ln(2)', alt_harmonic, np.log(2), tol=1e-5)

5. Power Series and Radius of Convergence

A power series n=0cn(xa)n\sum_{n=0}^\infty c_n(x-a)^n converges on an interval (aR,a+R)(a-R, a+R) where RR is the radius of convergence.

R=limncncn+1(ratio method)R = \lim_{n\to\infty}\left|\frac{c_n}{c_{n+1}}\right| \quad \text{(ratio method)}

Inside the radius: absolute convergence. Outside: divergence. Endpoints: must check separately.

Code cell 15

# === 5.1 Radius of Convergence — Numerical Computation ===

import numpy as np

def compute_R_ratio(coeffs):
    """Estimate R = lim |c_n/c_{n+1}| from coefficient sequence"""
    ratios = [abs(coeffs[n]/coeffs[n+1]) for n in range(len(coeffs)-1)]
    return ratios[-1]  # last ratio as estimate of limit

# Example 1: sum x^n/n! (exponential series)
from math import factorial
N = 30
c_exp = [1/factorial(n) for n in range(N)]
R_exp = compute_R_ratio(c_exp)
print(f'exp series c_n = 1/n!: R = {R_exp:.2e} (-> infinity, converges everywhere)')

# Example 2: sum x^n (geometric series)
c_geom = [1.0] * N
R_geom = compute_R_ratio(c_geom)
print(f'Geometric series c_n = 1: R = {R_geom:.4f} (should be 1)')

# Example 3: sum x^n/n (log series)
c_log = [1/n for n in range(1, N+1)]
R_log = compute_R_ratio(c_log)
print(f'Log series c_n = 1/n: R = {R_log:.4f} (should be 1)')

# Example 4: sum n!*x^n
c_fact = [float(factorial(n)) for n in range(1, N+1)]
R_fact = compute_R_ratio(c_fact)
print(f'Factorial series c_n = n!: R = {R_fact:.4f} (-> 0)')

# Visualize convergence for geometric series sum x^n at different x
print()
print('Geometric series sum x^n at various x (N=50 terms):')
for x in [-0.9, -0.5, 0.0, 0.5, 0.9, 0.99, 1.0, 1.1]:
    S50 = sum(x**n for n in range(50))
    exact = 1/(1-x) if abs(x) < 1 else float('inf')
    print(f'  x={x:5.2f}: S_50={S50:12.4f}, exact={str(round(exact,4)):>10}')

Code cell 16

# === 5.2 Power Series — Visualization Within/Outside Radius ===

import numpy as np

# Partial sum of geometric series at order N
def geom_partial(x, N):
    return sum(x**n for n in range(N))

if HAS_MPL:
    x_vals = np.linspace(-1.5, 1.5, 400)
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # Left: partial sums converging inside |x| < 1
    x_inside = np.linspace(-0.95, 0.95, 200)
    exact = 1 / (1 - x_inside)
    axes[0].plot(x_inside, exact, 'k-', lw=3, label='Exact: 1/(1-x)')
    for N in [1, 3, 5, 10, 20]:
        approx = np.array([geom_partial(x, N) for x in x_inside])
        axes[0].plot(x_inside, approx, alpha=0.6, label=f'N={N}')
    axes[0].set_ylim(-5, 5); axes[0].set_xlim(-1.05, 1.05)
    axes[0].axvline(-1, ls=':', color='gray'); axes[0].axvline(1, ls=':', color='gray')
    axes[0].set_title('Partial sums inside radius of convergence |x|<1')
    axes[0].set_xlabel('x'); axes[0].legend(fontsize=9)

    # Right: error vs n inside radius for different x values
    ns = np.arange(1, 51)
    for x, col in [(0.5, 'blue'), (0.8, 'orange'), (0.95, 'red')]:
        errors = [abs(geom_partial(x, n) - 1/(1-x)) for n in ns]
        axes[1].semilogy(ns, errors, color=col, label=f'x={x}')
    axes[1].set_xlabel('N (number of terms)'); axes[1].set_ylabel('|Error|')
    axes[1].set_title('Convergence speed depends on |x|/R'); axes[1].legend()
    plt.tight_layout(); plt.show()

6. Taylor Series

f(x)=n=0f(n)(a)n!(xa)n=f(a)+f(a)(xa)+f(a)2(xa)2+f(x) = \sum_{n=0}^\infty \frac{f^{(n)}(a)}{n!}(x-a)^n = f(a) + f'(a)(x-a) + \frac{f''(a)}{2}(x-a)^2 + \cdots

Key Maclaurin series (at a=0a=0):

FunctionSeriesRadius
exe^xxn/n!\sum x^n/n!\infty
sinx\sin x(1)nx2n+1/(2n+1)!\sum (-1)^n x^{2n+1}/(2n+1)!\infty
cosx\cos x(1)nx2n/(2n)!\sum (-1)^n x^{2n}/(2n)!\infty
ln(1+x)\ln(1+x)(1)n+1xn/n\sum (-1)^{n+1}x^n/n1
1/(1x)1/(1-x)xn\sum x^n1

Code cell 18

# === 6.1 Maclaurin Series Verification ===

import numpy as np
from math import factorial

def taylor_exp(x, N):
    return sum(x**n / factorial(n) for n in range(N))

def taylor_sin(x, N):
    return sum((-1)**n * x**(2*n+1) / factorial(2*n+1) for n in range(N))

def taylor_cos(x, N):
    return sum((-1)**n * x**(2*n) / factorial(2*n) for n in range(N))

def taylor_log1p(x, N):
    return sum((-1)**(n+1) * x**n / n for n in range(1, N+1))

print('Taylor Series Verification (N=20 terms):')
test_points = [0.5, 1.0, 2.0]
N = 20

for x in test_points:
    print(f'\n  x = {x}:')
    e_approx = taylor_exp(x, N)
    check(f'    exp({x})', e_approx, np.exp(x), tol=1e-10)
    s_approx = taylor_sin(x, N)
    check(f'    sin({x})', s_approx, np.sin(x), tol=1e-10)
    c_approx = taylor_cos(x, N)
    check(f'    cos({x})', c_approx, np.cos(x), tol=1e-10)

# Log series only valid for |x| < 1
for x in [0.2, 0.5, 0.9]:
    l_approx = taylor_log1p(x, 50)
    check(f'log(1+{x})', l_approx, np.log(1+x), tol=1e-5)

Code cell 19

# === 6.2 Taylor Polynomial Approximation Quality ===

import numpy as np
from math import factorial

if HAS_MPL:
    x = np.linspace(-3*np.pi, 3*np.pi, 500)
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))

    # sin(x) Taylor polynomials
    axes[0].plot(x, np.sin(x), 'k-', lw=3, label='sin(x)')
    for N, color in [(1, 'red'), (3, 'orange'), (5, 'green'), (7, 'blue'), (9, 'purple')]:
        T_N = sum((-1)**n * x**(2*n+1)/factorial(2*n+1) for n in range((N+1)//2))
        axes[0].plot(x, np.clip(T_N, -5, 5), color=color, alpha=0.7, label=f'T_{N}(x)')
    axes[0].set_ylim(-3, 3); axes[0].set_xlim(-3*np.pi, 3*np.pi)
    axes[0].set_title('Taylor Polynomials for sin(x)')
    axes[0].set_xlabel('x'); axes[0].legend(fontsize=9)

    # Error as function of x for fixed N
    x_err = np.linspace(-5, 5, 300)
    for N, color in [(3, 'red'), (5, 'blue'), (7, 'green'), (9, 'purple')]:
        T_N = sum((-1)**n * x_err**(2*n+1)/factorial(2*n+1) for n in range((N+1)//2))
        err = np.abs(np.sin(x_err) - T_N)
        axes[1].semilogy(x_err, err + 1e-20, color=color, label=f'N={N}')
    axes[1].set_xlabel('x'); axes[1].set_ylabel('|sin(x) - T_N(x)|')
    axes[1].set_title('Approximation Error vs x')
    axes[1].legend(); axes[1].set_ylim(1e-20, 1e5)
    plt.tight_layout(); plt.show()

Code cell 20

# === 6.3 Lagrange Remainder — Verified Error Bounds ===

import numpy as np
from math import factorial

# Bound for e^x Taylor approximation at x = 1
# |R_N(1)| <= e^1 / (N+1)! (since |f^(N+1)(x)| = e^x <= e^1 on [0,1])

print('Lagrange remainder bound for e^x at x=1 (a=0):')
print(f'True value: e = {np.e:.15f}')
print()

for N in [5, 10, 15, 20]:
    T_N = sum(1/factorial(n) for n in range(N+1))
    actual_error = abs(np.e - T_N)
    lagrange_bound = np.e / factorial(N+1)  # M = e, |x-a|^(N+1) = 1
    print(f'  N={N:2d}: T_N = {T_N:.15f}')
    print(f'        Actual error:    {actual_error:.3e}')
    print(f'        Lagrange bound:  {lagrange_bound:.3e}')
    print(f'        Bound tight?     {actual_error <= lagrange_bound}')

print()
# How many terms needed for 1e-15 precision?
for N in range(1, 25):
    bound = np.e / factorial(N+1)
    if bound < 1e-15:
        print(f'N={N} terms gives Lagrange bound {bound:.2e} < 1e-15 (float64 precision)')
        break

7. ML Applications

7.1 Softmax Temperature

softmax(x/T)i=exi/Tjexj/T\text{softmax}(\mathbf{x}/T)_i = \frac{e^{x_i/T}}{\sum_j e^{x_j/T}}
  • T0T \to 0: one-hot on argmax (hard attention, Gumbel-Softmax)
  • TT \to \infty: uniform distribution (knowledge distillation soft targets)
  • Taylor analysis: exi/T1+xi/Te^{x_i/T} \approx 1 + x_i/T for large TT

Code cell 22

# === 7.1 Softmax Temperature Analysis ===

import numpy as np

def softmax_temp(x, T):
    x = np.array(x, dtype=float)
    z = x / T
    z -= z.max()  # numerical stability
    e = np.exp(z)
    return e / e.sum()

logits = np.array([2.0, 1.0, 0.1, -1.0])
print('Logits:', logits)
print()
print(f'{'T':>8} | {'p[0]':>10} | {'p[1]':>10} | {'p[2]':>10} | {'p[3]':>10} | Entropy')
print('-' * 65)

temps = [0.01, 0.1, 0.5, 1.0, 2.0, 10.0, 100.0]
for T in temps:
    p = softmax_temp(logits, T)
    H = -np.sum(p * np.log(p + 1e-30))  # entropy
    print(f'{T:8.2f} | {p[0]:10.6f} | {p[1]:10.6f} | {p[2]:10.6f} | {p[3]:10.6f} | {H:.4f}')

print(f'  Uniform entropy (max): {np.log(4):.4f}')
print(f'  One-hot entropy (min): 0.0000')

# Verify: Taylor says for large T, p_i ≈ 1/n + (x_i - mean(x))/(n*T)
T = 100.0
p = softmax_temp(logits, T)
n = len(logits)
taylor_approx = 1/n + (logits - logits.mean())/(n * T)
print(f'\nT={T}: Softmax vs first-order Taylor:')
for i in range(n):
    print(f'  p[{i}] = {p[i]:.8f}, Taylor: {taylor_approx[i]:.8f}')

Code cell 23

# === 7.2 Adam Bias Correction via Geometric Series ===

import numpy as np

np.random.seed(42)

# Simulate Adam first moment with and without bias correction
T = 100
beta1 = 0.9
true_grad = 1.0  # constant true gradient
noise = 0.5

gradients = true_grad + noise * np.random.randn(T)

m = 0.0
m_raw = []
m_corrected = []

for t in range(1, T+1):
    g = gradients[t-1]
    m = beta1 * m + (1 - beta1) * g
    m_raw.append(m)
    m_corrected.append(m / (1 - beta1**t))

# Theoretical bias: E[m_t] = (1 - beta^t) * true_grad
theoretical_raw = [(1 - beta1**t) * true_grad for t in range(1, T+1)]

print(f'True gradient: {true_grad}')
print(f'beta1 = {beta1}')
print()
print(f'{'t':>5} | {'m_raw':>12} | {'m_corrected':>14} | {'theory_raw':>12}')
for t in [1, 2, 5, 10, 20, 50, 100]:
    print(f'{t:5d} | {m_raw[t-1]:12.6f} | {m_corrected[t-1]:14.6f} | {theoretical_raw[t-1]:12.6f}')

# Geometric series proof
print()
print('Geometric series identity: sum_{k=0}^{t-1} beta^k = (1-beta^t)/(1-beta)')
t_check = 10
geom_sum = sum(beta1**k for k in range(t_check))
formula = (1 - beta1**t_check) / (1 - beta1)
check(f'Geometric sum = formula (t={t_check})', geom_sum, formula)

Code cell 24

# === 7.3 Linear Attention via Taylor Expansion ===

import numpy as np

# Compare standard vs linear attention on a small example
np.random.seed(42)
n, d = 8, 4  # sequence length, head dim

Q = np.random.randn(n, d) / np.sqrt(d)
K = np.random.randn(n, d) / np.sqrt(d)
V = np.random.randn(n, d)

def softmax(x, axis=-1):
    x = x - x.max(axis=axis, keepdims=True)
    e = np.exp(x)
    return e / e.sum(axis=axis, keepdims=True)

# Standard attention: O(n^2 d)
scores = Q @ K.T / np.sqrt(d)  # n x n
attn = softmax(scores)           # n x n
out_standard = attn @ V          # n x d

# Linear attention (1st-order Taylor: phi(x) = [1, x])
# exp(q.k) approx 1 + q.k = phi(q).phi(k) where phi(x) = [1, x1, ..., xd]
def phi(x):
    return np.hstack([np.ones((x.shape[0], 1)), x])  # [1, x] features

pQ = phi(Q)  # n x (d+1)
pK = phi(K)  # n x (d+1)

# Compute S = sum_j phi(k_j) v_j^T and z = sum_j phi(k_j) in O(n*d)
S = pK.T @ V  # (d+1) x d
z = pK.sum(axis=0)  # (d+1,)

out_linear = np.array([pQ[i] @ S / (pQ[i] @ z) for i in range(n)])  # n x d

error = np.abs(out_standard - out_linear).mean()
print(f'Standard attention output shape: {out_standard.shape}')
print(f'Linear attention output shape:   {out_linear.shape}')
print(f'Mean absolute error: {error:.6f}')
print(f'Relative error: {error / np.abs(out_standard).mean():.4f}')
print()
print('Standard vs Linear (first 3 rows, first 2 dims):')
for i in range(3):
    print(f'  Row {i}: standard={out_standard[i,:2]}, linear={out_linear[i,:2]}')

8. Fourier Series and Positional Encodings

The sinusoidal positional encoding from the original Transformer (Vaswani et al., 2017):

PEp,2k=sin(p100002k/d),PEp,2k+1=cos(p100002k/d)PE_{p,2k} = \sin\left(\frac{p}{10000^{2k/d}}\right), \quad PE_{p,2k+1} = \cos\left(\frac{p}{10000^{2k/d}}\right)

This is a Fourier encoding: position pp is represented as a superposition of sinusoids at d/2d/2 different frequencies ωk=100002k/d\omega_k = 10000^{-2k/d}.

Code cell 26

# === 8.1 Sinusoidal Positional Encoding Visualization ===

import numpy as np

def sinusoidal_pe(positions, d_model):
    """Compute sinusoidal positional encodings."""
    PE = np.zeros((len(positions), d_model))
    for i, pos in enumerate(positions):
        for k in range(d_model // 2):
            omega = 1.0 / (10000 ** (2*k / d_model))
            PE[i, 2*k]   = np.sin(pos * omega)
            PE[i, 2*k+1] = np.cos(pos * omega)
    return PE

# Compute PEs for positions 0..99 and d_model=64
positions = np.arange(100)
d = 64
PE = sinusoidal_pe(positions, d)

print(f'PE shape: {PE.shape} (positions x d_model)')
print(f'PE values range: [{PE.min():.4f}, {PE.max():.4f}]')

# Key property: inner product PE[m] . PE[n] depends on m-n
def pe_dot(m, n, PE):
    return PE[m] @ PE[n]

print()
print('Inner products PE[m].PE[n] (should depend mainly on |m-n|):')
for (m, n) in [(0, 0), (10, 10), (0, 5), (10, 15), (0, 10), (10, 20)]:
    dot = pe_dot(m, n, PE)
    print(f'  m={m:2d}, n={n:2d}, |m-n|={abs(m-n):2d}: dot = {dot:.4f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    im = axes[0].imshow(PE, aspect='auto', cmap='RdBu', vmin=-1, vmax=1)
    plt.colorbar(im, ax=axes[0])
    axes[0].set_xlabel('Dimension (d)'); axes[0].set_ylabel('Position')
    axes[0].set_title('Sinusoidal Positional Encodings')
    # Show a few frequency channels
    for k, color in [(0, 'blue'), (2, 'orange'), (8, 'green')]:
        axes[1].plot(positions[:40], PE[:40, 2*k], color=color,
                     label=f'dim {2*k} (freq={1/10000**(2*k/d):.4f})')
    axes[1].set_xlabel('Position'); axes[1].set_title('PE values across positions (sin channels)')
    axes[1].legend(fontsize=9)
    plt.tight_layout(); plt.show()

Code cell 27

# === 8.2 Log-Sum-Exp Numerical Stability ===

import numpy as np

def naive_logsumexp(x):
    return np.log(np.sum(np.exp(x)))

def stable_logsumexp(x):
    m = np.max(x)
    return m + np.log(np.sum(np.exp(x - m)))

# Test cases
test_cases = [
    np.array([1.0, 2.0, 3.0]),         # normal range
    np.array([100.0, 200.0, 300.0]),    # large values
    np.array([-100.0, -50.0, 0.0]),     # large negative
    np.array([700.0, 710.0, 720.0]),    # near float64 overflow
    np.array([1000.0, 1001.0, 1002.0]),# overflow for float32
]

print(f'{'x':>35} | {'naive':>15} | {'stable':>15}')
print('-' * 75)
for x in test_cases:
    try:
        naive = naive_logsumexp(x)
    except:
        naive = float('nan')
    stable = stable_logsumexp(x)
    print(f'{str(x.tolist()):>35} | {naive:>15.6f} | {stable:>15.6f}')

# Verify with scipy
from scipy.special import logsumexp
x = np.array([1.0, 2.0, 3.0, 4.0, 5.0])
check('stable_lse matches scipy', stable_logsumexp(x), logsumexp(x))

Summary

ConceptKey FormulaML Connection
Geometric seriesrn=1/(1r)\sum r^n = 1/(1-r)Adam bias correction, RL returns
p-series criterionconverges iff p>1p > 1Robbins-Monro conditions
Radius of convergence$R = \limc_n/c_{n+1}
Taylor seriesf(n)(a)(xa)n/n!\sum f^{(n)}(a)(x-a)^n/n!Linear attention, optimizer derivations
Lagrange remainder$R_n
Softmax temperatureT0T\to 0 → argmax; TT\to\infty → uniformSampling, knowledge distillation
Linear attentioneqk1+qke^{q^\top k} \approx 1 + q^\top kO(n)O(n) transformer attention
Sinusoidal PEsin(p/100002k/d)\sin(p/10000^{2k/d})Relative position encoding
Log-sum-expm+lneximm + \ln\sum e^{x_i-m}Numerically stable softmax
Euler's formulaeiθ=cosθ+isinθe^{i\theta} = \cos\theta + i\sin\thetaRoPE complex rotation encoding

9. Series Convergence — Deep Dives

9.1 Absolute vs Conditional Convergence

Code cell 30

# === 9.1 Absolute vs. Conditional Convergence ===

import numpy as np

ns = np.arange(1, 10001)

# Alternating harmonic: conditionally convergent
alt_harm = (-1)**(ns+1) / ns
S_alt = np.cumsum(alt_harm)

# Absolute values: diverges (harmonic)
harm = 1.0 / ns
S_harm = np.cumsum(harm)

# Absolutely convergent: sum 1/n^2
p2 = 1.0 / ns**2
S_p2 = np.cumsum(p2)

print('Convergence comparison (N=10000):')
print(f'  Alternating harmonic sum: {S_alt[-1]:.10f}')
print(f'  ln(2):                    {np.log(2):.10f}')
print(f'  Harmonic (absolute val):  {S_harm[-1]:.4f} (diverges ~ ln(10000) = {np.log(10000):.4f})')
print(f'  Sum 1/n^2:                {S_p2[-1]:.10f}')
print(f'  pi^2/6:                   {np.pi**2/6:.10f}')

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    idx = np.arange(0, 1000, 10)  # subsample
    axes[0].plot(ns[idx], S_alt[idx], 'b-', label='Alternating harmonic', lw=2)
    axes[0].axhline(np.log(2), color='r', ls='--', label=f'ln(2)={np.log(2):.4f}')
    axes[0].set_title('Conditional convergence: alternating harmonic')
    axes[0].set_xlabel('N'); axes[0].legend()
    axes[1].plot(ns[idx], S_p2[idx], 'g-', label='Sum 1/n^2', lw=2)
    axes[1].axhline(np.pi**2/6, color='r', ls='--', label=f'π²/6={np.pi**2/6:.4f}')
    axes[1].set_title('Absolute convergence: p-series p=2')
    axes[1].set_xlabel('N'); axes[1].legend()
    plt.tight_layout(); plt.show()

Code cell 31

# === 9.2 Riemann Rearrangement Theorem Demo ===

import numpy as np

# The alternating harmonic series: sum (-1)^(n+1)/n = ln(2)
# Rearrangement: 2 positive, 1 negative => sum = 3/2 * ln(2)

def alt_harmonic_sum(N):
    return sum((-1)**(n+1)/n for n in range(1, N+1))

def rearranged_sum_2pos_1neg(K):
    """Take 2 positive terms then 1 negative term, K cycles"""
    pos_idx, neg_idx = 1, 1  # 1/1, 1/3, 1/5 ... and 1/2, 1/4, ...
    total = 0.0
    for _ in range(K):
        # Two positive terms: 1/(2k-1)
        for _ in range(2):
            total += 1.0 / pos_idx
            pos_idx += 2
        # One negative term: -1/(2k)
        total -= 1.0 / neg_idx
        neg_idx += 2
    return total

N = 10000
standard = alt_harmonic_sum(N)
rearranged = rearranged_sum_2pos_1neg(N // 3)
expected_rearranged = 1.5 * np.log(2)  # theoretical: 3/2 * ln(2)

print('Riemann Rearrangement Theorem:')
print(f'  Standard alternating harmonic sum: {standard:.8f}')
print(f'  ln(2) = {np.log(2):.8f}')
print(f'  Rearranged (2 pos, 1 neg):         {rearranged:.8f}')
print(f'  3/2 * ln(2) = {expected_rearranged:.8f}')
print()
print('CONCLUSION: The SAME terms, in different order, give DIFFERENT sums!')
print('This only happens for CONDITIONALLY convergent series.')

10. Taylor Series Manipulation

Substitution: Replace xx with g(x)g(x) in a known series.

From ex=xn/n!e^x = \sum x^n/n!: substitute xx2x \to -x^2 to get ex2=(1)nx2n/n!e^{-x^2} = \sum (-1)^n x^{2n}/n!

From 1/(1x)=xn1/(1-x) = \sum x^n: integrate to get ln(1+x)\ln(1+x); substitute to get 1/(1+x2)1/(1+x^2), then integrate to get arctanx\arctan x.

Code cell 33

# === 10.1 Taylor Series Composition and Integration ===

import numpy as np
from math import factorial

# Derive arctan(x) from 1/(1+x^2)
# 1/(1+x^2) = 1/(1-(-x^2)) = sum (-x^2)^n = sum (-1)^n x^{2n}
# arctan(x) = integral_0^x 1/(1+t^2) dt = sum (-1)^n x^{2n+1}/(2n+1)

def taylor_arctan(x, N):
    return sum((-1)**n * x**(2*n+1) / (2*n+1) for n in range(N))

def taylor_exp_neg_x2(x, N):
    """e^(-x^2) = sum (-1)^n x^(2n) / n!"""
    return sum((-1)**n * x**(2*n) / factorial(n) for n in range(N))

print('arctan(x) via Taylor (N=30 terms):')
for x in [0.5, 0.9, 1.0]:
    approx = taylor_arctan(x, 30)
    exact = np.arctan(x)
    check(f'arctan({x})', approx, exact, tol=1e-8)

print()
print('Leibniz formula: pi/4 = 1 - 1/3 + 1/5 - ...')
for N in [10, 100, 1000, 10000]:
    approx = taylor_arctan(1.0, N)
    print(f'  N={N:5d}: pi/4 approx = {approx:.8f}, error = {abs(approx - np.pi/4):.2e}')

print()
print('e^(-x^2): compare Taylor to numpy (N=15 terms):')
for x in [0.5, 1.0, 2.0]:
    approx = taylor_exp_neg_x2(x, 15)
    exact = np.exp(-x**2)
    check(f'exp(-{x}^2)', approx, exact, tol=1e-8)

Code cell 34

# === 10.2 Error Function via Taylor Series ===

import numpy as np
from math import factorial

# erf(x) = (2/sqrt(pi)) * integral_0^x e^(-t^2) dt
#        = (2/sqrt(pi)) * sum (-1)^n x^(2n+1) / (n! * (2n+1))

def taylor_erf(x, N):
    coeff = 2 / np.sqrt(np.pi)
    return coeff * sum((-1)**n * x**(2*n+1) / (factorial(n) * (2*n+1))
                       for n in range(N))

print('Error function erf(x) via Taylor series:')
for x in [0.5, 1.0, 1.5, 2.0]:
    for N in [5, 10, 20]:
        approx = taylor_erf(x, N)
        exact = float(special.erf(x))
        err = abs(approx - exact)
        print(f'  erf({x}) N={N:2d}: approx={approx:.8f}, exact={exact:.8f}, err={err:.2e}')

# GELU connection: GELU(x) = x * Phi(x) = x * 0.5 * (1 + erf(x/sqrt(2)))
print()
print('GELU via erf Taylor:')
def gelu_exact(x):
    return x * 0.5 * (1 + special.erf(x/np.sqrt(2)))

def gelu_taylor(x, N=15):
    erf_approx = taylor_erf(x/np.sqrt(2), N)
    return x * 0.5 * (1 + erf_approx)

for x in [-2.0, -1.0, 0.0, 0.5, 1.0, 2.0]:
    g_exact = gelu_exact(x)
    g_taylor = gelu_taylor(x, N=15)
    print(f'  GELU({x:5.1f}): exact={g_exact:.6f}, Taylor={g_taylor:.6f}, err={abs(g_exact-g_taylor):.2e}')

Code cell 35

# === 10.3 GELU vs ReLU — Taylor Perspective ===

import numpy as np

x = np.linspace(-3, 3, 300)

relu = np.maximum(0, x)
gelu = x * 0.5 * (1 + special.erf(x / np.sqrt(2)))
gelu_approx = 0.5 * x * (1 + np.tanh(np.sqrt(2/np.pi) * (x + 0.044715*x**3)))

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    axes[0].plot(x, relu, 'r-', lw=2, label='ReLU')
    axes[0].plot(x, gelu, 'b-', lw=2, label='GELU (exact)')
    axes[0].plot(x, gelu_approx, 'g--', lw=2, label='GELU (tanh approx)')
    axes[0].set_title('ReLU vs GELU')
    axes[0].set_xlabel('x'); axes[0].legend()
    # Derivative comparison
    h = x[1] - x[0]
    drelu = np.gradient(relu, h)
    dgelu = np.gradient(gelu, h)
    axes[1].plot(x, drelu, 'r-', lw=2, label="ReLU'")
    axes[1].plot(x, dgelu, 'b-', lw=2, label="GELU'")
    axes[1].set_title('Derivatives: ReLU is discontinuous, GELU is smooth')
    axes[1].set_xlabel('x'); axes[1].legend()
    plt.tight_layout(); plt.show()

# Verify approximation quality
max_err = np.max(np.abs(gelu - gelu_approx))
print(f'Max error of tanh GELU approximation: {max_err:.6f}')

11. Laplace Approximation

The Laplace approximation fits a Gaussian to a posterior by taking the quadratic Taylor expansion of logp(θD)\log p(\theta|\mathcal{D}) around the MAP θ^\hat\theta:

p(θD)N(θ^, [2logp(θ^D)]1)p(\theta|\mathcal{D}) \approx \mathcal{N}\left(\hat\theta,\ \left[-\nabla^2\log p(\hat\theta|\mathcal{D})\right]^{-1}\right)

Code cell 37

# === 11.1 Laplace Approximation Demo ===

import numpy as np
from scipy.stats import norm

# True posterior: Beta(alpha, beta) distribution, normalized
# log p(theta | data) = (a-1)*log(theta) + (b-1)*log(1-theta) + const
a, b = 7.0, 3.0  # alpha=7 heads, beta=3 tails -> posterior Beta(7,3)

def log_posterior(theta):
    if theta <= 0 or theta >= 1:
        return -np.inf
    return (a-1)*np.log(theta) + (b-1)*np.log(1-theta)

# MAP estimate: mode of Beta(a,b) = (a-1)/(a+b-2)
theta_map = (a - 1) / (a + b - 2)
print(f'MAP estimate: theta_hat = {theta_map:.6f}')

# Hessian at MAP (2nd derivative of log_posterior)
# d/dtheta: (a-1)/theta - (b-1)/(1-theta)
# d^2/dtheta^2: -(a-1)/theta^2 - (b-1)/(1-theta)^2
hessian_at_map = -(a-1)/theta_map**2 - (b-1)/(1-theta_map)**2
laplace_var = -1.0 / hessian_at_map  # Sigma = -H^{-1}
laplace_std = np.sqrt(laplace_var)

print(f'Hessian at MAP: {hessian_at_map:.4f}')
print(f'Laplace variance: {laplace_var:.6f}')
print(f'Laplace std: {laplace_std:.6f}')

# True Beta distribution stats
from scipy.stats import beta
true_mean = a / (a + b)
true_var = a * b / ((a+b)**2 * (a+b+1))
print(f'True Beta mean: {true_mean:.6f}, True Beta var: {true_var:.6f}')
print(f'Laplace mean:   {theta_map:.6f}, Laplace var:   {laplace_var:.6f}')

if HAS_MPL:
    thetas = np.linspace(0.01, 0.99, 300)
    true_pdf = beta.pdf(thetas, a, b)
    laplace_pdf = norm.pdf(thetas, theta_map, laplace_std)
    plt.figure(figsize=(9, 5))
    plt.plot(thetas, true_pdf, 'b-', lw=2, label=f'True Beta({a},{b})')
    plt.plot(thetas, laplace_pdf, 'r--', lw=2, label='Laplace approx (Gaussian)')
    plt.axvline(theta_map, ls=':', color='gray', label=f'MAP={theta_map:.3f}')
    plt.xlabel('theta'); plt.title('Laplace Approximation = Quadratic Taylor Expansion')
    plt.legend(); plt.show()

Code cell 38

# === 12. Learning Rate Schedules as Sequences ===

import numpy as np

T_max = 200
t = np.arange(1, T_max+1)
alpha_0 = 0.1

schedules = {
    'Constant': np.ones(T_max) * alpha_0,
    'Step decay (gamma=0.95)': alpha_0 * 0.95**(t//10),
    'Cosine': alpha_0/2 * (1 + np.cos(np.pi * t / T_max)),
    'Polynomial 1/t': alpha_0 / t,
    'Polynomial 1/sqrt(t)': alpha_0 / np.sqrt(t),
}

print('Robbins-Monro conditions:')
print('  Need: sum(alpha_t) = inf  AND  sum(alpha_t^2) < inf')
print()
for name, alpha in schedules.items():
    sum_alpha = np.sum(alpha)
    sum_alpha2 = np.sum(alpha**2)
    satisfies_rm = sum_alpha > 50 and sum_alpha2 < 1000
    print(f'  {name}:')
    print(f'    sum(alpha_t) = {sum_alpha:.4f}, sum(alpha_t^2) = {sum_alpha2:.6f}')
    print(f'    Robbins-Monro satisfied (heuristically)? {satisfies_rm}')

if HAS_MPL:
    plt.figure(figsize=(10, 5))
    for name, alpha in schedules.items():
        plt.semilogy(t, alpha, label=name, lw=2)
    plt.xlabel('Step t'); plt.ylabel('Learning rate (log scale)')
    plt.title('Learning Rate Schedules as Sequences')
    plt.legend(fontsize=9); plt.show()

Code cell 39

# === 13. Famous Series: Basel Problem and Euler's Identity ===

import numpy as np

# Basel problem: sum 1/n^2 = pi^2/6
ns = np.arange(1, 1000001)
basel_sum = np.sum(1.0/ns**2)
pi2_6 = np.pi**2 / 6
print(f'Basel problem: sum 1/n^2 (N=1M terms) = {basel_sum:.10f}')
print(f'pi^2/6 = {pi2_6:.10f}')
print(f'Error: {abs(basel_sum - pi2_6):.2e}')
check('Basel problem', basel_sum, pi2_6, tol=1e-5)

print()
# Euler's formula: e^(i*pi) + 1 = 0
# Verified via the exponential series for complex argument
def complex_exp_series(z, N=50):
    from math import factorial
    return sum(z**n / factorial(n) for n in range(N))

e_ipi = complex_exp_series(1j * np.pi, 50)
print(f"Euler's identity: e^(i*pi) + 1 = {e_ipi + 1:.2e} (should be ~0)")
check("e^(i*pi) = -1 (real part)", np.real(e_ipi), -1.0)
check("e^(i*pi) = -1 (imag part)", np.imag(e_ipi), 0.0)

print()
# sin and cos from exp
theta = np.pi / 4
e_itheta = complex_exp_series(1j * theta)
check('Re(e^(i*pi/4)) = cos(pi/4)', np.real(e_itheta), np.cos(theta))
check('Im(e^(i*pi/4)) = sin(pi/4)', np.imag(e_itheta), np.sin(theta))

Code cell 40

# === 14. Uniform vs. Pointwise Convergence ===

import numpy as np

# f_n(x) = x^n on [0, 1]
# Pointwise limit: f(x) = 0 for x in [0,1), f(1) = 1
# Convergence is NOT uniform on [0,1]: sup |x^n - f(x)| = 1 for all n

x = np.linspace(0, 1, 300)
x_interior = np.linspace(0, 0.99, 300)  # exclude x=1

if HAS_MPL:
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    # Left: x^n curves
    for n, col in [(1, 'blue'), (5, 'orange'), (20, 'green'), (50, 'red')]:
        axes[0].plot(x, x**n, color=col, label=f'n={n}', lw=2)
    # Limiting function
    lim = np.zeros_like(x)
    lim[-1] = 1  # f(1) = 1
    axes[0].step(x, lim, 'k-', lw=3, where='post', label='f(x) = 0 (x<1), 1 (x=1)')
    axes[0].set_title('Pointwise but not uniform: $f_n(x) = x^n$ on [0,1]')
    axes[0].set_xlabel('x'); axes[0].legend(fontsize=9)
    # Right: sup norm vs n
    ns = np.arange(1, 101)
    sup_norms_01 = [np.max(x**n) for n in ns]  # on [0,1]: sup = 1 always
    sup_norms_09 = [np.max(x_interior**n) for n in ns]  # on [0,0.99]: -> 0
    axes[1].semilogy(ns, sup_norms_01, 'r-', lw=2, label='sup on [0,1]: stays at 1')
    axes[1].semilogy(ns, sup_norms_09, 'b-', lw=2, label='sup on [0,0.99]: -> 0 (uniform)')
    axes[1].set_xlabel('n'); axes[1].set_ylabel('sup|f_n - f|')
    axes[1].set_title('Uniform convergence fails on [0,1], holds on [0,0.99]')
    axes[1].legend(); plt.tight_layout(); plt.show()

print('Lesson: uniform convergence on [0,R] for ANY R < 1.')
print('This is why power series converge uniformly on closed sub-intervals inside the radius.')

Final Takeaways

  1. Sequences and series are limits: all convergence analysis reduces to the ε-N definition of limits from §04/01.

  2. Geometric series is the master key: ratio test, Adam bias correction, discounted returns, and power series theory all reduce to rn=1/(1r)\sum r^n = 1/(1-r).

  3. Taylor series = polynomial approximation with error control: the Lagrange remainder tells you exactly how good your approximation is — essential for quantifying the error in linear attention, GELU approximations, etc.

  4. Uniform convergence enables term-by-term operations: the justification for differentiating/integrating power series, and for many limit interchanges in ML theory.

  5. Fourier series = Taylor series in a different basis: both are ways of decomposing functions into basis components. Positional encodings are Fourier series evaluated at discrete positions.


15. Matrix Exponential

For a square matrix AA, the matrix exponential uses the same power series:

eA=n=0Ann!=I+A+A22!+e^A = \sum_{n=0}^\infty \frac{A^n}{n!} = I + A + \frac{A^2}{2!} + \cdots

This always converges. Applications in ML: continuous state space models (Mamba), SE(3)-equivariant networks (rotation as eω^e^{\hat\omega}), flow matching.

Code cell 43

# === 15.1 Matrix Exponential via Power Series ===

import numpy as np
from math import factorial
from scipy.linalg import expm

def matrix_exp_series(A, N=30):
    """e^A = sum A^n / n! via series (naive, for small A)"""
    result = np.eye(A.shape[0])
    An = np.eye(A.shape[0])  # A^0
    for n in range(1, N):
        An = An @ A
        result += An / factorial(n)
    return result

# Example 1: rotation matrix (skew-symmetric A)
theta = np.pi / 4
A_rot = np.array([[0, -theta], [theta, 0]])  # 2D rotation generator
eA = matrix_exp_series(A_rot, N=30)
R_true = np.array([[np.cos(theta), -np.sin(theta)],
                    [np.sin(theta),  np.cos(theta)]])
print('Matrix exponential of rotation generator:')
print(f'e^A =\n{eA}')
print(f'R(theta) =\n{R_true}')
check('e^A = R(pi/4)', eA, R_true)

# Example 2: diagonal matrix
D = np.diag([1.0, 2.0, 3.0])
eD = matrix_exp_series(D, N=30)
eD_true = np.diag([np.e, np.e**2, np.e**3])
print()
print('e^diag(1,2,3) = diag(e, e^2, e^3)')
check('Matrix exp diagonal', eD, eD_true)

# State space model: x(t) = e^{At} x(0)
print()
print('Continuous SSM: x(t) = e^{At} x(0)')
A = np.array([[-0.5, 1.0], [-1.0, -0.5]])  # stable system
x0 = np.array([1.0, 0.0])
for t in [0.0, 0.5, 1.0, 2.0]:
    eAt = expm(A * t)
    xt = eAt @ x0
    print(f'  t={t:.1f}: x(t) = {xt}')

16. Convergence Acceleration

Slowly converging series can be accelerated. The Leibniz formula π/4=11/3+1/5\pi/4 = 1 - 1/3 + 1/5 - \cdots converges like O(1/N)O(1/N) — very slow.

Machin's formula: π/4=4arctan(1/5)arctan(1/239)\pi/4 = 4\arctan(1/5) - \arctan(1/239)

Since 1/5=0.2|1/5| = 0.2 and 1/239<0.005|1/239| < 0.005, each series converges rapidly.

Code cell 45

# === 16.1 Convergence Acceleration — Machin's Formula for pi ===

import numpy as np

def arctan_series(x, N):
    """arctan(x) = sum (-1)^n x^(2n+1)/(2n+1)"""
    return sum((-1)**n * x**(2*n+1) / (2*n+1) for n in range(N))

print('Computing pi using different series approaches:')
print()

# Leibniz: pi/4 = arctan(1) = 1 - 1/3 + 1/5 - ...
print('Leibniz (1 term = 1 Leibniz term):')
for N in [10, 100, 1000, 10000]:
    pi_approx = 4 * arctan_series(1.0, N)
    err = abs(pi_approx - np.pi)
    print(f'  N={N:5d}: pi = {pi_approx:.8f}, error = {err:.2e}')

print()
# Machin: pi/4 = 4*arctan(1/5) - arctan(1/239)
# arctan(1/5) converges like (1/5)^(2n+1)/(2n+1) -> very fast
print("Machin's formula: pi/4 = 4*arctan(1/5) - arctan(1/239):")
for N in [2, 3, 5, 10]:
    pi_machin = 4 * (4 * arctan_series(1/5, N) - arctan_series(1/239, N))
    err = abs(pi_machin - np.pi)
    print(f'  N={N}: pi = {pi_machin:.12f}, error = {err:.2e}')

# Euler acceleration of alternating series
print()
print('Error comparison at N=5 terms:')
leibniz_err = abs(4*arctan_series(1.0, 5) - np.pi)
machin_err = abs(4*(4*arctan_series(1/5, 5) - arctan_series(1/239, 5)) - np.pi)
print(f'  Leibniz N=5: {leibniz_err:.2e}')
print(f"  Machin N=5:  {machin_err:.2e}")
print(f"  Speedup: {leibniz_err/machin_err:.0f}x")

Code cell 46

# === 17. Zeta Function — Basel and Beyond ===

import numpy as np

# Compute zeta(s) for real s > 1
def zeta_partial(s, N=10000):
    ns = np.arange(1, N+1)
    return np.sum(1.0 / ns**s)

print('Riemann Zeta Function zeta(s) = sum 1/n^s:')
print(f'{'s':>6} | {'zeta(s)':>15} | {'Expected':>15}')
print('-' * 45)

known_values = [
    (2,   np.pi**2/6,    'pi^2/6'),
    (4,   np.pi**4/90,   'pi^4/90'),
    (6,   np.pi**6/945,  'pi^6/945'),
    (3,   1.2020569032,  'Apery constant'),
    (1.5, None,          'converges slowly'),
]

for s, exact, name in known_values:
    z = zeta_partial(s, N=100000)
    if exact is not None:
        print(f'{s:6.1f} | {z:15.10f} | {exact:15.10f} ({name})')
    else:
        print(f'{s:6.1f} | {z:15.10f} | (no closed form: {name})')

# Connection to prime numbers: Euler product formula
# zeta(s) = product_{p prime} 1/(1 - p^{-s})
print()
primes = [2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47]
s = 2
euler_product = 1.0
for p in primes:
    euler_product *= 1 / (1 - p**(-s))
print(f'Euler product over first 15 primes (s=2): {euler_product:.6f}')
print(f'zeta(2) = pi^2/6 = {np.pi**2/6:.6f}')

Code cell 47

# === 18. Higher-Order Taylor Attention Approximations ===

import numpy as np

np.random.seed(42)
n, d = 16, 8
Q = np.random.randn(n, d) * 0.5  # smaller scale for better Taylor approx
K = np.random.randn(n, d) * 0.5
V = np.random.randn(n, d)

def softmax_stable(x):
    x = x - x.max(axis=-1, keepdims=True)
    e = np.exp(x)
    return e / e.sum(axis=-1, keepdims=True)

# Standard attention
scores = (Q @ K.T) / np.sqrt(d)
attn = softmax_stable(scores)
out_exact = attn @ V

# 1st order Taylor: phi(x) = [1, x]
def phi1(x):
    return np.hstack([np.ones((len(x),1)), x])

# 2nd order Taylor: phi(x) = [1, x, x^2/2 (Hadamard)]
def phi2(x):
    x_sq = (x[:,None,:] * x[:,:,None]).reshape(len(x), -1) / 2  # outer product terms
    return np.hstack([np.ones((len(x),1)), x, x_sq])

def linear_attn(pQ, pK, V):
    S = pK.T @ V  # feature_dim x d
    z = pK.sum(0)  # feature_dim
    return np.array([pQ[i] @ S / (pQ[i] @ z) for i in range(len(pQ))])

out_1st = linear_attn(phi1(Q), phi1(K), V)

err_1st = np.abs(out_exact - out_1st).mean()
print(f'Standard attention vs approximations (n={n}, d={d}):')
print(f'  1st-order Taylor error: {err_1st:.6f}')
print(f'  (Exact=0 means perfect match)')

# Show that error decreases as queries/keys become smaller
print()
print('Error vs. QK scale (smaller -> better approximation):')
for scale in [1.0, 0.5, 0.2, 0.1]:
    Q_s, K_s = Q * scale, K * scale
    scores_s = (Q_s @ K_s.T) / np.sqrt(d)
    out_s = softmax_stable(scores_s) @ V
    out_lin_s = linear_attn(phi1(Q_s), phi1(K_s), V)
    err = np.abs(out_s - out_lin_s).mean()
    max_qk = np.abs(Q_s @ K_s.T / np.sqrt(d)).max()
    print(f'  scale={scale:.1f}, max|q.k|={max_qk:.4f}: error={err:.6f}')

Code cell 48

# === 19. Numerical Stability: Kahan vs Naive Summation ===

import numpy as np

def naive_sum(arr):
    total = 0.0
    for x in arr:
        total += x
    return total

def kahan_sum(arr):
    total = 0.0
    c = 0.0
    for x in arr:
        y = x - c
        t = total + y
        c = (t - total) - y
        total = t
    return total

# Test: sum a large number of small values
np.random.seed(42)
n = 1000000
# True sum: n * mean
vals = np.ones(n, dtype=np.float32)  # Use float32 to see rounding issues
true_sum = float(n)  # 1000000.0

naive = naive_sum(vals)
kahan = kahan_sum(vals)
numpy_sum = float(np.sum(vals))  # numpy uses pairwise summation

print(f'True sum: {true_sum}')
print(f'Naive:    {naive} (error={abs(naive-true_sum):.4f})')
print(f'Kahan:    {kahan} (error={abs(kahan-true_sum):.4f})')
print(f'NumPy:    {numpy_sum} (error={abs(numpy_sum-true_sum):.4f})')
print()

# More dramatic: alternating large/small
big = 1e8
small = 1.0
vals2 = np.array([big, small, -big] * 100, dtype=np.float32)
true_sum2 = small * 100.0  # 100.0
print(f'Alternating large/small: true sum = {true_sum2}')
print(f'Naive:  {naive_sum(vals2)} (error={abs(naive_sum(vals2)-true_sum2):.4f})')
print(f'Kahan:  {kahan_sum(vals2)} (error={abs(kahan_sum(vals2)-true_sum2):.4f})')

Code cell 49

# === 20. Lagrange Remainder — Bounding ML Approximation Errors ===

import numpy as np

# How many Taylor terms needed for exp(x) to be accurate to epsilon?
from math import factorial

def lagrange_bound_exp(x, n):
    """Bound |R_n(x)| for e^x using Lagrange remainder"""
    M = np.exp(abs(x))  # |f^(n+1)(c)| <= e^|x|
    return M * abs(x)**(n+1) / factorial(n+1)

print('Lagrange bounds for e^x Taylor approximation:')
print(f'{'x':>6} | {'N':>4} | {'Bound':>12} | {'Actual err':>12}')

test_cases = [(0.1, [1,2,3]), (1.0, [3,5,10]), (2.0, [5,10,15])]
for x, Ns in test_cases:
    true_val = np.exp(x)
    for N in Ns:
        T_N = sum(x**n/factorial(n) for n in range(N+1))
        actual_err = abs(true_val - T_N)
        bound = lagrange_bound_exp(x, N)
        print(f'{x:6.2f} | {N:4d} | {bound:12.4e} | {actual_err:12.4e}')

print()
# For linear attention: bound on |e^(q.k) - (1 + q.k)|
print('Linear attention 1st-order bound |e^x - (1+x)| <= x^2/2 * e^|x|:')
for x in [0.1, 0.5, 1.0, 2.0]:
    actual = abs(np.exp(x) - (1+x))
    bound = x**2/2 * np.exp(abs(x))
    print(f'  x={x:.1f}: actual={actual:.6f}, bound={bound:.6f}')

print()
print('CONCLUSION: For |q.k| small, linear attention has error O(|q.k|^2).')
print('With unit vectors in R^d: |q.k| <= 1, so error <= e/2 ~ 1.36 per element.')
print('For d large and random q,k: |q.k| ~ 1/sqrt(d), error ~ e/(2d) -> 0.')

Code cell 50

# === 21. Multiple Ways to Define e ===

import numpy as np
from math import factorial

print('Five equivalent definitions of e:')
print()

# Definition 1: e = lim (1 + 1/n)^n
e1 = (1 + 1/1000000)**1000000
print(f'1. lim (1+1/n)^n (n=1M): {e1:.12f}')

# Definition 2: e = sum 1/n!
e2 = sum(1/factorial(n) for n in range(25))
print(f'2. sum 1/n! (25 terms):   {e2:.12f}')

# Definition 3: e = 1 + integral_0^1 (1+ln(t))/t * e^t dt ... complex
# Simpler: e is unique base where d/dx e^x = e^x
# Verify numerically: slope of exp at x=0 equals exp(0) = 1
h = 1e-8
deriv_exp_at_0 = (np.exp(h) - np.exp(-h)) / (2*h)
print(f'3. d/dx[e^x] at x=0:      {deriv_exp_at_0:.12f}')

# Definition 4: e = integral_1^e 1/t dt = 1 => ln(e) = 1
from scipy.integrate import quad
for guess in [2.71, 2.718, 2.7182, 2.71828, np.e]:
    area, _ = quad(lambda t: 1/t, 1, guess)
    print(f'4. integral_1^{guess:.5f} 1/t dt = {area:.8f} (should be 1 at e)')

print()
print(f'All definitions agree: e = {np.e:.15f}')
check('sum 1/n! = e', e2, np.e)

22. Connections Summary

SERIES AND SEQUENCES — CONCEPTUAL MAP
════════════════════════════════════════════════════════════════════════

  Sequences
  └── Convergence (ε-N) ──→ limits from §04/01
  └── Monotone Convergence Theorem ──→ proves e exists, value iteration converges
  └── Bolzano-Weierstrass ──→ subsequences; compactness in optimization

  Infinite Series
  └── Geometric series ──→ Adam, RL returns, linear recurrences
  └── Convergence tests ──→ Robbins-Monro; proofs in optimization papers
  └── Absolute vs conditional ──→ Riemann rearrangement; FP summation order

  Power Series
  └── Radius of convergence ──→ when can you trust a Taylor approx?
  └── Term-by-term diff/int ──→ derive ln(1+x) from 1/(1+x)
  └── Uniform convergence ──→ justifies interchange of limits

  Taylor Series
  └── Maclaurin: e^x, sin, cos, ln(1+x) ──→ foundation of all ML approximations
  └── Lagrange remainder ──→ quantify error in linear attention, GELU
  └── Composition/manipulation ──→ softmax temperature analysis

  ML Applications
  └── Softmax temperature ──→ sampling, knowledge distillation
  └── Linear attention ──→ O(n) transformers via 1st-order Taylor
  └── Adam bias correction ──→ geometric series identity
  └── Laplace approximation ──→ quadratic Taylor of log-posterior
  └── Positional encoding ──→ Fourier series at discrete positions
  └── Matrix exponential ──→ SSMs (Mamba), equivariant networks

════════════════════════════════════════════════════════════════════════