Docs

advanced data

11 - Advanced Data Handling

📌 What You'll Learn

  • Shallow vs deep copy
  • Memory model and references
  • Collections module
  • Performance considerations
  • Data classes
  • Type hints

📋 Shallow vs Deep Copy

The Problem

# Simple assignment creates a reference, NOT a copy
original = [1, 2, [3, 4]]
reference = original  # Both point to same object!

reference[0] = 100
print(original)  # [100, 2, [3, 4]] - original changed!

Shallow Copy

Copies the outer object but NOT nested objects.

import copy

original = [1, 2, [3, 4]]

# Methods to create shallow copy
copy1 = original.copy()
copy2 = original[:]
copy3 = list(original)
copy4 = copy.copy(original)

# Shallow copy: outer list is new, inner list is shared
copy1[0] = 100
print(original)  # [1, 2, [3, 4]] - unchanged!

copy1[2][0] = 100
print(original)  # [1, 2, [100, 4]] - inner list changed!

Deep Copy

Copies everything recursively.

import copy

original = [1, 2, [3, 4, [5, 6]]]
deep = copy.deepcopy(original)

deep[2][2][0] = 100
print(original)  # [1, 2, [3, 4, [5, 6]]] - unchanged!
print(deep)      # [1, 2, [3, 4, [100, 6]]]

When to Use What?

# Simple data (no nested mutables): shallow copy is fine
numbers = [1, 2, 3]
copy = numbers.copy()

# Nested data structures: use deepcopy
nested = {"a": [1, 2], "b": {"c": 3}}
safe_copy = copy.deepcopy(nested)

# Performance: shallow copy is faster
# Use deepcopy only when needed

🧠 Memory Model and References

id() and is

a = [1, 2, 3]
b = a           # Same object
c = [1, 2, 3]   # Different object, same value

print(id(a), id(b), id(c))
print(a is b)    # True - same object
print(a is c)    # False - different objects
print(a == c)    # True - same value

Immutable vs Mutable

# Immutable: integers, strings, tuples
x = 10
y = x
x = 20
print(y)  # 10 - unchanged (new object created)

# Mutable: lists, dicts, sets
a = [1, 2, 3]
b = a
a.append(4)
print(b)  # [1, 2, 3, 4] - changed!

String Interning

# Python reuses small strings
a = "hello"
b = "hello"
print(a is b)  # True - same object!

# Long/dynamic strings might not be interned
a = "hello world " * 100
b = "hello world " * 100
print(a is b)  # May be False

📦 Collections Module

Counter

from collections import Counter

# Count occurrences
words = ['apple', 'banana', 'apple', 'cherry', 'banana', 'apple']
counter = Counter(words)

print(counter)  # Counter({'apple': 3, 'banana': 2, 'cherry': 1})
print(counter.most_common(2))  # [('apple', 3), ('banana', 2)]
print(counter['apple'])  # 3

# Counter arithmetic
c1 = Counter("aab")
c2 = Counter("abc")
print(c1 + c2)  # Counter({'a': 3, 'b': 2, 'c': 1})

defaultdict

from collections import defaultdict

# Regular dict raises KeyError
regular = {}
# regular['key'].append(1)  # KeyError!

# defaultdict provides default values
dd = defaultdict(list)
dd['key'].append(1)
dd['key'].append(2)
print(dd)  # defaultdict(<class 'list'>, {'key': [1, 2]})

# Common patterns
int_dict = defaultdict(int)      # Default: 0
list_dict = defaultdict(list)    # Default: []
set_dict = defaultdict(set)      # Default: set()

namedtuple

from collections import namedtuple

# Create a named tuple class
Point = namedtuple('Point', ['x', 'y'])
Person = namedtuple('Person', 'name age city')

# Create instances
p = Point(3, 4)
person = Person('Alice', 25, 'NYC')

# Access by name or index
print(p.x, p.y)           # 3 4
print(p[0], p[1])         # 3 4
print(person.name)        # Alice

# Convert to dict
print(p._asdict())        # {'x': 3, 'y': 4}

# Create from dict
data = {'x': 10, 'y': 20}
point = Point(**data)

deque (Double-Ended Queue)

from collections import deque

# Efficient append/pop from both ends
dq = deque([1, 2, 3])

dq.append(4)        # [1, 2, 3, 4]
dq.appendleft(0)    # [0, 1, 2, 3, 4]
dq.pop()            # Returns 4
dq.popleft()        # Returns 0

# Rotate elements
dq = deque([1, 2, 3, 4, 5])
dq.rotate(2)        # [4, 5, 1, 2, 3]
dq.rotate(-2)       # [1, 2, 3, 4, 5]

# Fixed size (drops oldest)
dq = deque(maxlen=3)
dq.extend([1, 2, 3, 4, 5])
print(dq)  # deque([3, 4, 5], maxlen=3)

OrderedDict

from collections import OrderedDict

# Remembers insertion order (Python 3.7+ dicts do too)
od = OrderedDict()
od['a'] = 1
od['b'] = 2
od['c'] = 3

# Move to end
od.move_to_end('a')  # {'b': 2, 'c': 3, 'a': 1}

# Pop first/last
od.popitem(last=True)   # Pop last
od.popitem(last=False)  # Pop first

ChainMap

from collections import ChainMap

# Combine multiple dicts
defaults = {'color': 'red', 'size': 'medium'}
user_settings = {'color': 'blue'}

combined = ChainMap(user_settings, defaults)
print(combined['color'])  # 'blue' (from user_settings)
print(combined['size'])   # 'medium' (from defaults)

⚡ Performance Considerations

List vs Tuple

import sys

# Tuples are more memory efficient
list_data = [1, 2, 3, 4, 5]
tuple_data = (1, 2, 3, 4, 5)

print(sys.getsizeof(list_data))   # ~104 bytes
print(sys.getsizeof(tuple_data))  # ~80 bytes

# Use tuple for fixed data

Set for Membership Testing

import timeit

# List: O(n) lookup
large_list = list(range(10000))
list_time = timeit.timeit(
    '9999 in large_list',
    globals={'large_list': large_list},
    number=10000
)

# Set: O(1) lookup
large_set = set(range(10000))
set_time = timeit.timeit(
    '9999 in large_set',
    globals={'large_set': large_set},
    number=10000
)

print(f"List: {list_time:.4f}s")  # Much slower
print(f"Set: {set_time:.4f}s")    # Much faster

List Comprehension vs Loop

import timeit

# Loop
def with_loop():
    result = []
    for i in range(1000):
        result.append(i * 2)
    return result

# List comprehension (faster)
def with_comprehension():
    return [i * 2 for i in range(1000)]

# Comprehension is typically 20-30% faster

🏷️ Data Classes (Python 3.7+)

from dataclasses import dataclass, field
from typing import List

@dataclass
class Person:
    name: str
    age: int
    email: str = ""  # Default value

    def greet(self):
        return f"Hello, I'm {self.name}"

# Automatically generates __init__, __repr__, __eq__
p1 = Person("Alice", 25)
p2 = Person("Alice", 25)

print(p1)           # Person(name='Alice', age=25, email='')
print(p1 == p2)     # True

# Advanced options
@dataclass(frozen=True)  # Immutable
class Point:
    x: float
    y: float

@dataclass
class Team:
    name: str
    members: List[str] = field(default_factory=list)

📝 Type Hints

from typing import List, Dict, Optional, Union, Tuple, Callable

# Basic types
def greet(name: str) -> str:
    return f"Hello, {name}"

# Collections
def process(items: List[int]) -> Dict[str, int]:
    return {"sum": sum(items), "count": len(items)}

# Optional (can be None)
def find_user(id: int) -> Optional[str]:
    return None  # or user name

# Union (multiple types)
def process_input(value: Union[int, str]) -> str:
    return str(value)

# Tuple
def get_coords() -> Tuple[float, float]:
    return (1.0, 2.0)

# Callable
def apply(func: Callable[[int], int], value: int) -> int:
    return func(value)

# Type alias
Vector = List[float]
def scale(v: Vector, factor: float) -> Vector:
    return [x * factor for x in v]

🎯 Next Steps

After mastering advanced data handling, proceed to 12_real_development to learn about os, sys, logging, and argparse!

Advanced Data - Python Tutorial | DeepML