Docs
advanced data
11 - Advanced Data Handling
📌 What You'll Learn
- •Shallow vs deep copy
- •Memory model and references
- •Collections module
- •Performance considerations
- •Data classes
- •Type hints
📋 Shallow vs Deep Copy
The Problem
# Simple assignment creates a reference, NOT a copy
original = [1, 2, [3, 4]]
reference = original # Both point to same object!
reference[0] = 100
print(original) # [100, 2, [3, 4]] - original changed!
Shallow Copy
Copies the outer object but NOT nested objects.
import copy
original = [1, 2, [3, 4]]
# Methods to create shallow copy
copy1 = original.copy()
copy2 = original[:]
copy3 = list(original)
copy4 = copy.copy(original)
# Shallow copy: outer list is new, inner list is shared
copy1[0] = 100
print(original) # [1, 2, [3, 4]] - unchanged!
copy1[2][0] = 100
print(original) # [1, 2, [100, 4]] - inner list changed!
Deep Copy
Copies everything recursively.
import copy
original = [1, 2, [3, 4, [5, 6]]]
deep = copy.deepcopy(original)
deep[2][2][0] = 100
print(original) # [1, 2, [3, 4, [5, 6]]] - unchanged!
print(deep) # [1, 2, [3, 4, [100, 6]]]
When to Use What?
# Simple data (no nested mutables): shallow copy is fine
numbers = [1, 2, 3]
copy = numbers.copy()
# Nested data structures: use deepcopy
nested = {"a": [1, 2], "b": {"c": 3}}
safe_copy = copy.deepcopy(nested)
# Performance: shallow copy is faster
# Use deepcopy only when needed
🧠 Memory Model and References
id() and is
a = [1, 2, 3]
b = a # Same object
c = [1, 2, 3] # Different object, same value
print(id(a), id(b), id(c))
print(a is b) # True - same object
print(a is c) # False - different objects
print(a == c) # True - same value
Immutable vs Mutable
# Immutable: integers, strings, tuples
x = 10
y = x
x = 20
print(y) # 10 - unchanged (new object created)
# Mutable: lists, dicts, sets
a = [1, 2, 3]
b = a
a.append(4)
print(b) # [1, 2, 3, 4] - changed!
String Interning
# Python reuses small strings
a = "hello"
b = "hello"
print(a is b) # True - same object!
# Long/dynamic strings might not be interned
a = "hello world " * 100
b = "hello world " * 100
print(a is b) # May be False
📦 Collections Module
Counter
from collections import Counter
# Count occurrences
words = ['apple', 'banana', 'apple', 'cherry', 'banana', 'apple']
counter = Counter(words)
print(counter) # Counter({'apple': 3, 'banana': 2, 'cherry': 1})
print(counter.most_common(2)) # [('apple', 3), ('banana', 2)]
print(counter['apple']) # 3
# Counter arithmetic
c1 = Counter("aab")
c2 = Counter("abc")
print(c1 + c2) # Counter({'a': 3, 'b': 2, 'c': 1})
defaultdict
from collections import defaultdict
# Regular dict raises KeyError
regular = {}
# regular['key'].append(1) # KeyError!
# defaultdict provides default values
dd = defaultdict(list)
dd['key'].append(1)
dd['key'].append(2)
print(dd) # defaultdict(<class 'list'>, {'key': [1, 2]})
# Common patterns
int_dict = defaultdict(int) # Default: 0
list_dict = defaultdict(list) # Default: []
set_dict = defaultdict(set) # Default: set()
namedtuple
from collections import namedtuple
# Create a named tuple class
Point = namedtuple('Point', ['x', 'y'])
Person = namedtuple('Person', 'name age city')
# Create instances
p = Point(3, 4)
person = Person('Alice', 25, 'NYC')
# Access by name or index
print(p.x, p.y) # 3 4
print(p[0], p[1]) # 3 4
print(person.name) # Alice
# Convert to dict
print(p._asdict()) # {'x': 3, 'y': 4}
# Create from dict
data = {'x': 10, 'y': 20}
point = Point(**data)
deque (Double-Ended Queue)
from collections import deque
# Efficient append/pop from both ends
dq = deque([1, 2, 3])
dq.append(4) # [1, 2, 3, 4]
dq.appendleft(0) # [0, 1, 2, 3, 4]
dq.pop() # Returns 4
dq.popleft() # Returns 0
# Rotate elements
dq = deque([1, 2, 3, 4, 5])
dq.rotate(2) # [4, 5, 1, 2, 3]
dq.rotate(-2) # [1, 2, 3, 4, 5]
# Fixed size (drops oldest)
dq = deque(maxlen=3)
dq.extend([1, 2, 3, 4, 5])
print(dq) # deque([3, 4, 5], maxlen=3)
OrderedDict
from collections import OrderedDict
# Remembers insertion order (Python 3.7+ dicts do too)
od = OrderedDict()
od['a'] = 1
od['b'] = 2
od['c'] = 3
# Move to end
od.move_to_end('a') # {'b': 2, 'c': 3, 'a': 1}
# Pop first/last
od.popitem(last=True) # Pop last
od.popitem(last=False) # Pop first
ChainMap
from collections import ChainMap
# Combine multiple dicts
defaults = {'color': 'red', 'size': 'medium'}
user_settings = {'color': 'blue'}
combined = ChainMap(user_settings, defaults)
print(combined['color']) # 'blue' (from user_settings)
print(combined['size']) # 'medium' (from defaults)
⚡ Performance Considerations
List vs Tuple
import sys
# Tuples are more memory efficient
list_data = [1, 2, 3, 4, 5]
tuple_data = (1, 2, 3, 4, 5)
print(sys.getsizeof(list_data)) # ~104 bytes
print(sys.getsizeof(tuple_data)) # ~80 bytes
# Use tuple for fixed data
Set for Membership Testing
import timeit
# List: O(n) lookup
large_list = list(range(10000))
list_time = timeit.timeit(
'9999 in large_list',
globals={'large_list': large_list},
number=10000
)
# Set: O(1) lookup
large_set = set(range(10000))
set_time = timeit.timeit(
'9999 in large_set',
globals={'large_set': large_set},
number=10000
)
print(f"List: {list_time:.4f}s") # Much slower
print(f"Set: {set_time:.4f}s") # Much faster
List Comprehension vs Loop
import timeit
# Loop
def with_loop():
result = []
for i in range(1000):
result.append(i * 2)
return result
# List comprehension (faster)
def with_comprehension():
return [i * 2 for i in range(1000)]
# Comprehension is typically 20-30% faster
🏷️ Data Classes (Python 3.7+)
from dataclasses import dataclass, field
from typing import List
@dataclass
class Person:
name: str
age: int
email: str = "" # Default value
def greet(self):
return f"Hello, I'm {self.name}"
# Automatically generates __init__, __repr__, __eq__
p1 = Person("Alice", 25)
p2 = Person("Alice", 25)
print(p1) # Person(name='Alice', age=25, email='')
print(p1 == p2) # True
# Advanced options
@dataclass(frozen=True) # Immutable
class Point:
x: float
y: float
@dataclass
class Team:
name: str
members: List[str] = field(default_factory=list)
📝 Type Hints
from typing import List, Dict, Optional, Union, Tuple, Callable
# Basic types
def greet(name: str) -> str:
return f"Hello, {name}"
# Collections
def process(items: List[int]) -> Dict[str, int]:
return {"sum": sum(items), "count": len(items)}
# Optional (can be None)
def find_user(id: int) -> Optional[str]:
return None # or user name
# Union (multiple types)
def process_input(value: Union[int, str]) -> str:
return str(value)
# Tuple
def get_coords() -> Tuple[float, float]:
return (1.0, 2.0)
# Callable
def apply(func: Callable[[int], int], value: int) -> int:
return func(value)
# Type alias
Vector = List[float]
def scale(v: Vector, factor: float) -> Vector:
return [x * factor for x in v]
🎯 Next Steps
After mastering advanced data handling, proceed to 12_real_development to learn about os, sys, logging, and argparse!