PYTHON
config.py🐍python
"""
Configuration handling for the scraper.
"""
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import yaml
@dataclass
class Target:
"""A scraping target."""
name: str
url: str
selectors: dict[str, str] = field(default_factory=dict)
follow_links: bool = False
max_pages: int = 1
@dataclass
class Settings:
"""Scraper settings."""
rate_limit: float = 1.0 # seconds between requests
timeout: int = 30
user_agent: str = "PythonScraper/1.0 (Educational)"
output_format: str = "json"
output_file: str = "output.json"
max_retries: int = 3
cache_enabled: bool = True
cache_dir: str = ".cache"
@dataclass
class ScraperConfig:
"""Complete scraper configuration."""
targets: list[Target]
settings: Settings = field(default_factory=Settings)
def load_config(path: Path) -> ScraperConfig:
"""
Load configuration from YAML file.
Args:
path: Path to YAML config file
Returns:
ScraperConfig object
"""
with open(path) as f:
data = yaml.safe_load(f)
# Parse targets
targets = []
for t in data.get('targets', []):
targets.append(Target(
name=t.get('name', 'unnamed'),
url=t['url'],
selectors=t.get('selectors', {}),
follow_links=t.get('follow_links', False),
max_pages=t.get('max_pages', 1)
))
# Parse settings
settings_data = data.get('settings', {})
settings = Settings(
rate_limit=settings_data.get('rate_limit', 1.0),
timeout=settings_data.get('timeout', 30),
user_agent=settings_data.get('user_agent', Settings.user_agent),
output_format=settings_data.get('output_format', 'json'),
output_file=settings_data.get('output_file', 'output.json'),
max_retries=settings_data.get('max_retries', 3),
cache_enabled=settings_data.get('cache_enabled', True),
cache_dir=settings_data.get('cache_dir', '.cache')
)
return ScraperConfig(targets=targets, settings=settings)