PYTHONPython

config

real world projects / web scraper / scraper

PYTHON
config.py🐍
"""
Configuration handling for the scraper.
"""

from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import yaml


@dataclass
class Target:
    """A scraping target."""
    name: str
    url: str
    selectors: dict[str, str] = field(default_factory=dict)
    follow_links: bool = False
    max_pages: int = 1


@dataclass
class Settings:
    """Scraper settings."""
    rate_limit: float = 1.0  # seconds between requests
    timeout: int = 30
    user_agent: str = "PythonScraper/1.0 (Educational)"
    output_format: str = "json"
    output_file: str = "output.json"
    max_retries: int = 3
    cache_enabled: bool = True
    cache_dir: str = ".cache"


@dataclass
class ScraperConfig:
    """Complete scraper configuration."""
    targets: list[Target]
    settings: Settings = field(default_factory=Settings)


def load_config(path: Path) -> ScraperConfig:
    """
    Load configuration from YAML file.
    
    Args:
        path: Path to YAML config file
        
    Returns:
        ScraperConfig object
    """
    with open(path) as f:
        data = yaml.safe_load(f)
    
    # Parse targets
    targets = []
    for t in data.get('targets', []):
        targets.append(Target(
            name=t.get('name', 'unnamed'),
            url=t['url'],
            selectors=t.get('selectors', {}),
            follow_links=t.get('follow_links', False),
            max_pages=t.get('max_pages', 1)
        ))
    
    # Parse settings
    settings_data = data.get('settings', {})
    settings = Settings(
        rate_limit=settings_data.get('rate_limit', 1.0),
        timeout=settings_data.get('timeout', 30),
        user_agent=settings_data.get('user_agent', Settings.user_agent),
        output_format=settings_data.get('output_format', 'json'),
        output_file=settings_data.get('output_file', 'output.json'),
        max_retries=settings_data.get('max_retries', 3),
        cache_enabled=settings_data.get('cache_enabled', True),
        cache_dir=settings_data.get('cache_dir', '.cache')
    )
    
    return ScraperConfig(targets=targets, settings=settings)
PreviousNext