PYTHONPython

main

real world projects / web scraper / scraper

PYTHON
main.py🐍
"""
Main entry point for the web scraper.
"""

import asyncio
import argparse
import sys
from pathlib import Path

from .config import load_config, ScraperConfig
from .crawler import Crawler
from .exporters import export_data


async def run_scraper(config: ScraperConfig):
    """Run the scraper with given configuration."""
    print(f"Starting scraper with {len(config.targets)} target(s)")
    
    async with Crawler(config.settings) as crawler:
        all_results = []
        
        for target in config.targets:
            print(f"\nScraping: {target.name} ({target.url})")
            
            try:
                result = await crawler.scrape(target)
                all_results.append(result)
                print(f"  ✓ Extracted {len(result.get('data', {}))} fields")
            except Exception as e:
                print(f"  ✗ Error: {e}")
        
        # Export results
        if all_results:
            output_file = export_data(
                all_results,
                format=config.settings.output_format,
                filename=config.settings.output_file
            )
            print(f"\nResults saved to: {output_file}")
        
        return all_results


def main():
    """CLI entry point."""
    parser = argparse.ArgumentParser(
        description="Web scraper with configurable selectors"
    )
    parser.add_argument(
        'config',
        nargs='?',
        help='Path to YAML configuration file'
    )
    parser.add_argument(
        '--url',
        help='Single URL to scrape'
    )
    parser.add_argument(
        '--selector',
        help='CSS selector to extract'
    )
    parser.add_argument(
        '--output',
        default='output.json',
        help='Output file path'
    )
    
    args = parser.parse_args()
    
    # Build config
    if args.config:
        config = load_config(Path(args.config))
    elif args.url:
        # Quick mode - single URL
        from .config import Target, Settings
        config = ScraperConfig(
            targets=[Target(
                name="quick",
                url=args.url,
                selectors={"content": args.selector or "body"}
            )],
            settings=Settings(output_file=args.output)
        )
    else:
        parser.print_help()
        sys.exit(1)
    
    # Run async scraper
    try:
        asyncio.run(run_scraper(config))
    except KeyboardInterrupt:
        print("\nScraping interrupted")
        sys.exit(1)


if __name__ == "__main__":
    main()
PreviousNext