PYTHON
main.py🐍python
"""
Main entry point for the web scraper.
"""
import asyncio
import argparse
import sys
from pathlib import Path
from .config import load_config, ScraperConfig
from .crawler import Crawler
from .exporters import export_data
async def run_scraper(config: ScraperConfig):
"""Run the scraper with given configuration."""
print(f"Starting scraper with {len(config.targets)} target(s)")
async with Crawler(config.settings) as crawler:
all_results = []
for target in config.targets:
print(f"\nScraping: {target.name} ({target.url})")
try:
result = await crawler.scrape(target)
all_results.append(result)
print(f" ✓ Extracted {len(result.get('data', {}))} fields")
except Exception as e:
print(f" ✗ Error: {e}")
# Export results
if all_results:
output_file = export_data(
all_results,
format=config.settings.output_format,
filename=config.settings.output_file
)
print(f"\nResults saved to: {output_file}")
return all_results
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Web scraper with configurable selectors"
)
parser.add_argument(
'config',
nargs='?',
help='Path to YAML configuration file'
)
parser.add_argument(
'--url',
help='Single URL to scrape'
)
parser.add_argument(
'--selector',
help='CSS selector to extract'
)
parser.add_argument(
'--output',
default='output.json',
help='Output file path'
)
args = parser.parse_args()
# Build config
if args.config:
config = load_config(Path(args.config))
elif args.url:
# Quick mode - single URL
from .config import Target, Settings
config = ScraperConfig(
targets=[Target(
name="quick",
url=args.url,
selectors={"content": args.selector or "body"}
)],
settings=Settings(output_file=args.output)
)
else:
parser.print_help()
sys.exit(1)
# Run async scraper
try:
asyncio.run(run_scraper(config))
except KeyboardInterrupt:
print("\nScraping interrupted")
sys.exit(1)
if __name__ == "__main__":
main()