PYTHONPython

parser

real world projects / web scraper / scraper

PYTHON
parser.py🐍
"""
HTML parsing and data extraction.
"""

from typing import Optional
from bs4 import BeautifulSoup


def parse_html(html: str, selectors: dict[str, str]) -> dict:
    """
    Parse HTML and extract data using CSS selectors.
    
    Args:
        html: HTML content to parse
        selectors: Dict mapping field names to CSS selectors
        
    Returns:
        Dict with extracted data
        
    Example:
        selectors = {
            'title': 'h1',
            'paragraphs': 'p',
            'links': 'a[href]'
        }
        data = parse_html(html, selectors)
        # Returns: {'title': 'Page Title', 'paragraphs': [...], 'links': [...]}
    """
    soup = BeautifulSoup(html, 'html.parser')
    result = {}
    
    for field_name, selector in selectors.items():
        elements = soup.select(selector)
        
        if not elements:
            result[field_name] = None
        elif len(elements) == 1:
            result[field_name] = extract_element(elements[0])
        else:
            result[field_name] = [extract_element(el) for el in elements]
    
    return result


def extract_element(element) -> dict:
    """
    Extract data from a BeautifulSoup element.
    
    Args:
        element: BeautifulSoup element
        
    Returns:
        Dictionary with text, attributes, and optional href
    """
    data = {
        'text': element.get_text(strip=True)
    }
    
    # Include href for links
    if element.name == 'a' and element.get('href'):
        data['href'] = element['href']
    
    # Include src for images
    if element.name == 'img':
        data['src'] = element.get('src', '')
        data['alt'] = element.get('alt', '')
    
    return data


def extract_table(html: str, selector: str = 'table') -> list[dict]:
    """
    Extract data from HTML table.
    
    Args:
        html: HTML content
        selector: CSS selector for table
        
    Returns:
        List of dictionaries (one per row)
    """
    soup = BeautifulSoup(html, 'html.parser')
    table = soup.select_one(selector)
    
    if not table:
        return []
    
    # Get headers
    headers = []
    header_row = table.select_one('thead tr') or table.select_one('tr')
    if header_row:
        headers = [th.get_text(strip=True) for th in header_row.select('th, td')]
    
    # Get data rows
    rows = []
    for tr in table.select('tbody tr') or table.select('tr')[1:]:
        cells = [td.get_text(strip=True) for td in tr.select('td')]
        if cells:
            if headers:
                rows.append(dict(zip(headers, cells)))
            else:
                rows.append(cells)
    
    return rows


def extract_links(html: str, base_url: Optional[str] = None) -> list[str]:
    """
    Extract all links from HTML.
    
    Args:
        html: HTML content
        base_url: Base URL for relative links
        
    Returns:
        List of absolute URLs
    """
    from urllib.parse import urljoin
    
    soup = BeautifulSoup(html, 'html.parser')
    links = []
    
    for a in soup.select('a[href]'):
        href = a['href']
        
        # Skip anchors and javascript
        if href.startswith('#') or href.startswith('javascript:'):
            continue
        
        # Convert relative to absolute
        if base_url and not href.startswith(('http://', 'https://')):
            href = urljoin(base_url, href)
        
        links.append(href)
    
    return list(set(links))  # Remove duplicates
PreviousNext