PYTHON
parser.py🐍python
"""
HTML parsing and data extraction.
"""
from typing import Optional
from bs4 import BeautifulSoup
def parse_html(html: str, selectors: dict[str, str]) -> dict:
"""
Parse HTML and extract data using CSS selectors.
Args:
html: HTML content to parse
selectors: Dict mapping field names to CSS selectors
Returns:
Dict with extracted data
Example:
selectors = {
'title': 'h1',
'paragraphs': 'p',
'links': 'a[href]'
}
data = parse_html(html, selectors)
# Returns: {'title': 'Page Title', 'paragraphs': [...], 'links': [...]}
"""
soup = BeautifulSoup(html, 'html.parser')
result = {}
for field_name, selector in selectors.items():
elements = soup.select(selector)
if not elements:
result[field_name] = None
elif len(elements) == 1:
result[field_name] = extract_element(elements[0])
else:
result[field_name] = [extract_element(el) for el in elements]
return result
def extract_element(element) -> dict:
"""
Extract data from a BeautifulSoup element.
Args:
element: BeautifulSoup element
Returns:
Dictionary with text, attributes, and optional href
"""
data = {
'text': element.get_text(strip=True)
}
# Include href for links
if element.name == 'a' and element.get('href'):
data['href'] = element['href']
# Include src for images
if element.name == 'img':
data['src'] = element.get('src', '')
data['alt'] = element.get('alt', '')
return data
def extract_table(html: str, selector: str = 'table') -> list[dict]:
"""
Extract data from HTML table.
Args:
html: HTML content
selector: CSS selector for table
Returns:
List of dictionaries (one per row)
"""
soup = BeautifulSoup(html, 'html.parser')
table = soup.select_one(selector)
if not table:
return []
# Get headers
headers = []
header_row = table.select_one('thead tr') or table.select_one('tr')
if header_row:
headers = [th.get_text(strip=True) for th in header_row.select('th, td')]
# Get data rows
rows = []
for tr in table.select('tbody tr') or table.select('tr')[1:]:
cells = [td.get_text(strip=True) for td in tr.select('td')]
if cells:
if headers:
rows.append(dict(zip(headers, cells)))
else:
rows.append(cells)
return rows
def extract_links(html: str, base_url: Optional[str] = None) -> list[str]:
"""
Extract all links from HTML.
Args:
html: HTML content
base_url: Base URL for relative links
Returns:
List of absolute URLs
"""
from urllib.parse import urljoin
soup = BeautifulSoup(html, 'html.parser')
links = []
for a in soup.select('a[href]'):
href = a['href']
# Skip anchors and javascript
if href.startswith('#') or href.startswith('javascript:'):
continue
# Convert relative to absolute
if base_url and not href.startswith(('http://', 'https://')):
href = urljoin(base_url, href)
links.append(href)
return list(set(links)) # Remove duplicates