PYTHON
test_scraper.py🐍python
"""Tests for web scraper."""
import pytest
from scraper.parser import parse_html, extract_table, extract_links
# Sample HTML for testing
SAMPLE_HTML = """
<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<h1>Main Title</h1>
<p>First paragraph.</p>
<p>Second paragraph.</p>
<a href="https://example.com">Example</a>
<a href="/relative">Relative Link</a>
</body>
</html>
"""
TABLE_HTML = """
<table>
<thead>
<tr><th>Name</th><th>Value</th></tr>
</thead>
<tbody>
<tr><td>Alpha</td><td>100</td></tr>
<tr><td>Beta</td><td>200</td></tr>
</tbody>
</table>
"""
class TestParser:
"""Tests for HTML parsing."""
def test_parse_single_element(self):
"""Test extracting single element."""
result = parse_html(SAMPLE_HTML, {'title': 'h1'})
assert result['title']['text'] == 'Main Title'
def test_parse_multiple_elements(self):
"""Test extracting multiple elements."""
result = parse_html(SAMPLE_HTML, {'paragraphs': 'p'})
assert len(result['paragraphs']) == 2
assert result['paragraphs'][0]['text'] == 'First paragraph.'
def test_parse_not_found(self):
"""Test handling missing elements."""
result = parse_html(SAMPLE_HTML, {'missing': '.nonexistent'})
assert result['missing'] is None
def test_parse_links(self):
"""Test extracting links with href."""
result = parse_html(SAMPLE_HTML, {'links': 'a[href]'})
assert len(result['links']) == 2
assert result['links'][0]['href'] == 'https://example.com'
class TestTableExtraction:
"""Tests for table extraction."""
def test_extract_table(self):
"""Test extracting table data."""
rows = extract_table(TABLE_HTML)
assert len(rows) == 2
assert rows[0]['Name'] == 'Alpha'
assert rows[0]['Value'] == '100'
class TestLinkExtraction:
"""Tests for link extraction."""
def test_extract_links(self):
"""Test extracting all links."""
links = extract_links(SAMPLE_HTML)
assert 'https://example.com' in links
def test_extract_links_with_base(self):
"""Test converting relative links."""
links = extract_links(SAMPLE_HTML, base_url='https://test.com')
assert 'https://test.com/relative' in links