PYTHONPython

test scraper

real world projects / web scraper / tests

PYTHON
test_scraper.py🐍
"""Tests for web scraper."""

import pytest
from scraper.parser import parse_html, extract_table, extract_links


# Sample HTML for testing
SAMPLE_HTML = """
<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
    <h1>Main Title</h1>
    <p>First paragraph.</p>
    <p>Second paragraph.</p>
    <a href="https://example.com">Example</a>
    <a href="/relative">Relative Link</a>
</body>
</html>
"""

TABLE_HTML = """
<table>
    <thead>
        <tr><th>Name</th><th>Value</th></tr>
    </thead>
    <tbody>
        <tr><td>Alpha</td><td>100</td></tr>
        <tr><td>Beta</td><td>200</td></tr>
    </tbody>
</table>
"""


class TestParser:
    """Tests for HTML parsing."""
    
    def test_parse_single_element(self):
        """Test extracting single element."""
        result = parse_html(SAMPLE_HTML, {'title': 'h1'})
        assert result['title']['text'] == 'Main Title'
    
    def test_parse_multiple_elements(self):
        """Test extracting multiple elements."""
        result = parse_html(SAMPLE_HTML, {'paragraphs': 'p'})
        assert len(result['paragraphs']) == 2
        assert result['paragraphs'][0]['text'] == 'First paragraph.'
    
    def test_parse_not_found(self):
        """Test handling missing elements."""
        result = parse_html(SAMPLE_HTML, {'missing': '.nonexistent'})
        assert result['missing'] is None
    
    def test_parse_links(self):
        """Test extracting links with href."""
        result = parse_html(SAMPLE_HTML, {'links': 'a[href]'})
        assert len(result['links']) == 2
        assert result['links'][0]['href'] == 'https://example.com'


class TestTableExtraction:
    """Tests for table extraction."""
    
    def test_extract_table(self):
        """Test extracting table data."""
        rows = extract_table(TABLE_HTML)
        assert len(rows) == 2
        assert rows[0]['Name'] == 'Alpha'
        assert rows[0]['Value'] == '100'


class TestLinkExtraction:
    """Tests for link extraction."""
    
    def test_extract_links(self):
        """Test extracting all links."""
        links = extract_links(SAMPLE_HTML)
        assert 'https://example.com' in links
    
    def test_extract_links_with_base(self):
        """Test converting relative links."""
        links = extract_links(SAMPLE_HTML, base_url='https://test.com')
        assert 'https://test.com/relative' in links
PreviousNext