Created
April 27, 2025 11:36
-
-
Save benlacey57/b1af215ca9a1ab14a65b1e893d4ab6cd to your computer and use it in GitHub Desktop.
Python Web Scraping example using playwright
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Advanced Web Scraper with Playwright | |
| This script demonstrates a robust web scraping approach using Playwright that: | |
| - Uses proper browser headers to reduce detection | |
| - Implements logging to track execution | |
| - Follows scraping best practices | |
| - Handles common scraping errors | |
| - Saves scraped data to CSV | |
| """ | |
| import asyncio | |
| import csv | |
| import logging | |
| import random | |
| import time | |
| from datetime import datetime | |
| from pathlib import Path | |
| from typing import Dict, List, Optional, Union | |
| from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Response | |
| # Configure logging | |
| def setup_logging() -> None: | |
| """Set up logging to both console and file.""" | |
| log_dir = Path("logs") | |
| log_dir.mkdir(exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| log_file = log_dir / f"scraper_{timestamp}.log" | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format='%(asctime)s - %(levelname)s - %(message)s', | |
| handlers=[ | |
| logging.FileHandler(log_file), | |
| logging.StreamHandler() | |
| ] | |
| ) | |
| # Browser headers and configurations | |
| def get_browser_config() -> Dict: | |
| """Return browser configuration with realistic headers and settings.""" | |
| # List of common user agents | |
| user_agents = [ | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0", | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15" | |
| ] | |
| return { | |
| "user_agent": random.choice(user_agents), | |
| "viewport": { | |
| "width": random.randint(1280, 1920), | |
| "height": random.randint(720, 1080), | |
| }, | |
| "locale": "en-US", | |
| "timezone_id": "America/New_York", | |
| "extra_headers": { | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.5", | |
| "Accept-Encoding": "gzip, deflate, br", | |
| "DNT": "1", | |
| "Connection": "keep-alive", | |
| "Upgrade-Insecure-Requests": "1", | |
| "Sec-Fetch-Dest": "document", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Site": "none", | |
| "Sec-Fetch-User": "?1", | |
| "Sec-GPC": "1", | |
| } | |
| } | |
| class PlaywrightScraper: | |
| """A robust web scraper using Playwright.""" | |
| def __init__(self, headless: bool = True, slow_mo: int = 50): | |
| """ | |
| Initialize the scraper. | |
| Args: | |
| headless: Whether to run browser in headless mode | |
| slow_mo: Slow down operations by this many milliseconds | |
| """ | |
| self.headless = headless | |
| self.slow_mo = slow_mo | |
| self.browser: Optional[Browser] = None | |
| self.context: Optional[BrowserContext] = None | |
| self.page: Optional[Page] = None | |
| self.data: List[Dict] = [] | |
| async def __aenter__(self): | |
| """Set up browser when using as context manager.""" | |
| await self.setup_browser() | |
| return self | |
| async def __aexit__(self, exc_type, exc_val, exc_tb): | |
| """Clean up resources when exiting context manager.""" | |
| await self.close() | |
| async def setup_browser(self) -> None: | |
| """Initialize browser with custom settings to avoid detection.""" | |
| logging.info("Setting up browser...") | |
| playwright = await async_playwright().start() | |
| # Get browser configuration | |
| config = get_browser_config() | |
| # Launch browser with specific settings | |
| self.browser = await playwright.chromium.launch( | |
| headless=self.headless, | |
| slow_mo=self.slow_mo | |
| ) | |
| # Create context with custom settings | |
| self.context = await self.browser.new_context( | |
| user_agent=config["user_agent"], | |
| viewport=config["viewport"], | |
| locale=config["locale"], | |
| timezone_id=config["timezone_id"], | |
| extra_http_headers=config["extra_headers"] | |
| ) | |
| # Add additional browser behaviors to seem more human | |
| await self.context.add_init_script(""" | |
| // Modify navigator properties to avoid fingerprinting | |
| Object.defineProperty(navigator, 'webdriver', { get: () => undefined }); | |
| """) | |
| self.page = await self.context.new_page() | |
| # Set up response handling to detect blocking | |
| self.page.on("response", self._handle_response) | |
| logging.info(f"Browser set up complete with user agent: {config['user_agent']}") | |
| async def _handle_response(self, response: Response) -> None: | |
| """ | |
| Monitor responses for potential blocking or CAPTCHA challenges. | |
| Args: | |
| response: The HTTP response object | |
| """ | |
| if response.url == self.page.url: | |
| if response.status >= 400: | |
| logging.warning(f"Received error status {response.status} for {response.url}") | |
| # Look for common blocking indicators in response body | |
| if response.status == 200: | |
| text = await response.text() | |
| lower_text = text.lower() | |
| if any(term in lower_text for term in ["captcha", "blocked", "rate limit", "denied"]): | |
| logging.warning(f"Possible blocking detected on {response.url}") | |
| async def add_human_behavior(self) -> None: | |
| """Add random mouse movements and scrolling to appear more human-like.""" | |
| # Random scrolling | |
| await self.page.evaluate(""" | |
| () => { | |
| window.scrollTo({ | |
| top: Math.floor(Math.random() * document.body.scrollHeight * 0.7), | |
| behavior: 'smooth' | |
| }); | |
| } | |
| """) | |
| # Small delay | |
| await asyncio.sleep(random.uniform(1.0, 3.0)) | |
| # Scroll back up randomly | |
| if random.random() > 0.5: | |
| await self.page.evaluate(""" | |
| () => { | |
| window.scrollTo({ | |
| top: 0, | |
| behavior: 'smooth' | |
| }); | |
| } | |
| """) | |
| await asyncio.sleep(random.uniform(0.5, 2.0)) | |
| async def navigate(self, url: str, wait_for_selector: str = "body") -> bool: | |
| """ | |
| Navigate to URL and wait for page to load. | |
| Args: | |
| url: The URL to navigate to | |
| wait_for_selector: CSS selector to wait for | |
| Returns: | |
| True if navigation was successful, False otherwise | |
| """ | |
| try: | |
| logging.info(f"Navigating to: {url}") | |
| # Navigate with timeout | |
| response = await self.page.goto(url, wait_until="networkidle", timeout=30000) | |
| # Check for successful navigation | |
| if not response: | |
| logging.error(f"No response received from {url}") | |
| return False | |
| if response.status >= 400: | |
| logging.error(f"Error response ({response.status}) from {url}") | |
| return False | |
| # Wait for main content to be available | |
| await self.page.wait_for_selector(wait_for_selector, timeout=10000) | |
| # Add random human-like behavior | |
| await self.add_human_behavior() | |
| # Add random delay between 2-5 seconds | |
| delay = random.uniform(2, 5) | |
| logging.debug(f"Waiting for {delay:.2f} seconds...") | |
| await asyncio.sleep(delay) | |
| return True | |
| except Exception as e: | |
| logging.error(f"Navigation error: {e}") | |
| return False | |
| async def extract_data(self, selectors: Dict[str, str]) -> Dict: | |
| """ | |
| Extract data from page using provided selectors. | |
| Args: | |
| selectors: Dictionary of data names and their CSS selectors | |
| Returns: | |
| Dictionary of extracted data | |
| """ | |
| data = {} | |
| for key, selector in selectors.items(): | |
| try: | |
| # Try text content first | |
| element = await self.page.query_selector(selector) | |
| if element: | |
| data[key] = await element.text_content() | |
| data[key] = data[key].strip() if data[key] else "" | |
| else: | |
| data[key] = "" | |
| logging.warning(f"Selector '{selector}' for '{key}' not found") | |
| except Exception as e: | |
| data[key] = "" | |
| logging.error(f"Error extracting '{key}' with selector '{selector}': {e}") | |
| return data | |
| async def scrape_multiple_pages(self, | |
| base_url: str, | |
| total_pages: int, | |
| selectors: Dict[str, str], | |
| url_pattern: str = "{}?page={}", | |
| output_file: str = "scraped_data.csv") -> None: | |
| """ | |
| Scrape multiple pages and save results. | |
| Args: | |
| base_url: The base URL to scrape | |
| total_pages: Number of pages to scrape | |
| selectors: Dictionary of CSS selectors for data extraction | |
| url_pattern: Pattern for constructing page URLs | |
| output_file: Path to save CSV output | |
| """ | |
| self.data = [] | |
| for page_num in range(1, total_pages + 1): | |
| url = url_pattern.format(base_url, page_num) | |
| logging.info(f"Processing page {page_num}/{total_pages}") | |
| # Navigate to page | |
| success = await self.navigate(url) | |
| if not success: | |
| logging.warning(f"Skipping page {page_num} due to navigation error") | |
| continue | |
| # Extract items from page (assuming a list of items) | |
| items = await self.page.query_selector_all("div.item") # Adjust selector for your target site | |
| if not items: | |
| logging.warning(f"No items found on page {page_num}") | |
| continue | |
| logging.info(f"Found {len(items)} items on page {page_num}") | |
| # Process each item | |
| for i, item in enumerate(items): | |
| try: | |
| # Create item-specific selectors relative to this item | |
| item_data = {} | |
| for key, selector in selectors.items(): | |
| element = await item.query_selector(selector) | |
| if element: | |
| item_data[key] = (await element.text_content()).strip() | |
| else: | |
| item_data[key] = "" | |
| # Add metadata | |
| item_data["page"] = page_num | |
| item_data["item_number"] = i + 1 | |
| item_data["timestamp"] = datetime.now().isoformat() | |
| self.data.append(item_data) | |
| except Exception as e: | |
| logging.error(f"Error processing item {i+1} on page {page_num}: {e}") | |
| # Save progress after each page | |
| self.save_to_csv(output_file) | |
| # Add random delay between pages (3-7 seconds) | |
| delay = random.uniform(3, 7) | |
| logging.info(f"Waiting {delay:.2f} seconds before next page...") | |
| await asyncio.sleep(delay) | |
| def save_to_csv(self, filename: str) -> None: | |
| """ | |
| Save scraped data to CSV file. | |
| Args: | |
| filename: Path to save the CSV file | |
| """ | |
| if not self.data: | |
| logging.warning("No data to save") | |
| return | |
| try: | |
| # Ensure all rows have the same keys (columns) | |
| all_keys = set() | |
| for item in self.data: | |
| all_keys.update(item.keys()) | |
| with open(filename, 'w', newline='', encoding='utf-8') as f: | |
| writer = csv.DictWriter(f, fieldnames=sorted(all_keys)) | |
| writer.writeheader() | |
| writer.writerows(self.data) | |
| logging.info(f"Data saved to {filename}") | |
| except Exception as e: | |
| logging.error(f"Error saving data to CSV: {e}") | |
| async def close(self) -> None: | |
| """Close browser and clean up resources.""" | |
| if self.browser: | |
| await self.browser.close() | |
| logging.info("Browser closed") | |
| async def main(): | |
| """Main entry point for the scraper.""" | |
| # Set up logging | |
| setup_logging() | |
| # Example usage | |
| target_url = "https://example.com/products" | |
| # Define the data selectors (adjust for your target site) | |
| selectors = { | |
| "title": "h2.product-title", | |
| "price": "span.price", | |
| "description": "div.description", | |
| "rating": "div.rating" | |
| } | |
| try: | |
| async with PlaywrightScraper(headless=True) as scraper: | |
| # Example: scrape 3 pages | |
| await scraper.scrape_multiple_pages( | |
| base_url=target_url, | |
| total_pages=3, | |
| selectors=selectors, | |
| output_file="products_data.csv" | |
| ) | |
| logging.info(f"Scraping complete. Total items collected: {len(scraper.data)}") | |
| except Exception as e: | |
| logging.error(f"Scraping failed: {e}") | |
| if __name__ == "__main__": | |
| # Run the async main function | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment