Skip to content

Instantly share code, notes, and snippets.

@benlacey57
Created April 27, 2025 11:36
Show Gist options
  • Select an option

  • Save benlacey57/b1af215ca9a1ab14a65b1e893d4ab6cd to your computer and use it in GitHub Desktop.

Select an option

Save benlacey57/b1af215ca9a1ab14a65b1e893d4ab6cd to your computer and use it in GitHub Desktop.
Python Web Scraping example using playwright
"""
Advanced Web Scraper with Playwright
This script demonstrates a robust web scraping approach using Playwright that:
- Uses proper browser headers to reduce detection
- Implements logging to track execution
- Follows scraping best practices
- Handles common scraping errors
- Saves scraped data to CSV
"""
import asyncio
import csv
import logging
import random
import time
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Union
from playwright.async_api import async_playwright, Browser, BrowserContext, Page, Response
# Configure logging
def setup_logging() -> None:
"""Set up logging to both console and file."""
log_dir = Path("logs")
log_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"scraper_{timestamp}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.FileHandler(log_file),
logging.StreamHandler()
]
)
# Browser headers and configurations
def get_browser_config() -> Dict:
"""Return browser configuration with realistic headers and settings."""
# List of common user agents
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.0 Safari/605.1.15"
]
return {
"user_agent": random.choice(user_agents),
"viewport": {
"width": random.randint(1280, 1920),
"height": random.randint(720, 1080),
},
"locale": "en-US",
"timezone_id": "America/New_York",
"extra_headers": {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1",
"Connection": "keep-alive",
"Upgrade-Insecure-Requests": "1",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Sec-GPC": "1",
}
}
class PlaywrightScraper:
"""A robust web scraper using Playwright."""
def __init__(self, headless: bool = True, slow_mo: int = 50):
"""
Initialize the scraper.
Args:
headless: Whether to run browser in headless mode
slow_mo: Slow down operations by this many milliseconds
"""
self.headless = headless
self.slow_mo = slow_mo
self.browser: Optional[Browser] = None
self.context: Optional[BrowserContext] = None
self.page: Optional[Page] = None
self.data: List[Dict] = []
async def __aenter__(self):
"""Set up browser when using as context manager."""
await self.setup_browser()
return self
async def __aexit__(self, exc_type, exc_val, exc_tb):
"""Clean up resources when exiting context manager."""
await self.close()
async def setup_browser(self) -> None:
"""Initialize browser with custom settings to avoid detection."""
logging.info("Setting up browser...")
playwright = await async_playwright().start()
# Get browser configuration
config = get_browser_config()
# Launch browser with specific settings
self.browser = await playwright.chromium.launch(
headless=self.headless,
slow_mo=self.slow_mo
)
# Create context with custom settings
self.context = await self.browser.new_context(
user_agent=config["user_agent"],
viewport=config["viewport"],
locale=config["locale"],
timezone_id=config["timezone_id"],
extra_http_headers=config["extra_headers"]
)
# Add additional browser behaviors to seem more human
await self.context.add_init_script("""
// Modify navigator properties to avoid fingerprinting
Object.defineProperty(navigator, 'webdriver', { get: () => undefined });
""")
self.page = await self.context.new_page()
# Set up response handling to detect blocking
self.page.on("response", self._handle_response)
logging.info(f"Browser set up complete with user agent: {config['user_agent']}")
async def _handle_response(self, response: Response) -> None:
"""
Monitor responses for potential blocking or CAPTCHA challenges.
Args:
response: The HTTP response object
"""
if response.url == self.page.url:
if response.status >= 400:
logging.warning(f"Received error status {response.status} for {response.url}")
# Look for common blocking indicators in response body
if response.status == 200:
text = await response.text()
lower_text = text.lower()
if any(term in lower_text for term in ["captcha", "blocked", "rate limit", "denied"]):
logging.warning(f"Possible blocking detected on {response.url}")
async def add_human_behavior(self) -> None:
"""Add random mouse movements and scrolling to appear more human-like."""
# Random scrolling
await self.page.evaluate("""
() => {
window.scrollTo({
top: Math.floor(Math.random() * document.body.scrollHeight * 0.7),
behavior: 'smooth'
});
}
""")
# Small delay
await asyncio.sleep(random.uniform(1.0, 3.0))
# Scroll back up randomly
if random.random() > 0.5:
await self.page.evaluate("""
() => {
window.scrollTo({
top: 0,
behavior: 'smooth'
});
}
""")
await asyncio.sleep(random.uniform(0.5, 2.0))
async def navigate(self, url: str, wait_for_selector: str = "body") -> bool:
"""
Navigate to URL and wait for page to load.
Args:
url: The URL to navigate to
wait_for_selector: CSS selector to wait for
Returns:
True if navigation was successful, False otherwise
"""
try:
logging.info(f"Navigating to: {url}")
# Navigate with timeout
response = await self.page.goto(url, wait_until="networkidle", timeout=30000)
# Check for successful navigation
if not response:
logging.error(f"No response received from {url}")
return False
if response.status >= 400:
logging.error(f"Error response ({response.status}) from {url}")
return False
# Wait for main content to be available
await self.page.wait_for_selector(wait_for_selector, timeout=10000)
# Add random human-like behavior
await self.add_human_behavior()
# Add random delay between 2-5 seconds
delay = random.uniform(2, 5)
logging.debug(f"Waiting for {delay:.2f} seconds...")
await asyncio.sleep(delay)
return True
except Exception as e:
logging.error(f"Navigation error: {e}")
return False
async def extract_data(self, selectors: Dict[str, str]) -> Dict:
"""
Extract data from page using provided selectors.
Args:
selectors: Dictionary of data names and their CSS selectors
Returns:
Dictionary of extracted data
"""
data = {}
for key, selector in selectors.items():
try:
# Try text content first
element = await self.page.query_selector(selector)
if element:
data[key] = await element.text_content()
data[key] = data[key].strip() if data[key] else ""
else:
data[key] = ""
logging.warning(f"Selector '{selector}' for '{key}' not found")
except Exception as e:
data[key] = ""
logging.error(f"Error extracting '{key}' with selector '{selector}': {e}")
return data
async def scrape_multiple_pages(self,
base_url: str,
total_pages: int,
selectors: Dict[str, str],
url_pattern: str = "{}?page={}",
output_file: str = "scraped_data.csv") -> None:
"""
Scrape multiple pages and save results.
Args:
base_url: The base URL to scrape
total_pages: Number of pages to scrape
selectors: Dictionary of CSS selectors for data extraction
url_pattern: Pattern for constructing page URLs
output_file: Path to save CSV output
"""
self.data = []
for page_num in range(1, total_pages + 1):
url = url_pattern.format(base_url, page_num)
logging.info(f"Processing page {page_num}/{total_pages}")
# Navigate to page
success = await self.navigate(url)
if not success:
logging.warning(f"Skipping page {page_num} due to navigation error")
continue
# Extract items from page (assuming a list of items)
items = await self.page.query_selector_all("div.item") # Adjust selector for your target site
if not items:
logging.warning(f"No items found on page {page_num}")
continue
logging.info(f"Found {len(items)} items on page {page_num}")
# Process each item
for i, item in enumerate(items):
try:
# Create item-specific selectors relative to this item
item_data = {}
for key, selector in selectors.items():
element = await item.query_selector(selector)
if element:
item_data[key] = (await element.text_content()).strip()
else:
item_data[key] = ""
# Add metadata
item_data["page"] = page_num
item_data["item_number"] = i + 1
item_data["timestamp"] = datetime.now().isoformat()
self.data.append(item_data)
except Exception as e:
logging.error(f"Error processing item {i+1} on page {page_num}: {e}")
# Save progress after each page
self.save_to_csv(output_file)
# Add random delay between pages (3-7 seconds)
delay = random.uniform(3, 7)
logging.info(f"Waiting {delay:.2f} seconds before next page...")
await asyncio.sleep(delay)
def save_to_csv(self, filename: str) -> None:
"""
Save scraped data to CSV file.
Args:
filename: Path to save the CSV file
"""
if not self.data:
logging.warning("No data to save")
return
try:
# Ensure all rows have the same keys (columns)
all_keys = set()
for item in self.data:
all_keys.update(item.keys())
with open(filename, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=sorted(all_keys))
writer.writeheader()
writer.writerows(self.data)
logging.info(f"Data saved to {filename}")
except Exception as e:
logging.error(f"Error saving data to CSV: {e}")
async def close(self) -> None:
"""Close browser and clean up resources."""
if self.browser:
await self.browser.close()
logging.info("Browser closed")
async def main():
"""Main entry point for the scraper."""
# Set up logging
setup_logging()
# Example usage
target_url = "https://example.com/products"
# Define the data selectors (adjust for your target site)
selectors = {
"title": "h2.product-title",
"price": "span.price",
"description": "div.description",
"rating": "div.rating"
}
try:
async with PlaywrightScraper(headless=True) as scraper:
# Example: scrape 3 pages
await scraper.scrape_multiple_pages(
base_url=target_url,
total_pages=3,
selectors=selectors,
output_file="products_data.csv"
)
logging.info(f"Scraping complete. Total items collected: {len(scraper.data)}")
except Exception as e:
logging.error(f"Scraping failed: {e}")
if __name__ == "__main__":
# Run the async main function
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment