Created
May 5, 2025 06:37
-
-
Save gbertb/70dc5e32b229ac1c54f3174b0cdf3706 to your computer and use it in GitHub Desktop.
Spider Streaming Results Example
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import requests | |
| import argparse | |
| import time | |
| import logging | |
| import logging.handlers | |
| from pathlib import Path | |
| from datetime import datetime | |
| from typing import Dict | |
| import jsonlines | |
| import asyncio | |
| import dotenv | |
| dotenv.load_dotenv() | |
| # Configuration parameters | |
| SPIDER_API_KEY = os.environ.get("SPIDER_API_KEY") | |
| CRAWL_LIMIT = 100 | |
| SPIDER_REQUEST_TIMEOUT = 30 | |
| SPIDER_API_ENDPOINT = "https://api.spider.cloud/v1/crawl" | |
| def setup_logging() -> None: | |
| """Configure logging to both file and console with timestamped files.""" | |
| script_dir = Path(__file__).parent | |
| log_dir = script_dir / "logs" | |
| log_dir.mkdir(exist_ok=True) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| log_file = log_dir / f"crawler_single_process_{timestamp}.log" | |
| file_formatter = logging.Formatter( | |
| "%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| datefmt="%Y-%m-%d %H:%M:%S", | |
| ) | |
| console_formatter = logging.Formatter("%(levelname)s: %(message)s") | |
| file_handler = logging.handlers.RotatingFileHandler( | |
| log_file, maxBytes=10 * 1024 * 1024, backupCount=5, encoding="utf-8" | |
| ) | |
| file_handler.setFormatter(file_formatter) | |
| latest_symlink = log_dir / "crawler_single_process_latest.log" | |
| try: | |
| if latest_symlink.exists(): | |
| latest_symlink.unlink() | |
| latest_symlink.symlink_to(log_file.name) | |
| except OSError: | |
| pass # Symlinks might not work on all systems | |
| console_handler = logging.StreamHandler() | |
| console_handler.setFormatter(console_formatter) | |
| root_logger = logging.getLogger() | |
| root_logger.setLevel(logging.INFO) | |
| root_logger.addHandler(file_handler) | |
| root_logger.addHandler(console_handler) | |
| logging.info(f"Logging to: {log_file}") | |
| setup_logging() | |
| logger = logging.getLogger(__name__) | |
| def process_item( | |
| item: Dict, total_crawl_cost: float, url_count: int | |
| ) -> tuple[float, int]: | |
| """Process single scraped item and log issues.""" | |
| logger.info(f"Processing item: {item}") | |
| url = item.get("url") | |
| url_count += 1 | |
| status = item.get("status") | |
| if status != 200: | |
| logger.warning(f"Non-200 status code {status} for URL: {url}") | |
| content = item.get("content", {}) | |
| if not content or (isinstance(content, dict) and not any(content.values())): | |
| logger.warning(f"Empty content received for URL: {url}") | |
| else: | |
| # Simplified file saving in 'content' directory | |
| filename = f"{url.replace('://', '_').replace('/', '_')}.html" | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| content_dir = os.path.join(script_dir, "content") | |
| os.makedirs(content_dir, exist_ok=True) | |
| html_file_path = os.path.join(content_dir, filename) | |
| try: | |
| with open(html_file_path, "w", encoding="utf-8") as html_file: | |
| html_file.write(content) | |
| logger.info(f"Saved HTML content to {html_file_path}") | |
| except OSError as e: | |
| logger.error(f"Error saving file: {e}") | |
| return | |
| costs = item.get("costs") or {} | |
| total_crawl_cost += float(costs.get("total_cost", 0.0)) | |
| logger.info(f"Processed URL {url_count}: {url} (status: {status})") | |
| return total_crawl_cost, url_count | |
| async def scrape_url(base_urls: list[str]) -> None: | |
| """Scrape a single URL using Spider API.""" | |
| headers = { | |
| "Authorization": f"Bearer {SPIDER_API_KEY}", | |
| "Content-Type": "application/jsonl", | |
| } | |
| logger.info(f"spider api key: {SPIDER_API_KEY}") | |
| default_crawler_params = { | |
| "return_format": "markdown", | |
| "respect_robots": False, | |
| "fingerprint": True, | |
| "metadata": False, | |
| "cache": False, | |
| "proxy_enabled": False, | |
| "anti_bot": True, | |
| "request": "smart", | |
| "store_data": False, | |
| "limit": CRAWL_LIMIT, | |
| "request_timeout": 90, | |
| "wait_for": { | |
| "dom": { | |
| "timeout": { | |
| "secs": 5, | |
| "nanos": 500 | |
| }, | |
| "selector": "body" | |
| }, | |
| "idle_network": { | |
| "timeout": { | |
| "secs": 8, | |
| "nanos": 0 | |
| } | |
| } | |
| }, | |
| } | |
| if len(base_urls) > 1: | |
| crawler_params = [] | |
| for base_url in base_urls: | |
| crawler_params.append({**default_crawler_params, "url": base_url}) | |
| else: | |
| base_url = base_urls[0] | |
| crawler_params = {**default_crawler_params, "url": base_url} | |
| logger.info(f"Crawling with crawler_params: {crawler_params}") | |
| crawl_start_time = time.time() | |
| total_crawl_cost = 0.0 | |
| url_count = 0 | |
| logger.info(f"Crawler endpoint: {SPIDER_API_ENDPOINT}") | |
| try: | |
| response = requests.post( | |
| SPIDER_API_ENDPOINT, | |
| headers=headers, | |
| json=crawler_params, | |
| stream=True, | |
| timeout=(30, SPIDER_REQUEST_TIMEOUT), | |
| ) | |
| response.raise_for_status() | |
| reader = jsonlines.Reader(response.raw) | |
| for value in reader: | |
| try: | |
| total_crawl_cost, url_count = process_item( | |
| value, total_crawl_cost, url_count | |
| ) | |
| except Exception as e: | |
| logger.error(f"Error processing item: {e}") | |
| continue | |
| except jsonlines.InvalidLineError as e: | |
| logger.error(f"Invalid JSON line received: {e}") | |
| crawl_end_time = time.time() | |
| crawl_duration = crawl_end_time - crawl_start_time | |
| logger.info(f"Processed {url_count} URLs for {base_url}") | |
| logger.info(f"Crawl took {crawl_duration:.2f} seconds") | |
| logger.info(f"Total crawl cost: ${total_crawl_cost:.5f}") | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Web scraping script for multiple URLs") | |
| parser.add_argument("--urls", type=str, nargs="+", required=True, help="One or more URLs to scrape") | |
| args = parser.parse_args() | |
| asyncio.run(scrape_url(base_urls=args.urls)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment