Skip to content

Instantly share code, notes, and snippets.

@gbertb
Created May 5, 2025 06:37
Show Gist options
  • Select an option

  • Save gbertb/70dc5e32b229ac1c54f3174b0cdf3706 to your computer and use it in GitHub Desktop.

Select an option

Save gbertb/70dc5e32b229ac1c54f3174b0cdf3706 to your computer and use it in GitHub Desktop.
Spider Streaming Results Example
import os
import requests
import argparse
import time
import logging
import logging.handlers
from pathlib import Path
from datetime import datetime
from typing import Dict
import jsonlines
import asyncio
import dotenv
dotenv.load_dotenv()
# Configuration parameters
SPIDER_API_KEY = os.environ.get("SPIDER_API_KEY")
CRAWL_LIMIT = 100
SPIDER_REQUEST_TIMEOUT = 30
SPIDER_API_ENDPOINT = "https://api.spider.cloud/v1/crawl"
def setup_logging() -> None:
"""Configure logging to both file and console with timestamped files."""
script_dir = Path(__file__).parent
log_dir = script_dir / "logs"
log_dir.mkdir(exist_ok=True)
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
log_file = log_dir / f"crawler_single_process_{timestamp}.log"
file_formatter = logging.Formatter(
"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
console_formatter = logging.Formatter("%(levelname)s: %(message)s")
file_handler = logging.handlers.RotatingFileHandler(
log_file, maxBytes=10 * 1024 * 1024, backupCount=5, encoding="utf-8"
)
file_handler.setFormatter(file_formatter)
latest_symlink = log_dir / "crawler_single_process_latest.log"
try:
if latest_symlink.exists():
latest_symlink.unlink()
latest_symlink.symlink_to(log_file.name)
except OSError:
pass # Symlinks might not work on all systems
console_handler = logging.StreamHandler()
console_handler.setFormatter(console_formatter)
root_logger = logging.getLogger()
root_logger.setLevel(logging.INFO)
root_logger.addHandler(file_handler)
root_logger.addHandler(console_handler)
logging.info(f"Logging to: {log_file}")
setup_logging()
logger = logging.getLogger(__name__)
def process_item(
item: Dict, total_crawl_cost: float, url_count: int
) -> tuple[float, int]:
"""Process single scraped item and log issues."""
logger.info(f"Processing item: {item}")
url = item.get("url")
url_count += 1
status = item.get("status")
if status != 200:
logger.warning(f"Non-200 status code {status} for URL: {url}")
content = item.get("content", {})
if not content or (isinstance(content, dict) and not any(content.values())):
logger.warning(f"Empty content received for URL: {url}")
else:
# Simplified file saving in 'content' directory
filename = f"{url.replace('://', '_').replace('/', '_')}.html"
script_dir = os.path.dirname(os.path.abspath(__file__))
content_dir = os.path.join(script_dir, "content")
os.makedirs(content_dir, exist_ok=True)
html_file_path = os.path.join(content_dir, filename)
try:
with open(html_file_path, "w", encoding="utf-8") as html_file:
html_file.write(content)
logger.info(f"Saved HTML content to {html_file_path}")
except OSError as e:
logger.error(f"Error saving file: {e}")
return
costs = item.get("costs") or {}
total_crawl_cost += float(costs.get("total_cost", 0.0))
logger.info(f"Processed URL {url_count}: {url} (status: {status})")
return total_crawl_cost, url_count
async def scrape_url(base_urls: list[str]) -> None:
"""Scrape a single URL using Spider API."""
headers = {
"Authorization": f"Bearer {SPIDER_API_KEY}",
"Content-Type": "application/jsonl",
}
logger.info(f"spider api key: {SPIDER_API_KEY}")
default_crawler_params = {
"return_format": "markdown",
"respect_robots": False,
"fingerprint": True,
"metadata": False,
"cache": False,
"proxy_enabled": False,
"anti_bot": True,
"request": "smart",
"store_data": False,
"limit": CRAWL_LIMIT,
"request_timeout": 90,
"wait_for": {
"dom": {
"timeout": {
"secs": 5,
"nanos": 500
},
"selector": "body"
},
"idle_network": {
"timeout": {
"secs": 8,
"nanos": 0
}
}
},
}
if len(base_urls) > 1:
crawler_params = []
for base_url in base_urls:
crawler_params.append({**default_crawler_params, "url": base_url})
else:
base_url = base_urls[0]
crawler_params = {**default_crawler_params, "url": base_url}
logger.info(f"Crawling with crawler_params: {crawler_params}")
crawl_start_time = time.time()
total_crawl_cost = 0.0
url_count = 0
logger.info(f"Crawler endpoint: {SPIDER_API_ENDPOINT}")
try:
response = requests.post(
SPIDER_API_ENDPOINT,
headers=headers,
json=crawler_params,
stream=True,
timeout=(30, SPIDER_REQUEST_TIMEOUT),
)
response.raise_for_status()
reader = jsonlines.Reader(response.raw)
for value in reader:
try:
total_crawl_cost, url_count = process_item(
value, total_crawl_cost, url_count
)
except Exception as e:
logger.error(f"Error processing item: {e}")
continue
except jsonlines.InvalidLineError as e:
logger.error(f"Invalid JSON line received: {e}")
crawl_end_time = time.time()
crawl_duration = crawl_end_time - crawl_start_time
logger.info(f"Processed {url_count} URLs for {base_url}")
logger.info(f"Crawl took {crawl_duration:.2f} seconds")
logger.info(f"Total crawl cost: ${total_crawl_cost:.5f}")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Web scraping script for multiple URLs")
parser.add_argument("--urls", type=str, nargs="+", required=True, help="One or more URLs to scrape")
args = parser.parse_args()
asyncio.run(scrape_url(base_urls=args.urls))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment