gbertb · May 5, 2025 06:37
diff --git a/spider_crawl_jsonlines.py b/spider_crawl_jsonlines.py
 import os
 import requests
 import argparse
 import time
 import logging
 import logging.handlers
 from pathlib import Path
 from datetime import datetime
 from typing import Dict
 import jsonlines
 import asyncio
 import dotenv

 dotenv.load_dotenv()

 # Configuration parameters
 SPIDER_API_KEY = os.environ.get("SPIDER_API_KEY")
 CRAWL_LIMIT = 100
 SPIDER_REQUEST_TIMEOUT = 30
 SPIDER_API_ENDPOINT = "https://api.spider.cloud/v1/crawl"


 def setup_logging() -> None:
    """Configure logging to both file and console with timestamped files."""
    script_dir = Path(__file__).parent
    log_dir = script_dir / "logs"
    log_dir.mkdir(exist_ok=True)
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = log_dir / f"crawler_single_process_{timestamp}.log"
    file_formatter = logging.Formatter(
        "%(asctime)s - %(name)s - %(levelname)s - %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
    console_formatter = logging.Formatter("%(levelname)s: %(message)s")
    file_handler = logging.handlers.RotatingFileHandler(
        log_file, maxBytes=10 * 1024 * 1024, backupCount=5, encoding="utf-8"
    )
    file_handler.setFormatter(file_formatter)
    latest_symlink = log_dir / "crawler_single_process_latest.log"
    try:
        if latest_symlink.exists():
            latest_symlink.unlink()
        latest_symlink.symlink_to(log_file.name)
    except OSError:
        pass  # Symlinks might not work on all systems
    console_handler = logging.StreamHandler()
    console_handler.setFormatter(console_formatter)
    root_logger = logging.getLogger()
    root_logger.setLevel(logging.INFO)
    root_logger.addHandler(file_handler)
    root_logger.addHandler(console_handler)
    logging.info(f"Logging to: {log_file}")


 setup_logging()
 logger = logging.getLogger(__name__)


 def process_item(
    item: Dict, total_crawl_cost: float, url_count: int
 ) -> tuple[float, int]:
    """Process single scraped item and log issues."""
    logger.info(f"Processing item: {item}")

    url = item.get("url")
    url_count += 1

    status = item.get("status")
    if status != 200:
        logger.warning(f"Non-200 status code {status} for URL: {url}")

    content = item.get("content", {})
    if not content or (isinstance(content, dict) and not any(content.values())):
        logger.warning(f"Empty content received for URL: {url}")
    else:
        # Simplified file saving in 'content' directory
        filename = f"{url.replace('://', '_').replace('/', '_')}.html"
        script_dir = os.path.dirname(os.path.abspath(__file__))
        content_dir = os.path.join(script_dir, "content")
        os.makedirs(content_dir, exist_ok=True)
        html_file_path = os.path.join(content_dir, filename)

        try:
            with open(html_file_path, "w", encoding="utf-8") as html_file:
                html_file.write(content)
            logger.info(f"Saved HTML content to {html_file_path}")
        except OSError as e:
            logger.error(f"Error saving file: {e}")
            return

    costs = item.get("costs") or {}
    total_crawl_cost += float(costs.get("total_cost", 0.0))

    logger.info(f"Processed URL {url_count}: {url} (status: {status})")
    return total_crawl_cost, url_count


 async def scrape_url(base_urls: list[str]) -> None:
    """Scrape a single URL using Spider API."""
    headers = {
        "Authorization": f"Bearer {SPIDER_API_KEY}",
        "Content-Type": "application/jsonl",
    }

    logger.info(f"spider api key: {SPIDER_API_KEY}")

    default_crawler_params = {
        "return_format": "markdown",
        "respect_robots": False,
        "fingerprint": True,
        "metadata": False,
        "cache": False,
        "proxy_enabled": False,
        "anti_bot": True,
        "request": "smart",
        "store_data": False,
        "limit": CRAWL_LIMIT,
        "request_timeout": 90,
        "wait_for": {
            "dom": {
                "timeout": {
                    "secs": 5,
                    "nanos": 500
                },
            "selector": "body"
            },
            "idle_network": {
                "timeout": {
                    "secs": 8,
                    "nanos": 0
                }
            }
        },
    }

    if len(base_urls) > 1:
        crawler_params = []
        for base_url in base_urls:
            crawler_params.append({**default_crawler_params, "url": base_url})
    else:
        base_url = base_urls[0]
        crawler_params = {**default_crawler_params, "url": base_url}

    logger.info(f"Crawling with crawler_params: {crawler_params}")
    crawl_start_time = time.time()
    total_crawl_cost = 0.0
    url_count = 0

    logger.info(f"Crawler endpoint: {SPIDER_API_ENDPOINT}")

    try:
        response = requests.post(
            SPIDER_API_ENDPOINT,
            headers=headers,
            json=crawler_params,
            stream=True,
            timeout=(30, SPIDER_REQUEST_TIMEOUT),
        )
        response.raise_for_status()

        reader = jsonlines.Reader(response.raw)
        for value in reader:
            try:
                total_crawl_cost, url_count = process_item(
                    value, total_crawl_cost, url_count
                )
            except Exception as e:
                logger.error(f"Error processing item: {e}")
                continue

    except jsonlines.InvalidLineError as e:
        logger.error(f"Invalid JSON line received: {e}")

    crawl_end_time = time.time()
    crawl_duration = crawl_end_time - crawl_start_time

    logger.info(f"Processed {url_count} URLs for {base_url}")
    logger.info(f"Crawl took {crawl_duration:.2f} seconds")
    logger.info(f"Total crawl cost: ${total_crawl_cost:.5f}")


 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Web scraping script for multiple URLs")
    parser.add_argument("--urls", type=str, nargs="+", required=True, help="One or more URLs to scrape")
    args = parser.parse_args()
    asyncio.run(scrape_url(base_urls=args.urls))
	import os
	import requests
	import argparse
	import time
	import logging
	import logging.handlers
	from pathlib import Path
	from datetime import datetime
	from typing import Dict
	import jsonlines
	import asyncio
	import dotenv

	dotenv.load_dotenv()

	# Configuration parameters
	SPIDER_API_KEY = os.environ.get("SPIDER_API_KEY")
	CRAWL_LIMIT = 100
	SPIDER_REQUEST_TIMEOUT = 30
	SPIDER_API_ENDPOINT = "https://api.spider.cloud/v1/crawl"


	def setup_logging() -> None:
	"""Configure logging to both file and console with timestamped files."""
	script_dir = Path(__file__).parent
	log_dir = script_dir / "logs"
	log_dir.mkdir(exist_ok=True)
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	log_file = log_dir / f"crawler_single_process_{timestamp}.log"
	file_formatter = logging.Formatter(
	"%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	datefmt="%Y-%m-%d %H:%M:%S",
	)
	console_formatter = logging.Formatter("%(levelname)s: %(message)s")
	file_handler = logging.handlers.RotatingFileHandler(
	log_file, maxBytes=10 * 1024 * 1024, backupCount=5, encoding="utf-8"
	)
	file_handler.setFormatter(file_formatter)
	latest_symlink = log_dir / "crawler_single_process_latest.log"
	try:
	if latest_symlink.exists():
	latest_symlink.unlink()
	latest_symlink.symlink_to(log_file.name)
	except OSError:
	pass # Symlinks might not work on all systems
	console_handler = logging.StreamHandler()
	console_handler.setFormatter(console_formatter)
	root_logger = logging.getLogger()
	root_logger.setLevel(logging.INFO)
	root_logger.addHandler(file_handler)
	root_logger.addHandler(console_handler)
	logging.info(f"Logging to: {log_file}")


	setup_logging()
	logger = logging.getLogger(__name__)


	def process_item(
	item: Dict, total_crawl_cost: float, url_count: int
	) -> tuple[float, int]:
	"""Process single scraped item and log issues."""
	logger.info(f"Processing item: {item}")

	url = item.get("url")
	url_count += 1

	status = item.get("status")
	if status != 200:
	logger.warning(f"Non-200 status code {status} for URL: {url}")

	content = item.get("content", {})
	if not content or (isinstance(content, dict) and not any(content.values())):
	logger.warning(f"Empty content received for URL: {url}")
	else:
	# Simplified file saving in 'content' directory
	filename = f"{url.replace('://', '_').replace('/', '_')}.html"
	script_dir = os.path.dirname(os.path.abspath(__file__))
	content_dir = os.path.join(script_dir, "content")
	os.makedirs(content_dir, exist_ok=True)
	html_file_path = os.path.join(content_dir, filename)

	try:
	with open(html_file_path, "w", encoding="utf-8") as html_file:
	html_file.write(content)
	logger.info(f"Saved HTML content to {html_file_path}")
	except OSError as e:
	logger.error(f"Error saving file: {e}")
	return

	costs = item.get("costs") or {}
	total_crawl_cost += float(costs.get("total_cost", 0.0))

	logger.info(f"Processed URL {url_count}: {url} (status: {status})")
	return total_crawl_cost, url_count


	async def scrape_url(base_urls: list[str]) -> None:
	"""Scrape a single URL using Spider API."""
	headers = {
	"Authorization": f"Bearer {SPIDER_API_KEY}",
	"Content-Type": "application/jsonl",
	}

	logger.info(f"spider api key: {SPIDER_API_KEY}")

	default_crawler_params = {
	"return_format": "markdown",
	"respect_robots": False,
	"fingerprint": True,
	"metadata": False,
	"cache": False,
	"proxy_enabled": False,
	"anti_bot": True,
	"request": "smart",
	"store_data": False,
	"limit": CRAWL_LIMIT,
	"request_timeout": 90,
	"wait_for": {
	"dom": {
	"timeout": {
	"secs": 5,
	"nanos": 500
	},
	"selector": "body"
	},
	"idle_network": {
	"timeout": {
	"secs": 8,
	"nanos": 0
	}
	}
	},
	}

	if len(base_urls) > 1:
	crawler_params = []
	for base_url in base_urls:
	crawler_params.append({**default_crawler_params, "url": base_url})
	else:
	base_url = base_urls[0]
	crawler_params = {**default_crawler_params, "url": base_url}

	logger.info(f"Crawling with crawler_params: {crawler_params}")
	crawl_start_time = time.time()
	total_crawl_cost = 0.0
	url_count = 0

	logger.info(f"Crawler endpoint: {SPIDER_API_ENDPOINT}")

	try:
	response = requests.post(
	SPIDER_API_ENDPOINT,
	headers=headers,
	json=crawler_params,
	stream=True,
	timeout=(30, SPIDER_REQUEST_TIMEOUT),
	)
	response.raise_for_status()

	reader = jsonlines.Reader(response.raw)
	for value in reader:
	try:
	total_crawl_cost, url_count = process_item(
	value, total_crawl_cost, url_count
	)
	except Exception as e:
	logger.error(f"Error processing item: {e}")
	continue

	except jsonlines.InvalidLineError as e:
	logger.error(f"Invalid JSON line received: {e}")

	crawl_end_time = time.time()
	crawl_duration = crawl_end_time - crawl_start_time

	logger.info(f"Processed {url_count} URLs for {base_url}")
	logger.info(f"Crawl took {crawl_duration:.2f} seconds")
	logger.info(f"Total crawl cost: ${total_crawl_cost:.5f}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Web scraping script for multiple URLs")
	parser.add_argument("--urls", type=str, nargs="+", required=True, help="One or more URLs to scrape")
	args = parser.parse_args()
	asyncio.run(scrape_url(base_urls=args.urls))
No results found