Last active
March 12, 2026 17:25
-
-
Save edsu/e1b92717d3cd7ed0ec98ade99f99392e to your computer and use it in GitHub Desktop.
A simple example of using Cloudflare's Crawl API endpoint to crawl a website and gather the results.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # This is a simplistic Python program that will use Cloudflare's Crawl API [1] | |
| # to crawl a website, and then fetch the results to the filesystem once the job | |
| # is completed. | |
| # | |
| # You run it like so: | |
| # | |
| # uv run crowdflare_crawl.py https://example.com | |
| # | |
| # Note: you will need to set these in your environment or in a `.env` file for | |
| # the program to work: | |
| # | |
| # - CLOUDFRONT_TOKEN | |
| # - CLOUDFRONT_ACCOUNT_ID | |
| # | |
| # In order to create a token you will need to go to the Cloudfront dashboard and | |
| # create a token that has the `Browser Rendering:Edit` permission. | |
| # | |
| # [1] https://developers.cloudflare.com/browser-rendering/rest-api/crawl-endpoint/ | |
| # /// script | |
| # dependencies = ["requests", "dotenv"] | |
| # /// | |
| import json | |
| import os | |
| import pathlib | |
| import sys | |
| import time | |
| import dotenv | |
| import requests | |
| from requests.adapters import HTTPAdapter, Retry | |
| dotenv.load_dotenv() | |
| token = os.environ.get('CLOUDFRONT_TOKEN') | |
| account_id = os.environ.get('CLOUDFRONT_ACCOUNT_ID') | |
| if not token or not account_id: | |
| sys.exit("Please set CLOUDFRONT_TOKEN and CLOUDFRONT_ACCOUNT_ID environment variables.") | |
| headers = {"Authorization": f"Bearer {token}"} | |
| https = requests.Session() | |
| retries = Retry(total=5, backoff_factor=1, status_forcelist=[401]) | |
| https.mount('https://', HTTPAdapter(max_retries=retries)) | |
| def main() -> None: | |
| url = sys.argv[1] | |
| job_id = start_crawl(url) | |
| print(f"created job {job_id}") | |
| wait_for_job(job_id) | |
| print(f"job {job_id} completed") | |
| write_results(job_id) | |
| def start_crawl(url: str): | |
| data = { | |
| "url": url, | |
| "formats": ["html", "markdown"], | |
| "limit": 5000 | |
| } | |
| resp = https.post( | |
| f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl", | |
| headers=headers, | |
| json=data | |
| ) | |
| resp.raise_for_status() | |
| result = resp.json() | |
| if result["success"] is False: | |
| raise Exception(result) | |
| return result["result"] | |
| def wait_for_job(job_id: str, sleep_secs=60) -> str: | |
| """ | |
| Wait for the crawl job to stop running and return the status. | |
| """ | |
| while True: | |
| resp = https.get( | |
| f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}", | |
| params={"limit": 1}, | |
| headers=headers | |
| ) | |
| resp.raise_for_status() | |
| result = resp.json() | |
| if result["result"]["status"] != "running": | |
| return result["result"]["status"] | |
| else: | |
| total = int(result["result"]["total"]) | |
| finished = int(result["result"]["finished"]) | |
| skipped = int(result["result"]["skipped"]) | |
| print(f"waiting for {job_id} to complete: total={total} finished={finished} skipped={skipped}") | |
| time.sleep(sleep_secs) | |
| def write_results(job_id: str) -> None: | |
| """ | |
| Fetch the results of the crawl job and write them as JSON to the filesystem. | |
| """ | |
| cursor = None | |
| count = 0 | |
| while True: | |
| if cursor is not None: | |
| params = {"cursor": cursor} | |
| else: | |
| params = {} | |
| resp = https.get( | |
| f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}", | |
| params=params, | |
| headers=headers | |
| ) | |
| resp.raise_for_status() | |
| count += 1 | |
| result = resp.json() | |
| path = pathlib.Path(f"{job_id}-{count:03}.json") | |
| json.dump(result, path.open("w"), indent=2) | |
| print(f"wrote {path}") | |
| new_cursor = result["result"].get("cursor") | |
| if new_cursor is None: | |
| break | |
| elif new_cursor == cursor: | |
| break | |
| else: | |
| cursor = new_cursor | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment