Skip to content

Instantly share code, notes, and snippets.

@edsu
Last active March 12, 2026 17:25
Show Gist options
  • Select an option

  • Save edsu/e1b92717d3cd7ed0ec98ade99f99392e to your computer and use it in GitHub Desktop.

Select an option

Save edsu/e1b92717d3cd7ed0ec98ade99f99392e to your computer and use it in GitHub Desktop.
A simple example of using Cloudflare's Crawl API endpoint to crawl a website and gather the results.
#!/usr/bin/env python3
# This is a simplistic Python program that will use Cloudflare's Crawl API [1]
# to crawl a website, and then fetch the results to the filesystem once the job
# is completed.
#
# You run it like so:
#
# uv run crowdflare_crawl.py https://example.com
#
# Note: you will need to set these in your environment or in a `.env` file for
# the program to work:
#
# - CLOUDFRONT_TOKEN
# - CLOUDFRONT_ACCOUNT_ID
#
# In order to create a token you will need to go to the Cloudfront dashboard and
# create a token that has the `Browser Rendering:Edit` permission.
#
# [1] https://developers.cloudflare.com/browser-rendering/rest-api/crawl-endpoint/
# /// script
# dependencies = ["requests", "dotenv"]
# ///
import json
import os
import pathlib
import sys
import time
import dotenv
import requests
from requests.adapters import HTTPAdapter, Retry
dotenv.load_dotenv()
token = os.environ.get('CLOUDFRONT_TOKEN')
account_id = os.environ.get('CLOUDFRONT_ACCOUNT_ID')
if not token or not account_id:
sys.exit("Please set CLOUDFRONT_TOKEN and CLOUDFRONT_ACCOUNT_ID environment variables.")
headers = {"Authorization": f"Bearer {token}"}
https = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[401])
https.mount('https://', HTTPAdapter(max_retries=retries))
def main() -> None:
url = sys.argv[1]
job_id = start_crawl(url)
print(f"created job {job_id}")
wait_for_job(job_id)
print(f"job {job_id} completed")
write_results(job_id)
def start_crawl(url: str):
data = {
"url": url,
"formats": ["html", "markdown"],
"limit": 5000
}
resp = https.post(
f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl",
headers=headers,
json=data
)
resp.raise_for_status()
result = resp.json()
if result["success"] is False:
raise Exception(result)
return result["result"]
def wait_for_job(job_id: str, sleep_secs=60) -> str:
"""
Wait for the crawl job to stop running and return the status.
"""
while True:
resp = https.get(
f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
params={"limit": 1},
headers=headers
)
resp.raise_for_status()
result = resp.json()
if result["result"]["status"] != "running":
return result["result"]["status"]
else:
total = int(result["result"]["total"])
finished = int(result["result"]["finished"])
skipped = int(result["result"]["skipped"])
print(f"waiting for {job_id} to complete: total={total} finished={finished} skipped={skipped}")
time.sleep(sleep_secs)
def write_results(job_id: str) -> None:
"""
Fetch the results of the crawl job and write them as JSON to the filesystem.
"""
cursor = None
count = 0
while True:
if cursor is not None:
params = {"cursor": cursor}
else:
params = {}
resp = https.get(
f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
params=params,
headers=headers
)
resp.raise_for_status()
count += 1
result = resp.json()
path = pathlib.Path(f"{job_id}-{count:03}.json")
json.dump(result, path.open("w"), indent=2)
print(f"wrote {path}")
new_cursor = result["result"].get("cursor")
if new_cursor is None:
break
elif new_cursor == cursor:
break
else:
cursor = new_cursor
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment