edsu · March 12, 2026 17:25
diff --git a/cloudflare_crawl.py b/cloudflare_crawl.py
 #!/usr/bin/env python3

 # This is a simplistic Python program that will use Cloudflare's Crawl API [1]
 # to crawl a website, and then fetch the results to the filesystem once the job
 # is completed.
 #
 # You run it like so:
 # 
 #    uv run crowdflare_crawl.py https://example.com
 #
 # Note: you will need to set these in your environment or in a `.env` file for 
 # the program to work:
 #
 # - CLOUDFRONT_TOKEN
 # - CLOUDFRONT_ACCOUNT_ID
 # 
 # In order to create a token you will need to go to the Cloudfront dashboard and
 # create a token that has the `Browser Rendering:Edit` permission.
 # 
 # [1] https://developers.cloudflare.com/browser-rendering/rest-api/crawl-endpoint/

 # /// script
 # dependencies = ["requests", "dotenv"]
 # ///

 import json
 import os
 import pathlib
 import sys
 import time

 import dotenv
 import requests
 from requests.adapters import HTTPAdapter, Retry

 dotenv.load_dotenv()

 token = os.environ.get('CLOUDFRONT_TOKEN')
 account_id = os.environ.get('CLOUDFRONT_ACCOUNT_ID')
 if not token or not account_id:
    sys.exit("Please set CLOUDFRONT_TOKEN and CLOUDFRONT_ACCOUNT_ID environment variables.")

 headers = {"Authorization": f"Bearer {token}"}

 https = requests.Session()
 retries = Retry(total=5, backoff_factor=1, status_forcelist=[401])
 https.mount('https://', HTTPAdapter(max_retries=retries))

 def main() -> None:
    url = sys.argv[1]
    job_id = start_crawl(url)
    print(f"created job {job_id}")

    wait_for_job(job_id)
    print(f"job {job_id} completed")

    write_results(job_id)


 def start_crawl(url: str):
    data = {
        "url": url,
        "formats": ["html", "markdown"],
        "limit": 5000
    }

    resp = https.post(
        f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl",
        headers=headers,
        json=data
    )

    resp.raise_for_status()

    result = resp.json()

    if result["success"] is False:
        raise Exception(result)

    return result["result"]


 def wait_for_job(job_id: str, sleep_secs=60) -> str:
    """
    Wait for the crawl job to stop running and return the status.
    """
    while True:
        resp = https.get(
            f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
            params={"limit": 1},
            headers=headers
        )

        resp.raise_for_status()

        result = resp.json()

        if result["result"]["status"] != "running":
            return result["result"]["status"]
        else:
            total = int(result["result"]["total"])
            finished = int(result["result"]["finished"])
            skipped = int(result["result"]["skipped"])
            print(f"waiting for {job_id} to complete: total={total} finished={finished} skipped={skipped}")
            time.sleep(sleep_secs)


 def write_results(job_id: str) -> None:
    """
    Fetch the results of the crawl job and write them as JSON to the filesystem.
    """
    cursor = None
    count = 0

    while True:
        if cursor is not None:
            params = {"cursor": cursor}
        else:
            params = {}

        resp = https.get(
            f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
            params=params,
            headers=headers
        )

        resp.raise_for_status()
        
        count += 1
        result = resp.json()
        path = pathlib.Path(f"{job_id}-{count:03}.json")
        json.dump(result, path.open("w"), indent=2)
        print(f"wrote {path}")

        new_cursor = result["result"].get("cursor")
        if new_cursor is None:
            break
        elif new_cursor == cursor:
            break
        else:
            cursor = new_cursor


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	# This is a simplistic Python program that will use Cloudflare's Crawl API [1]
	# to crawl a website, and then fetch the results to the filesystem once the job
	# is completed.
	#
	# You run it like so:
	#
	# uv run crowdflare_crawl.py https://example.com
	#
	# Note: you will need to set these in your environment or in a `.env` file for
	# the program to work:
	#
	# - CLOUDFRONT_TOKEN
	# - CLOUDFRONT_ACCOUNT_ID
	#
	# In order to create a token you will need to go to the Cloudfront dashboard and
	# create a token that has the `Browser Rendering:Edit` permission.
	#
	# [1] https://developers.cloudflare.com/browser-rendering/rest-api/crawl-endpoint/

	# /// script
	# dependencies = ["requests", "dotenv"]
	# ///

	import json
	import os
	import pathlib
	import sys
	import time

	import dotenv
	import requests
	from requests.adapters import HTTPAdapter, Retry

	dotenv.load_dotenv()

	token = os.environ.get('CLOUDFRONT_TOKEN')
	account_id = os.environ.get('CLOUDFRONT_ACCOUNT_ID')
	if not token or not account_id:
	sys.exit("Please set CLOUDFRONT_TOKEN and CLOUDFRONT_ACCOUNT_ID environment variables.")

	headers = {"Authorization": f"Bearer {token}"}

	https = requests.Session()
	retries = Retry(total=5, backoff_factor=1, status_forcelist=[401])
	https.mount('https://', HTTPAdapter(max_retries=retries))

	def main() -> None:
	url = sys.argv[1]
	job_id = start_crawl(url)
	print(f"created job {job_id}")

	wait_for_job(job_id)
	print(f"job {job_id} completed")

	write_results(job_id)


	def start_crawl(url: str):
	data = {
	"url": url,
	"formats": ["html", "markdown"],
	"limit": 5000
	}

	resp = https.post(
	f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl",
	headers=headers,
	json=data
	)

	resp.raise_for_status()

	result = resp.json()

	if result["success"] is False:
	raise Exception(result)

	return result["result"]


	def wait_for_job(job_id: str, sleep_secs=60) -> str:
	"""
	Wait for the crawl job to stop running and return the status.
	"""
	while True:
	resp = https.get(
	f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
	params={"limit": 1},
	headers=headers
	)

	resp.raise_for_status()

	result = resp.json()

	if result["result"]["status"] != "running":
	return result["result"]["status"]
	else:
	total = int(result["result"]["total"])
	finished = int(result["result"]["finished"])
	skipped = int(result["result"]["skipped"])
	print(f"waiting for {job_id} to complete: total={total} finished={finished} skipped={skipped}")
	time.sleep(sleep_secs)


	def write_results(job_id: str) -> None:
	"""
	Fetch the results of the crawl job and write them as JSON to the filesystem.
	"""
	cursor = None
	count = 0

	while True:
	if cursor is not None:
	params = {"cursor": cursor}
	else:
	params = {}

	resp = https.get(
	f"https://api.cloudflare.com/client/v4/accounts/{account_id}/browser-rendering/crawl/{job_id}",
	params=params,
	headers=headers
	)

	resp.raise_for_status()

	count += 1
	result = resp.json()
	path = pathlib.Path(f"{job_id}-{count:03}.json")
	json.dump(result, path.open("w"), indent=2)
	print(f"wrote {path}")

	new_cursor = result["result"].get("cursor")
	if new_cursor is None:
	break
	elif new_cursor == cursor:
	break
	else:
	cursor = new_cursor


	if __name__ == "__main__":
	main()
No results found