Mr0grog · November 14, 2025 22:37
diff --git a/README.md b/README.md
diff --git a/epatest-logs.py b/epatest-logs.py
 # /// script
 # dependencies = [
 #     "python-dateutil",
 # ]
 # ///
 from argparse import ArgumentParser
 from collections import defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime, timedelta
 from dateutil.parser import parse as parse_timestamp
 import json
 from pathlib import Path
 import re
 from statistics import median


 @dataclass
 class PageInfo:
    url: str
    index: int
    logs: list[dict] = field(default_factory=list)
    start_time: datetime | None = None
    end_time: datetime | None = None

    @property
    def duration(self) -> timedelta:
        return self.end_time - self.start_time


 def loglines(filepath: str):
    with open(filepath) as file:
        for line in file:
            clean_line = line.strip()
            if clean_line:
                yield json.loads(clean_line)


 def parse_logfile(filepath: str) -> dict[str, PageInfo]:
    print(f'Parsing "{filepath}"')
    index = 0
    current_page: PageInfo | None = None
    pages: dict[str, PageInfo] = {}
    for log in loglines(filepath):
        timestamp = parse_timestamp(log["timestamp"])
        log["timestamp"] = timestamp
        if log["context"] == "worker" and log["message"] == "Starting page":
            url = log["details"]["page"]
            if current_page:
                raise RuntimeError(f"Starting a new page when another has not finished! (old: '{current_page.url}', new: '{url}')")
            index += 1
            current_page = PageInfo(url=url, index=index, start_time=timestamp)

        if current_page:
            current_page.logs.append(log)

        if log["context"] == "pageStatus" and log["message"] == "Page Finished":
            url = log["details"]["page"]
            if not current_page:
                raise RuntimeError(f'Tried to finish page while no page is open! (url: {url})')
            current_page.end_time = timestamp
            pages[current_page.url] = current_page
            current_page = None

    return pages


 def parse_collection_logs(collection_path: Path) -> dict[str, PageInfo]:
    logs_directry = collection_path / 'logs'
    results = {}
    for file in logs_directry.iterdir():
        if file.suffix == '.log':
            results.update(parse_logfile(str(file)))

    return results


 parser = ArgumentParser()
 parser.add_argument(
    "--path",
    type=Path,
    default=Path("./crawls/collections"),
    help="Path to collections directory to analyze",
 )
 args = parser.parse_args()

 collections_path: Path = args.path
 # collections_path = Path('./crawls/collections')
 collections = sorted(p.name for p in collections_path.iterdir())
 crawls = {c: parse_collection_logs(collections_path / c) for c in collections}

 print('')

 # UNGROUPED RESULTS
 # # Show in order that pages were encountered in the first collection. This
 # # hopefully compares like-to-like as much as possible when it comes to caching,
 # # since each collection will have crawled in a *similar* order.
 # for page in sorted(crawls[collections[0]].values(), key=lambda x: x.index):
 #     print(page.url)
 #     for collection in collections:
 #         info = crawls[collection][page.url]
 #         memory = next(
 #             (log["details"] for log in info.logs if log["context"] == "memoryStatus"),
 #             "?"
 #         )
 #         print(f'  {info.duration.total_seconds():5.1f}s | {info.index:>2} | {collection:<50} | Mem: {json.dumps(memory)}')

 # GROUPED RESULTS
 # Group crawls. Expects crawls like "epatest--<version>--<tag>--<index>",
 # e.g. "epatest--1-8-1--basic--1".
 # The version and tag get combined to name a group we'll combine.
 crawl_groups: dict[str, list[dict[str, PageInfo]]] = defaultdict(list)
 for collection in collections:
    unprefixed = re.search(r'(^|-)\d-\d+-\d+-+.*$', collection).group(0).strip('-')
    name = re.sub(r'-+\d+$', '', unprefixed)
    crawl_groups[name].append(crawls[collection])

 # Show in order that pages were encountered in the first collection. This
 # hopefully compares like-to-like as much as possible when it comes to caching,
 # since each collection will have crawled in a *similar* order.
 print(f"  {'Median Time':>14} | {'Collections':<50} | Individual Times")
 print(f"  {'-' * 14} | {'-' * 50} | {'-' * 20}")
 for page in sorted(crawls[collections[0]].values(), key=lambda x: x.index):
    print(page.url)
    for group, members in sorted(crawl_groups.items(), key=lambda x: x[0]):
        page_infos = [c[page.url] for c in members]
        durations = [i.duration.total_seconds() for i in page_infos]
        print('  ' + ' | '.join([
            f"{median(durations):5.1f}s +/-{max(durations) - min(durations):4.1f}",
            f"{group:.<50}",
            ", ".join(f"{d:5.1f}s" for d in durations),
            # ",".join(str(i.index) for i in page_infos),
        ]))

 # # Show a summary and relative timings of all the logs while loading a given
 # # page in a given crawl.
 # print('')
 # print(' total s | incremental s | log')
 # page = crawls["epatest-1-9-0--no-autoscroll-1"]["https://espanol.epa.gov/tri/encontrar-interpretar-y-utilizar-el-tri"]
 # last_time = page.start_time
 # for log in page.logs:
 #     timestamp = log["timestamp"]
 #     details = log["details"]
 #     url = details.get("frameUrl", details.get("url", details.get("page")))
 #     url_text = f"(url='{url}')" if url else ""
 #     print(f'{(timestamp - page.start_time).total_seconds():6.2f} | {(timestamp - last_time).total_seconds():5.2f} | {log['context']}: {log['message']} {url_text}')
 #     last_time = timestamp
diff --git a/epatest.sh b/epatest.sh
 #!/usr/bin/env bash
 set -eo pipefail

 if [[ -z "${1}${2}" ]]; then
  echo 'You must specify a browsertrix-crawler version and crawl name as arguments.'
  echo 'For example, `epatest.sh 1.9.0 basic-1`'
  exit 1
 fi

 VERSION="${1}"
 BROWSERTRIX_IMAGE="webrecorder/browsertrix-crawler:${VERSION}"
 COLLECTION="$(echo "epatest--${VERSION}--${2}" | tr '.' '-')"

 echo "COLLECTION='${COLLECTION}'"

 mkdir -p crawls
 docker run \
    --rm \
    --attach stdout --attach stderr \
    --volume "./epatest.yaml:/app/config.yaml" \
    --volume "${PWD}/crawls/:/crawls/" \
    "${BROWSERTRIX_IMAGE}" \
    crawl \
    --config /app/config.yaml \
    --collection "${COLLECTION}" \
    --saveState always \
    --logging debug,stats \
    --logLevel debug,info,warn,error,fatal
diff --git a/epatest.yaml b/epatest.yaml
 behaviors:
 - autoscroll
 - autoplay
 - autofetch
 - siteSpecific
 pageLoadTimeout: 120
 rolloverSize: 8000000000
 saveStateHistory: 1
 scopeType: page
 seeds:
 - https://hero.epa.gov/hero/index.cfm/search
 - https://hero.epa.gov/
 - https://hero.epa.gov/hero/index.cfm/content/transparency
 - https://hero.epa.gov/hero/index.cfm/content/assessment
 - https://hero.epa.gov/hero/index.cfm/content/basic
 - https://hero.epa.gov/hero/index.cfm/litbrowser/public
 - https://hero.epa.gov/hero/index.cfm/content/howto
 - https://espanol.epa.gov/cai/manual-informativo-sobre-el-radon
 - https://espanol.epa.gov/watersense/en-sequia
 - https://espanol.epa.gov/cai/indoor-airplus-mejores-ambientes-adentro-y-afuera
 - https://espanol.epa.gov/plomo/acciones-para-reducir-la-exposicion-al-plomo
 - https://espanol.epa.gov/espanol/terminos-e
 - https://espanol.epa.gov/espanol/explicacion-sobre-el-oxido-de-etileno-eto
 - https://espanol.epa.gov/espanol/forms/contactenos-sobre-el-sitio-epa-en-espanol-preocupaciones-ambientales-o-alguna
 - https://espanol.epa.gov/programa-fronterizo/calendario-del-programa-fronterizo
 - https://espanol.epa.gov/tri/mision-y-metas-del-programa-del-tri
 - https://espanol.epa.gov/tri/encontrar-interpretar-y-utilizar-el-tri
 - https://espanol.epa.gov/espanol/resumen-del-programa-de-wifia
 - https://espanol.epa.gov/espanol/conceptos-basicos-sobre-el-material-particulado-pm-por-sus-siglas-en-ingles
 - https://espanol.epa.gov/cai/proteja-su-vida-y-la-de-su-familia-evite-el-envenenamiento-con-monoxido-de-carbono
 warcinfo:
  operator: '"Environmental Data & Governance Initiative" <[email protected]>'
 workers: 1
diff --git a/run-many.sh b/run-many.sh
 #!/usr/bin/env bash
 set -eo pipefail

 NAME="${1}"
 if [[ -z "${NAME}" ]]; then
  echo 'You must specify a name, e.g. `run-many.sh basic`'
  exit 1
 fi

 for i in $(seq 1 3); do
    echo '--------------------------------------------------------------------'
    echo "Running in 1.8.1... (run #${i})"
    ./epatest.sh 1.8.1 "${NAME}--${i}"
    echo ''
    echo '--------------------------------------------------------------------'
    echo "Running in 1.9.0... (run #${i})"
    ./epatest.sh 1.9.0 "${NAME}--${i}"
    echo ''
    echo ''
 done
	# /// script
	# dependencies = [
	# "python-dateutil",
	# ]
	# ///
	from argparse import ArgumentParser
	from collections import defaultdict
	from dataclasses import dataclass, field
	from datetime import datetime, timedelta
	from dateutil.parser import parse as parse_timestamp
	import json
	from pathlib import Path
	import re
	from statistics import median


	@dataclass
	class PageInfo:
	url: str
	index: int
	logs: list[dict] = field(default_factory=list)
	start_time: datetime \| None = None
	end_time: datetime \| None = None

	@property
	def duration(self) -> timedelta:
	return self.end_time - self.start_time


	def loglines(filepath: str):
	with open(filepath) as file:
	for line in file:
	clean_line = line.strip()
	if clean_line:
	yield json.loads(clean_line)


	def parse_logfile(filepath: str) -> dict[str, PageInfo]:
	print(f'Parsing "{filepath}"')
	index = 0
	current_page: PageInfo \| None = None
	pages: dict[str, PageInfo] = {}
	for log in loglines(filepath):
	timestamp = parse_timestamp(log["timestamp"])
	log["timestamp"] = timestamp
	if log["context"] == "worker" and log["message"] == "Starting page":
	url = log["details"]["page"]
	if current_page:
	raise RuntimeError(f"Starting a new page when another has not finished! (old: '{current_page.url}', new: '{url}')")
	index += 1
	current_page = PageInfo(url=url, index=index, start_time=timestamp)

	if current_page:
	current_page.logs.append(log)

	if log["context"] == "pageStatus" and log["message"] == "Page Finished":
	url = log["details"]["page"]
	if not current_page:
	raise RuntimeError(f'Tried to finish page while no page is open! (url: {url})')
	current_page.end_time = timestamp
	pages[current_page.url] = current_page
	current_page = None

	return pages


	def parse_collection_logs(collection_path: Path) -> dict[str, PageInfo]:
	logs_directry = collection_path / 'logs'
	results = {}
	for file in logs_directry.iterdir():
	if file.suffix == '.log':
	results.update(parse_logfile(str(file)))

	return results


	parser = ArgumentParser()
	parser.add_argument(
	"--path",
	type=Path,
	default=Path("./crawls/collections"),
	help="Path to collections directory to analyze",
	)
	args = parser.parse_args()

	collections_path: Path = args.path
	# collections_path = Path('./crawls/collections')
	collections = sorted(p.name for p in collections_path.iterdir())
	crawls = {c: parse_collection_logs(collections_path / c) for c in collections}

	print('')

	# UNGROUPED RESULTS
	# # Show in order that pages were encountered in the first collection. This
	# # hopefully compares like-to-like as much as possible when it comes to caching,
	# # since each collection will have crawled in a similar order.
	# for page in sorted(crawls[collections[0]].values(), key=lambda x: x.index):
	# print(page.url)
	# for collection in collections:
	# info = crawls[collection][page.url]
	# memory = next(
	# (log["details"] for log in info.logs if log["context"] == "memoryStatus"),
	# "?"
	# )
	# print(f' {info.duration.total_seconds():5.1f}s \| {info.index:>2} \| {collection:<50} \| Mem: {json.dumps(memory)}')

	# GROUPED RESULTS
	# Group crawls. Expects crawls like "epatest--<version>--<tag>--<index>",
	# e.g. "epatest--1-8-1--basic--1".
	# The version and tag get combined to name a group we'll combine.
	crawl_groups: dict[str, list[dict[str, PageInfo]]] = defaultdict(list)
	for collection in collections:
	unprefixed = re.search(r'(^\|-)\d-\d+-\d+-+.*$', collection).group(0).strip('-')
	name = re.sub(r'-+\d+$', '', unprefixed)
	crawl_groups[name].append(crawls[collection])

	# Show in order that pages were encountered in the first collection. This
	# hopefully compares like-to-like as much as possible when it comes to caching,
	# since each collection will have crawled in a similar order.
	print(f" {'Median Time':>14} \| {'Collections':<50} \| Individual Times")
	print(f" {'-' * 14} \| {'-' * 50} \| {'-' * 20}")
	for page in sorted(crawls[collections[0]].values(), key=lambda x: x.index):
	print(page.url)
	for group, members in sorted(crawl_groups.items(), key=lambda x: x[0]):
	page_infos = [c[page.url] for c in members]
	durations = [i.duration.total_seconds() for i in page_infos]
	print(' ' + ' \| '.join([
	f"{median(durations):5.1f}s +/-{max(durations) - min(durations):4.1f}",
	f"{group:.<50}",
	", ".join(f"{d:5.1f}s" for d in durations),
	# ",".join(str(i.index) for i in page_infos),
	]))

	# # Show a summary and relative timings of all the logs while loading a given
	# # page in a given crawl.
	# print('')
	# print(' total s \| incremental s \| log')
	# page = crawls["epatest-1-9-0--no-autoscroll-1"]["https://espanol.epa.gov/tri/encontrar-interpretar-y-utilizar-el-tri"]
	# last_time = page.start_time
	# for log in page.logs:
	# timestamp = log["timestamp"]
	# details = log["details"]
	# url = details.get("frameUrl", details.get("url", details.get("page")))
	# url_text = f"(url='{url}')" if url else ""
	# print(f'{(timestamp - page.start_time).total_seconds():6.2f} \| {(timestamp - last_time).total_seconds():5.2f} \| {log['context']}: {log['message']} {url_text}')
	# last_time = timestamp
	#!/usr/bin/env bash
	set -eo pipefail

	if [[ -z "${1}${2}" ]]; then
	echo 'You must specify a browsertrix-crawler version and crawl name as arguments.'
	echo 'For example, `epatest.sh 1.9.0 basic-1`'
	exit 1
	fi

	VERSION="${1}"
	BROWSERTRIX_IMAGE="webrecorder/browsertrix-crawler:${VERSION}"
	COLLECTION="$(echo "epatest--${VERSION}--${2}" \| tr '.' '-')"

	echo "COLLECTION='${COLLECTION}'"

	mkdir -p crawls
	docker run \
	--rm \
	--attach stdout --attach stderr \
	--volume "./epatest.yaml:/app/config.yaml" \
	--volume "${PWD}/crawls/:/crawls/" \
	"${BROWSERTRIX_IMAGE}" \
	crawl \
	--config /app/config.yaml \
	--collection "${COLLECTION}" \
	--saveState always \
	--logging debug,stats \
	--logLevel debug,info,warn,error,fatal
	behaviors:
	- autoscroll
	- autoplay
	- autofetch
	- siteSpecific
	pageLoadTimeout: 120
	rolloverSize: 8000000000
	saveStateHistory: 1
	scopeType: page
	seeds:
	- https://hero.epa.gov/hero/index.cfm/search
	- https://hero.epa.gov/
	- https://hero.epa.gov/hero/index.cfm/content/transparency
	- https://hero.epa.gov/hero/index.cfm/content/assessment
	- https://hero.epa.gov/hero/index.cfm/content/basic
	- https://hero.epa.gov/hero/index.cfm/litbrowser/public
	- https://hero.epa.gov/hero/index.cfm/content/howto
	- https://espanol.epa.gov/cai/manual-informativo-sobre-el-radon
	- https://espanol.epa.gov/watersense/en-sequia
	- https://espanol.epa.gov/cai/indoor-airplus-mejores-ambientes-adentro-y-afuera
	- https://espanol.epa.gov/plomo/acciones-para-reducir-la-exposicion-al-plomo
	- https://espanol.epa.gov/espanol/terminos-e
	- https://espanol.epa.gov/espanol/explicacion-sobre-el-oxido-de-etileno-eto
	- https://espanol.epa.gov/espanol/forms/contactenos-sobre-el-sitio-epa-en-espanol-preocupaciones-ambientales-o-alguna
	- https://espanol.epa.gov/programa-fronterizo/calendario-del-programa-fronterizo
	- https://espanol.epa.gov/tri/mision-y-metas-del-programa-del-tri
	- https://espanol.epa.gov/tri/encontrar-interpretar-y-utilizar-el-tri
	- https://espanol.epa.gov/espanol/resumen-del-programa-de-wifia
	- https://espanol.epa.gov/espanol/conceptos-basicos-sobre-el-material-particulado-pm-por-sus-siglas-en-ingles
	- https://espanol.epa.gov/cai/proteja-su-vida-y-la-de-su-familia-evite-el-envenenamiento-con-monoxido-de-carbono
	warcinfo:
	operator: '"Environmental Data & Governance Initiative" <[email protected]>'
	workers: 1
	#!/usr/bin/env bash
	set -eo pipefail

	NAME="${1}"
	if [[ -z "${NAME}" ]]; then
	echo 'You must specify a name, e.g. `run-many.sh basic`'
	exit 1
	fi

	for i in $(seq 1 3); do
	echo '--------------------------------------------------------------------'
	echo "Running in 1.8.1... (run #${i})"
	./epatest.sh 1.8.1 "${NAME}--${i}"
	echo ''
	echo '--------------------------------------------------------------------'
	echo "Running in 1.9.0... (run #${i})"
	./epatest.sh 1.9.0 "${NAME}--${i}"
	echo ''
	echo ''
	done