mgaitan · December 7, 2025 17:02
diff --git a/benchmark.py b/benchmark.py
 # /// script
 # requires-python = ">=3.12"
 # dependencies = [
 #     "tl-parser==0.7.11",
 #     "beautifulsoup4",
 #     "lxml",
 #     "pyquery",
 #     "matplotlib",
 # ]
 #
 # [tool.uv]
 # #find-links = ["https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11"]
 # ///

 from __future__ import annotations

 """
 Benchmark tl-parser against BeautifulSoup using a shared HTML sample.

 You can run it with uv and the GitHub release wheels via:

    uv run \
      --find-links=https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11 \
      --with tl-parser==0.7.11 \
      scripts/bench_tl_vs_bs4.py
 """

 from collections import OrderedDict
 from dataclasses import dataclass
 from contextlib import nullcontext
 from pathlib import Path
 from time import perf_counter
 from typing import Any, Callable, Dict, List, Sequence

 import matplotlib

 matplotlib.use("Agg")
 import matplotlib.pyplot as plt  # noqa: E402  # isort: skip
 from bs4 import BeautifulSoup  # noqa: E402  # isort: skip
 from pyquery import PyQuery as pq  # noqa: E402  # isort: skip

 matplotlib.use("Agg")

 import tl  # noqa: E402  # isort: skip

 HTML_SNIPPET = """
 <html>
  <head>
    <title>The Dormouse's story</title>
  </head>
  <body>
    <p class="title">
      <b>The Dormouse's story</b>
    </p>
    <p class="story">
      Once upon a time there were three little sisters; and their names were
      <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
      <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
      and
      <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
      and they lived at the bottom of a well.
    </p>
    <p class="story">...</p>
  </body>
 </html>
 """.strip()

 HTML_REPETITIONS = 250
 HTML_REPETITIONS = 150
 HTML_DOCUMENT = (
    "<!doctype html>\n" + "\n".join(HTML_SNIPPET for _ in range(HTML_REPETITIONS))
 )
 PARSE_ITERATIONS = 400
 OP_ITERATIONS = 4000
 PARSE_ITERATIONS = 150
 OP_ITERATIONS = 1500
 PLOT_PATH = Path(__file__).with_name("tl_vs_bs4_benchmark.png")

 _SINK: Any = None


 def consume(value: Any) -> None:
    global _SINK
    _SINK = value


 @dataclass
 class Library:
    name: str
    parse: Callable[[str], Any]
    operations: Dict[str, Callable[[Any], None]]


 def build_tl_library() -> Library:
    def parse_tl(html: str) -> Any:
        return tl.parse(html)

    def query(selector: str, dom: Any) -> List[Any]:
        return dom.query_selector(selector)

    operations = OrderedDict(
        [
            ("title_text", lambda dom: consume(_first_text(query("title", dom)))),
            (
                "class_lookup",
                lambda dom: consume(len(dom.get_elements_by_class_name("sister"))),
            ),
            (
                "id_lookup",
                lambda dom: consume(
                    (_elt := dom.get_element_by_id("link3")) and _elt.inner_text()
                ),
            ),
            (
                "css_query",
                lambda dom: consume(len(query("p.story a.sister", dom))),
            ),
        ]
    )

    return Library("tl.parse", parse_tl, operations)


 def build_bs4_library(name: str, parser: str) -> Library:
    def parse_bs4(html: str) -> Any:
        return BeautifulSoup(html, parser)

    operations = OrderedDict(
        [
            ("title_text", lambda soup: consume(soup.title.get_text(strip=True))),
            (
                "class_lookup",
                lambda soup: consume(len(soup.find_all(class_="sister"))),
            ),
            (
                "id_lookup",
                lambda soup: consume(
                    (elt := soup.find(id="link3")) and elt.get_text(strip=True)
                ),
            ),
            (
                "css_query",
                lambda soup: consume(len(soup.select("p.story a.sister"))),
            ),
        ]
    )

    return Library(name, parse_bs4, operations)


 LIBRARIES = [
    build_tl_library(),
    build_bs4_library("BeautifulSoup (html.parser)", "html.parser"),
    build_bs4_library("BeautifulSoup (lxml)", "lxml"),
    # PyQuery uses lxml under the hood and offers a jQuery-like API.
    Library(
        "PyQuery",
        lambda html: pq(html),
        OrderedDict(
            [
                ("title_text", lambda doc: consume(doc("title").text())),
                ("class_lookup", lambda doc: consume(len(doc(".sister")))),
                ("id_lookup", lambda doc: consume(doc("#link3").text())),
                ("css_query", lambda doc: consume(len(doc("p.story a.sister")))),
            ]
        ),
    ),
 ]


 def time_callable(fn: Callable[[], None], iterations: int) -> float:
    start = perf_counter()
    for _ in range(iterations):
        fn()
    return (perf_counter() - start) / iterations


 def benchmark() -> Dict[str, Dict[str, float]]:
    metrics = OrderedDict((metric, {}) for metric in ["parse"])
    for metric in LIBRARIES[0].operations:
        metrics[metric] = {}

    for library in LIBRARIES:
        parse_time = time_callable(lambda: library.parse(HTML_DOCUMENT), PARSE_ITERATIONS)
        metrics["parse"][library.name] = parse_time

        dom = library.parse(HTML_DOCUMENT)
        for metric_name, operation in library.operations.items():
            metrics[metric_name][library.name] = time_callable(
                lambda op=operation, d=dom: op(d),
                OP_ITERATIONS,
            )

    return metrics


 def print_results(results: Dict[str, Dict[str, float]]) -> None:
    libs = [lib.name for lib in LIBRARIES]
    header = "metric".ljust(16) + "".join(name.rjust(26) for name in libs)
    print(header)
    print("-" * len(header))
    for metric, values in results.items():
        row = metric.ljust(16)
        for lib in libs:
            ms = values[lib] * 1000
            row += f"{ms:>25.3f} ms"
        print(row)


 def plot_results(results: Dict[str, Dict[str, float]]) -> None:
    metrics = list(results.keys())
    libs = [lib.name for lib in LIBRARIES]
    bar_width = 0.25
    center_offset = (len(libs) - 1) / 2

    with plt.xkcd():
        fig, ax = plt.subplots(figsize=(10, 6))
        for lib_index, lib in enumerate(libs):
            positions = [
                metric_index + (lib_index - center_offset) * bar_width
                for metric_index in range(len(metrics))
            ]
            values = [results[metric][lib] * 1000 for metric in metrics]
            ax.bar(positions, values, width=bar_width, label=lib)

        ax.set_xticks(range(len(metrics)))
        ax.set_xticklabels(
            [metric.replace("_", " ") for metric in metrics], rotation=15
        )
        ax.set_ylabel("ms per operation (lower is better)")
        ax.set_title("tl.parse vs BeautifulSoup benchmarks")
        ax.legend()
        ax.grid(alpha=0.2, axis="y", linestyle="--")

        fig.tight_layout()
        fig.savefig(PLOT_PATH, dpi=200)
        print(f"Saved plot to {PLOT_PATH}")
    def render(use_xkcd: bool) -> None:
        context = plt.xkcd() if use_xkcd else nullcontext()
        with context:
            metrics = list(results.keys())
            libs = [lib.name for lib in LIBRARIES]
            bar_width = 0.25
            center_offset = (len(libs) - 1) / 2

            fig, ax = plt.subplots(figsize=(10, 6))
            for lib_index, lib in enumerate(libs):
                positions = [
                    metric_index + (lib_index - center_offset) * bar_width
                    for metric_index in range(len(metrics))
                ]
                values = [results[metric][lib] * 1000 for metric in metrics]
                ax.bar(positions, values, width=bar_width, label=lib)

            ax.set_xticks(range(len(metrics)))
            ax.set_xticklabels(
                [metric.replace("_", " ") for metric in metrics], rotation=15
            )
            ax.set_ylabel("ms per operation (lower is better)")
            ax.set_title("tl.parse vs BeautifulSoup benchmarks")
            ax.legend()
            ax.grid(alpha=0.2, axis="y")

            fig.tight_layout()
            fig.savefig(PLOT_PATH, dpi=200)

    try:
        render(True)
    except Exception as exc:
        print(f"xkcd style failed ({exc}); retrying without it.")
        render(False)

    print(f"Saved plot to {PLOT_PATH}")

 def main() -> None:
    results = benchmark()
    print_results(results)
    plot_results(results)


 def _first_text(elements: Sequence[Any]) -> str:
    if not elements:
        return ""
    node = elements[0]
    return node.inner_text() if hasattr(node, "inner_text") else str(node)


 if __name__ == "__main__":
    main()
	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "tl-parser==0.7.11",
	# "beautifulsoup4",
	# "lxml",
	# "pyquery",
	# "matplotlib",
	# ]
	#
	# [tool.uv]
	# #find-links = ["https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11"]
	# ///

	from __future__ import annotations

	"""
	Benchmark tl-parser against BeautifulSoup using a shared HTML sample.

	You can run it with uv and the GitHub release wheels via:

	uv run \
	--find-links=https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11 \
	--with tl-parser==0.7.11 \
	scripts/bench_tl_vs_bs4.py
	"""

	from collections import OrderedDict
	from dataclasses import dataclass
	from contextlib import nullcontext
	from pathlib import Path
	from time import perf_counter
	from typing import Any, Callable, Dict, List, Sequence

	import matplotlib

	matplotlib.use("Agg")
	import matplotlib.pyplot as plt # noqa: E402 # isort: skip
	from bs4 import BeautifulSoup # noqa: E402 # isort: skip
	from pyquery import PyQuery as pq # noqa: E402 # isort: skip

	matplotlib.use("Agg")

	import tl # noqa: E402 # isort: skip

	HTML_SNIPPET = """
	<html>
	<head>
	<title>The Dormouse's story</title>
	</head>
	<body>
	<p class="title">
	<b>The Dormouse's story</b>
	</p>
	<p class="story">
	Once upon a time there were three little sisters; and their names were
	<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
	<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
	and
	<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
	and they lived at the bottom of a well.
	</p>
	<p class="story">...</p>
	</body>
	</html>
	""".strip()

	HTML_REPETITIONS = 250
	HTML_REPETITIONS = 150
	HTML_DOCUMENT = (
	"<!doctype html>\n" + "\n".join(HTML_SNIPPET for _ in range(HTML_REPETITIONS))
	)
	PARSE_ITERATIONS = 400
	OP_ITERATIONS = 4000
	PARSE_ITERATIONS = 150
	OP_ITERATIONS = 1500
	PLOT_PATH = Path(__file__).with_name("tl_vs_bs4_benchmark.png")

	_SINK: Any = None


	def consume(value: Any) -> None:
	global _SINK
	_SINK = value


	@dataclass
	class Library:
	name: str
	parse: Callable[[str], Any]
	operations: Dict[str, Callable[[Any], None]]


	def build_tl_library() -> Library:
	def parse_tl(html: str) -> Any:
	return tl.parse(html)

	def query(selector: str, dom: Any) -> List[Any]:
	return dom.query_selector(selector)

	operations = OrderedDict(
	[
	("title_text", lambda dom: consume(_first_text(query("title", dom)))),
	(
	"class_lookup",
	lambda dom: consume(len(dom.get_elements_by_class_name("sister"))),
	),
	(
	"id_lookup",
	lambda dom: consume(
	(_elt := dom.get_element_by_id("link3")) and _elt.inner_text()
	),
	),
	(
	"css_query",
	lambda dom: consume(len(query("p.story a.sister", dom))),
	),
	]
	)

	return Library("tl.parse", parse_tl, operations)


	def build_bs4_library(name: str, parser: str) -> Library:
	def parse_bs4(html: str) -> Any:
	return BeautifulSoup(html, parser)

	operations = OrderedDict(
	[
	("title_text", lambda soup: consume(soup.title.get_text(strip=True))),
	(
	"class_lookup",
	lambda soup: consume(len(soup.find_all(class_="sister"))),
	),
	(
	"id_lookup",
	lambda soup: consume(
	(elt := soup.find(id="link3")) and elt.get_text(strip=True)
	),
	),
	(
	"css_query",
	lambda soup: consume(len(soup.select("p.story a.sister"))),
	),
	]
	)

	return Library(name, parse_bs4, operations)


	LIBRARIES = [
	build_tl_library(),
	build_bs4_library("BeautifulSoup (html.parser)", "html.parser"),
	build_bs4_library("BeautifulSoup (lxml)", "lxml"),
	# PyQuery uses lxml under the hood and offers a jQuery-like API.
	Library(
	"PyQuery",
	lambda html: pq(html),
	OrderedDict(
	[
	("title_text", lambda doc: consume(doc("title").text())),
	("class_lookup", lambda doc: consume(len(doc(".sister")))),
	("id_lookup", lambda doc: consume(doc("#link3").text())),
	("css_query", lambda doc: consume(len(doc("p.story a.sister")))),
	]
	),
	),
	]


	def time_callable(fn: Callable[[], None], iterations: int) -> float:
	start = perf_counter()
	for _ in range(iterations):
	fn()
	return (perf_counter() - start) / iterations


	def benchmark() -> Dict[str, Dict[str, float]]:
	metrics = OrderedDict((metric, {}) for metric in ["parse"])
	for metric in LIBRARIES[0].operations:
	metrics[metric] = {}

	for library in LIBRARIES:
	parse_time = time_callable(lambda: library.parse(HTML_DOCUMENT), PARSE_ITERATIONS)
	metrics["parse"][library.name] = parse_time

	dom = library.parse(HTML_DOCUMENT)
	for metric_name, operation in library.operations.items():
	metrics[metric_name][library.name] = time_callable(
	lambda op=operation, d=dom: op(d),
	OP_ITERATIONS,
	)

	return metrics


	def print_results(results: Dict[str, Dict[str, float]]) -> None:
	libs = [lib.name for lib in LIBRARIES]
	header = "metric".ljust(16) + "".join(name.rjust(26) for name in libs)
	print(header)
	print("-" * len(header))
	for metric, values in results.items():
	row = metric.ljust(16)
	for lib in libs:
	ms = values[lib] * 1000
	row += f"{ms:>25.3f} ms"
	print(row)


	def plot_results(results: Dict[str, Dict[str, float]]) -> None:
	metrics = list(results.keys())
	libs = [lib.name for lib in LIBRARIES]
	bar_width = 0.25
	center_offset = (len(libs) - 1) / 2

	with plt.xkcd():
	fig, ax = plt.subplots(figsize=(10, 6))
	for lib_index, lib in enumerate(libs):
	positions = [
	metric_index + (lib_index - center_offset) * bar_width
	for metric_index in range(len(metrics))
	]
	values = [results[metric][lib] * 1000 for metric in metrics]
	ax.bar(positions, values, width=bar_width, label=lib)

	ax.set_xticks(range(len(metrics)))
	ax.set_xticklabels(
	[metric.replace("_", " ") for metric in metrics], rotation=15
	)
	ax.set_ylabel("ms per operation (lower is better)")
	ax.set_title("tl.parse vs BeautifulSoup benchmarks")
	ax.legend()
	ax.grid(alpha=0.2, axis="y", linestyle="--")

	fig.tight_layout()
	fig.savefig(PLOT_PATH, dpi=200)
	print(f"Saved plot to {PLOT_PATH}")
	def render(use_xkcd: bool) -> None:
	context = plt.xkcd() if use_xkcd else nullcontext()
	with context:
	metrics = list(results.keys())
	libs = [lib.name for lib in LIBRARIES]
	bar_width = 0.25
	center_offset = (len(libs) - 1) / 2

	fig, ax = plt.subplots(figsize=(10, 6))
	for lib_index, lib in enumerate(libs):
	positions = [
	metric_index + (lib_index - center_offset) * bar_width
	for metric_index in range(len(metrics))
	]
	values = [results[metric][lib] * 1000 for metric in metrics]
	ax.bar(positions, values, width=bar_width, label=lib)

	ax.set_xticks(range(len(metrics)))
	ax.set_xticklabels(
	[metric.replace("_", " ") for metric in metrics], rotation=15
	)
	ax.set_ylabel("ms per operation (lower is better)")
	ax.set_title("tl.parse vs BeautifulSoup benchmarks")
	ax.legend()
	ax.grid(alpha=0.2, axis="y")

	fig.tight_layout()
	fig.savefig(PLOT_PATH, dpi=200)

	try:
	render(True)
	except Exception as exc:
	print(f"xkcd style failed ({exc}); retrying without it.")
	render(False)

	print(f"Saved plot to {PLOT_PATH}")

	def main() -> None:
	results = benchmark()
	print_results(results)
	plot_results(results)


	def _first_text(elements: Sequence[Any]) -> str:
	if not elements:
	return ""
	node = elements[0]
	return node.inner_text() if hasattr(node, "inner_text") else str(node)


	if __name__ == "__main__":
	main()
No results found