Skip to content

Instantly share code, notes, and snippets.

@mgaitan
Created December 7, 2025 17:02
Show Gist options
  • Select an option

  • Save mgaitan/0c4a49a16c825c1993a3ec3064035718 to your computer and use it in GitHub Desktop.

Select an option

Save mgaitan/0c4a49a16c825c1993a3ec3064035718 to your computer and use it in GitHub Desktop.
tl-parser vs pyquery vs bs4 with html5 and lxml parsers
# /// script
# requires-python = ">=3.12"
# dependencies = [
# "tl-parser==0.7.11",
# "beautifulsoup4",
# "lxml",
# "pyquery",
# "matplotlib",
# ]
#
# [tool.uv]
# #find-links = ["https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11"]
# ///
from __future__ import annotations
"""
Benchmark tl-parser against BeautifulSoup using a shared HTML sample.
You can run it with uv and the GitHub release wheels via:
uv run \
--find-links=https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11 \
--with tl-parser==0.7.11 \
scripts/bench_tl_vs_bs4.py
"""
from collections import OrderedDict
from dataclasses import dataclass
from contextlib import nullcontext
from pathlib import Path
from time import perf_counter
from typing import Any, Callable, Dict, List, Sequence
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt # noqa: E402 # isort: skip
from bs4 import BeautifulSoup # noqa: E402 # isort: skip
from pyquery import PyQuery as pq # noqa: E402 # isort: skip
matplotlib.use("Agg")
import tl # noqa: E402 # isort: skip
HTML_SNIPPET = """
<html>
<head>
<title>The Dormouse's story</title>
</head>
<body>
<p class="title">
<b>The Dormouse's story</b>
</p>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
and
<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
</body>
</html>
""".strip()
HTML_REPETITIONS = 250
HTML_REPETITIONS = 150
HTML_DOCUMENT = (
"<!doctype html>\n" + "\n".join(HTML_SNIPPET for _ in range(HTML_REPETITIONS))
)
PARSE_ITERATIONS = 400
OP_ITERATIONS = 4000
PARSE_ITERATIONS = 150
OP_ITERATIONS = 1500
PLOT_PATH = Path(__file__).with_name("tl_vs_bs4_benchmark.png")
_SINK: Any = None
def consume(value: Any) -> None:
global _SINK
_SINK = value
@dataclass
class Library:
name: str
parse: Callable[[str], Any]
operations: Dict[str, Callable[[Any], None]]
def build_tl_library() -> Library:
def parse_tl(html: str) -> Any:
return tl.parse(html)
def query(selector: str, dom: Any) -> List[Any]:
return dom.query_selector(selector)
operations = OrderedDict(
[
("title_text", lambda dom: consume(_first_text(query("title", dom)))),
(
"class_lookup",
lambda dom: consume(len(dom.get_elements_by_class_name("sister"))),
),
(
"id_lookup",
lambda dom: consume(
(_elt := dom.get_element_by_id("link3")) and _elt.inner_text()
),
),
(
"css_query",
lambda dom: consume(len(query("p.story a.sister", dom))),
),
]
)
return Library("tl.parse", parse_tl, operations)
def build_bs4_library(name: str, parser: str) -> Library:
def parse_bs4(html: str) -> Any:
return BeautifulSoup(html, parser)
operations = OrderedDict(
[
("title_text", lambda soup: consume(soup.title.get_text(strip=True))),
(
"class_lookup",
lambda soup: consume(len(soup.find_all(class_="sister"))),
),
(
"id_lookup",
lambda soup: consume(
(elt := soup.find(id="link3")) and elt.get_text(strip=True)
),
),
(
"css_query",
lambda soup: consume(len(soup.select("p.story a.sister"))),
),
]
)
return Library(name, parse_bs4, operations)
LIBRARIES = [
build_tl_library(),
build_bs4_library("BeautifulSoup (html.parser)", "html.parser"),
build_bs4_library("BeautifulSoup (lxml)", "lxml"),
# PyQuery uses lxml under the hood and offers a jQuery-like API.
Library(
"PyQuery",
lambda html: pq(html),
OrderedDict(
[
("title_text", lambda doc: consume(doc("title").text())),
("class_lookup", lambda doc: consume(len(doc(".sister")))),
("id_lookup", lambda doc: consume(doc("#link3").text())),
("css_query", lambda doc: consume(len(doc("p.story a.sister")))),
]
),
),
]
def time_callable(fn: Callable[[], None], iterations: int) -> float:
start = perf_counter()
for _ in range(iterations):
fn()
return (perf_counter() - start) / iterations
def benchmark() -> Dict[str, Dict[str, float]]:
metrics = OrderedDict((metric, {}) for metric in ["parse"])
for metric in LIBRARIES[0].operations:
metrics[metric] = {}
for library in LIBRARIES:
parse_time = time_callable(lambda: library.parse(HTML_DOCUMENT), PARSE_ITERATIONS)
metrics["parse"][library.name] = parse_time
dom = library.parse(HTML_DOCUMENT)
for metric_name, operation in library.operations.items():
metrics[metric_name][library.name] = time_callable(
lambda op=operation, d=dom: op(d),
OP_ITERATIONS,
)
return metrics
def print_results(results: Dict[str, Dict[str, float]]) -> None:
libs = [lib.name for lib in LIBRARIES]
header = "metric".ljust(16) + "".join(name.rjust(26) for name in libs)
print(header)
print("-" * len(header))
for metric, values in results.items():
row = metric.ljust(16)
for lib in libs:
ms = values[lib] * 1000
row += f"{ms:>25.3f} ms"
print(row)
def plot_results(results: Dict[str, Dict[str, float]]) -> None:
metrics = list(results.keys())
libs = [lib.name for lib in LIBRARIES]
bar_width = 0.25
center_offset = (len(libs) - 1) / 2
with plt.xkcd():
fig, ax = plt.subplots(figsize=(10, 6))
for lib_index, lib in enumerate(libs):
positions = [
metric_index + (lib_index - center_offset) * bar_width
for metric_index in range(len(metrics))
]
values = [results[metric][lib] * 1000 for metric in metrics]
ax.bar(positions, values, width=bar_width, label=lib)
ax.set_xticks(range(len(metrics)))
ax.set_xticklabels(
[metric.replace("_", " ") for metric in metrics], rotation=15
)
ax.set_ylabel("ms per operation (lower is better)")
ax.set_title("tl.parse vs BeautifulSoup benchmarks")
ax.legend()
ax.grid(alpha=0.2, axis="y", linestyle="--")
fig.tight_layout()
fig.savefig(PLOT_PATH, dpi=200)
print(f"Saved plot to {PLOT_PATH}")
def render(use_xkcd: bool) -> None:
context = plt.xkcd() if use_xkcd else nullcontext()
with context:
metrics = list(results.keys())
libs = [lib.name for lib in LIBRARIES]
bar_width = 0.25
center_offset = (len(libs) - 1) / 2
fig, ax = plt.subplots(figsize=(10, 6))
for lib_index, lib in enumerate(libs):
positions = [
metric_index + (lib_index - center_offset) * bar_width
for metric_index in range(len(metrics))
]
values = [results[metric][lib] * 1000 for metric in metrics]
ax.bar(positions, values, width=bar_width, label=lib)
ax.set_xticks(range(len(metrics)))
ax.set_xticklabels(
[metric.replace("_", " ") for metric in metrics], rotation=15
)
ax.set_ylabel("ms per operation (lower is better)")
ax.set_title("tl.parse vs BeautifulSoup benchmarks")
ax.legend()
ax.grid(alpha=0.2, axis="y")
fig.tight_layout()
fig.savefig(PLOT_PATH, dpi=200)
try:
render(True)
except Exception as exc:
print(f"xkcd style failed ({exc}); retrying without it.")
render(False)
print(f"Saved plot to {PLOT_PATH}")
def main() -> None:
results = benchmark()
print_results(results)
plot_results(results)
def _first_text(elements: Sequence[Any]) -> str:
if not elements:
return ""
node = elements[0]
return node.inner_text() if hasattr(node, "inner_text") else str(node)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment