Created
December 7, 2025 17:02
-
-
Save mgaitan/0c4a49a16c825c1993a3ec3064035718 to your computer and use it in GitHub Desktop.
tl-parser vs pyquery vs bs4 with html5 and lxml parsers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "tl-parser==0.7.11", | |
| # "beautifulsoup4", | |
| # "lxml", | |
| # "pyquery", | |
| # "matplotlib", | |
| # ] | |
| # | |
| # [tool.uv] | |
| # #find-links = ["https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11"] | |
| # /// | |
| from __future__ import annotations | |
| """ | |
| Benchmark tl-parser against BeautifulSoup using a shared HTML sample. | |
| You can run it with uv and the GitHub release wheels via: | |
| uv run \ | |
| --find-links=https://github.com/mgaitan/tl-parser/releases/expanded_assets/python-v0.7.11 \ | |
| --with tl-parser==0.7.11 \ | |
| scripts/bench_tl_vs_bs4.py | |
| """ | |
| from collections import OrderedDict | |
| from dataclasses import dataclass | |
| from contextlib import nullcontext | |
| from pathlib import Path | |
| from time import perf_counter | |
| from typing import Any, Callable, Dict, List, Sequence | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt # noqa: E402 # isort: skip | |
| from bs4 import BeautifulSoup # noqa: E402 # isort: skip | |
| from pyquery import PyQuery as pq # noqa: E402 # isort: skip | |
| matplotlib.use("Agg") | |
| import tl # noqa: E402 # isort: skip | |
| HTML_SNIPPET = """ | |
| <html> | |
| <head> | |
| <title>The Dormouse's story</title> | |
| </head> | |
| <body> | |
| <p class="title"> | |
| <b>The Dormouse's story</b> | |
| </p> | |
| <p class="story"> | |
| Once upon a time there were three little sisters; and their names were | |
| <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>, | |
| <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a> | |
| and | |
| <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>; | |
| and they lived at the bottom of a well. | |
| </p> | |
| <p class="story">...</p> | |
| </body> | |
| </html> | |
| """.strip() | |
| HTML_REPETITIONS = 250 | |
| HTML_REPETITIONS = 150 | |
| HTML_DOCUMENT = ( | |
| "<!doctype html>\n" + "\n".join(HTML_SNIPPET for _ in range(HTML_REPETITIONS)) | |
| ) | |
| PARSE_ITERATIONS = 400 | |
| OP_ITERATIONS = 4000 | |
| PARSE_ITERATIONS = 150 | |
| OP_ITERATIONS = 1500 | |
| PLOT_PATH = Path(__file__).with_name("tl_vs_bs4_benchmark.png") | |
| _SINK: Any = None | |
| def consume(value: Any) -> None: | |
| global _SINK | |
| _SINK = value | |
| @dataclass | |
| class Library: | |
| name: str | |
| parse: Callable[[str], Any] | |
| operations: Dict[str, Callable[[Any], None]] | |
| def build_tl_library() -> Library: | |
| def parse_tl(html: str) -> Any: | |
| return tl.parse(html) | |
| def query(selector: str, dom: Any) -> List[Any]: | |
| return dom.query_selector(selector) | |
| operations = OrderedDict( | |
| [ | |
| ("title_text", lambda dom: consume(_first_text(query("title", dom)))), | |
| ( | |
| "class_lookup", | |
| lambda dom: consume(len(dom.get_elements_by_class_name("sister"))), | |
| ), | |
| ( | |
| "id_lookup", | |
| lambda dom: consume( | |
| (_elt := dom.get_element_by_id("link3")) and _elt.inner_text() | |
| ), | |
| ), | |
| ( | |
| "css_query", | |
| lambda dom: consume(len(query("p.story a.sister", dom))), | |
| ), | |
| ] | |
| ) | |
| return Library("tl.parse", parse_tl, operations) | |
| def build_bs4_library(name: str, parser: str) -> Library: | |
| def parse_bs4(html: str) -> Any: | |
| return BeautifulSoup(html, parser) | |
| operations = OrderedDict( | |
| [ | |
| ("title_text", lambda soup: consume(soup.title.get_text(strip=True))), | |
| ( | |
| "class_lookup", | |
| lambda soup: consume(len(soup.find_all(class_="sister"))), | |
| ), | |
| ( | |
| "id_lookup", | |
| lambda soup: consume( | |
| (elt := soup.find(id="link3")) and elt.get_text(strip=True) | |
| ), | |
| ), | |
| ( | |
| "css_query", | |
| lambda soup: consume(len(soup.select("p.story a.sister"))), | |
| ), | |
| ] | |
| ) | |
| return Library(name, parse_bs4, operations) | |
| LIBRARIES = [ | |
| build_tl_library(), | |
| build_bs4_library("BeautifulSoup (html.parser)", "html.parser"), | |
| build_bs4_library("BeautifulSoup (lxml)", "lxml"), | |
| # PyQuery uses lxml under the hood and offers a jQuery-like API. | |
| Library( | |
| "PyQuery", | |
| lambda html: pq(html), | |
| OrderedDict( | |
| [ | |
| ("title_text", lambda doc: consume(doc("title").text())), | |
| ("class_lookup", lambda doc: consume(len(doc(".sister")))), | |
| ("id_lookup", lambda doc: consume(doc("#link3").text())), | |
| ("css_query", lambda doc: consume(len(doc("p.story a.sister")))), | |
| ] | |
| ), | |
| ), | |
| ] | |
| def time_callable(fn: Callable[[], None], iterations: int) -> float: | |
| start = perf_counter() | |
| for _ in range(iterations): | |
| fn() | |
| return (perf_counter() - start) / iterations | |
| def benchmark() -> Dict[str, Dict[str, float]]: | |
| metrics = OrderedDict((metric, {}) for metric in ["parse"]) | |
| for metric in LIBRARIES[0].operations: | |
| metrics[metric] = {} | |
| for library in LIBRARIES: | |
| parse_time = time_callable(lambda: library.parse(HTML_DOCUMENT), PARSE_ITERATIONS) | |
| metrics["parse"][library.name] = parse_time | |
| dom = library.parse(HTML_DOCUMENT) | |
| for metric_name, operation in library.operations.items(): | |
| metrics[metric_name][library.name] = time_callable( | |
| lambda op=operation, d=dom: op(d), | |
| OP_ITERATIONS, | |
| ) | |
| return metrics | |
| def print_results(results: Dict[str, Dict[str, float]]) -> None: | |
| libs = [lib.name for lib in LIBRARIES] | |
| header = "metric".ljust(16) + "".join(name.rjust(26) for name in libs) | |
| print(header) | |
| print("-" * len(header)) | |
| for metric, values in results.items(): | |
| row = metric.ljust(16) | |
| for lib in libs: | |
| ms = values[lib] * 1000 | |
| row += f"{ms:>25.3f} ms" | |
| print(row) | |
| def plot_results(results: Dict[str, Dict[str, float]]) -> None: | |
| metrics = list(results.keys()) | |
| libs = [lib.name for lib in LIBRARIES] | |
| bar_width = 0.25 | |
| center_offset = (len(libs) - 1) / 2 | |
| with plt.xkcd(): | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| for lib_index, lib in enumerate(libs): | |
| positions = [ | |
| metric_index + (lib_index - center_offset) * bar_width | |
| for metric_index in range(len(metrics)) | |
| ] | |
| values = [results[metric][lib] * 1000 for metric in metrics] | |
| ax.bar(positions, values, width=bar_width, label=lib) | |
| ax.set_xticks(range(len(metrics))) | |
| ax.set_xticklabels( | |
| [metric.replace("_", " ") for metric in metrics], rotation=15 | |
| ) | |
| ax.set_ylabel("ms per operation (lower is better)") | |
| ax.set_title("tl.parse vs BeautifulSoup benchmarks") | |
| ax.legend() | |
| ax.grid(alpha=0.2, axis="y", linestyle="--") | |
| fig.tight_layout() | |
| fig.savefig(PLOT_PATH, dpi=200) | |
| print(f"Saved plot to {PLOT_PATH}") | |
| def render(use_xkcd: bool) -> None: | |
| context = plt.xkcd() if use_xkcd else nullcontext() | |
| with context: | |
| metrics = list(results.keys()) | |
| libs = [lib.name for lib in LIBRARIES] | |
| bar_width = 0.25 | |
| center_offset = (len(libs) - 1) / 2 | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| for lib_index, lib in enumerate(libs): | |
| positions = [ | |
| metric_index + (lib_index - center_offset) * bar_width | |
| for metric_index in range(len(metrics)) | |
| ] | |
| values = [results[metric][lib] * 1000 for metric in metrics] | |
| ax.bar(positions, values, width=bar_width, label=lib) | |
| ax.set_xticks(range(len(metrics))) | |
| ax.set_xticklabels( | |
| [metric.replace("_", " ") for metric in metrics], rotation=15 | |
| ) | |
| ax.set_ylabel("ms per operation (lower is better)") | |
| ax.set_title("tl.parse vs BeautifulSoup benchmarks") | |
| ax.legend() | |
| ax.grid(alpha=0.2, axis="y") | |
| fig.tight_layout() | |
| fig.savefig(PLOT_PATH, dpi=200) | |
| try: | |
| render(True) | |
| except Exception as exc: | |
| print(f"xkcd style failed ({exc}); retrying without it.") | |
| render(False) | |
| print(f"Saved plot to {PLOT_PATH}") | |
| def main() -> None: | |
| results = benchmark() | |
| print_results(results) | |
| plot_results(results) | |
| def _first_text(elements: Sequence[Any]) -> str: | |
| if not elements: | |
| return "" | |
| node = elements[0] | |
| return node.inner_text() if hasattr(node, "inner_text") else str(node) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment