Skip to content

Instantly share code, notes, and snippets.

@paulrobello
Created March 14, 2026 04:34
Show Gist options
  • Select an option

  • Save paulrobello/ddee910edcbc4d69f38ff57d19a44030 to your computer and use it in GitHub Desktop.

Select an option

Save paulrobello/ddee910edcbc4d69f38ff57d19a44030 to your computer and use it in GitHub Desktop.
Convert HTML to clean markdown optimised for LLM consumption (file, stdin, or URL)
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.11"
# dependencies = [
# "beautifulsoup4",
# "html2text",
# "httpx",
# ]
# ///
"""Convert HTML to clean markdown optimised for LLM consumption.
Reads HTML from a file, stdin, or URL, strips navigation, banners, cookie
notices, layout tables, and script/style elements, then emits clean markdown
with preserved code-fence language annotations.
Usage:
html-to-md [FILE] # convert file → stdout
html-to-md [FILE] -o out.md # convert file → file
html-to-md - # read stdin → stdout
html-to-md https://example.com # fetch URL → stdout
html-to-md https://example.com -o page.md
Options:
FILE | URL | - Input source (URL, file path, or - for stdin)
-o, --output FILE Write output to FILE instead of stdout
--url URL Base URL for resolving relative links when input is
a file or stdin (ignored when fetching a URL directly)
--no-links Strip all hyperlinks from output
--include-images Preserve image references in output
--debug, -D Print extra info to stderr
--help, -h Show this message and exit
Examples:
html-to-md page.html
html-to-md page.html -o page.md
curl https://example.com | html-to-md - --url https://example.com
html-to-md https://docs.python.org/3/library/re.html -o re-docs.md
"""
from __future__ import annotations
import argparse
import re
import sys
from urllib.parse import urljoin, urlparse
import html2text
import httpx
from bs4 import BeautifulSoup
# ── Constants (mirrored from par-fetch-mcp) ────────────────────────────────
_LANG_MARKER_PREFIX = "<!--lang:"
_LANG_MARKER_SUFFIX = "-->"
_COOKIE_CONSENT_PATTERN = re.compile(
r"cookie|consent|gdpr|privacy.?banner|cc-banner|onetrust|CybotCookiebot",
re.IGNORECASE,
)
_SELECTORS_TO_REMOVE = [
'[role="navigation"]',
'[role="banner"]',
'[role="contentinfo"]',
'[role="complementary"]',
'[role="search"]',
'[aria-hidden="true"]',
".skip-to-content",
".skip-link",
"[class*='skip-to']",
"[class*='social-share']",
"[class*='share-button']",
"[class*='social-media']",
]
# ── Core pipeline (mirrored from par-fetch-mcp) ────────────────────────────
def _extract_code_language(tag) -> str:
for el in [tag] + tag.find_all("code"):
classes = el.get("class", [])
if isinstance(classes, str):
classes = classes.split()
for cls in classes:
for prefix in ("language-", "lang-", "highlight-"):
if cls.startswith(prefix):
return cls[len(prefix):]
return ""
def _is_layout_table(table) -> bool:
if table.get("role") == "presentation":
return True
if not table.find("th"):
rows = table.find_all("tr")
if rows and all(len(row.find_all(["td", "th"])) <= 1 for row in rows):
return True
return False
def _clean_markdown(text: str) -> str:
text = re.sub(r"\n{4,}", "\n\n\n", text)
text = re.sub(r"^[*\-+] *$", "", text, flags=re.MULTILINE)
text = re.sub(r"\[\s*\]\([^)]*\)", "", text)
text = re.sub(r"\*{1,3}\s*\*{1,3}", "", text)
text = re.sub(r"_{1,3}\s*_{1,3}", "", text)
text = re.sub(r"(^-{3,}\s*$\n?){2,}", "---\n", text, flags=re.MULTILINE)
text = re.sub(r"(^\*{3,}\s*$\n?){2,}", "---\n", text, flags=re.MULTILINE)
text = re.sub(r"^[|\-\s]+$", "", text, flags=re.MULTILINE)
text = re.sub(r"\n{4,}", "\n\n\n", text)
return text.strip()
def _html_to_markdown(
html_content: str,
*,
url: str | None = None,
include_links: bool = True,
include_images: bool = False,
) -> str:
soup = BeautifulSoup(html_content, "html.parser")
# Narrow to main content container if one exists
content_root = None
for selector in ["main", "[role='main']", "article"]:
candidate = soup.select_one(selector)
if candidate and len(candidate.get_text(strip=True)) > 200:
content_root = candidate
break
if content_root is not None:
new_soup = BeautifulSoup("<div></div>", "html.parser")
div = new_soup.find("div")
assert div is not None
div.append(content_root)
soup = new_soup
# Resolve relative URLs
if include_links and url:
url_attributes = ["href", "src", "action", "data", "poster", "background", "cite", "formaction"]
for tag in soup.find_all(True):
for attribute in url_attributes:
if tag.has_attr(attribute):
attr_value = tag[attribute]
if isinstance(attr_value, list):
continue
if attr_value.startswith("//"):
tag[attribute] = f"https:{attr_value}"
elif not attr_value.startswith(("http://", "https://", "mailto:", "tel:", "javascript:")):
tag[attribute] = urljoin(url, attr_value)
# Remove non-content elements by tag
elements_to_remove = [
"head", "header", "footer", "script", "source", "style",
"svg", "iframe", "nav", "aside", "form", "noscript", "template",
]
if not include_links:
elements_to_remove.extend(["a", "link"])
if not include_images:
elements_to_remove.append("img")
for element in elements_to_remove:
for tag in soup.find_all(element):
tag.decompose()
# Remove non-content elements by CSS selector
for selector in _SELECTORS_TO_REMOVE:
for tag in soup.select(selector):
tag.decompose()
# Remove cookie/consent banners
for tag in soup.find_all(True):
classes = " ".join(tag.get("class", []))
tag_id = tag.get("id", "") or ""
if _COOKIE_CONSENT_PATTERN.search(classes) or _COOKIE_CONSENT_PATTERN.search(tag_id):
tag.decompose()
# Unwrap layout tables
for table in soup.find_all("table"):
if _is_layout_table(table):
table.unwrap()
# Convert separator elements to <hr>
for element in soup.find_all(attrs={"role": "separator"}):
hr = soup.new_tag("hr")
element.replace_with(hr)
hr.insert_before(soup.new_string("\n"))
hr.insert_after(soup.new_string("\n"))
# Extract language hints and mark code blocks
for pre in soup.find_all("pre"):
lang = _extract_code_language(pre)
if lang:
marker = soup.new_string(f"{_LANG_MARKER_PREFIX}{lang}{_LANG_MARKER_SUFFIX}")
pre.insert_before(marker)
result_html = str(soup)
converter = html2text.HTML2Text()
converter.ignore_links = not include_links
converter.ignore_images = not include_images
converter.body_width = 0
converter.protect_links = True
converter.unicode_snob = True
converter.skip_internal_links = True
converter.wrap_links = False
markdown = converter.handle(result_html)
# Fix up language-annotated fenced code blocks: <!--lang:python-->``` → ```python
markdown = re.sub(
rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\s*```",
r"```\1",
markdown,
)
# Fix up language-annotated indented code blocks (html2text uses 4-space indent for <pre>)
# <!--lang:python-->\n \n code\n → ```python\ncode\n```
def _replace_indented_block(m: re.Match) -> str:
lang = m.group(1)
block = m.group(2)
# Strip the 4-space (or tab) indent from every line; drop leading blank lines
lines = block.rstrip("\n").split("\n")
dedented = []
for line in lines:
if line.startswith(" "):
dedented.append(line[4:])
elif line.startswith("\t"):
dedented.append(line[1:])
else:
dedented.append(line.strip())
# Drop leading/trailing blank lines inside the block
while dedented and not dedented[0].strip():
dedented.pop(0)
while dedented and not dedented[-1].strip():
dedented.pop()
if not dedented:
return ""
return f"```{lang}\n" + "\n".join(dedented) + "\n```"
# Match marker followed by all lines starting with whitespace (covers blank indented lines)
markdown = re.sub(
rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\n((?:[ \t][^\n]*\n)*)",
_replace_indented_block,
markdown,
)
# Remove any remaining orphaned lang markers that didn't match either pattern
markdown = re.sub(
rf"{re.escape(_LANG_MARKER_PREFIX)}\w+{re.escape(_LANG_MARKER_SUFFIX)}\n*",
"",
markdown,
)
return _clean_markdown(markdown)
# ── Fetching ───────────────────────────────────────────────────────────────
def _is_url(s: str) -> bool:
parsed = urlparse(s)
return parsed.scheme in ("http", "https")
def _fetch_url(url: str, timeout: int = 10, debug: bool = False) -> str:
import random
os_list = [
("Windows NT 10.0", "Win64; x64"),
("Macintosh; Apple M2 Mac OS X 14_2_1", "arm64"),
]
os_name, platform = random.choice(os_list)
ua = f"Mozilla/5.0 ({os_name.split('; ')[0]}; {platform}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36"
headers = {
"User-Agent": ua,
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.5",
}
if debug:
print(f"[html-to-md] fetching {url}", file=sys.stderr)
with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
response = client.get(url)
response.raise_for_status()
return response.text
# ── CLI ────────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(
prog="html-to-md",
description="Convert HTML to clean markdown optimised for LLM consumption.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__.split("Examples:")[1].strip() if "Examples:" in __doc__ else "",
)
parser.add_argument(
"input",
nargs="?",
default="-",
metavar="FILE|URL|-",
help="HTML file, URL to fetch, or - for stdin (default: -)",
)
parser.add_argument(
"-o", "--output",
metavar="FILE",
help="Write output to FILE instead of stdout",
)
parser.add_argument(
"--url",
metavar="URL",
help="Base URL for resolving relative links when input is a file or stdin",
)
parser.add_argument(
"--no-links",
action="store_true",
help="Strip all hyperlinks from output",
)
parser.add_argument(
"--include-images",
action="store_true",
help="Preserve image references in output",
)
parser.add_argument(
"--timeout",
type=int,
default=10,
metavar="SECS",
help="HTTP timeout in seconds when fetching a URL (default: 10)",
)
parser.add_argument(
"--debug", "-D",
action="store_true",
help="Print extra info to stderr",
)
args = parser.parse_args()
# ── Read input ──────────────────────────────────────────────────────────
base_url: str | None = args.url
if args.input != "-" and _is_url(args.input):
# Input is a URL — fetch it
base_url = base_url or args.input
try:
html_content = _fetch_url(args.input, timeout=args.timeout, debug=args.debug)
except httpx.HTTPStatusError as e:
print(f"error: HTTP {e.response.status_code} fetching {args.input}", file=sys.stderr)
sys.exit(1)
except httpx.RequestError as e:
print(f"error: {e}", file=sys.stderr)
sys.exit(1)
elif args.input == "-":
# Read from stdin
if args.debug:
print("[html-to-md] reading from stdin", file=sys.stderr)
html_content = sys.stdin.read()
else:
# Read from file
if args.debug:
print(f"[html-to-md] reading {args.input}", file=sys.stderr)
try:
with open(args.input, encoding="utf-8") as fh:
html_content = fh.read()
except FileNotFoundError:
print(f"error: file not found: {args.input}", file=sys.stderr)
sys.exit(1)
except OSError as e:
print(f"error: {e}", file=sys.stderr)
sys.exit(1)
# ── Convert ─────────────────────────────────────────────────────────────
if args.debug:
print(f"[html-to-md] converting {len(html_content):,} bytes of HTML", file=sys.stderr)
markdown = _html_to_markdown(
html_content,
url=base_url,
include_links=not args.no_links,
include_images=args.include_images,
)
if args.debug:
print(f"[html-to-md] output: {len(markdown):,} chars", file=sys.stderr)
# ── Write output ─────────────────────────────────────────────────────────
if args.output:
with open(args.output, "w", encoding="utf-8") as fh:
fh.write(markdown)
fh.write("\n")
if args.debug:
print(f"[html-to-md] written to {args.output}", file=sys.stderr)
else:
print(markdown)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment