paulrobello · March 14, 2026 04:34
diff --git a/html-to-md b/html-to-md
 #!/usr/bin/env -S uv run --script
 # /// script
 # requires-python = ">=3.11"
 # dependencies = [
 #   "beautifulsoup4",
 #   "html2text",
 #   "httpx",
 # ]
 # ///
 """Convert HTML to clean markdown optimised for LLM consumption.

 Reads HTML from a file, stdin, or URL, strips navigation, banners, cookie
 notices, layout tables, and script/style elements, then emits clean markdown
 with preserved code-fence language annotations.

 Usage:
  html-to-md [FILE]                  # convert file → stdout
  html-to-md [FILE] -o out.md        # convert file → file
  html-to-md -                       # read stdin → stdout
  html-to-md https://example.com     # fetch URL → stdout
  html-to-md https://example.com -o page.md

 Options:
  FILE | URL | -       Input source (URL, file path, or - for stdin)
  -o, --output FILE    Write output to FILE instead of stdout
  --url URL            Base URL for resolving relative links when input is
                       a file or stdin (ignored when fetching a URL directly)
  --no-links           Strip all hyperlinks from output
  --include-images     Preserve image references in output
  --debug, -D          Print extra info to stderr
  --help, -h           Show this message and exit

 Examples:
  html-to-md page.html
  html-to-md page.html -o page.md
  curl https://example.com | html-to-md - --url https://example.com
  html-to-md https://docs.python.org/3/library/re.html -o re-docs.md
 """

 from __future__ import annotations

 import argparse
 import re
 import sys
 from urllib.parse import urljoin, urlparse

 import html2text
 import httpx
 from bs4 import BeautifulSoup

 # ── Constants (mirrored from par-fetch-mcp) ────────────────────────────────

 _LANG_MARKER_PREFIX = "<!--lang:"
 _LANG_MARKER_SUFFIX = "-->"

 _COOKIE_CONSENT_PATTERN = re.compile(
    r"cookie|consent|gdpr|privacy.?banner|cc-banner|onetrust|CybotCookiebot",
    re.IGNORECASE,
 )

 _SELECTORS_TO_REMOVE = [
    '[role="navigation"]',
    '[role="banner"]',
    '[role="contentinfo"]',
    '[role="complementary"]',
    '[role="search"]',
    '[aria-hidden="true"]',
    ".skip-to-content",
    ".skip-link",
    "[class*='skip-to']",
    "[class*='social-share']",
    "[class*='share-button']",
    "[class*='social-media']",
 ]


 # ── Core pipeline (mirrored from par-fetch-mcp) ────────────────────────────

 def _extract_code_language(tag) -> str:
    for el in [tag] + tag.find_all("code"):
        classes = el.get("class", [])
        if isinstance(classes, str):
            classes = classes.split()
        for cls in classes:
            for prefix in ("language-", "lang-", "highlight-"):
                if cls.startswith(prefix):
                    return cls[len(prefix):]
    return ""


 def _is_layout_table(table) -> bool:
    if table.get("role") == "presentation":
        return True
    if not table.find("th"):
        rows = table.find_all("tr")
        if rows and all(len(row.find_all(["td", "th"])) <= 1 for row in rows):
            return True
    return False


 def _clean_markdown(text: str) -> str:
    text = re.sub(r"\n{4,}", "\n\n\n", text)
    text = re.sub(r"^[*\-+] *$", "", text, flags=re.MULTILINE)
    text = re.sub(r"\[\s*\]\([^)]*\)", "", text)
    text = re.sub(r"\*{1,3}\s*\*{1,3}", "", text)
    text = re.sub(r"_{1,3}\s*_{1,3}", "", text)
    text = re.sub(r"(^-{3,}\s*$\n?){2,}", "---\n", text, flags=re.MULTILINE)
    text = re.sub(r"(^\*{3,}\s*$\n?){2,}", "---\n", text, flags=re.MULTILINE)
    text = re.sub(r"^[|\-\s]+$", "", text, flags=re.MULTILINE)
    text = re.sub(r"\n{4,}", "\n\n\n", text)
    return text.strip()


 def _html_to_markdown(
    html_content: str,
    *,
    url: str | None = None,
    include_links: bool = True,
    include_images: bool = False,
 ) -> str:
    soup = BeautifulSoup(html_content, "html.parser")

    # Narrow to main content container if one exists
    content_root = None
    for selector in ["main", "[role='main']", "article"]:
        candidate = soup.select_one(selector)
        if candidate and len(candidate.get_text(strip=True)) > 200:
            content_root = candidate
            break

    if content_root is not None:
        new_soup = BeautifulSoup("<div></div>", "html.parser")
        div = new_soup.find("div")
        assert div is not None
        div.append(content_root)
        soup = new_soup

    # Resolve relative URLs
    if include_links and url:
        url_attributes = ["href", "src", "action", "data", "poster", "background", "cite", "formaction"]
        for tag in soup.find_all(True):
            for attribute in url_attributes:
                if tag.has_attr(attribute):
                    attr_value = tag[attribute]
                    if isinstance(attr_value, list):
                        continue
                    if attr_value.startswith("//"):
                        tag[attribute] = f"https:{attr_value}"
                    elif not attr_value.startswith(("http://", "https://", "mailto:", "tel:", "javascript:")):
                        tag[attribute] = urljoin(url, attr_value)

    # Remove non-content elements by tag
    elements_to_remove = [
        "head", "header", "footer", "script", "source", "style",
        "svg", "iframe", "nav", "aside", "form", "noscript", "template",
    ]
    if not include_links:
        elements_to_remove.extend(["a", "link"])
    if not include_images:
        elements_to_remove.append("img")
    for element in elements_to_remove:
        for tag in soup.find_all(element):
            tag.decompose()

    # Remove non-content elements by CSS selector
    for selector in _SELECTORS_TO_REMOVE:
        for tag in soup.select(selector):
            tag.decompose()

    # Remove cookie/consent banners
    for tag in soup.find_all(True):
        classes = " ".join(tag.get("class", []))
        tag_id = tag.get("id", "") or ""
        if _COOKIE_CONSENT_PATTERN.search(classes) or _COOKIE_CONSENT_PATTERN.search(tag_id):
            tag.decompose()

    # Unwrap layout tables
    for table in soup.find_all("table"):
        if _is_layout_table(table):
            table.unwrap()

    # Convert separator elements to <hr>
    for element in soup.find_all(attrs={"role": "separator"}):
        hr = soup.new_tag("hr")
        element.replace_with(hr)
        hr.insert_before(soup.new_string("\n"))
        hr.insert_after(soup.new_string("\n"))

    # Extract language hints and mark code blocks
    for pre in soup.find_all("pre"):
        lang = _extract_code_language(pre)
        if lang:
            marker = soup.new_string(f"{_LANG_MARKER_PREFIX}{lang}{_LANG_MARKER_SUFFIX}")
            pre.insert_before(marker)

    result_html = str(soup)

    converter = html2text.HTML2Text()
    converter.ignore_links = not include_links
    converter.ignore_images = not include_images
    converter.body_width = 0
    converter.protect_links = True
    converter.unicode_snob = True
    converter.skip_internal_links = True
    converter.wrap_links = False

    markdown = converter.handle(result_html)

    # Fix up language-annotated fenced code blocks: <!--lang:python-->``` → ```python
    markdown = re.sub(
        rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\s*```",
        r"```\1",
        markdown,
    )

    # Fix up language-annotated indented code blocks (html2text uses 4-space indent for <pre>)
    # <!--lang:python-->\n    \n    code\n → ```python\ncode\n```
    def _replace_indented_block(m: re.Match) -> str:
        lang = m.group(1)
        block = m.group(2)
        # Strip the 4-space (or tab) indent from every line; drop leading blank lines
        lines = block.rstrip("\n").split("\n")
        dedented = []
        for line in lines:
            if line.startswith("    "):
                dedented.append(line[4:])
            elif line.startswith("\t"):
                dedented.append(line[1:])
            else:
                dedented.append(line.strip())
        # Drop leading/trailing blank lines inside the block
        while dedented and not dedented[0].strip():
            dedented.pop(0)
        while dedented and not dedented[-1].strip():
            dedented.pop()
        if not dedented:
            return ""
        return f"```{lang}\n" + "\n".join(dedented) + "\n```"

    # Match marker followed by all lines starting with whitespace (covers blank indented lines)
    markdown = re.sub(
        rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\n((?:[ \t][^\n]*\n)*)",
        _replace_indented_block,
        markdown,
    )

    # Remove any remaining orphaned lang markers that didn't match either pattern
    markdown = re.sub(
        rf"{re.escape(_LANG_MARKER_PREFIX)}\w+{re.escape(_LANG_MARKER_SUFFIX)}\n*",
        "",
        markdown,
    )

    return _clean_markdown(markdown)


 # ── Fetching ───────────────────────────────────────────────────────────────

 def _is_url(s: str) -> bool:
    parsed = urlparse(s)
    return parsed.scheme in ("http", "https")


 def _fetch_url(url: str, timeout: int = 10, debug: bool = False) -> str:
    import random

    os_list = [
        ("Windows NT 10.0", "Win64; x64"),
        ("Macintosh; Apple M2 Mac OS X 14_2_1", "arm64"),
    ]
    os_name, platform = random.choice(os_list)
    ua = f"Mozilla/5.0 ({os_name.split('; ')[0]}; {platform}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36"

    headers = {
        "User-Agent": ua,
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
        "Accept-Language": "en-US,en;q=0.5",
    }
    if debug:
        print(f"[html-to-md] fetching {url}", file=sys.stderr)

    with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
        response = client.get(url)
        response.raise_for_status()
        return response.text


 # ── CLI ────────────────────────────────────────────────────────────────────

 def main() -> None:
    parser = argparse.ArgumentParser(
        prog="html-to-md",
        description="Convert HTML to clean markdown optimised for LLM consumption.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__.split("Examples:")[1].strip() if "Examples:" in __doc__ else "",
    )
    parser.add_argument(
        "input",
        nargs="?",
        default="-",
        metavar="FILE|URL|-",
        help="HTML file, URL to fetch, or - for stdin (default: -)",
    )
    parser.add_argument(
        "-o", "--output",
        metavar="FILE",
        help="Write output to FILE instead of stdout",
    )
    parser.add_argument(
        "--url",
        metavar="URL",
        help="Base URL for resolving relative links when input is a file or stdin",
    )
    parser.add_argument(
        "--no-links",
        action="store_true",
        help="Strip all hyperlinks from output",
    )
    parser.add_argument(
        "--include-images",
        action="store_true",
        help="Preserve image references in output",
    )
    parser.add_argument(
        "--timeout",
        type=int,
        default=10,
        metavar="SECS",
        help="HTTP timeout in seconds when fetching a URL (default: 10)",
    )
    parser.add_argument(
        "--debug", "-D",
        action="store_true",
        help="Print extra info to stderr",
    )

    args = parser.parse_args()

    # ── Read input ──────────────────────────────────────────────────────────
    base_url: str | None = args.url

    if args.input != "-" and _is_url(args.input):
        # Input is a URL — fetch it
        base_url = base_url or args.input
        try:
            html_content = _fetch_url(args.input, timeout=args.timeout, debug=args.debug)
        except httpx.HTTPStatusError as e:
            print(f"error: HTTP {e.response.status_code} fetching {args.input}", file=sys.stderr)
            sys.exit(1)
        except httpx.RequestError as e:
            print(f"error: {e}", file=sys.stderr)
            sys.exit(1)
    elif args.input == "-":
        # Read from stdin
        if args.debug:
            print("[html-to-md] reading from stdin", file=sys.stderr)
        html_content = sys.stdin.read()
    else:
        # Read from file
        if args.debug:
            print(f"[html-to-md] reading {args.input}", file=sys.stderr)
        try:
            with open(args.input, encoding="utf-8") as fh:
                html_content = fh.read()
        except FileNotFoundError:
            print(f"error: file not found: {args.input}", file=sys.stderr)
            sys.exit(1)
        except OSError as e:
            print(f"error: {e}", file=sys.stderr)
            sys.exit(1)

    # ── Convert ─────────────────────────────────────────────────────────────
    if args.debug:
        print(f"[html-to-md] converting {len(html_content):,} bytes of HTML", file=sys.stderr)

    markdown = _html_to_markdown(
        html_content,
        url=base_url,
        include_links=not args.no_links,
        include_images=args.include_images,
    )

    if args.debug:
        print(f"[html-to-md] output: {len(markdown):,} chars", file=sys.stderr)

    # ── Write output ─────────────────────────────────────────────────────────
    if args.output:
        with open(args.output, "w", encoding="utf-8") as fh:
            fh.write(markdown)
            fh.write("\n")
        if args.debug:
            print(f"[html-to-md] written to {args.output}", file=sys.stderr)
    else:
        print(markdown)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env -S uv run --script
	# /// script
	# requires-python = ">=3.11"
	# dependencies = [
	# "beautifulsoup4",
	# "html2text",
	# "httpx",
	# ]
	# ///
	"""Convert HTML to clean markdown optimised for LLM consumption.

	Reads HTML from a file, stdin, or URL, strips navigation, banners, cookie
	notices, layout tables, and script/style elements, then emits clean markdown
	with preserved code-fence language annotations.

	Usage:
	html-to-md [FILE] # convert file → stdout
	html-to-md [FILE] -o out.md # convert file → file
	html-to-md - # read stdin → stdout
	html-to-md https://example.com # fetch URL → stdout
	html-to-md https://example.com -o page.md

	Options:
	FILE \| URL \| - Input source (URL, file path, or - for stdin)
	-o, --output FILE Write output to FILE instead of stdout
	--url URL Base URL for resolving relative links when input is
	a file or stdin (ignored when fetching a URL directly)
	--no-links Strip all hyperlinks from output
	--include-images Preserve image references in output
	--debug, -D Print extra info to stderr
	--help, -h Show this message and exit

	Examples:
	html-to-md page.html
	html-to-md page.html -o page.md
	curl https://example.com \| html-to-md - --url https://example.com
	html-to-md https://docs.python.org/3/library/re.html -o re-docs.md
	"""

	from __future__ import annotations

	import argparse
	import re
	import sys
	from urllib.parse import urljoin, urlparse

	import html2text
	import httpx
	from bs4 import BeautifulSoup

	# ── Constants (mirrored from par-fetch-mcp) ────────────────────────────────

	_LANG_MARKER_PREFIX = "<!--lang:"
	_LANG_MARKER_SUFFIX = "-->"

	_COOKIE_CONSENT_PATTERN = re.compile(
	r"cookie\|consent\|gdpr\|privacy.?banner\|cc-banner\|onetrust\|CybotCookiebot",
	re.IGNORECASE,
	)

	_SELECTORS_TO_REMOVE = [
	'[role="navigation"]',
	'[role="banner"]',
	'[role="contentinfo"]',
	'[role="complementary"]',
	'[role="search"]',
	'[aria-hidden="true"]',
	".skip-to-content",
	".skip-link",
	"[class*='skip-to']",
	"[class*='social-share']",
	"[class*='share-button']",
	"[class*='social-media']",
	]


	# ── Core pipeline (mirrored from par-fetch-mcp) ────────────────────────────

	def _extract_code_language(tag) -> str:
	for el in [tag] + tag.find_all("code"):
	classes = el.get("class", [])
	if isinstance(classes, str):
	classes = classes.split()
	for cls in classes:
	for prefix in ("language-", "lang-", "highlight-"):
	if cls.startswith(prefix):
	return cls[len(prefix):]
	return ""


	def _is_layout_table(table) -> bool:
	if table.get("role") == "presentation":
	return True
	if not table.find("th"):
	rows = table.find_all("tr")
	if rows and all(len(row.find_all(["td", "th"])) <= 1 for row in rows):
	return True
	return False


	def _clean_markdown(text: str) -> str:
	text = re.sub(r"\n{4,}", "\n\n\n", text)
	text = re.sub(r"^[\-+] $", "", text, flags=re.MULTILINE)
	text = re.sub(r"\[\s\]\([^)]\)", "", text)
	text = re.sub(r"\{1,3}\s\*{1,3}", "", text)
	text = re.sub(r"_{1,3}\s*_{1,3}", "", text)
	text = re.sub(r"(^-{3,}\s*$\n?){2,}", "---\n", text, flags=re.MULTILINE)
	text = re.sub(r"(^\{3,}\s$\n?){2,}", "---\n", text, flags=re.MULTILINE)
	text = re.sub(r"^[\|\-\s]+$", "", text, flags=re.MULTILINE)
	text = re.sub(r"\n{4,}", "\n\n\n", text)
	return text.strip()


	def _html_to_markdown(
	html_content: str,
	*,
	url: str \| None = None,
	include_links: bool = True,
	include_images: bool = False,
	) -> str:
	soup = BeautifulSoup(html_content, "html.parser")

	# Narrow to main content container if one exists
	content_root = None
	for selector in ["main", "[role='main']", "article"]:
	candidate = soup.select_one(selector)
	if candidate and len(candidate.get_text(strip=True)) > 200:
	content_root = candidate
	break

	if content_root is not None:
	new_soup = BeautifulSoup("<div></div>", "html.parser")
	div = new_soup.find("div")
	assert div is not None
	div.append(content_root)
	soup = new_soup

	# Resolve relative URLs
	if include_links and url:
	url_attributes = ["href", "src", "action", "data", "poster", "background", "cite", "formaction"]
	for tag in soup.find_all(True):
	for attribute in url_attributes:
	if tag.has_attr(attribute):
	attr_value = tag[attribute]
	if isinstance(attr_value, list):
	continue
	if attr_value.startswith("//"):
	tag[attribute] = f"https:{attr_value}"
	elif not attr_value.startswith(("http://", "https://", "mailto:", "tel:", "javascript:")):
	tag[attribute] = urljoin(url, attr_value)

	# Remove non-content elements by tag
	elements_to_remove = [
	"head", "header", "footer", "script", "source", "style",
	"svg", "iframe", "nav", "aside", "form", "noscript", "template",
	]
	if not include_links:
	elements_to_remove.extend(["a", "link"])
	if not include_images:
	elements_to_remove.append("img")
	for element in elements_to_remove:
	for tag in soup.find_all(element):
	tag.decompose()

	# Remove non-content elements by CSS selector
	for selector in _SELECTORS_TO_REMOVE:
	for tag in soup.select(selector):
	tag.decompose()

	# Remove cookie/consent banners
	for tag in soup.find_all(True):
	classes = " ".join(tag.get("class", []))
	tag_id = tag.get("id", "") or ""
	if _COOKIE_CONSENT_PATTERN.search(classes) or _COOKIE_CONSENT_PATTERN.search(tag_id):
	tag.decompose()

	# Unwrap layout tables
	for table in soup.find_all("table"):
	if _is_layout_table(table):
	table.unwrap()

	# Convert separator elements to <hr>
	for element in soup.find_all(attrs={"role": "separator"}):
	hr = soup.new_tag("hr")
	element.replace_with(hr)
	hr.insert_before(soup.new_string("\n"))
	hr.insert_after(soup.new_string("\n"))

	# Extract language hints and mark code blocks
	for pre in soup.find_all("pre"):
	lang = _extract_code_language(pre)
	if lang:
	marker = soup.new_string(f"{_LANG_MARKER_PREFIX}{lang}{_LANG_MARKER_SUFFIX}")
	pre.insert_before(marker)

	result_html = str(soup)

	converter = html2text.HTML2Text()
	converter.ignore_links = not include_links
	converter.ignore_images = not include_images
	converter.body_width = 0
	converter.protect_links = True
	converter.unicode_snob = True
	converter.skip_internal_links = True
	converter.wrap_links = False

	markdown = converter.handle(result_html)

	# Fix up language-annotated fenced code blocks: <!--lang:python-->``` → ```python
	markdown = re.sub(
	rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\s*```",
	r"```\1",
	markdown,
	)

	# Fix up language-annotated indented code blocks (html2text uses 4-space indent for <pre>)
	# <!--lang:python-->\n \n code\n → ```python\ncode\n```
	def _replace_indented_block(m: re.Match) -> str:
	lang = m.group(1)
	block = m.group(2)
	# Strip the 4-space (or tab) indent from every line; drop leading blank lines
	lines = block.rstrip("\n").split("\n")
	dedented = []
	for line in lines:
	if line.startswith(" "):
	dedented.append(line[4:])
	elif line.startswith("\t"):
	dedented.append(line[1:])
	else:
	dedented.append(line.strip())
	# Drop leading/trailing blank lines inside the block
	while dedented and not dedented[0].strip():
	dedented.pop(0)
	while dedented and not dedented[-1].strip():
	dedented.pop()
	if not dedented:
	return ""
	return f"```{lang}\n" + "\n".join(dedented) + "\n```"

	# Match marker followed by all lines starting with whitespace (covers blank indented lines)
	markdown = re.sub(
	rf"{re.escape(_LANG_MARKER_PREFIX)}(\w+){re.escape(_LANG_MARKER_SUFFIX)}\n((?:[ \t][^\n]\n))",
	_replace_indented_block,
	markdown,
	)

	# Remove any remaining orphaned lang markers that didn't match either pattern
	markdown = re.sub(
	rf"{re.escape(_LANG_MARKER_PREFIX)}\w+{re.escape(_LANG_MARKER_SUFFIX)}\n*",
	"",
	markdown,
	)

	return _clean_markdown(markdown)


	# ── Fetching ───────────────────────────────────────────────────────────────

	def _is_url(s: str) -> bool:
	parsed = urlparse(s)
	return parsed.scheme in ("http", "https")


	def _fetch_url(url: str, timeout: int = 10, debug: bool = False) -> str:
	import random

	os_list = [
	("Windows NT 10.0", "Win64; x64"),
	("Macintosh; Apple M2 Mac OS X 14_2_1", "arm64"),
	]
	os_name, platform = random.choice(os_list)
	ua = f"Mozilla/5.0 ({os_name.split('; ')[0]}; {platform}) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.6261.128 Safari/537.36"

	headers = {
	"User-Agent": ua,
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	}
	if debug:
	print(f"[html-to-md] fetching {url}", file=sys.stderr)

	with httpx.Client(timeout=timeout, follow_redirects=True, headers=headers) as client:
	response = client.get(url)
	response.raise_for_status()
	return response.text


	# ── CLI ────────────────────────────────────────────────────────────────────

	def main() -> None:
	parser = argparse.ArgumentParser(
	prog="html-to-md",
	description="Convert HTML to clean markdown optimised for LLM consumption.",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog=__doc__.split("Examples:")[1].strip() if "Examples:" in __doc__ else "",
	)
	parser.add_argument(
	"input",
	nargs="?",
	default="-",
	metavar="FILE\|URL\|-",
	help="HTML file, URL to fetch, or - for stdin (default: -)",
	)
	parser.add_argument(
	"-o", "--output",
	metavar="FILE",
	help="Write output to FILE instead of stdout",
	)
	parser.add_argument(
	"--url",
	metavar="URL",
	help="Base URL for resolving relative links when input is a file or stdin",
	)
	parser.add_argument(
	"--no-links",
	action="store_true",
	help="Strip all hyperlinks from output",
	)
	parser.add_argument(
	"--include-images",
	action="store_true",
	help="Preserve image references in output",
	)
	parser.add_argument(
	"--timeout",
	type=int,
	default=10,
	metavar="SECS",
	help="HTTP timeout in seconds when fetching a URL (default: 10)",
	)
	parser.add_argument(
	"--debug", "-D",
	action="store_true",
	help="Print extra info to stderr",
	)

	args = parser.parse_args()

	# ── Read input ──────────────────────────────────────────────────────────
	base_url: str \| None = args.url

	if args.input != "-" and _is_url(args.input):
	# Input is a URL — fetch it
	base_url = base_url or args.input
	try:
	html_content = _fetch_url(args.input, timeout=args.timeout, debug=args.debug)
	except httpx.HTTPStatusError as e:
	print(f"error: HTTP {e.response.status_code} fetching {args.input}", file=sys.stderr)
	sys.exit(1)
	except httpx.RequestError as e:
	print(f"error: {e}", file=sys.stderr)
	sys.exit(1)
	elif args.input == "-":
	# Read from stdin
	if args.debug:
	print("[html-to-md] reading from stdin", file=sys.stderr)
	html_content = sys.stdin.read()
	else:
	# Read from file
	if args.debug:
	print(f"[html-to-md] reading {args.input}", file=sys.stderr)
	try:
	with open(args.input, encoding="utf-8") as fh:
	html_content = fh.read()
	except FileNotFoundError:
	print(f"error: file not found: {args.input}", file=sys.stderr)
	sys.exit(1)
	except OSError as e:
	print(f"error: {e}", file=sys.stderr)
	sys.exit(1)

	# ── Convert ─────────────────────────────────────────────────────────────
	if args.debug:
	print(f"[html-to-md] converting {len(html_content):,} bytes of HTML", file=sys.stderr)

	markdown = _html_to_markdown(
	html_content,
	url=base_url,
	include_links=not args.no_links,
	include_images=args.include_images,
	)

	if args.debug:
	print(f"[html-to-md] output: {len(markdown):,} chars", file=sys.stderr)

	# ── Write output ─────────────────────────────────────────────────────────
	if args.output:
	with open(args.output, "w", encoding="utf-8") as fh:
	fh.write(markdown)
	fh.write("\n")
	if args.debug:
	print(f"[html-to-md] written to {args.output}", file=sys.stderr)
	else:
	print(markdown)


	if __name__ == "__main__":
	main()
No results found