Skip to content

Instantly share code, notes, and snippets.

@vincenthsu
Created January 27, 2026 03:49
Show Gist options
  • Select an option

  • Save vincenthsu/7e36ef3a61cade517c10f0693c2a2cfb to your computer and use it in GitHub Desktop.

Select an option

Save vincenthsu/7e36ef3a61cade517c10f0693c2a2cfb to your computer and use it in GitHub Desktop.
Convert a website folder to PDF for LLM ingestion.
#!/usr/bin/env python3
"""
Site to PDF Converter
=====================
將 site 資料夾中的網站內容轉換成單一 PDF 檔案,方便給 LLM 閱讀。
使用方法:
python site_to_pdf.py [site_folder] [output_file]
依賴套件:
pip install beautifulsoup4 weasyprint
"""
import os
import sys
import argparse
from pathlib import Path
from datetime import datetime
try:
from bs4 import BeautifulSoup
except ImportError:
print("請安裝 beautifulsoup4: pip install beautifulsoup4")
sys.exit(1)
try:
from weasyprint import HTML, CSS
except ImportError:
print("請安裝 weasyprint: pip install weasyprint")
sys.exit(1)
def get_html_files(site_folder: Path) -> list[Path]:
"""遞迴取得所有 HTML 檔案,排除 404.html"""
html_files = []
for html_file in site_folder.rglob("*.html"):
# 排除 404 頁面
if html_file.name == "404.html":
continue
html_files.append(html_file)
# 排序:index.html 排最前面,其他按路徑排序
def sort_key(path: Path):
if path.name == "index.html" and path.parent == site_folder:
return (0, str(path))
return (1, str(path))
return sorted(html_files, key=sort_key)
def extract_content(html_file: Path) -> dict:
"""從 HTML 檔案提取標題和主要內容"""
with open(html_file, "r", encoding="utf-8") as f:
soup = BeautifulSoup(f.read(), "html.parser")
# 取得標題
title = ""
title_tag = soup.find("title")
if title_tag:
title = title_tag.get_text(strip=True)
# 嘗試取得主要內容區域 (通常是 main, article, 或特定 class)
content = None
# 常見的內容區域選擇器
content_selectors = [
"main",
"article",
".md-content", # MkDocs Material
".content",
"#content",
".document",
".body",
]
for selector in content_selectors:
if selector.startswith("."):
content = soup.find(class_=selector[1:])
elif selector.startswith("#"):
content = soup.find(id=selector[1:])
else:
content = soup.find(selector)
if content:
break
# 如果找不到特定區域,使用 body
if not content:
content = soup.find("body")
if content:
# 移除不需要的元素
for tag in content.find_all(["script", "style", "nav", "header", "footer", "aside"]):
tag.decompose()
# 移除導航相關的元素
for nav_class in ["md-sidebar", "md-header", "md-footer", "md-tabs", "md-search"]:
for tag in content.find_all(class_=nav_class):
tag.decompose()
content_html = str(content)
else:
content_html = "<p>無法提取內容</p>"
return {
"title": title,
"content": content_html,
"path": str(html_file),
}
def create_combined_html(pages: list[dict], site_folder: Path) -> str:
"""建立合併的 HTML 文件"""
# 建立目錄
toc_items = []
for i, page in enumerate(pages):
toc_items.append(f'<li><a href="#page-{i}">{page["title"] or page["path"]}</a></li>')
toc_html = f'<ul>{"".join(toc_items)}</ul>'
# 建立各頁面內容
pages_html = []
for i, page in enumerate(pages):
relative_path = Path(page["path"]).relative_to(site_folder)
pages_html.append(f'''
<section id="page-{i}" class="page-section">
<div class="page-header">
<h1>{page["title"]}</h1>
<p class="page-path">📄 {relative_path}</p>
</div>
<div class="page-content">
{page["content"]}
</div>
</section>
<div class="page-break"></div>
''')
# 完整 HTML 文件
html = f'''<!DOCTYPE html>
<html lang="zh-TW">
<head>
<meta charset="UTF-8">
<title>Site Documentation</title>
<style>
@page {{
size: A3;
margin: 2cm;
}}
body {{
font-family: "Noto Sans TC", "Microsoft JhengHei", sans-serif;
font-size: 11pt;
line-height: 1.6;
color: #333;
max-width: 100%;
}}
h1, h2, h3, h4, h5, h6 {{
color: #2c3e50;
margin-top: 1.5em;
margin-bottom: 0.5em;
}}
h1 {{
font-size: 20pt;
border-bottom: 2px solid #3498db;
padding-bottom: 0.3em;
}}
h2 {{
font-size: 16pt;
border-bottom: 1px solid #bdc3c7;
padding-bottom: 0.2em;
}}
h3 {{
font-size: 14pt;
}}
code {{
font-family: "Source Code Pro", "Consolas", monospace;
background-color: #f4f4f4;
padding: 0.2em 0.4em;
border-radius: 3px;
font-size: 10pt;
}}
pre {{
background-color: #f8f8f8;
border: 1px solid #ddd;
border-radius: 5px;
padding: 1em;
overflow-x: auto;
font-size: 9pt;
line-height: 1.4;
}}
pre code {{
background-color: transparent;
padding: 0;
}}
table {{
border-collapse: collapse;
width: 100%;
margin: 1em 0;
font-size: 10pt;
}}
th, td {{
border: 1px solid #ddd;
padding: 8px;
text-align: left;
}}
th {{
background-color: #3498db;
color: white;
}}
tr:nth-child(even) {{
background-color: #f9f9f9;
}}
.cover-page {{
text-align: center;
padding-top: 30%;
}}
.cover-page h1 {{
font-size: 28pt;
border: none;
}}
.cover-page .date {{
margin-top: 2em;
color: #7f8c8d;
}}
.toc {{
page-break-after: always;
}}
.toc h2 {{
font-size: 18pt;
}}
.toc ul {{
list-style-type: none;
padding-left: 0;
}}
.toc li {{
padding: 0.3em 0;
border-bottom: 1px dotted #ddd;
}}
.toc a {{
color: #2980b9;
text-decoration: none;
}}
.page-section {{
page-break-inside: avoid;
}}
.page-header {{
background-color: #ecf0f1;
padding: 1em;
margin-bottom: 1em;
border-left: 4px solid #3498db;
}}
.page-path {{
color: #7f8c8d;
font-size: 9pt;
margin: 0.5em 0 0 0;
}}
.page-break {{
page-break-after: always;
}}
img {{
max-width: 100%;
height: auto;
}}
a {{
color: #2980b9;
}}
blockquote {{
border-left: 4px solid #3498db;
margin: 1em 0;
padding-left: 1em;
color: #555;
}}
ul, ol {{
margin: 0.5em 0;
padding-left: 2em;
}}
li {{
margin: 0.3em 0;
}}
</style>
</head>
<body>
<!-- 封面 -->
<div class="cover-page">
<h1>📚 Site Documentation</h1>
<p>網站文件合集</p>
<p class="date">生成日期: {datetime.now().strftime("%Y-%m-%d %H:%M")}</p>
<p>共 {len(pages)} 頁文件</p>
</div>
<div class="page-break"></div>
<!-- 目錄 -->
<div class="toc">
<h2>📋 目錄 Table of Contents</h2>
{toc_html}
</div>
<!-- 各頁面內容 -->
{"".join(pages_html)}
</body>
</html>'''
return html
def main():
parser = argparse.ArgumentParser(
description="將 site 資料夾網站內容轉換成 PDF",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
範例:
python site_to_pdf.py site output.pdf
python site_to_pdf.py ./site ./documentation.pdf
"""
)
parser.add_argument(
"site_folder",
nargs="?",
default="site",
help="網站資料夾路徑 (預設: site)"
)
parser.add_argument(
"output_file",
nargs="?",
default="site_documentation.pdf",
help="輸出 PDF 檔名 (預設: site_documentation.pdf)"
)
parser.add_argument(
"--html-only",
action="store_true",
help="只輸出合併的 HTML 檔案,不轉換成 PDF"
)
args = parser.parse_args()
site_folder = Path(args.site_folder).resolve()
output_file = Path(args.output_file).resolve()
if not site_folder.exists():
print(f"❌ 錯誤: 找不到資料夾 {site_folder}")
sys.exit(1)
print(f"📂 掃描資料夾: {site_folder}")
# 取得所有 HTML 檔案
html_files = get_html_files(site_folder)
if not html_files:
print("❌ 錯誤: 找不到任何 HTML 檔案")
sys.exit(1)
print(f"📄 找到 {len(html_files)} 個 HTML 檔案")
# 提取內容
print("📖 正在提取內容...")
pages = []
for html_file in html_files:
try:
page = extract_content(html_file)
pages.append(page)
print(f" ✓ {html_file.relative_to(site_folder)}")
except Exception as e:
print(f" ✗ {html_file.relative_to(site_folder)}: {e}")
# 建立合併 HTML
print("🔧 正在建立文件...")
combined_html = create_combined_html(pages, site_folder)
if args.html_only:
html_output = output_file.with_suffix(".html")
with open(html_output, "w", encoding="utf-8") as f:
f.write(combined_html)
print(f"✅ HTML 已儲存至: {html_output}")
else:
# 轉換成 PDF
print("📝 正在轉換成 PDF...")
try:
HTML(string=combined_html, base_url=str(site_folder)).write_pdf(output_file)
print(f"✅ PDF 已儲存至: {output_file}")
print(f"📊 檔案大小: {output_file.stat().st_size / 1024:.1f} KB")
except Exception as e:
print(f"❌ PDF 轉換失敗: {e}")
# 嘗試儲存 HTML 作為備份
html_backup = output_file.with_suffix(".html")
with open(html_backup, "w", encoding="utf-8") as f:
f.write(combined_html)
print(f"💾 已儲存 HTML 備份: {html_backup}")
sys.exit(1)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment