Created
January 27, 2026 03:49
-
-
Save vincenthsu/7e36ef3a61cade517c10f0693c2a2cfb to your computer and use it in GitHub Desktop.
Convert a website folder to PDF for LLM ingestion.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Site to PDF Converter | |
| ===================== | |
| 將 site 資料夾中的網站內容轉換成單一 PDF 檔案,方便給 LLM 閱讀。 | |
| 使用方法: | |
| python site_to_pdf.py [site_folder] [output_file] | |
| 依賴套件: | |
| pip install beautifulsoup4 weasyprint | |
| """ | |
| import os | |
| import sys | |
| import argparse | |
| from pathlib import Path | |
| from datetime import datetime | |
| try: | |
| from bs4 import BeautifulSoup | |
| except ImportError: | |
| print("請安裝 beautifulsoup4: pip install beautifulsoup4") | |
| sys.exit(1) | |
| try: | |
| from weasyprint import HTML, CSS | |
| except ImportError: | |
| print("請安裝 weasyprint: pip install weasyprint") | |
| sys.exit(1) | |
| def get_html_files(site_folder: Path) -> list[Path]: | |
| """遞迴取得所有 HTML 檔案,排除 404.html""" | |
| html_files = [] | |
| for html_file in site_folder.rglob("*.html"): | |
| # 排除 404 頁面 | |
| if html_file.name == "404.html": | |
| continue | |
| html_files.append(html_file) | |
| # 排序:index.html 排最前面,其他按路徑排序 | |
| def sort_key(path: Path): | |
| if path.name == "index.html" and path.parent == site_folder: | |
| return (0, str(path)) | |
| return (1, str(path)) | |
| return sorted(html_files, key=sort_key) | |
| def extract_content(html_file: Path) -> dict: | |
| """從 HTML 檔案提取標題和主要內容""" | |
| with open(html_file, "r", encoding="utf-8") as f: | |
| soup = BeautifulSoup(f.read(), "html.parser") | |
| # 取得標題 | |
| title = "" | |
| title_tag = soup.find("title") | |
| if title_tag: | |
| title = title_tag.get_text(strip=True) | |
| # 嘗試取得主要內容區域 (通常是 main, article, 或特定 class) | |
| content = None | |
| # 常見的內容區域選擇器 | |
| content_selectors = [ | |
| "main", | |
| "article", | |
| ".md-content", # MkDocs Material | |
| ".content", | |
| "#content", | |
| ".document", | |
| ".body", | |
| ] | |
| for selector in content_selectors: | |
| if selector.startswith("."): | |
| content = soup.find(class_=selector[1:]) | |
| elif selector.startswith("#"): | |
| content = soup.find(id=selector[1:]) | |
| else: | |
| content = soup.find(selector) | |
| if content: | |
| break | |
| # 如果找不到特定區域,使用 body | |
| if not content: | |
| content = soup.find("body") | |
| if content: | |
| # 移除不需要的元素 | |
| for tag in content.find_all(["script", "style", "nav", "header", "footer", "aside"]): | |
| tag.decompose() | |
| # 移除導航相關的元素 | |
| for nav_class in ["md-sidebar", "md-header", "md-footer", "md-tabs", "md-search"]: | |
| for tag in content.find_all(class_=nav_class): | |
| tag.decompose() | |
| content_html = str(content) | |
| else: | |
| content_html = "<p>無法提取內容</p>" | |
| return { | |
| "title": title, | |
| "content": content_html, | |
| "path": str(html_file), | |
| } | |
| def create_combined_html(pages: list[dict], site_folder: Path) -> str: | |
| """建立合併的 HTML 文件""" | |
| # 建立目錄 | |
| toc_items = [] | |
| for i, page in enumerate(pages): | |
| toc_items.append(f'<li><a href="#page-{i}">{page["title"] or page["path"]}</a></li>') | |
| toc_html = f'<ul>{"".join(toc_items)}</ul>' | |
| # 建立各頁面內容 | |
| pages_html = [] | |
| for i, page in enumerate(pages): | |
| relative_path = Path(page["path"]).relative_to(site_folder) | |
| pages_html.append(f''' | |
| <section id="page-{i}" class="page-section"> | |
| <div class="page-header"> | |
| <h1>{page["title"]}</h1> | |
| <p class="page-path">📄 {relative_path}</p> | |
| </div> | |
| <div class="page-content"> | |
| {page["content"]} | |
| </div> | |
| </section> | |
| <div class="page-break"></div> | |
| ''') | |
| # 完整 HTML 文件 | |
| html = f'''<!DOCTYPE html> | |
| <html lang="zh-TW"> | |
| <head> | |
| <meta charset="UTF-8"> | |
| <title>Site Documentation</title> | |
| <style> | |
| @page {{ | |
| size: A3; | |
| margin: 2cm; | |
| }} | |
| body {{ | |
| font-family: "Noto Sans TC", "Microsoft JhengHei", sans-serif; | |
| font-size: 11pt; | |
| line-height: 1.6; | |
| color: #333; | |
| max-width: 100%; | |
| }} | |
| h1, h2, h3, h4, h5, h6 {{ | |
| color: #2c3e50; | |
| margin-top: 1.5em; | |
| margin-bottom: 0.5em; | |
| }} | |
| h1 {{ | |
| font-size: 20pt; | |
| border-bottom: 2px solid #3498db; | |
| padding-bottom: 0.3em; | |
| }} | |
| h2 {{ | |
| font-size: 16pt; | |
| border-bottom: 1px solid #bdc3c7; | |
| padding-bottom: 0.2em; | |
| }} | |
| h3 {{ | |
| font-size: 14pt; | |
| }} | |
| code {{ | |
| font-family: "Source Code Pro", "Consolas", monospace; | |
| background-color: #f4f4f4; | |
| padding: 0.2em 0.4em; | |
| border-radius: 3px; | |
| font-size: 10pt; | |
| }} | |
| pre {{ | |
| background-color: #f8f8f8; | |
| border: 1px solid #ddd; | |
| border-radius: 5px; | |
| padding: 1em; | |
| overflow-x: auto; | |
| font-size: 9pt; | |
| line-height: 1.4; | |
| }} | |
| pre code {{ | |
| background-color: transparent; | |
| padding: 0; | |
| }} | |
| table {{ | |
| border-collapse: collapse; | |
| width: 100%; | |
| margin: 1em 0; | |
| font-size: 10pt; | |
| }} | |
| th, td {{ | |
| border: 1px solid #ddd; | |
| padding: 8px; | |
| text-align: left; | |
| }} | |
| th {{ | |
| background-color: #3498db; | |
| color: white; | |
| }} | |
| tr:nth-child(even) {{ | |
| background-color: #f9f9f9; | |
| }} | |
| .cover-page {{ | |
| text-align: center; | |
| padding-top: 30%; | |
| }} | |
| .cover-page h1 {{ | |
| font-size: 28pt; | |
| border: none; | |
| }} | |
| .cover-page .date {{ | |
| margin-top: 2em; | |
| color: #7f8c8d; | |
| }} | |
| .toc {{ | |
| page-break-after: always; | |
| }} | |
| .toc h2 {{ | |
| font-size: 18pt; | |
| }} | |
| .toc ul {{ | |
| list-style-type: none; | |
| padding-left: 0; | |
| }} | |
| .toc li {{ | |
| padding: 0.3em 0; | |
| border-bottom: 1px dotted #ddd; | |
| }} | |
| .toc a {{ | |
| color: #2980b9; | |
| text-decoration: none; | |
| }} | |
| .page-section {{ | |
| page-break-inside: avoid; | |
| }} | |
| .page-header {{ | |
| background-color: #ecf0f1; | |
| padding: 1em; | |
| margin-bottom: 1em; | |
| border-left: 4px solid #3498db; | |
| }} | |
| .page-path {{ | |
| color: #7f8c8d; | |
| font-size: 9pt; | |
| margin: 0.5em 0 0 0; | |
| }} | |
| .page-break {{ | |
| page-break-after: always; | |
| }} | |
| img {{ | |
| max-width: 100%; | |
| height: auto; | |
| }} | |
| a {{ | |
| color: #2980b9; | |
| }} | |
| blockquote {{ | |
| border-left: 4px solid #3498db; | |
| margin: 1em 0; | |
| padding-left: 1em; | |
| color: #555; | |
| }} | |
| ul, ol {{ | |
| margin: 0.5em 0; | |
| padding-left: 2em; | |
| }} | |
| li {{ | |
| margin: 0.3em 0; | |
| }} | |
| </style> | |
| </head> | |
| <body> | |
| <!-- 封面 --> | |
| <div class="cover-page"> | |
| <h1>📚 Site Documentation</h1> | |
| <p>網站文件合集</p> | |
| <p class="date">生成日期: {datetime.now().strftime("%Y-%m-%d %H:%M")}</p> | |
| <p>共 {len(pages)} 頁文件</p> | |
| </div> | |
| <div class="page-break"></div> | |
| <!-- 目錄 --> | |
| <div class="toc"> | |
| <h2>📋 目錄 Table of Contents</h2> | |
| {toc_html} | |
| </div> | |
| <!-- 各頁面內容 --> | |
| {"".join(pages_html)} | |
| </body> | |
| </html>''' | |
| return html | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description="將 site 資料夾網站內容轉換成 PDF", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=""" | |
| 範例: | |
| python site_to_pdf.py site output.pdf | |
| python site_to_pdf.py ./site ./documentation.pdf | |
| """ | |
| ) | |
| parser.add_argument( | |
| "site_folder", | |
| nargs="?", | |
| default="site", | |
| help="網站資料夾路徑 (預設: site)" | |
| ) | |
| parser.add_argument( | |
| "output_file", | |
| nargs="?", | |
| default="site_documentation.pdf", | |
| help="輸出 PDF 檔名 (預設: site_documentation.pdf)" | |
| ) | |
| parser.add_argument( | |
| "--html-only", | |
| action="store_true", | |
| help="只輸出合併的 HTML 檔案,不轉換成 PDF" | |
| ) | |
| args = parser.parse_args() | |
| site_folder = Path(args.site_folder).resolve() | |
| output_file = Path(args.output_file).resolve() | |
| if not site_folder.exists(): | |
| print(f"❌ 錯誤: 找不到資料夾 {site_folder}") | |
| sys.exit(1) | |
| print(f"📂 掃描資料夾: {site_folder}") | |
| # 取得所有 HTML 檔案 | |
| html_files = get_html_files(site_folder) | |
| if not html_files: | |
| print("❌ 錯誤: 找不到任何 HTML 檔案") | |
| sys.exit(1) | |
| print(f"📄 找到 {len(html_files)} 個 HTML 檔案") | |
| # 提取內容 | |
| print("📖 正在提取內容...") | |
| pages = [] | |
| for html_file in html_files: | |
| try: | |
| page = extract_content(html_file) | |
| pages.append(page) | |
| print(f" ✓ {html_file.relative_to(site_folder)}") | |
| except Exception as e: | |
| print(f" ✗ {html_file.relative_to(site_folder)}: {e}") | |
| # 建立合併 HTML | |
| print("🔧 正在建立文件...") | |
| combined_html = create_combined_html(pages, site_folder) | |
| if args.html_only: | |
| html_output = output_file.with_suffix(".html") | |
| with open(html_output, "w", encoding="utf-8") as f: | |
| f.write(combined_html) | |
| print(f"✅ HTML 已儲存至: {html_output}") | |
| else: | |
| # 轉換成 PDF | |
| print("📝 正在轉換成 PDF...") | |
| try: | |
| HTML(string=combined_html, base_url=str(site_folder)).write_pdf(output_file) | |
| print(f"✅ PDF 已儲存至: {output_file}") | |
| print(f"📊 檔案大小: {output_file.stat().st_size / 1024:.1f} KB") | |
| except Exception as e: | |
| print(f"❌ PDF 轉換失敗: {e}") | |
| # 嘗試儲存 HTML 作為備份 | |
| html_backup = output_file.with_suffix(".html") | |
| with open(html_backup, "w", encoding="utf-8") as f: | |
| f.write(combined_html) | |
| print(f"💾 已儲存 HTML 備份: {html_backup}") | |
| sys.exit(1) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment