vincenthsu · January 27, 2026 03:49
diff --git a/site_to_pdf.py b/site_to_pdf.py
 #!/usr/bin/env python3

 """
 Site to PDF Converter
 =====================
 將 site 資料夾中的網站內容轉換成單一 PDF 檔案，方便給 LLM 閱讀。

 使用方法:
    python site_to_pdf.py [site_folder] [output_file]

 依賴套件:
    pip install beautifulsoup4 weasyprint
 """

 import os
 import sys
 import argparse
 from pathlib import Path
 from datetime import datetime

 try:
    from bs4 import BeautifulSoup
 except ImportError:
    print("請安裝 beautifulsoup4: pip install beautifulsoup4")
    sys.exit(1)

 try:
    from weasyprint import HTML, CSS
 except ImportError:
    print("請安裝 weasyprint: pip install weasyprint")
    sys.exit(1)


 def get_html_files(site_folder: Path) -> list[Path]:
    """遞迴取得所有 HTML 檔案，排除 404.html"""
    html_files = []
    for html_file in site_folder.rglob("*.html"):
        # 排除 404 頁面
        if html_file.name == "404.html":
            continue
        html_files.append(html_file)

    # 排序：index.html 排最前面，其他按路徑排序
    def sort_key(path: Path):
        if path.name == "index.html" and path.parent == site_folder:
            return (0, str(path))
        return (1, str(path))

    return sorted(html_files, key=sort_key)


 def extract_content(html_file: Path) -> dict:
    """從 HTML 檔案提取標題和主要內容"""
    with open(html_file, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f.read(), "html.parser")

    # 取得標題
    title = ""
    title_tag = soup.find("title")
    if title_tag:
        title = title_tag.get_text(strip=True)

    # 嘗試取得主要內容區域 (通常是 main, article, 或特定 class)
    content = None

    # 常見的內容區域選擇器
    content_selectors = [
        "main",
        "article",
        ".md-content",  # MkDocs Material
        ".content",
        "#content",
        ".document",
        ".body",
    ]

    for selector in content_selectors:
        if selector.startswith("."):
            content = soup.find(class_=selector[1:])
        elif selector.startswith("#"):
            content = soup.find(id=selector[1:])
        else:
            content = soup.find(selector)

        if content:
            break

    # 如果找不到特定區域，使用 body
    if not content:
        content = soup.find("body")

    if content:
        # 移除不需要的元素
        for tag in content.find_all(["script", "style", "nav", "header", "footer", "aside"]):
            tag.decompose()

        # 移除導航相關的元素
        for nav_class in ["md-sidebar", "md-header", "md-footer", "md-tabs", "md-search"]:
            for tag in content.find_all(class_=nav_class):
                tag.decompose()

        content_html = str(content)
    else:
        content_html = "<p>無法提取內容</p>"

    return {
        "title": title,
        "content": content_html,
        "path": str(html_file),
    }


 def create_combined_html(pages: list[dict], site_folder: Path) -> str:
    """建立合併的 HTML 文件"""

    # 建立目錄
    toc_items = []
    for i, page in enumerate(pages):
        toc_items.append(f'<li><a href="#page-{i}">{page["title"] or page["path"]}</a></li>')

    toc_html = f'<ul>{"".join(toc_items)}</ul>'

    # 建立各頁面內容
    pages_html = []
    for i, page in enumerate(pages):
        relative_path = Path(page["path"]).relative_to(site_folder)
        pages_html.append(f'''
        <section id="page-{i}" class="page-section">
            <div class="page-header">
                <h1>{page["title"]}</h1>
                <p class="page-path">📄 {relative_path}</p>
            </div>
            <div class="page-content">
                {page["content"]}
            </div>
        </section>
        <div class="page-break"></div>
        ''')

    # 完整 HTML 文件
    html = f'''<!DOCTYPE html>
 <html lang="zh-TW">
 <head>
    <meta charset="UTF-8">
    <title>Site Documentation</title>
    <style>
        @page {{
            size: A3;
            margin: 2cm;
        }}

        body {{
            font-family: "Noto Sans TC", "Microsoft JhengHei", sans-serif;
            font-size: 11pt;
            line-height: 1.6;
            color: #333;
            max-width: 100%;
        }}

        h1, h2, h3, h4, h5, h6 {{
            color: #2c3e50;
            margin-top: 1.5em;
            margin-bottom: 0.5em;
        }}

        h1 {{
            font-size: 20pt;
            border-bottom: 2px solid #3498db;
            padding-bottom: 0.3em;
        }}

        h2 {{
            font-size: 16pt;
            border-bottom: 1px solid #bdc3c7;
            padding-bottom: 0.2em;
        }}

        h3 {{
            font-size: 14pt;
        }}

        code {{
            font-family: "Source Code Pro", "Consolas", monospace;
            background-color: #f4f4f4;
            padding: 0.2em 0.4em;
            border-radius: 3px;
            font-size: 10pt;
        }}

        pre {{
            background-color: #f8f8f8;
            border: 1px solid #ddd;
            border-radius: 5px;
            padding: 1em;
            overflow-x: auto;
            font-size: 9pt;
            line-height: 1.4;
        }}

        pre code {{
            background-color: transparent;
            padding: 0;
        }}

        table {{
            border-collapse: collapse;
            width: 100%;
            margin: 1em 0;
            font-size: 10pt;
        }}

        th, td {{
            border: 1px solid #ddd;
            padding: 8px;
            text-align: left;
        }}

        th {{
            background-color: #3498db;
            color: white;
        }}

        tr:nth-child(even) {{
            background-color: #f9f9f9;
        }}

        .cover-page {{
            text-align: center;
            padding-top: 30%;
        }}

        .cover-page h1 {{
            font-size: 28pt;
            border: none;
        }}

        .cover-page .date {{
            margin-top: 2em;
            color: #7f8c8d;
        }}

        .toc {{
            page-break-after: always;
        }}

        .toc h2 {{
            font-size: 18pt;
        }}

        .toc ul {{
            list-style-type: none;
            padding-left: 0;
        }}

        .toc li {{
            padding: 0.3em 0;
            border-bottom: 1px dotted #ddd;
        }}

        .toc a {{
            color: #2980b9;
            text-decoration: none;
        }}

        .page-section {{
            page-break-inside: avoid;
        }}

        .page-header {{
            background-color: #ecf0f1;
            padding: 1em;
            margin-bottom: 1em;
            border-left: 4px solid #3498db;
        }}

        .page-path {{
            color: #7f8c8d;
            font-size: 9pt;
            margin: 0.5em 0 0 0;
        }}

        .page-break {{
            page-break-after: always;
        }}

        img {{
            max-width: 100%;
            height: auto;
        }}

        a {{
            color: #2980b9;
        }}

        blockquote {{
            border-left: 4px solid #3498db;
            margin: 1em 0;
            padding-left: 1em;
            color: #555;
        }}

        ul, ol {{
            margin: 0.5em 0;
            padding-left: 2em;
        }}

        li {{
            margin: 0.3em 0;
        }}
    </style>
 </head>
 <body>
    <!-- 封面 -->
    <div class="cover-page">
        <h1>📚 Site Documentation</h1>
        <p>網站文件合集</p>
        <p class="date">生成日期: {datetime.now().strftime("%Y-%m-%d %H:%M")}</p>
        <p>共 {len(pages)} 頁文件</p>
    </div>
    <div class="page-break"></div>

    <!-- 目錄 -->
    <div class="toc">
        <h2>📋 目錄 Table of Contents</h2>
        {toc_html}
    </div>

    <!-- 各頁面內容 -->
    {"".join(pages_html)}
 </body>
 </html>'''

    return html


 def main():
    parser = argparse.ArgumentParser(
        description="將 site 資料夾網站內容轉換成 PDF",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
 範例:
    python site_to_pdf.py site output.pdf
    python site_to_pdf.py ./site ./documentation.pdf
        """
    )
    parser.add_argument(
        "site_folder",
        nargs="?",
        default="site",
        help="網站資料夾路徑 (預設: site)"
    )
    parser.add_argument(
        "output_file",
        nargs="?",
        default="site_documentation.pdf",
        help="輸出 PDF 檔名 (預設: site_documentation.pdf)"
    )
    parser.add_argument(
        "--html-only",
        action="store_true",
        help="只輸出合併的 HTML 檔案，不轉換成 PDF"
    )

    args = parser.parse_args()

    site_folder = Path(args.site_folder).resolve()
    output_file = Path(args.output_file).resolve()

    if not site_folder.exists():
        print(f"❌ 錯誤: 找不到資料夾 {site_folder}")
        sys.exit(1)

    print(f"📂 掃描資料夾: {site_folder}")

    # 取得所有 HTML 檔案
    html_files = get_html_files(site_folder)

    if not html_files:
        print("❌ 錯誤: 找不到任何 HTML 檔案")
        sys.exit(1)

    print(f"📄 找到 {len(html_files)} 個 HTML 檔案")

    # 提取內容
    print("📖 正在提取內容...")
    pages = []
    for html_file in html_files:
        try:
            page = extract_content(html_file)
            pages.append(page)
            print(f"  ✓ {html_file.relative_to(site_folder)}")
        except Exception as e:
            print(f"  ✗ {html_file.relative_to(site_folder)}: {e}")

    # 建立合併 HTML
    print("🔧 正在建立文件...")
    combined_html = create_combined_html(pages, site_folder)

    if args.html_only:
        html_output = output_file.with_suffix(".html")
        with open(html_output, "w", encoding="utf-8") as f:
            f.write(combined_html)
        print(f"✅ HTML 已儲存至: {html_output}")
    else:
        # 轉換成 PDF
        print("📝 正在轉換成 PDF...")
        try:
            HTML(string=combined_html, base_url=str(site_folder)).write_pdf(output_file)
            print(f"✅ PDF 已儲存至: {output_file}")
            print(f"📊 檔案大小: {output_file.stat().st_size / 1024:.1f} KB")
        except Exception as e:
            print(f"❌ PDF 轉換失敗: {e}")
            # 嘗試儲存 HTML 作為備份
            html_backup = output_file.with_suffix(".html")
            with open(html_backup, "w", encoding="utf-8") as f:
                f.write(combined_html)
            print(f"💾 已儲存 HTML 備份: {html_backup}")
            sys.exit(1)


 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3

	"""
	Site to PDF Converter
	=====================
	將 site 資料夾中的網站內容轉換成單一 PDF 檔案，方便給 LLM 閱讀。

	使用方法:
	python site_to_pdf.py [site_folder] [output_file]

	依賴套件:
	pip install beautifulsoup4 weasyprint
	"""

	import os
	import sys
	import argparse
	from pathlib import Path
	from datetime import datetime

	try:
	from bs4 import BeautifulSoup
	except ImportError:
	print("請安裝 beautifulsoup4: pip install beautifulsoup4")
	sys.exit(1)

	try:
	from weasyprint import HTML, CSS
	except ImportError:
	print("請安裝 weasyprint: pip install weasyprint")
	sys.exit(1)


	def get_html_files(site_folder: Path) -> list[Path]:
	"""遞迴取得所有 HTML 檔案，排除 404.html"""
	html_files = []
	for html_file in site_folder.rglob("*.html"):
	# 排除 404 頁面
	if html_file.name == "404.html":
	continue
	html_files.append(html_file)

	# 排序：index.html 排最前面，其他按路徑排序
	def sort_key(path: Path):
	if path.name == "index.html" and path.parent == site_folder:
	return (0, str(path))
	return (1, str(path))

	return sorted(html_files, key=sort_key)


	def extract_content(html_file: Path) -> dict:
	"""從 HTML 檔案提取標題和主要內容"""
	with open(html_file, "r", encoding="utf-8") as f:
	soup = BeautifulSoup(f.read(), "html.parser")

	# 取得標題
	title = ""
	title_tag = soup.find("title")
	if title_tag:
	title = title_tag.get_text(strip=True)

	# 嘗試取得主要內容區域 (通常是 main, article, 或特定 class)
	content = None

	# 常見的內容區域選擇器
	content_selectors = [
	"main",
	"article",
	".md-content", # MkDocs Material
	".content",
	"#content",
	".document",
	".body",
	]

	for selector in content_selectors:
	if selector.startswith("."):
	content = soup.find(class_=selector[1:])
	elif selector.startswith("#"):
	content = soup.find(id=selector[1:])
	else:
	content = soup.find(selector)

	if content:
	break

	# 如果找不到特定區域，使用 body
	if not content:
	content = soup.find("body")

	if content:
	# 移除不需要的元素
	for tag in content.find_all(["script", "style", "nav", "header", "footer", "aside"]):
	tag.decompose()

	# 移除導航相關的元素
	for nav_class in ["md-sidebar", "md-header", "md-footer", "md-tabs", "md-search"]:
	for tag in content.find_all(class_=nav_class):
	tag.decompose()

	content_html = str(content)
	else:
	content_html = "<p>無法提取內容</p>"

	return {
	"title": title,
	"content": content_html,
	"path": str(html_file),
	}


	def create_combined_html(pages: list[dict], site_folder: Path) -> str:
	"""建立合併的 HTML 文件"""

	# 建立目錄
	toc_items = []
	for i, page in enumerate(pages):
	toc_items.append(f'<li><a href="#page-{i}">{page["title"] or page["path"]}</a></li>')

	toc_html = f'<ul>{"".join(toc_items)}</ul>'

	# 建立各頁面內容
	pages_html = []
	for i, page in enumerate(pages):
	relative_path = Path(page["path"]).relative_to(site_folder)
	pages_html.append(f'''
	<section id="page-{i}" class="page-section">
	<div class="page-header">
	<h1>{page["title"]}</h1>
	<p class="page-path">📄 {relative_path}</p>
	</div>
	<div class="page-content">
	{page["content"]}
	</div>
	</section>
	<div class="page-break"></div>
	''')

	# 完整 HTML 文件
	html = f'''<!DOCTYPE html>
	<html lang="zh-TW">
	<head>
	<meta charset="UTF-8">
	<title>Site Documentation</title>
	<style>
	@page {{
	size: A3;
	margin: 2cm;
	}}

	body {{
	font-family: "Noto Sans TC", "Microsoft JhengHei", sans-serif;
	font-size: 11pt;
	line-height: 1.6;
	color: #333;
	max-width: 100%;
	}}

	h1, h2, h3, h4, h5, h6 {{
	color: #2c3e50;
	margin-top: 1.5em;
	margin-bottom: 0.5em;
	}}

	h1 {{
	font-size: 20pt;
	border-bottom: 2px solid #3498db;
	padding-bottom: 0.3em;
	}}

	h2 {{
	font-size: 16pt;
	border-bottom: 1px solid #bdc3c7;
	padding-bottom: 0.2em;
	}}

	h3 {{
	font-size: 14pt;
	}}

	code {{
	font-family: "Source Code Pro", "Consolas", monospace;
	background-color: #f4f4f4;
	padding: 0.2em 0.4em;
	border-radius: 3px;
	font-size: 10pt;
	}}

	pre {{
	background-color: #f8f8f8;
	border: 1px solid #ddd;
	border-radius: 5px;
	padding: 1em;
	overflow-x: auto;
	font-size: 9pt;
	line-height: 1.4;
	}}

	pre code {{
	background-color: transparent;
	padding: 0;
	}}

	table {{
	border-collapse: collapse;
	width: 100%;
	margin: 1em 0;
	font-size: 10pt;
	}}

	th, td {{
	border: 1px solid #ddd;
	padding: 8px;
	text-align: left;
	}}

	th {{
	background-color: #3498db;
	color: white;
	}}

	tr:nth-child(even) {{
	background-color: #f9f9f9;
	}}

	.cover-page {{
	text-align: center;
	padding-top: 30%;
	}}

	.cover-page h1 {{
	font-size: 28pt;
	border: none;
	}}

	.cover-page .date {{
	margin-top: 2em;
	color: #7f8c8d;
	}}

	.toc {{
	page-break-after: always;
	}}

	.toc h2 {{
	font-size: 18pt;
	}}

	.toc ul {{
	list-style-type: none;
	padding-left: 0;
	}}

	.toc li {{
	padding: 0.3em 0;
	border-bottom: 1px dotted #ddd;
	}}

	.toc a {{
	color: #2980b9;
	text-decoration: none;
	}}

	.page-section {{
	page-break-inside: avoid;
	}}

	.page-header {{
	background-color: #ecf0f1;
	padding: 1em;
	margin-bottom: 1em;
	border-left: 4px solid #3498db;
	}}

	.page-path {{
	color: #7f8c8d;
	font-size: 9pt;
	margin: 0.5em 0 0 0;
	}}

	.page-break {{
	page-break-after: always;
	}}

	img {{
	max-width: 100%;
	height: auto;
	}}

	a {{
	color: #2980b9;
	}}

	blockquote {{
	border-left: 4px solid #3498db;
	margin: 1em 0;
	padding-left: 1em;
	color: #555;
	}}

	ul, ol {{
	margin: 0.5em 0;
	padding-left: 2em;
	}}

	li {{
	margin: 0.3em 0;
	}}
	</style>
	</head>
	<body>
	<!-- 封面 -->
	<div class="cover-page">
	<h1>📚 Site Documentation</h1>
	<p>網站文件合集</p>
	<p class="date">生成日期: {datetime.now().strftime("%Y-%m-%d %H:%M")}</p>
	<p>共 {len(pages)} 頁文件</p>
	</div>
	<div class="page-break"></div>

	<!-- 目錄 -->
	<div class="toc">
	<h2>📋 目錄 Table of Contents</h2>
	{toc_html}
	</div>

	<!-- 各頁面內容 -->
	{"".join(pages_html)}
	</body>
	</html>'''

	return html


	def main():
	parser = argparse.ArgumentParser(
	description="將 site 資料夾網站內容轉換成 PDF",
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	範例:
	python site_to_pdf.py site output.pdf
	python site_to_pdf.py ./site ./documentation.pdf
	"""
	)
	parser.add_argument(
	"site_folder",
	nargs="?",
	default="site",
	help="網站資料夾路徑 (預設: site)"
	)
	parser.add_argument(
	"output_file",
	nargs="?",
	default="site_documentation.pdf",
	help="輸出 PDF 檔名 (預設: site_documentation.pdf)"
	)
	parser.add_argument(
	"--html-only",
	action="store_true",
	help="只輸出合併的 HTML 檔案，不轉換成 PDF"
	)

	args = parser.parse_args()

	site_folder = Path(args.site_folder).resolve()
	output_file = Path(args.output_file).resolve()

	if not site_folder.exists():
	print(f"❌ 錯誤: 找不到資料夾 {site_folder}")
	sys.exit(1)

	print(f"📂 掃描資料夾: {site_folder}")

	# 取得所有 HTML 檔案
	html_files = get_html_files(site_folder)

	if not html_files:
	print("❌ 錯誤: 找不到任何 HTML 檔案")
	sys.exit(1)

	print(f"📄 找到 {len(html_files)} 個 HTML 檔案")

	# 提取內容
	print("📖 正在提取內容...")
	pages = []
	for html_file in html_files:
	try:
	page = extract_content(html_file)
	pages.append(page)
	print(f" ✓ {html_file.relative_to(site_folder)}")
	except Exception as e:
	print(f" ✗ {html_file.relative_to(site_folder)}: {e}")

	# 建立合併 HTML
	print("🔧 正在建立文件...")
	combined_html = create_combined_html(pages, site_folder)

	if args.html_only:
	html_output = output_file.with_suffix(".html")
	with open(html_output, "w", encoding="utf-8") as f:
	f.write(combined_html)
	print(f"✅ HTML 已儲存至: {html_output}")
	else:
	# 轉換成 PDF
	print("📝 正在轉換成 PDF...")
	try:
	HTML(string=combined_html, base_url=str(site_folder)).write_pdf(output_file)
	print(f"✅ PDF 已儲存至: {output_file}")
	print(f"📊 檔案大小: {output_file.stat().st_size / 1024:.1f} KB")
	except Exception as e:
	print(f"❌ PDF 轉換失敗: {e}")
	# 嘗試儲存 HTML 作為備份
	html_backup = output_file.with_suffix(".html")
	with open(html_backup, "w", encoding="utf-8") as f:
	f.write(combined_html)
	print(f"💾 已儲存 HTML 備份: {html_backup}")
	sys.exit(1)


	if __name__ == "__main__":
	main()
No results found