Created
September 18, 2025 07:51
-
-
Save talentestors/ada2b348e34c5bcf0321167348a49cf1 to your computer and use it in GitHub Desktop.
Crawl wiki images and articles, and convert wiki articles to Markdown format
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # requires-python = ">=3.12" | |
| # dependencies = [ | |
| # "beautifulsoup4", | |
| # "markdownify", | |
| # "opencc-python-reimplemented", | |
| # "requests", | |
| # "wikipedia", | |
| # ] | |
| # /// | |
| # from: https://github.com/Vyenkor/wiki-exporter-md | |
| # uv run wiki_gui.py | |
| import wikipedia | |
| from markdownify import markdownify as md | |
| from opencc import OpenCC | |
| from bs4 import BeautifulSoup | |
| import requests | |
| import os | |
| import threading | |
| import tkinter as tk | |
| from tkinter import messagebox | |
| import re | |
| # 繁体→简体转换器 | |
| cc = OpenCC('t2s') | |
| # 正则模式 | |
| REF_HEADING_RE = re.compile(r'^#{1,6}\s*(参考文献|References)', re.IGNORECASE) | |
| # 匹配脚注引用,如 [[7]](#cite_note-...) | |
| CITE_RE = re.compile(r"\[\[(\d+)\]\]\(#cite_note-[^)]+\)") | |
| # 匹配普通外部链接 | |
| LINK_RE = re.compile(r"\[([^\]]+)\]\((?!#)[^)]+\)") | |
| # 匹配内部锚点链接(#开头) | |
| ANCHOR_RE = re.compile(r"\[([^\]]+)\]\(#[^)]+\)") | |
| def fetch_content(source: str, by_title: bool, lang: str): | |
| """按词条或URL获取页面标题和HTML内容""" | |
| wikipedia.set_lang(lang) | |
| if by_title: | |
| page = wikipedia.page(source) | |
| return page.title, page.html() | |
| resp = requests.get(source) | |
| resp.raise_for_status() | |
| soup = BeautifulSoup(resp.text, 'html.parser') | |
| title = soup.find(id='firstHeading').get_text() | |
| html = str(soup.find('div', id='mw-content-text')) | |
| return title, html | |
| def process_html(title: str, html: str, lang: str, output_dir: str = 'output') -> str: | |
| """ | |
| 清理HTML,下载并重命名图片,转换为Markdown,处理链接与脚注 | |
| """ | |
| soup = BeautifulSoup(html, 'html.parser') | |
| for span in soup.select('.mw-editsection'): | |
| span.decompose() | |
| # 下载并本地化图片 | |
| img_dir = os.path.join(output_dir, 'images') | |
| os.makedirs(img_dir, exist_ok=True) | |
| for idx, img in enumerate(soup.find_all('img'), start=1): | |
| src = img.get('src') or img.get('data-src') | |
| if not src: | |
| continue | |
| url = src if src.startswith('http') else ( | |
| 'https:' + src if src.startswith('//') else f'https://{lang}.wikipedia.org' + src | |
| ) | |
| ext = os.path.splitext(url.split('?')[0])[1] | |
| new_name = f"{title.replace('/', '_')}_{idx}{ext}" | |
| local_rel = os.path.join('images', new_name) | |
| full_path = os.path.join(output_dir, local_rel) | |
| try: | |
| r = requests.get(url, timeout=10) | |
| r.raise_for_status() | |
| with open(full_path, 'wb') as f: | |
| f.write(r.content) | |
| img['src'] = local_rel | |
| except Exception: | |
| continue | |
| # 转为Markdown并简体化 | |
| text = md(str(soup)) | |
| if lang == 'zh': | |
| text = cc.convert(text) | |
| # 分离正文与参考文献 | |
| lines = text.splitlines() | |
| for i, line in enumerate(lines): | |
| if REF_HEADING_RE.match(line): | |
| main, ref = '\n'.join(lines[:i]), '\n'.join(lines[i:]) | |
| break | |
| else: | |
| main, ref = text, '' | |
| # 去除红链及“页面不存在”提示 | |
| main = re.sub(r"\[([^\]]+)\]\([^)]*redlink=1[^)]*\)", '', main) | |
| main = re.sub(r"&action=edit&redlink=1[^\s]*", '', main) | |
| main = re.sub(r"(页面不存在)", '', main) | |
| # 图片链接转换为 ![[词条_序号]] | |
| main = re.sub( | |
| r"!\[([^\]]+?)\]\(images/([^\)]+?)\)", | |
| lambda m: f"![[{title.replace('/', '_')}_{m.group(2).split('_')[-1].split('.')[0]}]]", | |
| main | |
| ) | |
| # 脚注引用转换,如 [[7]](...) -> $^{7}$ | |
| main = CITE_RE.sub(lambda m: f"$^{{{m.group(1)}}}$", main) | |
| # 移除所有内部锚点链接,转换为加粗文本 | |
| main = ANCHOR_RE.sub(lambda m: f"**{m.group(1)}**", main) | |
| # 外部链接转换为加粗文本 | |
| main = LINK_RE.sub(lambda m: f"**{m.group(1)}**", main) | |
| # 参考文献区块处理 | |
| if ref: | |
| ref = re.sub(r"\[([^\]]+)\]\([^)]*redlink=1[^)]*\)", '', ref) | |
| ref = CITE_RE.sub(lambda m: f"$^{{{m.group(1)}}}$", ref) | |
| ref = ANCHOR_RE.sub(lambda m: f"**{m.group(1)}**", ref) | |
| ref = LINK_RE.sub(lambda m: f"**{m.group(1)}**", ref) | |
| # 写入Markdown文件 | |
| safe_title = title.replace('/', '_') | |
| os.makedirs(output_dir, exist_ok=True) | |
| path = os.path.join(output_dir, f"{safe_title}.md") | |
| with open(path, 'w', encoding='utf-8') as f: | |
| f.write(f"# {title}\n\n{main}\n{ref}") | |
| return path | |
| def run_export(entry: str, mode: str, lang: str): | |
| """ | |
| 启动导出流程 | |
| """ | |
| by_title = (mode == 'title') | |
| title, html = fetch_content(entry, by_title, lang) | |
| return process_html(title, html, lang) | |
| def on_fetch(): | |
| """GUI 触发:读取输入并启动后台任务""" | |
| entry = entry_input.get().strip() | |
| if not entry: | |
| messagebox.showwarning('输入错误', '请填写词条或URL。') | |
| return | |
| btn_fetch.config(state='disabled') | |
| status_label.config(text='⏳ 处理中…') | |
| def update_status(status: str): | |
| """在主线程更新GUI状态""" | |
| status_label.config(text=status) | |
| btn_fetch.config(state='normal') | |
| def task(): | |
| try: | |
| out = run_export(entry, mode_var.get(), lang_var.get()) | |
| status = f'✅ 成功:{out}' | |
| except Exception as e: | |
| status = f'❌ 错误:{e}' | |
| root.after(0, lambda: update_status(status)) | |
| threading.Thread(target=task, daemon=True).start() | |
| # GUI 界面初始化 | |
| root = tk.Tk() | |
| root.title('维基导出工具') | |
| root.geometry('500x260') | |
| root.resizable(False, False) | |
| frame = tk.Frame(root, padx=15, pady=15) | |
| frame.pack(fill=tk.BOTH, expand=True) | |
| mode_var = tk.StringVar(value='title') | |
| tk.Radiobutton(frame, text='按词条搜索', variable=mode_var, value='title').grid(row=0, column=0, sticky='w') | |
| tk.Radiobutton(frame, text='按页面URL', variable=mode_var, value='url').grid(row=0, column=1, sticky='w') | |
| tk.Label(frame, text='词条或URL:').grid(row=1, column=0, pady=8, sticky='e') | |
| entry_input = tk.Entry(frame, width=40) | |
| entry_input.grid(row=1, column=1, columnspan=2, pady=8) | |
| lang_var = tk.StringVar(value='zh') | |
| tk.Label(frame, text='语言:').grid(row=2, column=0, sticky='e') | |
| tk.Radiobutton(frame, text='中文', variable=lang_var, value='zh').grid(row=2, column=1) | |
| tk.Radiobutton(frame, text='English', variable=lang_var, value='en').grid(row=2, column=2) | |
| btn_fetch = tk.Button(frame, text='开始导出', width=12, command=on_fetch) | |
| btn_fetch.grid(row=3, column=1, pady=15) | |
| status_label = tk.Label(frame, text='', wraplength=460, justify='left') | |
| status_label.grid(row=4, column=0, columnspan=3) | |
| root.mainloop() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment