Created
November 20, 2025 22:58
-
-
Save yukiarimo/98d849cc327418b5865a0cd8ecc8e53e to your computer and use it in GitHub Desktop.
Apple Dictionary Extractor
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import json | |
| import re | |
| import sys | |
| from tqdm import tqdm | |
| from html import unescape | |
| from bs4 import BeautifulSoup | |
| def strip_html_tags(html: str) -> str: | |
| """Remove HTML tags for plain text extraction.""" | |
| soup = BeautifulSoup(html, "html.parser") | |
| return soup.get_text(separator="\n", strip=True) | |
| def extract_sections(definition_html: str): | |
| """Extract headword, grammar, main, phrases, derivatives from AppleDict HTML.""" | |
| soup = BeautifulSoup(definition_html, "html.parser") | |
| result = {} | |
| # Headword | |
| hw = soup.find(class_="hw") | |
| result["headword"] = hw.get_text(" ", strip=True) if hw else None | |
| # Grammar (all .posg and .pos, plus inflections) | |
| grammar = [] | |
| for g in soup.select(".posg, .pos"): | |
| text = g.get_text(" ", strip=True) | |
| if text and text not in grammar: | |
| grammar.append(text) | |
| # Add inflections (e.g. -и, род. мн. пе́сен) | |
| inflections = [] | |
| for inf in soup.select(".infg, .inf, .sy"): | |
| text = inf.get_text(" ", strip=True) | |
| if text and text not in inflections: | |
| inflections.append(text) | |
| grammar_line = ", ".join(inflections + grammar) | |
| result["grammar"] = grammar_line if grammar_line else None | |
| # Main definition (all .df under .msDict) | |
| main_defs = [] | |
| for d in soup.select(".msDict .df"): | |
| main_defs.append(d.get_text(" ", strip=True)) | |
| # Add examples (all .ex under .msDict) | |
| examples = [] | |
| for ex in soup.select(".msDict .ex"): | |
| examples.append(ex.get_text(" ", strip=True)) | |
| # Add short examples (all .ex under .msDict .eg) | |
| for ex in soup.select(".msDict .eg .ex"): | |
| examples.append(ex.get_text(" ", strip=True)) | |
| # Add all |-separated short examples (e.g. | Массовая п.) | |
| for lbl in soup.select(".msDict .lbl"): | |
| text = lbl.get_text(" ", strip=True) | |
| if text and text not in examples: | |
| examples.append(text) | |
| # Compose main definition | |
| main_lines = [] | |
| if main_defs: | |
| main_lines.append(main_defs[0]) | |
| if examples: | |
| main_lines.extend(examples) | |
| result["main_definition"] = "\n".join(main_lines).strip() if main_lines else None | |
| # Phrases (under .t_phrases) | |
| phrases = [] | |
| for block in soup.select(".t_phrases .subEntry"): | |
| label = block.find(class_="l") | |
| meaning = block.find(class_="df") | |
| if label and meaning: | |
| phrases.append(f"{label.get_text(' ', strip=True)}\n— {meaning.get_text(' ', strip=True)}") | |
| result["phrases"] = "\n".join(phrases) if phrases else None | |
| # Derivatives (under .t_derivatives) | |
| derivatives = [] | |
| for block in soup.select(".t_derivatives .subEntry"): | |
| parts = [] | |
| # e.g. "уменьш." or "прил." | |
| reg = block.find(class_="reg") | |
| if reg: | |
| parts.append(reg.get_text(" ", strip=True)) | |
| label = block.find(class_="l") | |
| if label: | |
| parts.append(label.get_text(" ", strip=True)) | |
| inf = block.find(class_="inf") | |
| if inf: | |
| parts.append(inf.get_text(" ", strip=True)) | |
| pos = block.find(class_="pos") | |
| if pos: | |
| parts.append(pos.get_text(" ", strip=True)) | |
| # Example for derivative | |
| eg = block.find(class_="eg") | |
| if eg: | |
| parts.append(eg.get_text(" ", strip=True)) | |
| line = ", ".join([p for p in parts if p]) | |
| if line: | |
| derivatives.append(line) | |
| result["derivatives"] = "\n".join(derivatives) if derivatives else None | |
| # Remove empty fields | |
| return {k: v for k, v in result.items() if v} | |
| def extract_entries_to_json(xml_path: str, output_json_path: str): | |
| """Extract entries from AppleDict-like XML with <article> tags and save as beautiful JSON""" | |
| with open(xml_path, "r", encoding="utf-8") as f: | |
| xml = f.read() | |
| chunks = xml.split("<article>") | |
| entries = [] | |
| for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"): | |
| full_entry = "<article>" + chunk | |
| closing_tag_index = full_entry.find("</article>") | |
| if closing_tag_index == -1: | |
| continue | |
| full_entry = full_entry[: closing_tag_index + len("</article>")] | |
| key_match = re.search(r"<key>(.*?)</key>", full_entry) | |
| key = key_match.group(1) if key_match else None | |
| synonyms = re.findall(r"<synonym>(.*?)</synonym>", full_entry) | |
| # Extract definition HTML | |
| definition_match = re.search(r'<definition(?:\s+type="(.*?)")?.*?<!\[CDATA\[(.*?)\]\]>', full_entry, re.DOTALL) | |
| definition_html = definition_match.group(2).strip() if definition_match else None | |
| # Extract sections using HTML parser | |
| sections = extract_sections(definition_html) if definition_html else {} | |
| entry = { | |
| "key": key, | |
| "synonyms": synonyms, | |
| **sections | |
| } | |
| # Remove empty fields | |
| entry = {k: v for k, v in entry.items() if v} | |
| entries.append(entry) | |
| with open(output_json_path, "w", encoding="utf-8") as out_file: | |
| json.dump(entries, out_file, ensure_ascii=False, indent=2) | |
| print(f"✅ Extracted {len(entries)} entries to: {output_json_path}") | |
| if __name__ == "__main__": | |
| if len(sys.argv) != 3: | |
| print("Usage: python extract_appledict_entries.py input.xml output.json") | |
| sys.exit(1) | |
| input_xml = sys.argv[1] | |
| output_json = sys.argv[2] | |
| extract_entries_to_json(input_xml, output_json) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment