Skip to content

Instantly share code, notes, and snippets.

@yukiarimo
Created November 20, 2025 22:58
Show Gist options
  • Select an option

  • Save yukiarimo/98d849cc327418b5865a0cd8ecc8e53e to your computer and use it in GitHub Desktop.

Select an option

Save yukiarimo/98d849cc327418b5865a0cd8ecc8e53e to your computer and use it in GitHub Desktop.
Apple Dictionary Extractor
import json
import re
import sys
from tqdm import tqdm
from html import unescape
from bs4 import BeautifulSoup
def strip_html_tags(html: str) -> str:
"""Remove HTML tags for plain text extraction."""
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n", strip=True)
def extract_sections(definition_html: str):
"""Extract headword, grammar, main, phrases, derivatives from AppleDict HTML."""
soup = BeautifulSoup(definition_html, "html.parser")
result = {}
# Headword
hw = soup.find(class_="hw")
result["headword"] = hw.get_text(" ", strip=True) if hw else None
# Grammar (all .posg and .pos, plus inflections)
grammar = []
for g in soup.select(".posg, .pos"):
text = g.get_text(" ", strip=True)
if text and text not in grammar:
grammar.append(text)
# Add inflections (e.g. -и, род. мн. пе́сен)
inflections = []
for inf in soup.select(".infg, .inf, .sy"):
text = inf.get_text(" ", strip=True)
if text and text not in inflections:
inflections.append(text)
grammar_line = ", ".join(inflections + grammar)
result["grammar"] = grammar_line if grammar_line else None
# Main definition (all .df under .msDict)
main_defs = []
for d in soup.select(".msDict .df"):
main_defs.append(d.get_text(" ", strip=True))
# Add examples (all .ex under .msDict)
examples = []
for ex in soup.select(".msDict .ex"):
examples.append(ex.get_text(" ", strip=True))
# Add short examples (all .ex under .msDict .eg)
for ex in soup.select(".msDict .eg .ex"):
examples.append(ex.get_text(" ", strip=True))
# Add all |-separated short examples (e.g. | Массовая п.)
for lbl in soup.select(".msDict .lbl"):
text = lbl.get_text(" ", strip=True)
if text and text not in examples:
examples.append(text)
# Compose main definition
main_lines = []
if main_defs:
main_lines.append(main_defs[0])
if examples:
main_lines.extend(examples)
result["main_definition"] = "\n".join(main_lines).strip() if main_lines else None
# Phrases (under .t_phrases)
phrases = []
for block in soup.select(".t_phrases .subEntry"):
label = block.find(class_="l")
meaning = block.find(class_="df")
if label and meaning:
phrases.append(f"{label.get_text(' ', strip=True)}\n— {meaning.get_text(' ', strip=True)}")
result["phrases"] = "\n".join(phrases) if phrases else None
# Derivatives (under .t_derivatives)
derivatives = []
for block in soup.select(".t_derivatives .subEntry"):
parts = []
# e.g. "уменьш." or "прил."
reg = block.find(class_="reg")
if reg:
parts.append(reg.get_text(" ", strip=True))
label = block.find(class_="l")
if label:
parts.append(label.get_text(" ", strip=True))
inf = block.find(class_="inf")
if inf:
parts.append(inf.get_text(" ", strip=True))
pos = block.find(class_="pos")
if pos:
parts.append(pos.get_text(" ", strip=True))
# Example for derivative
eg = block.find(class_="eg")
if eg:
parts.append(eg.get_text(" ", strip=True))
line = ", ".join([p for p in parts if p])
if line:
derivatives.append(line)
result["derivatives"] = "\n".join(derivatives) if derivatives else None
# Remove empty fields
return {k: v for k, v in result.items() if v}
def extract_entries_to_json(xml_path: str, output_json_path: str):
"""Extract entries from AppleDict-like XML with <article> tags and save as beautiful JSON"""
with open(xml_path, "r", encoding="utf-8") as f:
xml = f.read()
chunks = xml.split("<article>")
entries = []
for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"):
full_entry = "<article>" + chunk
closing_tag_index = full_entry.find("</article>")
if closing_tag_index == -1:
continue
full_entry = full_entry[: closing_tag_index + len("</article>")]
key_match = re.search(r"<key>(.*?)</key>", full_entry)
key = key_match.group(1) if key_match else None
synonyms = re.findall(r"<synonym>(.*?)</synonym>", full_entry)
# Extract definition HTML
definition_match = re.search(r'<definition(?:\s+type="(.*?)")?.*?<!\[CDATA\[(.*?)\]\]>', full_entry, re.DOTALL)
definition_html = definition_match.group(2).strip() if definition_match else None
# Extract sections using HTML parser
sections = extract_sections(definition_html) if definition_html else {}
entry = {
"key": key,
"synonyms": synonyms,
**sections
}
# Remove empty fields
entry = {k: v for k, v in entry.items() if v}
entries.append(entry)
with open(output_json_path, "w", encoding="utf-8") as out_file:
json.dump(entries, out_file, ensure_ascii=False, indent=2)
print(f"✅ Extracted {len(entries)} entries to: {output_json_path}")
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python extract_appledict_entries.py input.xml output.json")
sys.exit(1)
input_xml = sys.argv[1]
output_json = sys.argv[2]
extract_entries_to_json(input_xml, output_json)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment