yukiarimo · November 20, 2025 22:58
diff --git a/apple_dict_extract.py b/apple_dict_extract.py
 import json
 import re
 import sys
 from tqdm import tqdm
 from html import unescape
 from bs4 import BeautifulSoup

 def strip_html_tags(html: str) -> str:
    """Remove HTML tags for plain text extraction."""
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n", strip=True)

 def extract_sections(definition_html: str):
    """Extract headword, grammar, main, phrases, derivatives from AppleDict HTML."""
    soup = BeautifulSoup(definition_html, "html.parser")
    result = {}

    # Headword
    hw = soup.find(class_="hw")
    result["headword"] = hw.get_text(" ", strip=True) if hw else None

    # Grammar (all .posg and .pos, plus inflections)
    grammar = []
    for g in soup.select(".posg, .pos"):
        text = g.get_text(" ", strip=True)
        if text and text not in grammar:
            grammar.append(text)
    # Add inflections (e.g. -и, род. мн. пе́сен)
    inflections = []
    for inf in soup.select(".infg, .inf, .sy"):
        text = inf.get_text(" ", strip=True)
        if text and text not in inflections:
            inflections.append(text)
    grammar_line = ", ".join(inflections + grammar)
    result["grammar"] = grammar_line if grammar_line else None

    # Main definition (all .df under .msDict)
    main_defs = []
    for d in soup.select(".msDict .df"):
        main_defs.append(d.get_text(" ", strip=True))
    # Add examples (all .ex under .msDict)
    examples = []
    for ex in soup.select(".msDict .ex"):
        examples.append(ex.get_text(" ", strip=True))
    # Add short examples (all .ex under .msDict .eg)
    for ex in soup.select(".msDict .eg .ex"):
        examples.append(ex.get_text(" ", strip=True))
    # Add all |-separated short examples (e.g. | Массовая п.)
    for lbl in soup.select(".msDict .lbl"):
        text = lbl.get_text(" ", strip=True)
        if text and text not in examples:
            examples.append(text)
    # Compose main definition
    main_lines = []
    if main_defs:
        main_lines.append(main_defs[0])
    if examples:
        main_lines.extend(examples)
    result["main_definition"] = "\n".join(main_lines).strip() if main_lines else None

    # Phrases (under .t_phrases)
    phrases = []
    for block in soup.select(".t_phrases .subEntry"):
        label = block.find(class_="l")
        meaning = block.find(class_="df")
        if label and meaning:
            phrases.append(f"{label.get_text(' ', strip=True)}\n— {meaning.get_text(' ', strip=True)}")
    result["phrases"] = "\n".join(phrases) if phrases else None

    # Derivatives (under .t_derivatives)
    derivatives = []
    for block in soup.select(".t_derivatives .subEntry"):
        parts = []
        # e.g. "уменьш." or "прил."
        reg = block.find(class_="reg")
        if reg:
            parts.append(reg.get_text(" ", strip=True))
        label = block.find(class_="l")
        if label:
            parts.append(label.get_text(" ", strip=True))
        inf = block.find(class_="inf")
        if inf:
            parts.append(inf.get_text(" ", strip=True))
        pos = block.find(class_="pos")
        if pos:
            parts.append(pos.get_text(" ", strip=True))
        # Example for derivative
        eg = block.find(class_="eg")
        if eg:
            parts.append(eg.get_text(" ", strip=True))
        line = ", ".join([p for p in parts if p])
        if line:
            derivatives.append(line)
    result["derivatives"] = "\n".join(derivatives) if derivatives else None

    # Remove empty fields
    return {k: v for k, v in result.items() if v}

 def extract_entries_to_json(xml_path: str, output_json_path: str):
    """Extract entries from AppleDict-like XML with <article> tags and save as beautiful JSON"""
    with open(xml_path, "r", encoding="utf-8") as f:
        xml = f.read()

    chunks = xml.split("<article>")
    entries = []

    for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"):
        full_entry = "<article>" + chunk
        closing_tag_index = full_entry.find("</article>")
        if closing_tag_index == -1:
            continue

        full_entry = full_entry[: closing_tag_index + len("</article>")]

        key_match = re.search(r"<key>(.*?)</key>", full_entry)
        key = key_match.group(1) if key_match else None

        synonyms = re.findall(r"<synonym>(.*?)</synonym>", full_entry)

        # Extract definition HTML
        definition_match = re.search(r'<definition(?:\s+type="(.*?)")?.*?<!\[CDATA\[(.*?)\]\]>', full_entry, re.DOTALL)
        definition_html = definition_match.group(2).strip() if definition_match else None

        # Extract sections using HTML parser
        sections = extract_sections(definition_html) if definition_html else {}

        entry = {
            "key": key,
            "synonyms": synonyms,
            **sections
        }
        # Remove empty fields
        entry = {k: v for k, v in entry.items() if v}
        entries.append(entry)

    with open(output_json_path, "w", encoding="utf-8") as out_file:
        json.dump(entries, out_file, ensure_ascii=False, indent=2)

    print(f"✅ Extracted {len(entries)} entries to: {output_json_path}")

 if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: python extract_appledict_entries.py input.xml output.json")
        sys.exit(1)

    input_xml = sys.argv[1]
    output_json = sys.argv[2]

    extract_entries_to_json(input_xml, output_json)
	import json
	import re
	import sys
	from tqdm import tqdm
	from html import unescape
	from bs4 import BeautifulSoup

	def strip_html_tags(html: str) -> str:
	"""Remove HTML tags for plain text extraction."""
	soup = BeautifulSoup(html, "html.parser")
	return soup.get_text(separator="\n", strip=True)

	def extract_sections(definition_html: str):
	"""Extract headword, grammar, main, phrases, derivatives from AppleDict HTML."""
	soup = BeautifulSoup(definition_html, "html.parser")
	result = {}

	# Headword
	hw = soup.find(class_="hw")
	result["headword"] = hw.get_text(" ", strip=True) if hw else None

	# Grammar (all .posg and .pos, plus inflections)
	grammar = []
	for g in soup.select(".posg, .pos"):
	text = g.get_text(" ", strip=True)
	if text and text not in grammar:
	grammar.append(text)
	# Add inflections (e.g. -и, род. мн. пе́сен)
	inflections = []
	for inf in soup.select(".infg, .inf, .sy"):
	text = inf.get_text(" ", strip=True)
	if text and text not in inflections:
	inflections.append(text)
	grammar_line = ", ".join(inflections + grammar)
	result["grammar"] = grammar_line if grammar_line else None

	# Main definition (all .df under .msDict)
	main_defs = []
	for d in soup.select(".msDict .df"):
	main_defs.append(d.get_text(" ", strip=True))
	# Add examples (all .ex under .msDict)
	examples = []
	for ex in soup.select(".msDict .ex"):
	examples.append(ex.get_text(" ", strip=True))
	# Add short examples (all .ex under .msDict .eg)
	for ex in soup.select(".msDict .eg .ex"):
	examples.append(ex.get_text(" ", strip=True))
	# Add all \|-separated short examples (e.g. \| Массовая п.)
	for lbl in soup.select(".msDict .lbl"):
	text = lbl.get_text(" ", strip=True)
	if text and text not in examples:
	examples.append(text)
	# Compose main definition
	main_lines = []
	if main_defs:
	main_lines.append(main_defs[0])
	if examples:
	main_lines.extend(examples)
	result["main_definition"] = "\n".join(main_lines).strip() if main_lines else None

	# Phrases (under .t_phrases)
	phrases = []
	for block in soup.select(".t_phrases .subEntry"):
	label = block.find(class_="l")
	meaning = block.find(class_="df")
	if label and meaning:
	phrases.append(f"{label.get_text(' ', strip=True)}\n— {meaning.get_text(' ', strip=True)}")
	result["phrases"] = "\n".join(phrases) if phrases else None

	# Derivatives (under .t_derivatives)
	derivatives = []
	for block in soup.select(".t_derivatives .subEntry"):
	parts = []
	# e.g. "уменьш." or "прил."
	reg = block.find(class_="reg")
	if reg:
	parts.append(reg.get_text(" ", strip=True))
	label = block.find(class_="l")
	if label:
	parts.append(label.get_text(" ", strip=True))
	inf = block.find(class_="inf")
	if inf:
	parts.append(inf.get_text(" ", strip=True))
	pos = block.find(class_="pos")
	if pos:
	parts.append(pos.get_text(" ", strip=True))
	# Example for derivative
	eg = block.find(class_="eg")
	if eg:
	parts.append(eg.get_text(" ", strip=True))
	line = ", ".join([p for p in parts if p])
	if line:
	derivatives.append(line)
	result["derivatives"] = "\n".join(derivatives) if derivatives else None

	# Remove empty fields
	return {k: v for k, v in result.items() if v}

	def extract_entries_to_json(xml_path: str, output_json_path: str):
	"""Extract entries from AppleDict-like XML with <article> tags and save as beautiful JSON"""
	with open(xml_path, "r", encoding="utf-8") as f:
	xml = f.read()

	chunks = xml.split("<article>")
	entries = []

	for chunk in tqdm(chunks[1:], desc="Extracting entries", unit="entry"):
	full_entry = "<article>" + chunk
	closing_tag_index = full_entry.find("</article>")
	if closing_tag_index == -1:
	continue

	full_entry = full_entry[: closing_tag_index + len("</article>")]

	key_match = re.search(r"<key>(.*?)</key>", full_entry)
	key = key_match.group(1) if key_match else None

	synonyms = re.findall(r"<synonym>(.*?)</synonym>", full_entry)

	# Extract definition HTML
	definition_match = re.search(r'<definition(?:\s+type="(.?)")?.?<!\[CDATA\[(.*?)\]\]>', full_entry, re.DOTALL)
	definition_html = definition_match.group(2).strip() if definition_match else None

	# Extract sections using HTML parser
	sections = extract_sections(definition_html) if definition_html else {}

	entry = {
	"key": key,
	"synonyms": synonyms,
	**sections
	}
	# Remove empty fields
	entry = {k: v for k, v in entry.items() if v}
	entries.append(entry)

	with open(output_json_path, "w", encoding="utf-8") as out_file:
	json.dump(entries, out_file, ensure_ascii=False, indent=2)

	print(f"✅ Extracted {len(entries)} entries to: {output_json_path}")

	if __name__ == "__main__":
	if len(sys.argv) != 3:
	print("Usage: python extract_appledict_entries.py input.xml output.json")
	sys.exit(1)

	input_xml = sys.argv[1]
	output_json = sys.argv[2]

	extract_entries_to_json(input_xml, output_json)
No results found