Created
July 27, 2025 12:41
-
-
Save skrungly/912a825ec032df3c17ce89d4321d0ad0 to your computer and use it in GitHub Desktop.
a script to generate an anki deck from the NTNU NoW1 online resources
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import time | |
| import json | |
| from html.parser import HTMLParser | |
| from pathlib import Path | |
| import genanki | |
| import requests | |
| VOCAB_URL = "https://www.ntnu.edu/now/{chapter}/vocabulary" | |
| CHAPTERS = list(range(1, 11)) # 1 to 10 (inc.) | |
| AUDIO_DL_RATE = 1 # seconds | |
| WORD_TYPES = { | |
| "n": "Noun", | |
| "v": "Verb", | |
| "a": "Adjective", | |
| "d": "Adverb", | |
| "p": "Preposition", | |
| None: "Other", | |
| } | |
| # first set up some anki things | |
| audio_model = genanki.Model( | |
| 1653378496, | |
| "Comprehensive Model", | |
| fields=[ | |
| {"name": "Norsk"}, | |
| {"name": "English"}, | |
| {"name": "Audio"}, | |
| ], | |
| templates=[ | |
| { | |
| "name": "Reading", | |
| "qfmt": "«{{Norsk}}»", | |
| "afmt": "{{FrontSide}} <hr id=answer> {{English}}", | |
| }, | |
| { | |
| "name": "Spelling", | |
| "qfmt": "\"{{English}}\" {{type:Norsk}}", | |
| "afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}} {{Audio}}", | |
| }, | |
| { | |
| "name": "Listening", | |
| "qfmt": "{{Audio}}", | |
| "afmt": "{{Audio}} <hr id=answer> {{English}}", | |
| }, | |
| ] | |
| ) | |
| simple_model = genanki.Model( | |
| 1169572294, | |
| "Simple Model", | |
| fields=[ | |
| {"name": "Norsk"}, | |
| {"name": "English"}, | |
| ], | |
| templates=[ | |
| { | |
| "name": "Reading", | |
| "qfmt": "«{{Norsk}}»", | |
| "afmt": "{{FrontSide}} <hr id=answer> {{English}}", | |
| }, | |
| { | |
| "name": "Spelling", | |
| "qfmt": "\"{{English}}\" {{type:Norsk}}", | |
| "afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}}", | |
| }, | |
| ] | |
| ) | |
| # create a custom Note type which only uses the main two fields for the | |
| # GUID, so that we have the option to add fields without changing GUID. | |
| class BasicNote(genanki.Note): | |
| @property | |
| def guid(self): | |
| return genanki.guid_for(self.fields[0], self.fields[1]) | |
| class VocabTableParser(HTMLParser): | |
| def __init__(self, *args, **kwargs): | |
| super().__init__(*args, **kwargs) | |
| self.parsing_table = False | |
| self.headers = [] | |
| self.parsing_header = False | |
| self.skip_data = False | |
| self.current_word = {} | |
| self.word_list = [] | |
| self.column = 0 | |
| def handle_starttag(self, tag, attrs): | |
| if tag == "tbody": | |
| self.parsing_table = True | |
| if not self.parsing_table: | |
| return | |
| if tag == "tr": | |
| self.column = 0 | |
| elif tag == "th": | |
| self.parsing_header = True | |
| elif tag == "audio": | |
| for attr, value in attrs: | |
| if attr == "src": | |
| self.current_word["Audio"] = value | |
| self.skip_data = True | |
| break | |
| def handle_endtag(self, tag): | |
| if tag == "tbody": | |
| self.parsing_table = False | |
| if not self.parsing_table: | |
| return | |
| elif tag == "th": | |
| self.column += 1 | |
| self.parsing_header = False | |
| elif tag == "td": | |
| self.column += 1 | |
| elif tag == "audio": | |
| self.skip_data = False | |
| elif tag == "tr": | |
| if self.current_word: | |
| self.word_list.append(self.current_word) | |
| self.current_word = {} | |
| def handle_data(self, data): | |
| if not self.parsing_table: | |
| return | |
| if self.parsing_header: | |
| # the unlabelled column contains prefixes like "å" and "en" | |
| self.headers.append(data.strip() or "Prefix") | |
| return | |
| if not data.strip() or self.skip_data: | |
| return | |
| column_name = self.headers[self.column] | |
| self.current_word[column_name] = data | |
| def fetch_vocab_list(vocab_path=None): | |
| if vocab_path and vocab_path.exists(): | |
| print(f"using existing data from {vocab_path}") | |
| with open(vocab_path) as vocab_file: | |
| return json.load(vocab_file) | |
| vocab_list = [] | |
| print(f"fetching terms for {len(CHAPTERS)} chapters...") | |
| for chapter in CHAPTERS: | |
| response = requests.get(VOCAB_URL.format(chapter=chapter)) | |
| response.raise_for_status() | |
| parser = VocabTableParser() | |
| parser.feed(response.text) | |
| print(f"fetched {len(parser.word_list)} terms for chapter {chapter}") | |
| vocab_list.extend(parser.word_list) | |
| if vocab_path: | |
| with open(vocab_path, "w") as vocab_file: | |
| json.dump(vocab_list, vocab_file, indent=2) | |
| return vocab_list | |
| def fetch_audio(audio_url): | |
| audio_name = audio_url.rsplit("/", 1)[1] | |
| audio_path = Path("audio") / audio_name | |
| print(f"looking for {audio_name}... ", end="") | |
| if audio_path.exists(): | |
| print("cached") | |
| return audio_path | |
| response = requests.get(audio_url) | |
| if response.status_code != 200: | |
| print("not found") | |
| return None | |
| with open(audio_path, "wb") as audio_file: | |
| audio_file.write(response.content) | |
| print("fetched") | |
| time.sleep(AUDIO_DL_RATE) | |
| return audio_path | |
| def generate_anki_package(word_list): | |
| deck = genanki.Deck(1220042721, "Norwegian on the Web 1") | |
| package = genanki.Package(deck) | |
| skipped_words = [] | |
| for word_data in word_list: | |
| prefix = word_data.get("Prefix") | |
| word = word_data["Word"] | |
| norsk_field = f"{prefix} {word}" if prefix else word | |
| english_field = word_data.get("English") | |
| if not english_field: | |
| print(f"no english term found for {word!r}") | |
| skipped_words.append(word_data) | |
| continue | |
| if "proper name" in english_field: | |
| print(f"skipping proper name {word!r}") | |
| skipped_words.append(word_data) | |
| continue | |
| word_type = WORD_TYPES[word_data.get("C")] | |
| initial_text = word_data["TXT"] | |
| chapter = initial_text[:2] | |
| note_tags = [word_type, initial_text, chapter] | |
| audio_path = fetch_audio(word_data["Audio"]) | |
| if audio_path: | |
| audio_field = f"[sound:{audio_path.name}]" | |
| package.media_files.append(audio_path) | |
| note = BasicNote( | |
| model=audio_model, | |
| tags=note_tags, | |
| fields=[norsk_field, english_field, audio_field] | |
| ) | |
| else: | |
| note = BasicNote( | |
| model=simple_model, | |
| tags=note_tags, | |
| fields=[norsk_field, english_field] | |
| ) | |
| deck.add_note(note) | |
| print("done! skipped words:", json.dumps(skipped_words, indent=2)) | |
| return package | |
| if __name__ == "__main__": | |
| vocab_path = Path("vocab.json") | |
| vocab_list = fetch_vocab_list(vocab_path) | |
| package = generate_anki_package(vocab_list) | |
| package.write_to_file("now1.apkg") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment