Skip to content

Instantly share code, notes, and snippets.

@skrungly
Created July 27, 2025 12:41
Show Gist options
  • Select an option

  • Save skrungly/912a825ec032df3c17ce89d4321d0ad0 to your computer and use it in GitHub Desktop.

Select an option

Save skrungly/912a825ec032df3c17ce89d4321d0ad0 to your computer and use it in GitHub Desktop.
a script to generate an anki deck from the NTNU NoW1 online resources
import time
import json
from html.parser import HTMLParser
from pathlib import Path
import genanki
import requests
VOCAB_URL = "https://www.ntnu.edu/now/{chapter}/vocabulary"
CHAPTERS = list(range(1, 11)) # 1 to 10 (inc.)
AUDIO_DL_RATE = 1 # seconds
WORD_TYPES = {
"n": "Noun",
"v": "Verb",
"a": "Adjective",
"d": "Adverb",
"p": "Preposition",
None: "Other",
}
# first set up some anki things
audio_model = genanki.Model(
1653378496,
"Comprehensive Model",
fields=[
{"name": "Norsk"},
{"name": "English"},
{"name": "Audio"},
],
templates=[
{
"name": "Reading",
"qfmt": "«{{Norsk}}»",
"afmt": "{{FrontSide}} <hr id=answer> {{English}}",
},
{
"name": "Spelling",
"qfmt": "\"{{English}}\" {{type:Norsk}}",
"afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}} {{Audio}}",
},
{
"name": "Listening",
"qfmt": "{{Audio}}",
"afmt": "{{Audio}} <hr id=answer> {{English}}",
},
]
)
simple_model = genanki.Model(
1169572294,
"Simple Model",
fields=[
{"name": "Norsk"},
{"name": "English"},
],
templates=[
{
"name": "Reading",
"qfmt": "«{{Norsk}}»",
"afmt": "{{FrontSide}} <hr id=answer> {{English}}",
},
{
"name": "Spelling",
"qfmt": "\"{{English}}\" {{type:Norsk}}",
"afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}}",
},
]
)
# create a custom Note type which only uses the main two fields for the
# GUID, so that we have the option to add fields without changing GUID.
class BasicNote(genanki.Note):
@property
def guid(self):
return genanki.guid_for(self.fields[0], self.fields[1])
class VocabTableParser(HTMLParser):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.parsing_table = False
self.headers = []
self.parsing_header = False
self.skip_data = False
self.current_word = {}
self.word_list = []
self.column = 0
def handle_starttag(self, tag, attrs):
if tag == "tbody":
self.parsing_table = True
if not self.parsing_table:
return
if tag == "tr":
self.column = 0
elif tag == "th":
self.parsing_header = True
elif tag == "audio":
for attr, value in attrs:
if attr == "src":
self.current_word["Audio"] = value
self.skip_data = True
break
def handle_endtag(self, tag):
if tag == "tbody":
self.parsing_table = False
if not self.parsing_table:
return
elif tag == "th":
self.column += 1
self.parsing_header = False
elif tag == "td":
self.column += 1
elif tag == "audio":
self.skip_data = False
elif tag == "tr":
if self.current_word:
self.word_list.append(self.current_word)
self.current_word = {}
def handle_data(self, data):
if not self.parsing_table:
return
if self.parsing_header:
# the unlabelled column contains prefixes like "å" and "en"
self.headers.append(data.strip() or "Prefix")
return
if not data.strip() or self.skip_data:
return
column_name = self.headers[self.column]
self.current_word[column_name] = data
def fetch_vocab_list(vocab_path=None):
if vocab_path and vocab_path.exists():
print(f"using existing data from {vocab_path}")
with open(vocab_path) as vocab_file:
return json.load(vocab_file)
vocab_list = []
print(f"fetching terms for {len(CHAPTERS)} chapters...")
for chapter in CHAPTERS:
response = requests.get(VOCAB_URL.format(chapter=chapter))
response.raise_for_status()
parser = VocabTableParser()
parser.feed(response.text)
print(f"fetched {len(parser.word_list)} terms for chapter {chapter}")
vocab_list.extend(parser.word_list)
if vocab_path:
with open(vocab_path, "w") as vocab_file:
json.dump(vocab_list, vocab_file, indent=2)
return vocab_list
def fetch_audio(audio_url):
audio_name = audio_url.rsplit("/", 1)[1]
audio_path = Path("audio") / audio_name
print(f"looking for {audio_name}... ", end="")
if audio_path.exists():
print("cached")
return audio_path
response = requests.get(audio_url)
if response.status_code != 200:
print("not found")
return None
with open(audio_path, "wb") as audio_file:
audio_file.write(response.content)
print("fetched")
time.sleep(AUDIO_DL_RATE)
return audio_path
def generate_anki_package(word_list):
deck = genanki.Deck(1220042721, "Norwegian on the Web 1")
package = genanki.Package(deck)
skipped_words = []
for word_data in word_list:
prefix = word_data.get("Prefix")
word = word_data["Word"]
norsk_field = f"{prefix} {word}" if prefix else word
english_field = word_data.get("English")
if not english_field:
print(f"no english term found for {word!r}")
skipped_words.append(word_data)
continue
if "proper name" in english_field:
print(f"skipping proper name {word!r}")
skipped_words.append(word_data)
continue
word_type = WORD_TYPES[word_data.get("C")]
initial_text = word_data["TXT"]
chapter = initial_text[:2]
note_tags = [word_type, initial_text, chapter]
audio_path = fetch_audio(word_data["Audio"])
if audio_path:
audio_field = f"[sound:{audio_path.name}]"
package.media_files.append(audio_path)
note = BasicNote(
model=audio_model,
tags=note_tags,
fields=[norsk_field, english_field, audio_field]
)
else:
note = BasicNote(
model=simple_model,
tags=note_tags,
fields=[norsk_field, english_field]
)
deck.add_note(note)
print("done! skipped words:", json.dumps(skipped_words, indent=2))
return package
if __name__ == "__main__":
vocab_path = Path("vocab.json")
vocab_list = fetch_vocab_list(vocab_path)
package = generate_anki_package(vocab_list)
package.write_to_file("now1.apkg")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment