skrungly · July 27, 2025 12:41
diff --git a/generate.py b/generate.py
 import time
 import json
 from html.parser import HTMLParser
 from pathlib import Path

 import genanki
 import requests

 VOCAB_URL = "https://www.ntnu.edu/now/{chapter}/vocabulary"
 CHAPTERS = list(range(1, 11))  # 1 to 10 (inc.)
 AUDIO_DL_RATE = 1  # seconds

 WORD_TYPES = {
    "n": "Noun",
    "v": "Verb",
    "a": "Adjective",
    "d": "Adverb",
    "p": "Preposition",
    None: "Other",
 }

 # first set up some anki things
 audio_model = genanki.Model(
    1653378496,
    "Comprehensive Model",
    fields=[
        {"name": "Norsk"},
        {"name": "English"},
        {"name": "Audio"},
    ],
    templates=[
        {
            "name": "Reading",
            "qfmt": "«{{Norsk}}»",
            "afmt": "{{FrontSide}} <hr id=answer> {{English}}",
        },
        {
            "name": "Spelling",
            "qfmt": "\"{{English}}\" {{type:Norsk}}",
            "afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}} {{Audio}}",
        },
        {
            "name": "Listening",
            "qfmt": "{{Audio}}",
            "afmt": "{{Audio}} <hr id=answer> {{English}}",
        },
    ]
 )

 simple_model = genanki.Model(
    1169572294,
    "Simple Model",
    fields=[
        {"name": "Norsk"},
        {"name": "English"},
    ],
    templates=[
        {
            "name": "Reading",
            "qfmt": "«{{Norsk}}»",
            "afmt": "{{FrontSide}} <hr id=answer> {{English}}",
        },
        {
            "name": "Spelling",
            "qfmt": "\"{{English}}\" {{type:Norsk}}",
            "afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}}",
        },
    ]
 )


 # create a custom Note type which only uses the main two fields for the
 # GUID, so that we have the option to add fields without changing GUID.
 class BasicNote(genanki.Note):
    @property
    def guid(self):
        return genanki.guid_for(self.fields[0], self.fields[1])


 class VocabTableParser(HTMLParser):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self.parsing_table = False

        self.headers = []
        self.parsing_header = False
        self.skip_data = False

        self.current_word = {}
        self.word_list = []

        self.column = 0

    def handle_starttag(self, tag, attrs):
        if tag == "tbody":
            self.parsing_table = True

        if not self.parsing_table:
            return

        if tag == "tr":
            self.column = 0

        elif tag == "th":
            self.parsing_header = True

        elif tag == "audio":
            for attr, value in attrs:
                if attr == "src":
                    self.current_word["Audio"] = value
                    self.skip_data = True
                    break

    def handle_endtag(self, tag):
        if tag == "tbody":
            self.parsing_table = False

        if not self.parsing_table:
            return

        elif tag == "th":
            self.column += 1
            self.parsing_header = False

        elif tag == "td":
            self.column += 1

        elif tag == "audio":
            self.skip_data = False

        elif tag == "tr":
            if self.current_word:
                self.word_list.append(self.current_word)
                self.current_word = {}

    def handle_data(self, data):
        if not self.parsing_table:
            return

        if self.parsing_header:
            # the unlabelled column contains prefixes like "å" and "en"
            self.headers.append(data.strip() or "Prefix")
            return

        if not data.strip() or self.skip_data:
            return

        column_name = self.headers[self.column]
        self.current_word[column_name] = data


 def fetch_vocab_list(vocab_path=None):
    if vocab_path and vocab_path.exists():
        print(f"using existing data from {vocab_path}")
        with open(vocab_path) as vocab_file:
            return json.load(vocab_file)

    vocab_list = []

    print(f"fetching terms for {len(CHAPTERS)} chapters...")
    for chapter in CHAPTERS:
        response = requests.get(VOCAB_URL.format(chapter=chapter))
        response.raise_for_status()

        parser = VocabTableParser()
        parser.feed(response.text)

        print(f"fetched {len(parser.word_list)} terms for chapter {chapter}")
        vocab_list.extend(parser.word_list)

    if vocab_path:
        with open(vocab_path, "w") as vocab_file:
            json.dump(vocab_list, vocab_file, indent=2)

    return vocab_list


 def fetch_audio(audio_url):
    audio_name = audio_url.rsplit("/", 1)[1]
    audio_path = Path("audio") / audio_name

    print(f"looking for {audio_name}... ", end="")

    if audio_path.exists():
        print("cached")
        return audio_path

    response = requests.get(audio_url)
    if response.status_code != 200:
        print("not found")
        return None

    with open(audio_path, "wb") as audio_file:
        audio_file.write(response.content)

    print("fetched")
    time.sleep(AUDIO_DL_RATE)
    return audio_path


 def generate_anki_package(word_list):
    deck = genanki.Deck(1220042721, "Norwegian on the Web 1")
    package = genanki.Package(deck)
    skipped_words = []

    for word_data in word_list:
        prefix = word_data.get("Prefix")
        word = word_data["Word"]

        norsk_field = f"{prefix} {word}" if prefix else word
        english_field = word_data.get("English")

        if not english_field:
            print(f"no english term found for {word!r}")
            skipped_words.append(word_data)
            continue

        if "proper name" in english_field:
            print(f"skipping proper name {word!r}")
            skipped_words.append(word_data)
            continue

        word_type = WORD_TYPES[word_data.get("C")]
        initial_text = word_data["TXT"]
        chapter = initial_text[:2]
        note_tags = [word_type, initial_text, chapter]

        audio_path = fetch_audio(word_data["Audio"])

        if audio_path:
            audio_field = f"[sound:{audio_path.name}]"
            package.media_files.append(audio_path)
            note = BasicNote(
                model=audio_model,
                tags=note_tags,
                fields=[norsk_field, english_field, audio_field]
            )

        else:
            note = BasicNote(
                model=simple_model,
                tags=note_tags,
                fields=[norsk_field, english_field]
            )

        deck.add_note(note)

    print("done! skipped words:", json.dumps(skipped_words, indent=2))
    return package


 if __name__ == "__main__":
    vocab_path = Path("vocab.json")
    vocab_list = fetch_vocab_list(vocab_path)

    package = generate_anki_package(vocab_list)
    package.write_to_file("now1.apkg")
	import time
	import json
	from html.parser import HTMLParser
	from pathlib import Path

	import genanki
	import requests

	VOCAB_URL = "https://www.ntnu.edu/now/{chapter}/vocabulary"
	CHAPTERS = list(range(1, 11)) # 1 to 10 (inc.)
	AUDIO_DL_RATE = 1 # seconds

	WORD_TYPES = {
	"n": "Noun",
	"v": "Verb",
	"a": "Adjective",
	"d": "Adverb",
	"p": "Preposition",
	None: "Other",
	}

	# first set up some anki things
	audio_model = genanki.Model(
	1653378496,
	"Comprehensive Model",
	fields=[
	{"name": "Norsk"},
	{"name": "English"},
	{"name": "Audio"},
	],
	templates=[
	{
	"name": "Reading",
	"qfmt": "«{{Norsk}}»",
	"afmt": "{{FrontSide}} <hr id=answer> {{English}}",
	},
	{
	"name": "Spelling",
	"qfmt": "\"{{English}}\" {{type:Norsk}}",
	"afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}} {{Audio}}",
	},
	{
	"name": "Listening",
	"qfmt": "{{Audio}}",
	"afmt": "{{Audio}} <hr id=answer> {{English}}",
	},
	]
	)

	simple_model = genanki.Model(
	1169572294,
	"Simple Model",
	fields=[
	{"name": "Norsk"},
	{"name": "English"},
	],
	templates=[
	{
	"name": "Reading",
	"qfmt": "«{{Norsk}}»",
	"afmt": "{{FrontSide}} <hr id=answer> {{English}}",
	},
	{
	"name": "Spelling",
	"qfmt": "\"{{English}}\" {{type:Norsk}}",
	"afmt": "\"{{English}}\" <hr id=answer> {{type:Norsk}}",
	},
	]
	)


	# create a custom Note type which only uses the main two fields for the
	# GUID, so that we have the option to add fields without changing GUID.
	class BasicNote(genanki.Note):
	@property
	def guid(self):
	return genanki.guid_for(self.fields[0], self.fields[1])


	class VocabTableParser(HTMLParser):
	def __init__(self, args, *kwargs):
	super().__init__(args, *kwargs)

	self.parsing_table = False

	self.headers = []
	self.parsing_header = False
	self.skip_data = False

	self.current_word = {}
	self.word_list = []

	self.column = 0

	def handle_starttag(self, tag, attrs):
	if tag == "tbody":
	self.parsing_table = True

	if not self.parsing_table:
	return

	if tag == "tr":
	self.column = 0

	elif tag == "th":
	self.parsing_header = True

	elif tag == "audio":
	for attr, value in attrs:
	if attr == "src":
	self.current_word["Audio"] = value
	self.skip_data = True
	break

	def handle_endtag(self, tag):
	if tag == "tbody":
	self.parsing_table = False

	if not self.parsing_table:
	return

	elif tag == "th":
	self.column += 1
	self.parsing_header = False

	elif tag == "td":
	self.column += 1

	elif tag == "audio":
	self.skip_data = False

	elif tag == "tr":
	if self.current_word:
	self.word_list.append(self.current_word)
	self.current_word = {}

	def handle_data(self, data):
	if not self.parsing_table:
	return

	if self.parsing_header:
	# the unlabelled column contains prefixes like "å" and "en"
	self.headers.append(data.strip() or "Prefix")
	return

	if not data.strip() or self.skip_data:
	return

	column_name = self.headers[self.column]
	self.current_word[column_name] = data


	def fetch_vocab_list(vocab_path=None):
	if vocab_path and vocab_path.exists():
	print(f"using existing data from {vocab_path}")
	with open(vocab_path) as vocab_file:
	return json.load(vocab_file)

	vocab_list = []

	print(f"fetching terms for {len(CHAPTERS)} chapters...")
	for chapter in CHAPTERS:
	response = requests.get(VOCAB_URL.format(chapter=chapter))
	response.raise_for_status()

	parser = VocabTableParser()
	parser.feed(response.text)

	print(f"fetched {len(parser.word_list)} terms for chapter {chapter}")
	vocab_list.extend(parser.word_list)

	if vocab_path:
	with open(vocab_path, "w") as vocab_file:
	json.dump(vocab_list, vocab_file, indent=2)

	return vocab_list


	def fetch_audio(audio_url):
	audio_name = audio_url.rsplit("/", 1)[1]
	audio_path = Path("audio") / audio_name

	print(f"looking for {audio_name}... ", end="")

	if audio_path.exists():
	print("cached")
	return audio_path

	response = requests.get(audio_url)
	if response.status_code != 200:
	print("not found")
	return None

	with open(audio_path, "wb") as audio_file:
	audio_file.write(response.content)

	print("fetched")
	time.sleep(AUDIO_DL_RATE)
	return audio_path


	def generate_anki_package(word_list):
	deck = genanki.Deck(1220042721, "Norwegian on the Web 1")
	package = genanki.Package(deck)
	skipped_words = []

	for word_data in word_list:
	prefix = word_data.get("Prefix")
	word = word_data["Word"]

	norsk_field = f"{prefix} {word}" if prefix else word
	english_field = word_data.get("English")

	if not english_field:
	print(f"no english term found for {word!r}")
	skipped_words.append(word_data)
	continue

	if "proper name" in english_field:
	print(f"skipping proper name {word!r}")
	skipped_words.append(word_data)
	continue

	word_type = WORD_TYPES[word_data.get("C")]
	initial_text = word_data["TXT"]
	chapter = initial_text[:2]
	note_tags = [word_type, initial_text, chapter]

	audio_path = fetch_audio(word_data["Audio"])

	if audio_path:
	audio_field = f"[sound:{audio_path.name}]"
	package.media_files.append(audio_path)
	note = BasicNote(
	model=audio_model,
	tags=note_tags,
	fields=[norsk_field, english_field, audio_field]
	)

	else:
	note = BasicNote(
	model=simple_model,
	tags=note_tags,
	fields=[norsk_field, english_field]
	)

	deck.add_note(note)

	print("done! skipped words:", json.dumps(skipped_words, indent=2))
	return package


	if __name__ == "__main__":
	vocab_path = Path("vocab.json")
	vocab_list = fetch_vocab_list(vocab_path)

	package = generate_anki_package(vocab_list)
	package.write_to_file("now1.apkg")
No results found