Skip to content

Instantly share code, notes, and snippets.

@FlyingFathead
Last active November 15, 2024 18:06
Show Gist options
  • Select an option

  • Save FlyingFathead/aa4b292c95eb0a7dbbfdcf8476596885 to your computer and use it in GitHub Desktop.

Select an option

Save FlyingFathead/aa4b292c95eb0a7dbbfdcf8476596885 to your computer and use it in GitHub Desktop.
LLM letter counter (Finnish GPT model)
#!/usr/bin/env python3
# finnish_word_letter_counter.py
"""
A training set generator to assist a Finnish GPT-2 model in counting letters in a given word,
including reasoning steps. The script generates data in a specific format suitable for training
the model to perform letter counting tasks with explanations.
(c) FlyingFathead 2024
"""
import json
import argparse
import sys
import unicodedata
import random
# -------------------------------
# Global Configuration Variables
# -------------------------------
# Delimiter to separate examples
DELIMITER = "<delimiter>"
# Dialogue prefix (optional)
DIALOGUE_PREFIX = "\n<dialogi>\n"
# Question prefix
QUESTION_PREFIX = "|k| "
# Question template
QUESTION_TEMPLATE = 'Montako "{kirjain}"-kirjainta on sanassa "{sana}"?'
# Thinking step prefix
THINKING_STEP_PREFIX = "🤔💭 "
# Answer prefix
ANSWER_PREFIX = "|v| "
# -------------------------------
def is_alphabetic(char):
"""Check if a character is alphabetic according to Unicode standard."""
return unicodedata.category(char).startswith('L')
def count_and_highlight(word, char):
"""Count occurrences of `char` in `word` (case-insensitive) and highlight them."""
word_chars = list(word)
positions = []
highlighted_word = ''
for idx, c in enumerate(word_chars, 1):
if c.lower() == char.lower():
positions.append(str(idx))
highlighted_word += f'[{c}]'
else:
highlighted_word += c
count = len(positions)
return count, highlighted_word, positions
def get_kirjain_case(char, count):
"""
Return the appropriate form of 'kirjain' and related words based on the count.
"""
if count == 1:
# Singular forms
kirjain_nominative = 'kirjain'
kirjain_genitive = 'kirjaimen'
kappale_kirjaimia = 'kappale "kirjain"-kirjainta'
kertaa_kirjainta = 'kertaa'
else:
# Plural forms
kirjain_nominative = 'kirjainta'
kirjain_genitive = 'kirjainten'
kappale_kirjaimia = 'kappaletta "kirjain"-kirjaimia'
kertaa_kirjainta = 'kertaa'
return {
'kirjain_nominative': kirjain_nominative,
'kirjain_genitive': kirjain_genitive,
'kappale_kirjaimia': kappale_kirjaimia,
'kertaa_kirjainta': kertaa_kirjainta
}
def generate_training_data(lexicon, include_nonexistent, delimiter, question_template):
"""Generate training data with character counts and reasoning steps for each word in `lexicon`."""
training_data = []
finnish_alphabet = list("aäbcdefghijklmnoöpqrsšzžtuvywxå")
for word in lexicon:
# Randomly select a character from the word or from the Finnish alphabet
unique_chars = set(word.lower())
if unique_chars:
char = random.choice(list(unique_chars))
else:
char = random.choice(finnish_alphabet)
count, highlighted_word, positions = count_and_highlight(word, char)
total_count = len(word)
# Get appropriate cases based on count
kirjain_case = get_kirjain_case(char, count)
# Build the thinking step
thinking_steps = []
thinking_steps.append(f"{THINKING_STEP_PREFIX}Käyttäjä näemmä kysyy, montako \"{char}\"-kirjainta on sanassa \"{word}\".")
thinking_steps.append("Puretaanpa ensin sana kirjaimiksi:")
char_list = [f"{idx+1}. {c}" for idx, c in enumerate(word)]
thinking_steps.append(' '.join(char_list))
if count > 0:
positions_str = ', '.join(positions)
if count == 1:
once_str = "yhden kerran (1 kpl)"
else:
once_str = f"tasan {count} kertaa"
thinking_steps.append(f"💡Näköjään kirjain \"{char}\" esiintyi kirjain {positions_str} kohdalla {once_str}.")
thinking_steps.append(f"Eli, kun otamme sanan \"{word}\" ja puramme sen osiin, voimme korostaa [ ]-hakasuluilla kirjaimet: \"{highlighted_word}\".")
thinking_steps.append(f"Sanassa \"{word}\" on siis yhteensä {count} kappaletta \"{char}\"-kirjaimia.")
else:
thinking_steps.append(f"💡Näköjään kirjain \"{char}\" ei esiinny sanassa \"{word}\" lainkaan.")
thinking_steps.append(f"Sanassa \"{word}\" ei ole yhtään \"{char}\"-kirjainta.")
# Build the final answer
if count > 0:
if count == 1:
# Singular forms with "yhden kerran (1 kpl)"
answer = f"Sanassa \"{word}\" esiintyy kirjain \"{char}\" yhteensä yhden kerran (1 kpl). Kirjain korostettuna sanassa hakasulkein: {highlighted_word}"
else:
# Plural forms
answer = f"Sanassa \"{word}\" esiintyy kirjain \"{char}\" yhteensä {count} {kirjain_case['kertaa_kirjainta']}. Kirjaimet korostettuna hakasulkein: {highlighted_word}"
else:
answer = f"Sanassa \"{word}\" ei esiinny \"{char}\"-kirjainta kertaakaan."
# Replace placeholders with appropriate forms
if count > 0:
# Adjust the 'kirjaimen' in the reasoning step
thinking_steps = [step.replace("kirjaimen", kirjain_case['kirjain_genitive']) for step in thinking_steps]
# Adjust 'kappaletta "kirjain"-kirjaimia'
thinking_steps = [step.replace('kappaletta "kirjain"-kirjaimia', kirjain_case['kappale_kirjaimia']) for step in thinking_steps]
# Build the full training example with optional dialogue prefix
# training_example = f"{delimiter}{DIALOGUE_PREFIX if DIALOGUE_PREFIX else ''}\n{QUESTION_PREFIX}{question_template.format(kirjain=char, sana=word)}\n"
training_example = f"{delimiter}{DIALOGUE_PREFIX if DIALOGUE_PREFIX else ''}\n{QUESTION_PREFIX}{QUESTION_TEMPLATE.format(kirjain=char, sana=word)}\n"
training_example += ' '.join(thinking_steps) + "\n"
training_example += f"{ANSWER_PREFIX}{answer}\n"
training_data.append(training_example)
return training_data
def main():
# Set up argument parser
parser = argparse.ArgumentParser(description='Generate character count and reasoning data for Finnish words.')
parser.add_argument('--num_words', type=int, default=1000, help='Number of words to process from the lexicon')
parser.add_argument('--word', type=str, help='A specific word to process')
parser.add_argument('--output_file', type=str, help='File to save the generated training data')
parser.add_argument('--delimiter', type=str, default=DELIMITER, help='Delimiter to separate examples')
parser.add_argument('--question_template', type=str, default='Montako "{kirjain}"-kirjainta on sanassa "{sana}"?', help='Template for the question line')
parser.add_argument('--dialogue_prefix', type=str, default=DIALOGUE_PREFIX, help='Dialogue prefix to add before each example')
parser.add_argument('--question_prefix', type=str, default=QUESTION_PREFIX, help='Prefix for the question line')
parser.add_argument('--thinking_step_prefix', type=str, default=THINKING_STEP_PREFIX, help='Prefix for the thinking step')
parser.add_argument('--answer_prefix', type=str, default=ANSWER_PREFIX, help='Prefix for the answer line')
args = parser.parse_args()
# Update global variables if command-line arguments are provided
global DELIMITER, DIALOGUE_PREFIX, QUESTION_PREFIX, QUESTION_TEMPLATE, THINKING_STEP_PREFIX, ANSWER_PREFIX
DELIMITER = args.delimiter
DIALOGUE_PREFIX = args.dialogue_prefix
QUESTION_PREFIX = args.question_prefix
QUESTION_TEMPLATE = args.question_template
THINKING_STEP_PREFIX = args.thinking_step_prefix
ANSWER_PREFIX = args.answer_prefix
if args.word:
lexicon = [args.word]
else:
# For demonstration, using a small set of Finnish words. Replace with a Finnish corpus as needed.
lexicon = ['kissa', 'koira', 'pöytä', 'mökkiläinen', 'äiti', 'jyväskylä', 'suomi', 'käärme', 'sähkö', 'järvi']
# If you have a larger Finnish lexicon, you can load it here and slice according to args.num_words
lexicon = lexicon[:args.num_words]
training_data = generate_training_data(
lexicon,
include_nonexistent=True,
delimiter=DELIMITER,
question_template=args.question_template
)
# Output the generated training data
if args.output_file:
try:
with open(args.output_file, 'w', encoding='utf-8') as f:
for example in training_data:
f.write(example + "\n")
except IOError as e:
print(f"Error writing to file {args.output_file}: {e}", file=sys.stderr)
else:
for example in training_data:
print(example)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment