Last active
November 15, 2024 18:06
-
-
Save FlyingFathead/aa4b292c95eb0a7dbbfdcf8476596885 to your computer and use it in GitHub Desktop.
LLM letter counter (Finnish GPT model)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # finnish_word_letter_counter.py | |
| """ | |
| A training set generator to assist a Finnish GPT-2 model in counting letters in a given word, | |
| including reasoning steps. The script generates data in a specific format suitable for training | |
| the model to perform letter counting tasks with explanations. | |
| (c) FlyingFathead 2024 | |
| """ | |
| import json | |
| import argparse | |
| import sys | |
| import unicodedata | |
| import random | |
| # ------------------------------- | |
| # Global Configuration Variables | |
| # ------------------------------- | |
| # Delimiter to separate examples | |
| DELIMITER = "<delimiter>" | |
| # Dialogue prefix (optional) | |
| DIALOGUE_PREFIX = "\n<dialogi>\n" | |
| # Question prefix | |
| QUESTION_PREFIX = "|k| " | |
| # Question template | |
| QUESTION_TEMPLATE = 'Montako "{kirjain}"-kirjainta on sanassa "{sana}"?' | |
| # Thinking step prefix | |
| THINKING_STEP_PREFIX = "🤔💭 " | |
| # Answer prefix | |
| ANSWER_PREFIX = "|v| " | |
| # ------------------------------- | |
| def is_alphabetic(char): | |
| """Check if a character is alphabetic according to Unicode standard.""" | |
| return unicodedata.category(char).startswith('L') | |
| def count_and_highlight(word, char): | |
| """Count occurrences of `char` in `word` (case-insensitive) and highlight them.""" | |
| word_chars = list(word) | |
| positions = [] | |
| highlighted_word = '' | |
| for idx, c in enumerate(word_chars, 1): | |
| if c.lower() == char.lower(): | |
| positions.append(str(idx)) | |
| highlighted_word += f'[{c}]' | |
| else: | |
| highlighted_word += c | |
| count = len(positions) | |
| return count, highlighted_word, positions | |
| def get_kirjain_case(char, count): | |
| """ | |
| Return the appropriate form of 'kirjain' and related words based on the count. | |
| """ | |
| if count == 1: | |
| # Singular forms | |
| kirjain_nominative = 'kirjain' | |
| kirjain_genitive = 'kirjaimen' | |
| kappale_kirjaimia = 'kappale "kirjain"-kirjainta' | |
| kertaa_kirjainta = 'kertaa' | |
| else: | |
| # Plural forms | |
| kirjain_nominative = 'kirjainta' | |
| kirjain_genitive = 'kirjainten' | |
| kappale_kirjaimia = 'kappaletta "kirjain"-kirjaimia' | |
| kertaa_kirjainta = 'kertaa' | |
| return { | |
| 'kirjain_nominative': kirjain_nominative, | |
| 'kirjain_genitive': kirjain_genitive, | |
| 'kappale_kirjaimia': kappale_kirjaimia, | |
| 'kertaa_kirjainta': kertaa_kirjainta | |
| } | |
| def generate_training_data(lexicon, include_nonexistent, delimiter, question_template): | |
| """Generate training data with character counts and reasoning steps for each word in `lexicon`.""" | |
| training_data = [] | |
| finnish_alphabet = list("aäbcdefghijklmnoöpqrsšzžtuvywxå") | |
| for word in lexicon: | |
| # Randomly select a character from the word or from the Finnish alphabet | |
| unique_chars = set(word.lower()) | |
| if unique_chars: | |
| char = random.choice(list(unique_chars)) | |
| else: | |
| char = random.choice(finnish_alphabet) | |
| count, highlighted_word, positions = count_and_highlight(word, char) | |
| total_count = len(word) | |
| # Get appropriate cases based on count | |
| kirjain_case = get_kirjain_case(char, count) | |
| # Build the thinking step | |
| thinking_steps = [] | |
| thinking_steps.append(f"{THINKING_STEP_PREFIX}Käyttäjä näemmä kysyy, montako \"{char}\"-kirjainta on sanassa \"{word}\".") | |
| thinking_steps.append("Puretaanpa ensin sana kirjaimiksi:") | |
| char_list = [f"{idx+1}. {c}" for idx, c in enumerate(word)] | |
| thinking_steps.append(' '.join(char_list)) | |
| if count > 0: | |
| positions_str = ', '.join(positions) | |
| if count == 1: | |
| once_str = "yhden kerran (1 kpl)" | |
| else: | |
| once_str = f"tasan {count} kertaa" | |
| thinking_steps.append(f"💡Näköjään kirjain \"{char}\" esiintyi kirjain {positions_str} kohdalla {once_str}.") | |
| thinking_steps.append(f"Eli, kun otamme sanan \"{word}\" ja puramme sen osiin, voimme korostaa [ ]-hakasuluilla kirjaimet: \"{highlighted_word}\".") | |
| thinking_steps.append(f"Sanassa \"{word}\" on siis yhteensä {count} kappaletta \"{char}\"-kirjaimia.") | |
| else: | |
| thinking_steps.append(f"💡Näköjään kirjain \"{char}\" ei esiinny sanassa \"{word}\" lainkaan.") | |
| thinking_steps.append(f"Sanassa \"{word}\" ei ole yhtään \"{char}\"-kirjainta.") | |
| # Build the final answer | |
| if count > 0: | |
| if count == 1: | |
| # Singular forms with "yhden kerran (1 kpl)" | |
| answer = f"Sanassa \"{word}\" esiintyy kirjain \"{char}\" yhteensä yhden kerran (1 kpl). Kirjain korostettuna sanassa hakasulkein: {highlighted_word}" | |
| else: | |
| # Plural forms | |
| answer = f"Sanassa \"{word}\" esiintyy kirjain \"{char}\" yhteensä {count} {kirjain_case['kertaa_kirjainta']}. Kirjaimet korostettuna hakasulkein: {highlighted_word}" | |
| else: | |
| answer = f"Sanassa \"{word}\" ei esiinny \"{char}\"-kirjainta kertaakaan." | |
| # Replace placeholders with appropriate forms | |
| if count > 0: | |
| # Adjust the 'kirjaimen' in the reasoning step | |
| thinking_steps = [step.replace("kirjaimen", kirjain_case['kirjain_genitive']) for step in thinking_steps] | |
| # Adjust 'kappaletta "kirjain"-kirjaimia' | |
| thinking_steps = [step.replace('kappaletta "kirjain"-kirjaimia', kirjain_case['kappale_kirjaimia']) for step in thinking_steps] | |
| # Build the full training example with optional dialogue prefix | |
| # training_example = f"{delimiter}{DIALOGUE_PREFIX if DIALOGUE_PREFIX else ''}\n{QUESTION_PREFIX}{question_template.format(kirjain=char, sana=word)}\n" | |
| training_example = f"{delimiter}{DIALOGUE_PREFIX if DIALOGUE_PREFIX else ''}\n{QUESTION_PREFIX}{QUESTION_TEMPLATE.format(kirjain=char, sana=word)}\n" | |
| training_example += ' '.join(thinking_steps) + "\n" | |
| training_example += f"{ANSWER_PREFIX}{answer}\n" | |
| training_data.append(training_example) | |
| return training_data | |
| def main(): | |
| # Set up argument parser | |
| parser = argparse.ArgumentParser(description='Generate character count and reasoning data for Finnish words.') | |
| parser.add_argument('--num_words', type=int, default=1000, help='Number of words to process from the lexicon') | |
| parser.add_argument('--word', type=str, help='A specific word to process') | |
| parser.add_argument('--output_file', type=str, help='File to save the generated training data') | |
| parser.add_argument('--delimiter', type=str, default=DELIMITER, help='Delimiter to separate examples') | |
| parser.add_argument('--question_template', type=str, default='Montako "{kirjain}"-kirjainta on sanassa "{sana}"?', help='Template for the question line') | |
| parser.add_argument('--dialogue_prefix', type=str, default=DIALOGUE_PREFIX, help='Dialogue prefix to add before each example') | |
| parser.add_argument('--question_prefix', type=str, default=QUESTION_PREFIX, help='Prefix for the question line') | |
| parser.add_argument('--thinking_step_prefix', type=str, default=THINKING_STEP_PREFIX, help='Prefix for the thinking step') | |
| parser.add_argument('--answer_prefix', type=str, default=ANSWER_PREFIX, help='Prefix for the answer line') | |
| args = parser.parse_args() | |
| # Update global variables if command-line arguments are provided | |
| global DELIMITER, DIALOGUE_PREFIX, QUESTION_PREFIX, QUESTION_TEMPLATE, THINKING_STEP_PREFIX, ANSWER_PREFIX | |
| DELIMITER = args.delimiter | |
| DIALOGUE_PREFIX = args.dialogue_prefix | |
| QUESTION_PREFIX = args.question_prefix | |
| QUESTION_TEMPLATE = args.question_template | |
| THINKING_STEP_PREFIX = args.thinking_step_prefix | |
| ANSWER_PREFIX = args.answer_prefix | |
| if args.word: | |
| lexicon = [args.word] | |
| else: | |
| # For demonstration, using a small set of Finnish words. Replace with a Finnish corpus as needed. | |
| lexicon = ['kissa', 'koira', 'pöytä', 'mökkiläinen', 'äiti', 'jyväskylä', 'suomi', 'käärme', 'sähkö', 'järvi'] | |
| # If you have a larger Finnish lexicon, you can load it here and slice according to args.num_words | |
| lexicon = lexicon[:args.num_words] | |
| training_data = generate_training_data( | |
| lexicon, | |
| include_nonexistent=True, | |
| delimiter=DELIMITER, | |
| question_template=args.question_template | |
| ) | |
| # Output the generated training data | |
| if args.output_file: | |
| try: | |
| with open(args.output_file, 'w', encoding='utf-8') as f: | |
| for example in training_data: | |
| f.write(example + "\n") | |
| except IOError as e: | |
| print(f"Error writing to file {args.output_file}: {e}", file=sys.stderr) | |
| else: | |
| for example in training_data: | |
| print(example) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment