Created
January 25, 2026 23:06
-
-
Save eadmaster/17476c9184ae7311f1b993b675c3b736 to your computer and use it in GitHub Desktop.
jstringstxt2csv.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import sys | |
| def translate_with_sugoi_bulk(raw_list): | |
| # https://huggingface.co/entai2965/sugoi-v4-ja-en-ctranslate2 | |
| import ctranslate2 | |
| import sentencepiece | |
| #set defaults | |
| #model_path='sugoi-v4-ja-en-ctranslate2' | |
| model_path='/home/user/.cache/huggingface/hub/models--entai2965--sugoi-v4-ja-en-ctranslate2/snapshots/71d67eb8e73ec2f5aaefc0689e03a4eb843d3a2b' | |
| sentencepiece_model_path=model_path+'/spm' | |
| device='cpu' | |
| #device='cuda' | |
| #load models | |
| translator = ctranslate2.Translator(model_path, device=device) | |
| tokenizer_for_source_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.ja.nopretok.model') | |
| tokenizer_for_target_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.en.nopretok.model') | |
| #tokenize batch | |
| tokenized_batch=[] | |
| for text in raw_list: | |
| tokenized_batch.append(tokenizer_for_source_language.encode(text,out_type=str)) | |
| #translate | |
| #https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch | |
| #translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=1) #faster https://github.com/OpenNMT/CTranslate2/blob/master/docs/decoding.md#greedy-search | |
| translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=5) #disable_unk=True | |
| assert(len(raw_list)==len(translated_batch)) | |
| #decode | |
| for count,tokens in enumerate(translated_batch): | |
| translated_batch[count]=tokenizer_for_target_language.decode(tokens.hypotheses[0]).replace('<unk>','') | |
| return translated_batch | |
| def is_sjis_single_byte(char): | |
| """ | |
| Checks if a character occupies exactly 1 byte in Shift-JIS (cp932). | |
| This correctly identifies '。' (65377) as a single-byte character. | |
| """ | |
| try: | |
| return len(char.encode('cp932')) == 1 | |
| except UnicodeEncodeError: | |
| return False | |
| def output_translated_file(input_file_str, output_file): | |
| curr_lines_list = [] | |
| #curr_addr_list = [] | |
| BULK_SIZE = 100 | |
| lines = input_file_str.splitlines() | |
| for i, line in enumerate(lines): | |
| if line.strip() == "" or line.startswith("#") or line.startswith(";"): | |
| # keep empty lines and comments | |
| output_file.write(line) | |
| continue | |
| try: | |
| address, text = line.split(" ", 1) | |
| addr_int = int(address, 16) | |
| except: | |
| # invalid address | |
| print("bad line:" + line) | |
| continue | |
| # strip 1-byte chars from the beginning | |
| while len(text) > 0 and is_sjis_single_byte(text[0]): # or ord(text[0])==0x88a1) # skip special 2-byte control code decoded as kanji | |
| addr_int += 1 | |
| address = f"0x{addr_int:08x}" | |
| text = text[1:] | |
| # strip 1-byte chars from the end | |
| while len(text) > 0 and is_sjis_single_byte(text[-1]): | |
| text = text[:-1] | |
| if len(text)==0: | |
| continue | |
| if (len(curr_lines_list) < BULK_SIZE) and (i != len(lines) - 2): # bulk list is full or last line | |
| curr_lines_list.append(text) | |
| #curr_addr_list.append(addr_int) | |
| else: | |
| #translated_text = translate_with_argos(text) | |
| #translated_text = translate_with_sugoi(text) | |
| translated_text_lines = translate_with_sugoi_bulk(curr_lines_list) | |
| for i, translated_line in enumerate(translated_text_lines): | |
| #address = hex(curr_addr_list[i] - ADDR_OFFSET).upper()[2:] | |
| jap_line = curr_lines_list[i] | |
| output_file.write('%s\t%s\n' % (jap_line, translated_line)) | |
| # endfor, empty buffers | |
| curr_lines_list.clear() | |
| #curr_addr_list.clear() | |
| # end if | |
| # end for | |
| output_file.close() | |
| # end | |
| if __name__ == "__main__": | |
| import argparse, sys | |
| parser = argparse.ArgumentParser(description='converts jstrings txt dumps into csv with optional transaltion') | |
| parser.add_argument('infile', nargs='?', default="-", help="input file, defaults to stdin if unspecified. Supports passing urls.") | |
| parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file, defaults to stdout if unspecified") | |
| args = parser.parse_args() | |
| if args.infile == "-": | |
| infile = sys.stdin | |
| sys.stderr.write("reading from stdin...\n") | |
| elif args.infile.startswith(("http://", "ftp://", "https://")): # TODO: proper URL validation | |
| from urllib.request import urlopen | |
| infile = urlopen(args.infile) | |
| # switch to text file mode | |
| infile = open(args.infile, encoding="utf-8", errors="ignore") | |
| #infile = codecs.getreader("utf-8")(infile) | |
| else: | |
| infile = open(args.infile) | |
| input_file_str = infile.read() | |
| output_translated_file(input_file_str, args.outfile) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment