Skip to content

Instantly share code, notes, and snippets.

@eadmaster
Created January 25, 2026 23:06
Show Gist options
  • Select an option

  • Save eadmaster/17476c9184ae7311f1b993b675c3b736 to your computer and use it in GitHub Desktop.

Select an option

Save eadmaster/17476c9184ae7311f1b993b675c3b736 to your computer and use it in GitHub Desktop.
jstringstxt2csv.py
import sys
def translate_with_sugoi_bulk(raw_list):
# https://huggingface.co/entai2965/sugoi-v4-ja-en-ctranslate2
import ctranslate2
import sentencepiece
#set defaults
#model_path='sugoi-v4-ja-en-ctranslate2'
model_path='/home/user/.cache/huggingface/hub/models--entai2965--sugoi-v4-ja-en-ctranslate2/snapshots/71d67eb8e73ec2f5aaefc0689e03a4eb843d3a2b'
sentencepiece_model_path=model_path+'/spm'
device='cpu'
#device='cuda'
#load models
translator = ctranslate2.Translator(model_path, device=device)
tokenizer_for_source_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.ja.nopretok.model')
tokenizer_for_target_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.en.nopretok.model')
#tokenize batch
tokenized_batch=[]
for text in raw_list:
tokenized_batch.append(tokenizer_for_source_language.encode(text,out_type=str))
#translate
#https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
#translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=1) #faster https://github.com/OpenNMT/CTranslate2/blob/master/docs/decoding.md#greedy-search
translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=5) #disable_unk=True
assert(len(raw_list)==len(translated_batch))
#decode
for count,tokens in enumerate(translated_batch):
translated_batch[count]=tokenizer_for_target_language.decode(tokens.hypotheses[0]).replace('<unk>','')
return translated_batch
def is_sjis_single_byte(char):
"""
Checks if a character occupies exactly 1 byte in Shift-JIS (cp932).
This correctly identifies '。' (65377) as a single-byte character.
"""
try:
return len(char.encode('cp932')) == 1
except UnicodeEncodeError:
return False
def output_translated_file(input_file_str, output_file):
curr_lines_list = []
#curr_addr_list = []
BULK_SIZE = 100
lines = input_file_str.splitlines()
for i, line in enumerate(lines):
if line.strip() == "" or line.startswith("#") or line.startswith(";"):
# keep empty lines and comments
output_file.write(line)
continue
try:
address, text = line.split(" ", 1)
addr_int = int(address, 16)
except:
# invalid address
print("bad line:" + line)
continue
# strip 1-byte chars from the beginning
while len(text) > 0 and is_sjis_single_byte(text[0]): # or ord(text[0])==0x88a1) # skip special 2-byte control code decoded as kanji
addr_int += 1
address = f"0x{addr_int:08x}"
text = text[1:]
# strip 1-byte chars from the end
while len(text) > 0 and is_sjis_single_byte(text[-1]):
text = text[:-1]
if len(text)==0:
continue
if (len(curr_lines_list) < BULK_SIZE) and (i != len(lines) - 2): # bulk list is full or last line
curr_lines_list.append(text)
#curr_addr_list.append(addr_int)
else:
#translated_text = translate_with_argos(text)
#translated_text = translate_with_sugoi(text)
translated_text_lines = translate_with_sugoi_bulk(curr_lines_list)
for i, translated_line in enumerate(translated_text_lines):
#address = hex(curr_addr_list[i] - ADDR_OFFSET).upper()[2:]
jap_line = curr_lines_list[i]
output_file.write('%s\t%s\n' % (jap_line, translated_line))
# endfor, empty buffers
curr_lines_list.clear()
#curr_addr_list.clear()
# end if
# end for
output_file.close()
# end
if __name__ == "__main__":
import argparse, sys
parser = argparse.ArgumentParser(description='converts jstrings txt dumps into csv with optional transaltion')
parser.add_argument('infile', nargs='?', default="-", help="input file, defaults to stdin if unspecified. Supports passing urls.")
parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file, defaults to stdout if unspecified")
args = parser.parse_args()
if args.infile == "-":
infile = sys.stdin
sys.stderr.write("reading from stdin...\n")
elif args.infile.startswith(("http://", "ftp://", "https://")): # TODO: proper URL validation
from urllib.request import urlopen
infile = urlopen(args.infile)
# switch to text file mode
infile = open(args.infile, encoding="utf-8", errors="ignore")
#infile = codecs.getreader("utf-8")(infile)
else:
infile = open(args.infile)
input_file_str = infile.read()
output_translated_file(input_file_str, args.outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment