eadmaster · January 25, 2026 23:06
diff --git a/jstringstxt2csv.py b/jstringstxt2csv.py

 import sys

   

 def translate_with_sugoi_bulk(raw_list):
 	# https://huggingface.co/entai2965/sugoi-v4-ja-en-ctranslate2
 	import ctranslate2
 	import sentencepiece

 	#set defaults
 	#model_path='sugoi-v4-ja-en-ctranslate2'
 	model_path='/home/user/.cache/huggingface/hub/models--entai2965--sugoi-v4-ja-en-ctranslate2/snapshots/71d67eb8e73ec2f5aaefc0689e03a4eb843d3a2b'
 	sentencepiece_model_path=model_path+'/spm'

 	device='cpu'
 	#device='cuda'

 	#load models
 	translator = ctranslate2.Translator(model_path, device=device)
 	tokenizer_for_source_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.ja.nopretok.model')
 	tokenizer_for_target_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.en.nopretok.model')

 	#tokenize batch
 	tokenized_batch=[]
 	for text in raw_list:
 		tokenized_batch.append(tokenizer_for_source_language.encode(text,out_type=str))

 	#translate
 	#https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
 	#translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=1)  #faster  https://github.com/OpenNMT/CTranslate2/blob/master/docs/decoding.md#greedy-search
 	translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=5)  #disable_unk=True
 	assert(len(raw_list)==len(translated_batch))

 	#decode
 	for count,tokens in enumerate(translated_batch):
 		translated_batch[count]=tokenizer_for_target_language.decode(tokens.hypotheses[0]).replace('<unk>','')
 	
 	return translated_batch
 	

 def is_sjis_single_byte(char):
    """
    Checks if a character occupies exactly 1 byte in Shift-JIS (cp932).
    This correctly identifies '｡' (65377) as a single-byte character.
    """
    try:
        return len(char.encode('cp932')) == 1
    except UnicodeEncodeError:
        return False
        
        
        
 def output_translated_file(input_file_str, output_file):
 	
 	curr_lines_list = []
 	#curr_addr_list = []
 	BULK_SIZE = 100
 	
 	lines = input_file_str.splitlines()
 	
 	for i, line in enumerate(lines):
 		if line.strip() == "" or line.startswith("#") or line.startswith(";"):
 			# keep empty lines and comments
 			output_file.write(line)
 			continue
 		
 		try:
 			address, text = line.split(" ", 1)
 			addr_int = int(address, 16)
 		except:
 			# invalid address
 			print("bad line:" + line)
 			continue
 			
 		# strip 1-byte chars from the beginning
 		while len(text) > 0 and is_sjis_single_byte(text[0]):  # or ord(text[0])==0x88a1) # skip special 2-byte control code decoded as kanji
 			addr_int += 1
 			address = f"0x{addr_int:08x}"
 			text = text[1:]

 		# strip 1-byte chars from the end
 		while len(text) > 0 and is_sjis_single_byte(text[-1]):
 			text = text[:-1]
 			
 		if len(text)==0:
 			continue
        
 		if (len(curr_lines_list) < BULK_SIZE) and (i != len(lines) - 2):  # bulk list is full or last line
 			curr_lines_list.append(text)
 			#curr_addr_list.append(addr_int)
 		else:
 			#translated_text = translate_with_argos(text)
 			#translated_text = translate_with_sugoi(text)
 			translated_text_lines = translate_with_sugoi_bulk(curr_lines_list)
 			for i, translated_line in enumerate(translated_text_lines):
 				#address = hex(curr_addr_list[i] - ADDR_OFFSET).upper()[2:]
 				jap_line = curr_lines_list[i]
 				output_file.write('%s\t%s\n' % (jap_line, translated_line))
 			# endfor, empty buffers
 			curr_lines_list.clear()
 			#curr_addr_list.clear()
 		# end if
 	# end for
 	
 	output_file.close()
 # end


 if __name__ == "__main__":
 	import argparse, sys
 	parser = argparse.ArgumentParser(description='converts jstrings txt dumps into csv with optional transaltion')
 	parser.add_argument('infile', nargs='?', default="-", help="input file, defaults to stdin if unspecified. Supports passing urls.")
 	parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file, defaults to stdout if unspecified")
 	args = parser.parse_args()

 	if args.infile == "-":
 		infile = sys.stdin
 		sys.stderr.write("reading from stdin...\n")
 	elif args.infile.startswith(("http://", "ftp://", "https://")):  # TODO: proper URL validation
 		from urllib.request import urlopen
 		infile = urlopen(args.infile)
 		# switch to text file mode
 		infile = open(args.infile, encoding="utf-8", errors="ignore")
 		#infile = codecs.getreader("utf-8")(infile)
 	else:
 		infile = open(args.infile)

 	input_file_str = infile.read()
 	
 	output_translated_file(input_file_str, args.outfile)

	import sys



	def translate_with_sugoi_bulk(raw_list):
	# https://huggingface.co/entai2965/sugoi-v4-ja-en-ctranslate2
	import ctranslate2
	import sentencepiece

	#set defaults
	#model_path='sugoi-v4-ja-en-ctranslate2'
	model_path='/home/user/.cache/huggingface/hub/models--entai2965--sugoi-v4-ja-en-ctranslate2/snapshots/71d67eb8e73ec2f5aaefc0689e03a4eb843d3a2b'
	sentencepiece_model_path=model_path+'/spm'

	device='cpu'
	#device='cuda'

	#load models
	translator = ctranslate2.Translator(model_path, device=device)
	tokenizer_for_source_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.ja.nopretok.model')
	tokenizer_for_target_language = sentencepiece.SentencePieceProcessor(sentencepiece_model_path+'/spm.en.nopretok.model')

	#tokenize batch
	tokenized_batch=[]
	for text in raw_list:
	tokenized_batch.append(tokenizer_for_source_language.encode(text,out_type=str))

	#translate
	#https://opennmt.net/CTranslate2/python/ctranslate2.Translator.html?#ctranslate2.Translator.translate_batch
	#translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=1) #faster https://github.com/OpenNMT/CTranslate2/blob/master/docs/decoding.md#greedy-search
	translated_batch=translator.translate_batch(source=tokenized_batch,beam_size=5) #disable_unk=True
	assert(len(raw_list)==len(translated_batch))

	#decode
	for count,tokens in enumerate(translated_batch):
	translated_batch[count]=tokenizer_for_target_language.decode(tokens.hypotheses[0]).replace('<unk>','')

	return translated_batch


	def is_sjis_single_byte(char):
	"""
	Checks if a character occupies exactly 1 byte in Shift-JIS (cp932).
	This correctly identifies '｡' (65377) as a single-byte character.
	"""
	try:
	return len(char.encode('cp932')) == 1
	except UnicodeEncodeError:
	return False



	def output_translated_file(input_file_str, output_file):

	curr_lines_list = []
	#curr_addr_list = []
	BULK_SIZE = 100

	lines = input_file_str.splitlines()

	for i, line in enumerate(lines):
	if line.strip() == "" or line.startswith("#") or line.startswith(";"):
	# keep empty lines and comments
	output_file.write(line)
	continue

	try:
	address, text = line.split(" ", 1)
	addr_int = int(address, 16)
	except:
	# invalid address
	print("bad line:" + line)
	continue

	# strip 1-byte chars from the beginning
	while len(text) > 0 and is_sjis_single_byte(text[0]): # or ord(text[0])==0x88a1) # skip special 2-byte control code decoded as kanji
	addr_int += 1
	address = f"0x{addr_int:08x}"
	text = text[1:]

	# strip 1-byte chars from the end
	while len(text) > 0 and is_sjis_single_byte(text[-1]):
	text = text[:-1]

	if len(text)==0:
	continue

	if (len(curr_lines_list) < BULK_SIZE) and (i != len(lines) - 2): # bulk list is full or last line
	curr_lines_list.append(text)
	#curr_addr_list.append(addr_int)
	else:
	#translated_text = translate_with_argos(text)
	#translated_text = translate_with_sugoi(text)
	translated_text_lines = translate_with_sugoi_bulk(curr_lines_list)
	for i, translated_line in enumerate(translated_text_lines):
	#address = hex(curr_addr_list[i] - ADDR_OFFSET).upper()[2:]
	jap_line = curr_lines_list[i]
	output_file.write('%s\t%s\n' % (jap_line, translated_line))
	# endfor, empty buffers
	curr_lines_list.clear()
	#curr_addr_list.clear()
	# end if
	# end for

	output_file.close()
	# end


	if __name__ == "__main__":
	import argparse, sys
	parser = argparse.ArgumentParser(description='converts jstrings txt dumps into csv with optional transaltion')
	parser.add_argument('infile', nargs='?', default="-", help="input file, defaults to stdin if unspecified. Supports passing urls.")
	parser.add_argument('outfile', nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="output file, defaults to stdout if unspecified")
	args = parser.parse_args()

	if args.infile == "-":
	infile = sys.stdin
	sys.stderr.write("reading from stdin...\n")
	elif args.infile.startswith(("http://", "ftp://", "https://")): # TODO: proper URL validation
	from urllib.request import urlopen
	infile = urlopen(args.infile)
	# switch to text file mode
	infile = open(args.infile, encoding="utf-8", errors="ignore")
	#infile = codecs.getreader("utf-8")(infile)
	else:
	infile = open(args.infile)

	input_file_str = infile.read()

	output_translated_file(input_file_str, args.outfile)
No results found