# import sys
# !{sys.executable} -m pip install yattag
import xml .sax
import pandas as pd
import matplotlib .pyplot as plt
import csv
class TMXFileHandler (xml .sax .ContentHandler ):
def __init__ (self , srcLang , destLang , rows ):
self .rows = rows
self .inside_translation_unit_tag = False
self .text_content = []
self .srcLang = srcLang
self .destLang = destLang
def startElement (self , tag , attributes ):
if tag == "header" :
self .srcLang = attributes ['srclang' ]
elif tag == "tu" :
self .inside_translation_unit_tag = True
def endElement (self , tag ):
if tag == 'tu' :
self .inside_translation_unit_tag = False
item = {
self .srcLang : self .text_content [0 ],
self .destLang : self .text_content [1 ]
}
self .rows .append (item )
self .text_content = []
if tag == 'body' :
self .text_content = []
def characters (self , content ):
if self .inside_translation_unit_tag :
if content .strip ():
self .text_content .append (content .strip ())
def read_rows ():
rows = []
handler = TMXFileHandler ('en' , 'pt' , rows )
xml .sax .parse ("./translations_backup/en-pt_1.tmx" , handler )
# xml.sax.parse("./en-pt-cleaned.tmx", handler)
print ("Finished XML" )
return rows
def make_df (rows ):
print ("Making DF..." )
df = pd .DataFrame (rows )
print ("Cleaning up rows..." )
rows [:] = []
print ("Completing..." )
return df
def make_csv (rows , filename ):
print ("Making CSV..." )
# field names
keys = rows [0 ].keys ()
with open (filename , 'w' ) as f :
# using csv.writer method from CSV package
writer = csv .DictWriter (f , keys )
writer .writerows (rows )
print ("Cleaning up rows..." )
rows [:] = []
print ("Completing..." )
try :
df = make_df (read_rows ())
finally :
pass
Finished XML
Making DF...
Cleaning up rows...
Completing...
make_csv (read_rows (), 'test.csv' )
Finished XML
Making CSV...
Cleaning up rows...
Completing...
en 26805614
pt 26805614
dtype: int64
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
en
pt
0
Pyke to plumber command.
Pyke para o Comando dos Encanadores.
1
Current location, sector zed niner.
Localização actual: sector 09.
2
Approaching location of unidentified spacecraft.
A aproximar-me da localização da nave não iden...
3
Copy that, pyke.
Entendido, Pyke.
4
Do you have visual?
Consegues vê-la?
Write Dataframe to TMX File
from yattag import Doc
from datetime import datetime
def convert_to_tmx (srcLang , destLang , df ):
doc , tag , text = Doc ().tagtext ()
doc .asis ("<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>" )
with tag ('tmx' , 'version="1.4"' ):
with tag ('header' ):
doc .attr (creationdate = datetime .now ().strftime ("%d/%m/%Y %H:%M:%S" ))
doc .attr (srclang = srcLang )
doc .attr (adminlang = srcLang )
doc .attr (('o-tmf' , "Unknown" ))
doc .attr (segtype = "sentence" )
doc .attr (creationtool = "Pandas" )
doc .attr (creationtoolversion = "1.0" )
doc .attr (datatype = "PlainText" )
with tag ('body' ):
for x , y in zip (df ['en' ], df ['pt' ]):
with tag ('tu' ):
with tag ('tuv' ):
doc .attr (('xml:lang' , srcLang ))
with tag ('seg' ):
text (x )
with tag ('tuv' ):
doc .attr (('xml:lang' , destLang ))
with tag ('seg' ):
text (y )
return doc .getvalue ()
Write Dataframe to TMX File with splits
from yattag import Doc
from datetime import datetime
def convert_to_tmx (srcLang , destLang , df , number_of_splits ):
count = 0
total_rows = df .shape [0 ]
rows_in_each_file = int (total_rows / number_of_splits )
print (count , total_rows , rows_in_each_file )
while (count <= number_of_splits ):
start = count * rows_in_each_file
if count == number_of_splits :
end = total_rows
else :
end = ((count + 1 ) * rows_in_each_file ) - 1
doc , tag , text = Doc ().tagtext ()
doc .asis ("<?xml version=\" 1.0\" encoding=\" UTF-8\" ?>" )
with tag ('tmx' , 'version="1.4"' ):
with tag ('header' ):
doc .attr (creationdate = datetime .now ().strftime ("%d/%m/%Y %H:%M:%S" ))
doc .attr (srclang = srcLang )
doc .attr (adminlang = srcLang )
doc .attr (('o-tmf' , "Unknown" ))
doc .attr (segtype = "sentence" )
doc .attr (creationtool = "Pandas" )
doc .attr (creationtoolversion = "1.0" )
doc .attr (datatype = "PlainText" )
with tag ('body' ):
for x , y in zip (df .loc [start :end ]['en' ], df .loc [start :end ]['pt' ]):
with tag ('tu' ):
with tag ('tuv' ):
doc .attr (('xml:lang' , srcLang ))
with tag ('seg' ):
text (x )
with tag ('tuv' ):
doc .attr (('xml:lang' , destLang ))
with tag ('seg' ):
text (y )
count = count + 1
# yield (doc.getvalue(), count-1, start, end, number_of_splits)
yield doc .getvalue ()
0 Pyke to plumber command.
1 Current location, sector zed niner.
2 Approaching location of unidentified spacecraft.
3 Copy that, pyke.
4 Do you have visual?
...
26805609 You gonna get me a cape that says "Super Dad"?
26805610 Human behavior is less like a chicken and more...
26805611 No matter how far an albatross flies, it alway...
26805612 Unless it flies too close to the water and get...
26805613 But, still, I had to let her pass
Name: en, Length: 26805614, dtype: object
df1 = df .head (99993 )
splits = convert_to_tmx ("en" , "pt" , df1 , 10 )
for i , split in enumerate (splits ):
# print(split[1], split[2], split[3], split[4])
# print(i)
with open ("./test/en-pt-cleaned_" + str (i ) + ".tmx" , "w" ) as f :
f .write (split )
0 99993 9999
# df1 = df.head(100000)
df = df .reset_index (drop = True )
splits = convert_to_tmx ("en" , "pt" , df , 1000 )
for i , split in enumerate (splits ):
with open ("./test/en-pt-cleaned_" + str (i ) + ".tmx" , "w" ) as f :
f .write (split )
0 26805614 26805
with open ("en-pt-cleaned.tmx" , "w" ) as f :
f .write (convert_to_tmx ("en" , "pt" , df_raw_sorted_src_length ))