Skip to content

Instantly share code, notes, and snippets.

@Navaneethsen
Created August 13, 2021 16:11
Show Gist options
  • Select an option

  • Save Navaneethsen/8df0f3200f98849eba1f1a615de1a495 to your computer and use it in GitHub Desktop.

Select an option

Save Navaneethsen/8df0f3200f98849eba1f1a615de1a495 to your computer and use it in GitHub Desktop.
TMX Parser - SAX - Conver to Dataframe

TMX Parser - Create DF

# import sys
# !{sys.executable} -m pip install yattag
import xml.sax
import pandas as pd
import matplotlib.pyplot as plt
import csv
class TMXFileHandler(xml.sax.ContentHandler):
    def __init__(self, srcLang, destLang, rows):
        self.rows = rows
        self.inside_translation_unit_tag = False
        self.text_content = []
        self.srcLang = srcLang
        self.destLang = destLang

    def startElement(self, tag, attributes):
        if tag == "header":
            self.srcLang = attributes['srclang']
        elif tag == "tu":
            self.inside_translation_unit_tag = True

    def endElement(self, tag):
        if tag == 'tu':
            self.inside_translation_unit_tag = False
            item = {
                self.srcLang: self.text_content[0],
                self.destLang: self.text_content[1]
            }
            self.rows.append(item)
            self.text_content=[]
        if tag == 'body':
            self.text_content=[]

    def characters(self, content):
        if self.inside_translation_unit_tag:
            if content.strip():
                self.text_content.append(content.strip())
def read_rows():
    rows = []
    handler = TMXFileHandler('en', 'pt', rows)
    xml.sax.parse("./translations_backup/en-pt_1.tmx", handler)
#     xml.sax.parse("./en-pt-cleaned.tmx", handler)
    print("Finished XML")
    return rows
def make_df(rows):
    print("Making DF...")
    df = pd.DataFrame(rows)
    print("Cleaning up rows...")
    rows[:] = []
    print("Completing...")
    return df
def make_csv(rows, filename):
    print("Making CSV...")
    # field names
    keys = rows[0].keys()
    with open(filename, 'w') as f:
        # using csv.writer method from CSV package
        writer = csv.DictWriter(f, keys)
        writer.writerows(rows)
    print("Cleaning up rows...")
    rows[:] = []
    print("Completing...")
try:
    df = make_df(read_rows())
finally:
    pass
Finished XML
Making DF...
Cleaning up rows...
Completing...
make_csv(read_rows(), 'test.csv')
Finished XML
Making CSV...
Cleaning up rows...
Completing...
df.count()
en    26805614
pt    26805614
dtype: int64
df.head()
<style scoped> .dataframe tbody tr th:only-of-type { vertical-align: middle; }
.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}
</style>
en pt
0 Pyke to plumber command. Pyke para o Comando dos Encanadores.
1 Current location, sector zed niner. Localização actual: sector 09.
2 Approaching location of unidentified spacecraft. A aproximar-me da localização da nave não iden...
3 Copy that, pyke. Entendido, Pyke.
4 Do you have visual? Consegues vê-la?

Write Dataframe to TMX File

from yattag import Doc
from datetime import datetime

def convert_to_tmx(srcLang, destLang, df):

    doc, tag, text = Doc().tagtext()
    doc.asis("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")

    with tag('tmx', 'version="1.4"'):
        with tag('header'):
            doc.attr(creationdate = datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
            doc.attr(srclang = srcLang)
            doc.attr(adminlang = srcLang)
            doc.attr(('o-tmf', "Unknown"))
            doc.attr(segtype = "sentence")
            doc.attr(creationtool = "Pandas")
            doc.attr(creationtoolversion = "1.0")
            doc.attr(datatype = "PlainText")


        with tag('body'):
            for x, y in zip(df['en'], df['pt']):
                with tag('tu'):
                    with tag('tuv'):
                        doc.attr(('xml:lang', srcLang))
                        with tag('seg'):
                            text(x)
                    with tag('tuv'):
                        doc.attr(('xml:lang', destLang))
                        with tag('seg'):
                            text(y)

    return doc.getvalue()

Write Dataframe to TMX File with splits

from yattag import Doc
from datetime import datetime

def convert_to_tmx(srcLang, destLang, df, number_of_splits):

    count = 0
    total_rows = df.shape[0]
    rows_in_each_file = int(total_rows/number_of_splits)

    print(count, total_rows, rows_in_each_file)

    while (count <= number_of_splits):

        start = count * rows_in_each_file
        if count == number_of_splits:
            end = total_rows
        else:
            end = ((count + 1) * rows_in_each_file) - 1

        doc, tag, text = Doc().tagtext()
        doc.asis("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>")

        with tag('tmx', 'version="1.4"'):
            with tag('header'):
                doc.attr(creationdate = datetime.now().strftime("%d/%m/%Y %H:%M:%S"))
                doc.attr(srclang = srcLang)
                doc.attr(adminlang = srcLang)
                doc.attr(('o-tmf', "Unknown"))
                doc.attr(segtype = "sentence")
                doc.attr(creationtool = "Pandas")
                doc.attr(creationtoolversion = "1.0")
                doc.attr(datatype = "PlainText")


            with tag('body'):
                for x, y in zip(df.loc[start:end]['en'], df.loc[start:end]['pt']):
                    with tag('tu'):
                        with tag('tuv'):
                            doc.attr(('xml:lang', srcLang))
                            with tag('seg'):
                                text(x)
                        with tag('tuv'):
                            doc.attr(('xml:lang', destLang))
                            with tag('seg'):
                                text(y)
        count = count + 1
#         yield (doc.getvalue(), count-1, start, end, number_of_splits)
        yield doc.getvalue()
df['en']
0                                    Pyke to plumber command.
1                         Current location, sector zed niner.
2            Approaching location of unidentified spacecraft.
3                                            Copy that, pyke.
4                                         Do you have visual?
                                  ...
26805609       You gonna get me a cape that says "Super Dad"?
26805610    Human behavior is less like a chicken and more...
26805611    No matter how far an albatross flies, it alway...
26805612    Unless it flies too close to the water and get...
26805613                    But, still, I had to let her pass
Name: en, Length: 26805614, dtype: object
df1 = df.head(99993)

splits = convert_to_tmx("en", "pt",  df1, 10)

for i, split in enumerate(splits):
#     print(split[1], split[2], split[3], split[4])
#     print(i)
    with open("./test/en-pt-cleaned_" + str(i) + ".tmx", "w") as f:
        f.write(split)
0 99993 9999
# df1 = df.head(100000)

df = df.reset_index(drop=True)
splits = convert_to_tmx("en", "pt",  df, 1000)

for i, split in enumerate(splits):
    with open("./test/en-pt-cleaned_" + str(i) + ".tmx", "w") as f:
        f.write(split)
0 26805614 26805
with open("en-pt-cleaned.tmx", "w") as f:
    f.write(convert_to_tmx("en", "pt",  df_raw_sorted_src_length))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment