dobrosketchkun · November 22, 2023 22:44
diff --git a/epub_rulat.py b/epub_rulat.py
 import pymorphy2
 import re
 import nltk
 from nltk.tokenize import word_tokenize

 nltk.download('punkt')

 one_letter_correspondences = {
    'А': 'A',
    'Б': 'B',
    'В': 'V',
    'Г': 'G',
    'Д': 'D',
    'Е': 'E',
    'Ё': 'Jo',
    'Ж': 'X',
    'З': 'Z',
    'И': 'I',
    'Й': 'J',
    'К': 'K',
    'Л': 'L',
    'М': 'M',
    'Н': 'N',
    'О': 'O',
    'П': 'P',
    'Р': 'R',
    'С': 'S',
    'Т': 'T',
    'У': 'U',
    'Ф': 'F',
    'Х': 'H',
    'Ц': 'Ts',
    'Ч': 'C',
    'Ш': 'W',
    'Щ': 'Wq',
    'Ъ': '',
    'Ы': 'Y',
    'Ь': 'Q',
    'Э': 'Je',
    'Ю': 'Ju',
    'Я': 'Ja',
    'а': 'a',
    'б': 'b',
    'в': 'v',
    'г': 'g',
    'д': 'd',
    'е': 'e',
    'ё': 'jo',
    'ж': 'x',
    'з': 'z',
    'и': 'i',
    'й': 'j',
    'к': 'k',
    'л': 'l',
    'м': 'm',
    'н': 'n',
    'о': 'o',
    'п': 'p',
    'р': 'r',
    'с': 's',
    'т': 't',
    'у': 'u',
    'ф': 'f',
    'х': 'h',
    'ц': 'ts',
    'ч': 'c',
    'ш': 'w',
    'щ': 'wq',
    'ъ': '',
    'ы': 'y',
    'ь': 'q',
    'э': 'je',
    'ю': 'ju',
    'я': 'ja'
 }

 mult_letter_correspondences = {
    'жи': 'xy',
    'Жи': 'Xy',
    'ши': 'wy',
    'Ши': 'Wy',
    'чю': 'cu',
    'Чю': 'Cu',
    'цы': 'tsy',
    'Цы': 'Tsy',
    'ци': 'tsy',
    'Ци': 'Tsy',
    'шю': 'wu',
    'Шю': 'Wu',
    'щю' : 'wqu',
    'Щю' : 'Wqu',

    'ЖИ': 'XY',
    'ШИ': 'WY',
    'ЧЮ': 'CU',
    'ЦЫ': 'TSY',
    'ЦИ': 'TSY',
    'ШЮ': 'WU',
    'ЩЮ' : 'WQU',
 }


 def prepare_text(text):
    morph = pymorphy2.MorphAnalyzer()

    new_text = []
    for word in word_tokenize(text, language='russian'):
        parsed_word = morph.parse(word)[0]

        

        # мышь/ночь
        if parsed_word.tag.gender == "femn": 
            if parsed_word.word.endswith("ь"):
                if parsed_word.word[-2] in ['ч', 'щ', 'ж', 'ш', 'ц']: 
                    new_text.append(word[:-1])
                else:
                    new_text.append(word)
            else:
                new_text.append(word)

        elif parsed_word.tag.POS == 'INFN':
            new_text.append(word.replace("ться", "тся"))

        else:
            # print(parsed_word.word)
            new_text.append(word)
        # new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form))
        


    result = " ".join(new_text)
    # print('RESULT 1', result)
    result = re.sub(r"(ц|ж|ш)ы", r"\1и", result)
    # print('RESULT 2', result)
    result = re.sub(r"(ш)ю", r"\1у", result)
    # print('RESULT 3', result)

    result = re.sub(r"\s+([.,!?])", r"\1", result)
    # print('RESULT 4', result)

    return result


 def is_russian_letter(char):
    # Unicode range for Cyrillic characters (Russian letters)
    russian_range = (0x0400, 0x04FF)  # Cyrillic

    # Convert the character to its Unicode code point
    code_point = ord(char)

    # Check if the code point falls within the Russian range
    if russian_range[0] <= code_point <= russian_range[1]:
        return True
    else:
        return False


 def multi_letters_repl(text):
    for key, value in mult_letter_correspondences.items():
        text = text.replace(key, value)
    return text


 def process_text(text):
    processed_text = ""

    for i in range(len(text)):
        char = text[i]
        
        if char.isalpha():
            if is_russian_letter(char):
                if char.islower():

                    processed_text += one_letter_correspondences[char] 
                else:
                    if i == 0:
                        if i + 1 < len(text) and text[i + 1].isupper():
                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char]
                    elif i == len(text) - 1:
                        if text[i - 1].isupper():
                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char] 
                    else:
                        if text[i - 1].isupper() or text[i + 1].isupper():

                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char] 
            else:
                processed_text += char
        else:
            processed_text += char
    
    return processed_text


 def final_processing(text):
    processed_text = prepare_text(text)
    processed_text = multi_letters_repl(processed_text)
    processed_text = process_text(processed_text)
    return processed_text 
 ################################################################################




 from html.entities import name2codepoint
 from html.parser import HTMLParser
 from zipfile import ZipFile
 from math import ceil, log
 import argparse
 import shutil
 import string
 import re
 import os


 #################################

 file_path = '/content/91317318.epub'

 #################################



 class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        global data_html
        # print("Start tag:", tag)
        attributes = []
        for attr in attrs:
            # print("     attr:", attr)
            attributes.append(attr)
        data_html.append((("Start tag:", tag), ("attr:", attributes)))

    def handle_endtag(self, tag):
        global data_html
        # print("End tag  :", tag)
        data_html.append(("End tag:", tag))

    def handle_data(self, data):
        global data_html
        data_html.append(("Data:", data))
        # print("Data     :", data)

    def handle_comment(self, data):
        pass
        # print("Comment  :", data)

    def handle_entityref(self, name):
        c = chr(name2codepoint[name])
        # print("Named ent:", c)

    def handle_charref(self, name):
        if name.startswith('x'):
            c = chr(int(name[1:], 16))
        else:
            c = chr(int(name))
        # print("Num ent  :", c)

    def handle_decl(self, data):
        pass
        # print("Decl     :", data)

 # def bolding(text):
 #     parts = re.findall( r'\w+|[^\s\w]+', text)
 #     new_text = ''
 #     for part in parts:
 #         if part in string.punctuation or part in string.digits:
 #             new_text += part
 #         else:
 #             if len(part) <= 3:
 #                 new_part = ''
 #                 new_part = f"<b>{part[0]}</b>"
 #                 new_part += ''.join(part[1:])
 #                 new_text += ' ' + new_part
 #             else:
 #                 point = ceil(log(len(part), 2))
 #                 new_part = ''
 #                 new_part = f"<b>{part[0:point]}</b>"
 #                 new_part += ''.join(part[point:])
 #                 new_text += ' ' + new_part 
 #     return new_text      



 ####################################


 # parser = argparse.ArgumentParser()
 # parser.add_argument("epubfile", help="put a path to your epub file in here")
 # args = parser.parse_args()
 # file_path = args.epubfile


 file_name = os.path.basename(file_path)
 epub_path = os.getcwd() +'/rulat_' + file_name
 unzip_path_folder = file_name + '_zip/' 
 unzip_path = os.getcwd() + '/' + unzip_path_folder



 print("Unzipping", file_name)

 try:
    with ZipFile(file_path, 'r') as zipObj:
        zipObj.extractall(unzip_path)
 except:
    with ZipFile(os.getcwd() + '/' + file_path, 'r') as zipObj:
        zipObj.extractall(unzip_path)
 print('Preparing rulat translation...')

 ####################################

 first_tags = """<?xml version='1.0' encoding='utf-8'?>
 <!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>\n"""


 htmls = []
 # r=root, d=directories, f = files
 for r, d, f in os.walk(unzip_path):
    for hfile in f:
        if hfile[-4:] == 'html':
            htmls.append(os.path.join(r, hfile))


 for html in htmls:
  
    with open(html, 'r', encoding='utf-8') as f:
        html_data = f.read()

    data_html = []
    parser = MyHTMLParser()
    parser.feed(html_data)

    full_html = ''
    for html_part in data_html:
        # print(html_part, '\n')
        if html_part[0] == 'Data:':
            # full_html += html_part[1]
            # full_html += f"<b>{html_part[1]}</b>"
            # full_html += bolding(html_part[1])
            proc_text = prepare_text(html_part[1])
            full_html +=  final_processing(proc_text)
            

        if len(html_part) == 2 and html_part[0][0] == 'Start tag:':
            tag = '<' + html_part[0][1] 
            full_attr = []
            for attr in html_part[1][1]:
                full_attr.append(attr[0] + f'="{attr[1]}"')
            full_attr = ', '.join(full_attr)
            if not full_attr:
                tag += full_attr + '>'
            else:
                tag += ' ' + full_attr + '>'
            full_html += tag
        if html_part[0] == 'End tag:':
            tag = f"</{html_part[1]}>"
            full_html += tag
    full_html = first_tags + full_html

    with open(html, 'w', encoding='utf-8') as f:
        f.write(full_html)


 ####################################





 os.chdir(unzip_path)
 shutil.make_archive(epub_path, 'zip', './')
 os.rename((epub_path + '.zip'), (epub_path + '.zip')[:-4])
diff --git a/rulat_text_conv.py b/rulat_text_conv.py
 import pymorphy2
 import re
 import nltk
 from nltk.tokenize import word_tokenize

 nltk.download('punkt')

 one_letter_correspondences = {
    'А': 'A',
    'Б': 'B',
    'В': 'V',
    'Г': 'G',
    'Д': 'D',
    'Е': 'E',
    'Ё': 'Jo',
    'Ж': 'X',
    'З': 'Z',
    'И': 'I',
    'Й': 'J',
    'К': 'K',
    'Л': 'L',
    'М': 'M',
    'Н': 'N',
    'О': 'O',
    'П': 'P',
    'Р': 'R',
    'С': 'S',
    'Т': 'T',
    'У': 'U',
    'Ф': 'F',
    'Х': 'H',
    'Ц': 'Ts',
    'Ч': 'C',
    'Ш': 'W',
    'Щ': 'Wq',
    'Ъ': '',
    'Ы': 'Y',
    'Ь': 'Q',
    'Э': 'Je',
    'Ю': 'Ju',
    'Я': 'Ja',
    'а': 'a',
    'б': 'b',
    'в': 'v',
    'г': 'g',
    'д': 'd',
    'е': 'e',
    'ё': 'jo',
    'ж': 'x',
    'з': 'z',
    'и': 'i',
    'й': 'j',
    'к': 'k',
    'л': 'l',
    'м': 'm',
    'н': 'n',
    'о': 'o',
    'п': 'p',
    'р': 'r',
    'с': 's',
    'т': 't',
    'у': 'u',
    'ф': 'f',
    'х': 'h',
    'ц': 'ts',
    'ч': 'c',
    'ш': 'w',
    'щ': 'wq',
    'ъ': '',
    'ы': 'y',
    'ь': 'q',
    'э': 'je',
    'ю': 'ju',
    'я': 'ja'
 }

 mult_letter_correspondences = {
    'жи': 'xy',
    'Жи': 'Xy',
    'ши': 'wy',
    'Ши': 'Wy',
    'чю': 'cu',
    'Чю': 'Cu',
    'цы': 'tsy',
    'Цы': 'Tsy',
    'ци': 'tsy',
    'Ци': 'Tsy',
    'шю': 'wu',
    'Шю': 'Wu',
    'щю' : 'wqu',
    'Щю' : 'Wqu',

    'ЖИ': 'XY',
    'ШИ': 'WY',
    'ЧЮ': 'CU',
    'ЦЫ': 'TSY',
    'ЦИ': 'TSY',
    'ШЮ': 'WU',
    'ЩЮ' : 'WQU',
 }


 def prepare_text(text):
    morph = pymorphy2.MorphAnalyzer()

    new_text = []
    for word in word_tokenize(text, language='russian'):
        parsed_word = morph.parse(word)[0]

        

        # мышь/ночь
        if parsed_word.tag.gender == "femn" and parsed_word.normal_form.endswith("ь"):
            new_text.append(word[:-1])
            # print('FEMININE and Ь', parsed_word.word)

        elif parsed_word.tag.POS == 'INFN':
            new_text.append(word.replace("ться", "тся"))

        else:
            # print(parsed_word.word)
            new_text.append(word)
        # new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form))
        


    result = " ".join(new_text)
    # print('RESULT 1', result)
    result = re.sub(r"(ц|ж|ш)ы", r"\1и", result)
    # print('RESULT 2', result)
    result = re.sub(r"(ш)ю", r"\1у", result)
    # print('RESULT 3', result)

    result = re.sub(r"\s+([.,!?])", r"\1", result)
    # print('RESULT 4', result)

    return result


 def multi_letters_repl(text):
    for key, value in mult_letter_correspondences.items():
        text = text.replace(key, value)
    return text


 def process_text(text):
    processed_text = ""

    for i in range(len(text)):
        char = text[i]
        
        if char.isalpha():
            if is_russian_letter(char):
                if char.islower():

                    processed_text += one_letter_correspondences[char] 
                else:
                    if i == 0:
                        if i + 1 < len(text) and text[i + 1].isupper():
                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char]
                    elif i == len(text) - 1:
                        if text[i - 1].isupper():
                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char] 
                    else:
                        if text[i - 1].isupper() or text[i + 1].isupper():

                            processed_text += one_letter_correspondences[char].upper()
                        else:
                            processed_text += one_letter_correspondences[char] 
            else:
                processed_text += char
        else:
            processed_text += char
    
    return processed_text


 def is_russian_letter(char):
    # Unicode range for Cyrillic characters (Russian letters)
    russian_range = (0x0400, 0x04FF)  # Cyrillic

    # Convert the character to its Unicode code point
    code_point = ord(char)

    # Check if the code point falls within the Russian range
    if russian_range[0] <= code_point <= russian_range[1]:
        return True
    else:
        return False


 processed_text = prepare_text(text)
 processed_text = multi_letters_repl(processed_text)
 processed_text = process_text(processed_text)
 print(text)
 print(processed_text)
	import pymorphy2
	import re
	import nltk
	from nltk.tokenize import word_tokenize

	nltk.download('punkt')

	one_letter_correspondences = {
	'А': 'A',
	'Б': 'B',
	'В': 'V',
	'Г': 'G',
	'Д': 'D',
	'Е': 'E',
	'Ё': 'Jo',
	'Ж': 'X',
	'З': 'Z',
	'И': 'I',
	'Й': 'J',
	'К': 'K',
	'Л': 'L',
	'М': 'M',
	'Н': 'N',
	'О': 'O',
	'П': 'P',
	'Р': 'R',
	'С': 'S',
	'Т': 'T',
	'У': 'U',
	'Ф': 'F',
	'Х': 'H',
	'Ц': 'Ts',
	'Ч': 'C',
	'Ш': 'W',
	'Щ': 'Wq',
	'Ъ': '',
	'Ы': 'Y',
	'Ь': 'Q',
	'Э': 'Je',
	'Ю': 'Ju',
	'Я': 'Ja',
	'а': 'a',
	'б': 'b',
	'в': 'v',
	'г': 'g',
	'д': 'd',
	'е': 'e',
	'ё': 'jo',
	'ж': 'x',
	'з': 'z',
	'и': 'i',
	'й': 'j',
	'к': 'k',
	'л': 'l',
	'м': 'm',
	'н': 'n',
	'о': 'o',
	'п': 'p',
	'р': 'r',
	'с': 's',
	'т': 't',
	'у': 'u',
	'ф': 'f',
	'х': 'h',
	'ц': 'ts',
	'ч': 'c',
	'ш': 'w',
	'щ': 'wq',
	'ъ': '',
	'ы': 'y',
	'ь': 'q',
	'э': 'je',
	'ю': 'ju',
	'я': 'ja'
	}

	mult_letter_correspondences = {
	'жи': 'xy',
	'Жи': 'Xy',
	'ши': 'wy',
	'Ши': 'Wy',
	'чю': 'cu',
	'Чю': 'Cu',
	'цы': 'tsy',
	'Цы': 'Tsy',
	'ци': 'tsy',
	'Ци': 'Tsy',
	'шю': 'wu',
	'Шю': 'Wu',
	'щю' : 'wqu',
	'Щю' : 'Wqu',

	'ЖИ': 'XY',
	'ШИ': 'WY',
	'ЧЮ': 'CU',
	'ЦЫ': 'TSY',
	'ЦИ': 'TSY',
	'ШЮ': 'WU',
	'ЩЮ' : 'WQU',
	}


	def prepare_text(text):
	morph = pymorphy2.MorphAnalyzer()

	new_text = []
	for word in word_tokenize(text, language='russian'):
	parsed_word = morph.parse(word)[0]



	# мышь/ночь
	if parsed_word.tag.gender == "femn":
	if parsed_word.word.endswith("ь"):
	if parsed_word.word[-2] in ['ч', 'щ', 'ж', 'ш', 'ц']:
	new_text.append(word[:-1])
	else:
	new_text.append(word)
	else:
	new_text.append(word)

	elif parsed_word.tag.POS == 'INFN':
	new_text.append(word.replace("ться", "тся"))

	else:
	# print(parsed_word.word)
	new_text.append(word)
	# new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form))



	result = " ".join(new_text)
	# print('RESULT 1', result)
	result = re.sub(r"(ц\|ж\|ш)ы", r"\1и", result)
	# print('RESULT 2', result)
	result = re.sub(r"(ш)ю", r"\1у", result)
	# print('RESULT 3', result)

	result = re.sub(r"\s+([.,!?])", r"\1", result)
	# print('RESULT 4', result)

	return result


	def is_russian_letter(char):
	# Unicode range for Cyrillic characters (Russian letters)
	russian_range = (0x0400, 0x04FF) # Cyrillic

	# Convert the character to its Unicode code point
	code_point = ord(char)

	# Check if the code point falls within the Russian range
	if russian_range[0] <= code_point <= russian_range[1]:
	return True
	else:
	return False


	def multi_letters_repl(text):
	for key, value in mult_letter_correspondences.items():
	text = text.replace(key, value)
	return text


	def process_text(text):
	processed_text = ""

	for i in range(len(text)):
	char = text[i]

	if char.isalpha():
	if is_russian_letter(char):
	if char.islower():

	processed_text += one_letter_correspondences[char]
	else:
	if i == 0:
	if i + 1 < len(text) and text[i + 1].isupper():
	processed_text += one_letter_correspondences[char].upper()
	else:
	processed_text += one_letter_correspondences[char]
	elif i == len(text) - 1:
	if text[i - 1].isupper():
	processed_text += one_letter_correspondences[char].upper()
	else:
	processed_text += one_letter_correspondences[char]
	else:
	if text[i - 1].isupper() or text[i + 1].isupper():

	processed_text += one_letter_correspondences[char].upper()
	else:
	processed_text += one_letter_correspondences[char]
	else:
	processed_text += char
	else:
	processed_text += char

	return processed_text


	def final_processing(text):
	processed_text = prepare_text(text)
	processed_text = multi_letters_repl(processed_text)
	processed_text = process_text(processed_text)
	return processed_text
	################################################################################




	from html.entities import name2codepoint
	from html.parser import HTMLParser
	from zipfile import ZipFile
	from math import ceil, log
	import argparse
	import shutil
	import string
	import re
	import os


	#################################

	file_path = '/content/91317318.epub'

	#################################



	class MyHTMLParser(HTMLParser):
	def handle_starttag(self, tag, attrs):
	global data_html
	# print("Start tag:", tag)
	attributes = []
	for attr in attrs:
	# print(" attr:", attr)
	attributes.append(attr)
	data_html.append((("Start tag:", tag), ("attr:", attributes)))

	def handle_endtag(self, tag):
	global data_html
	# print("End tag :", tag)
	data_html.append(("End tag:", tag))

	def handle_data(self, data):
	global data_html
	data_html.append(("Data:", data))
	# print("Data :", data)

	def handle_comment(self, data):
	pass
	# print("Comment :", data)

	def handle_entityref(self, name):
	c = chr(name2codepoint[name])
	# print("Named ent:", c)

	def handle_charref(self, name):
	if name.startswith('x'):
	c = chr(int(name[1:], 16))
	else:
	c = chr(int(name))
	# print("Num ent :", c)

	def handle_decl(self, data):
	pass
	# print("Decl :", data)

	# def bolding(text):
	# parts = re.findall( r'\w+\|[^\s\w]+', text)
	# new_text = ''
	# for part in parts:
	# if part in string.punctuation or part in string.digits:
	# new_text += part
	# else:
	# if len(part) <= 3:
	# new_part = ''
	# new_part = f"<b>{part[0]}</b>"
	# new_part += ''.join(part[1:])
	# new_text += ' ' + new_part
	# else:
	# point = ceil(log(len(part), 2))
	# new_part = ''
	# new_part = f"<b>{part[0:point]}</b>"
	# new_part += ''.join(part[point:])
	# new_text += ' ' + new_part
	# return new_text



	####################################


	# parser = argparse.ArgumentParser()
	# parser.add_argument("epubfile", help="put a path to your epub file in here")
	# args = parser.parse_args()
	# file_path = args.epubfile


	file_name = os.path.basename(file_path)
	epub_path = os.getcwd() +'/rulat_' + file_name
	unzip_path_folder = file_name + '_zip/'
	unzip_path = os.getcwd() + '/' + unzip_path_folder



	print("Unzipping", file_name)

	try:
	with ZipFile(file_path, 'r') as zipObj:
	zipObj.extractall(unzip_path)
	except:
	with ZipFile(os.getcwd() + '/' + file_path, 'r') as zipObj:
	zipObj.extractall(unzip_path)
	print('Preparing rulat translation...')

	####################################

	first_tags = """<?xml version='1.0' encoding='utf-8'?>
	<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>\n"""


	htmls = []
	# r=root, d=directories, f = files
	for r, d, f in os.walk(unzip_path):
	for hfile in f:
	if hfile[-4:] == 'html':
	htmls.append(os.path.join(r, hfile))


	for html in htmls:

	with open(html, 'r', encoding='utf-8') as f:
	html_data = f.read()

	data_html = []
	parser = MyHTMLParser()
	parser.feed(html_data)

	full_html = ''
	for html_part in data_html:
	# print(html_part, '\n')
	if html_part[0] == 'Data:':
	# full_html += html_part[1]
	# full_html += f"<b>{html_part[1]}</b>"
	# full_html += bolding(html_part[1])
	proc_text = prepare_text(html_part[1])
	full_html += final_processing(proc_text)


	if len(html_part) == 2 and html_part[0][0] == 'Start tag:':
	tag = '<' + html_part[0][1]
	full_attr = []
	for attr in html_part[1][1]:
	full_attr.append(attr[0] + f'="{attr[1]}"')
	full_attr = ', '.join(full_attr)
	if not full_attr:
	tag += full_attr + '>'
	else:
	tag += ' ' + full_attr + '>'
	full_html += tag
	if html_part[0] == 'End tag:':
	tag = f"</{html_part[1]}>"
	full_html += tag
	full_html = first_tags + full_html

	with open(html, 'w', encoding='utf-8') as f:
	f.write(full_html)


	####################################





	os.chdir(unzip_path)
	shutil.make_archive(epub_path, 'zip', './')
	os.rename((epub_path + '.zip'), (epub_path + '.zip')[:-4])
No results found