Last active
November 22, 2023 22:44
-
-
Save dobrosketchkun/0a84e7233a4ee1b775cf748496fc53e0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pymorphy2 | |
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| nltk.download('punkt') | |
| one_letter_correspondences = { | |
| 'А': 'A', | |
| 'Б': 'B', | |
| 'В': 'V', | |
| 'Г': 'G', | |
| 'Д': 'D', | |
| 'Е': 'E', | |
| 'Ё': 'Jo', | |
| 'Ж': 'X', | |
| 'З': 'Z', | |
| 'И': 'I', | |
| 'Й': 'J', | |
| 'К': 'K', | |
| 'Л': 'L', | |
| 'М': 'M', | |
| 'Н': 'N', | |
| 'О': 'O', | |
| 'П': 'P', | |
| 'Р': 'R', | |
| 'С': 'S', | |
| 'Т': 'T', | |
| 'У': 'U', | |
| 'Ф': 'F', | |
| 'Х': 'H', | |
| 'Ц': 'Ts', | |
| 'Ч': 'C', | |
| 'Ш': 'W', | |
| 'Щ': 'Wq', | |
| 'Ъ': '', | |
| 'Ы': 'Y', | |
| 'Ь': 'Q', | |
| 'Э': 'Je', | |
| 'Ю': 'Ju', | |
| 'Я': 'Ja', | |
| 'а': 'a', | |
| 'б': 'b', | |
| 'в': 'v', | |
| 'г': 'g', | |
| 'д': 'd', | |
| 'е': 'e', | |
| 'ё': 'jo', | |
| 'ж': 'x', | |
| 'з': 'z', | |
| 'и': 'i', | |
| 'й': 'j', | |
| 'к': 'k', | |
| 'л': 'l', | |
| 'м': 'm', | |
| 'н': 'n', | |
| 'о': 'o', | |
| 'п': 'p', | |
| 'р': 'r', | |
| 'с': 's', | |
| 'т': 't', | |
| 'у': 'u', | |
| 'ф': 'f', | |
| 'х': 'h', | |
| 'ц': 'ts', | |
| 'ч': 'c', | |
| 'ш': 'w', | |
| 'щ': 'wq', | |
| 'ъ': '', | |
| 'ы': 'y', | |
| 'ь': 'q', | |
| 'э': 'je', | |
| 'ю': 'ju', | |
| 'я': 'ja' | |
| } | |
| mult_letter_correspondences = { | |
| 'жи': 'xy', | |
| 'Жи': 'Xy', | |
| 'ши': 'wy', | |
| 'Ши': 'Wy', | |
| 'чю': 'cu', | |
| 'Чю': 'Cu', | |
| 'цы': 'tsy', | |
| 'Цы': 'Tsy', | |
| 'ци': 'tsy', | |
| 'Ци': 'Tsy', | |
| 'шю': 'wu', | |
| 'Шю': 'Wu', | |
| 'щю' : 'wqu', | |
| 'Щю' : 'Wqu', | |
| 'ЖИ': 'XY', | |
| 'ШИ': 'WY', | |
| 'ЧЮ': 'CU', | |
| 'ЦЫ': 'TSY', | |
| 'ЦИ': 'TSY', | |
| 'ШЮ': 'WU', | |
| 'ЩЮ' : 'WQU', | |
| } | |
| def prepare_text(text): | |
| morph = pymorphy2.MorphAnalyzer() | |
| new_text = [] | |
| for word in word_tokenize(text, language='russian'): | |
| parsed_word = morph.parse(word)[0] | |
| # мышь/ночь | |
| if parsed_word.tag.gender == "femn": | |
| if parsed_word.word.endswith("ь"): | |
| if parsed_word.word[-2] in ['ч', 'щ', 'ж', 'ш', 'ц']: | |
| new_text.append(word[:-1]) | |
| else: | |
| new_text.append(word) | |
| else: | |
| new_text.append(word) | |
| elif parsed_word.tag.POS == 'INFN': | |
| new_text.append(word.replace("ться", "тся")) | |
| else: | |
| # print(parsed_word.word) | |
| new_text.append(word) | |
| # new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form)) | |
| result = " ".join(new_text) | |
| # print('RESULT 1', result) | |
| result = re.sub(r"(ц|ж|ш)ы", r"\1и", result) | |
| # print('RESULT 2', result) | |
| result = re.sub(r"(ш)ю", r"\1у", result) | |
| # print('RESULT 3', result) | |
| result = re.sub(r"\s+([.,!?])", r"\1", result) | |
| # print('RESULT 4', result) | |
| return result | |
| def is_russian_letter(char): | |
| # Unicode range for Cyrillic characters (Russian letters) | |
| russian_range = (0x0400, 0x04FF) # Cyrillic | |
| # Convert the character to its Unicode code point | |
| code_point = ord(char) | |
| # Check if the code point falls within the Russian range | |
| if russian_range[0] <= code_point <= russian_range[1]: | |
| return True | |
| else: | |
| return False | |
| def multi_letters_repl(text): | |
| for key, value in mult_letter_correspondences.items(): | |
| text = text.replace(key, value) | |
| return text | |
| def process_text(text): | |
| processed_text = "" | |
| for i in range(len(text)): | |
| char = text[i] | |
| if char.isalpha(): | |
| if is_russian_letter(char): | |
| if char.islower(): | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| if i == 0: | |
| if i + 1 < len(text) and text[i + 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| elif i == len(text) - 1: | |
| if text[i - 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| if text[i - 1].isupper() or text[i + 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| processed_text += char | |
| else: | |
| processed_text += char | |
| return processed_text | |
| def final_processing(text): | |
| processed_text = prepare_text(text) | |
| processed_text = multi_letters_repl(processed_text) | |
| processed_text = process_text(processed_text) | |
| return processed_text | |
| ################################################################################ | |
| from html.entities import name2codepoint | |
| from html.parser import HTMLParser | |
| from zipfile import ZipFile | |
| from math import ceil, log | |
| import argparse | |
| import shutil | |
| import string | |
| import re | |
| import os | |
| ################################# | |
| file_path = '/content/91317318.epub' | |
| ################################# | |
| class MyHTMLParser(HTMLParser): | |
| def handle_starttag(self, tag, attrs): | |
| global data_html | |
| # print("Start tag:", tag) | |
| attributes = [] | |
| for attr in attrs: | |
| # print(" attr:", attr) | |
| attributes.append(attr) | |
| data_html.append((("Start tag:", tag), ("attr:", attributes))) | |
| def handle_endtag(self, tag): | |
| global data_html | |
| # print("End tag :", tag) | |
| data_html.append(("End tag:", tag)) | |
| def handle_data(self, data): | |
| global data_html | |
| data_html.append(("Data:", data)) | |
| # print("Data :", data) | |
| def handle_comment(self, data): | |
| pass | |
| # print("Comment :", data) | |
| def handle_entityref(self, name): | |
| c = chr(name2codepoint[name]) | |
| # print("Named ent:", c) | |
| def handle_charref(self, name): | |
| if name.startswith('x'): | |
| c = chr(int(name[1:], 16)) | |
| else: | |
| c = chr(int(name)) | |
| # print("Num ent :", c) | |
| def handle_decl(self, data): | |
| pass | |
| # print("Decl :", data) | |
| # def bolding(text): | |
| # parts = re.findall( r'\w+|[^\s\w]+', text) | |
| # new_text = '' | |
| # for part in parts: | |
| # if part in string.punctuation or part in string.digits: | |
| # new_text += part | |
| # else: | |
| # if len(part) <= 3: | |
| # new_part = '' | |
| # new_part = f"<b>{part[0]}</b>" | |
| # new_part += ''.join(part[1:]) | |
| # new_text += ' ' + new_part | |
| # else: | |
| # point = ceil(log(len(part), 2)) | |
| # new_part = '' | |
| # new_part = f"<b>{part[0:point]}</b>" | |
| # new_part += ''.join(part[point:]) | |
| # new_text += ' ' + new_part | |
| # return new_text | |
| #################################### | |
| # parser = argparse.ArgumentParser() | |
| # parser.add_argument("epubfile", help="put a path to your epub file in here") | |
| # args = parser.parse_args() | |
| # file_path = args.epubfile | |
| file_name = os.path.basename(file_path) | |
| epub_path = os.getcwd() +'/rulat_' + file_name | |
| unzip_path_folder = file_name + '_zip/' | |
| unzip_path = os.getcwd() + '/' + unzip_path_folder | |
| print("Unzipping", file_name) | |
| try: | |
| with ZipFile(file_path, 'r') as zipObj: | |
| zipObj.extractall(unzip_path) | |
| except: | |
| with ZipFile(os.getcwd() + '/' + file_path, 'r') as zipObj: | |
| zipObj.extractall(unzip_path) | |
| print('Preparing rulat translation...') | |
| #################################### | |
| first_tags = """<?xml version='1.0' encoding='utf-8'?> | |
| <!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>\n""" | |
| htmls = [] | |
| # r=root, d=directories, f = files | |
| for r, d, f in os.walk(unzip_path): | |
| for hfile in f: | |
| if hfile[-4:] == 'html': | |
| htmls.append(os.path.join(r, hfile)) | |
| for html in htmls: | |
| with open(html, 'r', encoding='utf-8') as f: | |
| html_data = f.read() | |
| data_html = [] | |
| parser = MyHTMLParser() | |
| parser.feed(html_data) | |
| full_html = '' | |
| for html_part in data_html: | |
| # print(html_part, '\n') | |
| if html_part[0] == 'Data:': | |
| # full_html += html_part[1] | |
| # full_html += f"<b>{html_part[1]}</b>" | |
| # full_html += bolding(html_part[1]) | |
| proc_text = prepare_text(html_part[1]) | |
| full_html += final_processing(proc_text) | |
| if len(html_part) == 2 and html_part[0][0] == 'Start tag:': | |
| tag = '<' + html_part[0][1] | |
| full_attr = [] | |
| for attr in html_part[1][1]: | |
| full_attr.append(attr[0] + f'="{attr[1]}"') | |
| full_attr = ', '.join(full_attr) | |
| if not full_attr: | |
| tag += full_attr + '>' | |
| else: | |
| tag += ' ' + full_attr + '>' | |
| full_html += tag | |
| if html_part[0] == 'End tag:': | |
| tag = f"</{html_part[1]}>" | |
| full_html += tag | |
| full_html = first_tags + full_html | |
| with open(html, 'w', encoding='utf-8') as f: | |
| f.write(full_html) | |
| #################################### | |
| os.chdir(unzip_path) | |
| shutil.make_archive(epub_path, 'zip', './') | |
| os.rename((epub_path + '.zip'), (epub_path + '.zip')[:-4]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pymorphy2 | |
| import re | |
| import nltk | |
| from nltk.tokenize import word_tokenize | |
| nltk.download('punkt') | |
| one_letter_correspondences = { | |
| 'А': 'A', | |
| 'Б': 'B', | |
| 'В': 'V', | |
| 'Г': 'G', | |
| 'Д': 'D', | |
| 'Е': 'E', | |
| 'Ё': 'Jo', | |
| 'Ж': 'X', | |
| 'З': 'Z', | |
| 'И': 'I', | |
| 'Й': 'J', | |
| 'К': 'K', | |
| 'Л': 'L', | |
| 'М': 'M', | |
| 'Н': 'N', | |
| 'О': 'O', | |
| 'П': 'P', | |
| 'Р': 'R', | |
| 'С': 'S', | |
| 'Т': 'T', | |
| 'У': 'U', | |
| 'Ф': 'F', | |
| 'Х': 'H', | |
| 'Ц': 'Ts', | |
| 'Ч': 'C', | |
| 'Ш': 'W', | |
| 'Щ': 'Wq', | |
| 'Ъ': '', | |
| 'Ы': 'Y', | |
| 'Ь': 'Q', | |
| 'Э': 'Je', | |
| 'Ю': 'Ju', | |
| 'Я': 'Ja', | |
| 'а': 'a', | |
| 'б': 'b', | |
| 'в': 'v', | |
| 'г': 'g', | |
| 'д': 'd', | |
| 'е': 'e', | |
| 'ё': 'jo', | |
| 'ж': 'x', | |
| 'з': 'z', | |
| 'и': 'i', | |
| 'й': 'j', | |
| 'к': 'k', | |
| 'л': 'l', | |
| 'м': 'm', | |
| 'н': 'n', | |
| 'о': 'o', | |
| 'п': 'p', | |
| 'р': 'r', | |
| 'с': 's', | |
| 'т': 't', | |
| 'у': 'u', | |
| 'ф': 'f', | |
| 'х': 'h', | |
| 'ц': 'ts', | |
| 'ч': 'c', | |
| 'ш': 'w', | |
| 'щ': 'wq', | |
| 'ъ': '', | |
| 'ы': 'y', | |
| 'ь': 'q', | |
| 'э': 'je', | |
| 'ю': 'ju', | |
| 'я': 'ja' | |
| } | |
| mult_letter_correspondences = { | |
| 'жи': 'xy', | |
| 'Жи': 'Xy', | |
| 'ши': 'wy', | |
| 'Ши': 'Wy', | |
| 'чю': 'cu', | |
| 'Чю': 'Cu', | |
| 'цы': 'tsy', | |
| 'Цы': 'Tsy', | |
| 'ци': 'tsy', | |
| 'Ци': 'Tsy', | |
| 'шю': 'wu', | |
| 'Шю': 'Wu', | |
| 'щю' : 'wqu', | |
| 'Щю' : 'Wqu', | |
| 'ЖИ': 'XY', | |
| 'ШИ': 'WY', | |
| 'ЧЮ': 'CU', | |
| 'ЦЫ': 'TSY', | |
| 'ЦИ': 'TSY', | |
| 'ШЮ': 'WU', | |
| 'ЩЮ' : 'WQU', | |
| } | |
| def prepare_text(text): | |
| morph = pymorphy2.MorphAnalyzer() | |
| new_text = [] | |
| for word in word_tokenize(text, language='russian'): | |
| parsed_word = morph.parse(word)[0] | |
| # мышь/ночь | |
| if parsed_word.tag.gender == "femn" and parsed_word.normal_form.endswith("ь"): | |
| new_text.append(word[:-1]) | |
| # print('FEMININE and Ь', parsed_word.word) | |
| elif parsed_word.tag.POS == 'INFN': | |
| new_text.append(word.replace("ться", "тся")) | |
| else: | |
| # print(parsed_word.word) | |
| new_text.append(word) | |
| # new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form)) | |
| result = " ".join(new_text) | |
| # print('RESULT 1', result) | |
| result = re.sub(r"(ц|ж|ш)ы", r"\1и", result) | |
| # print('RESULT 2', result) | |
| result = re.sub(r"(ш)ю", r"\1у", result) | |
| # print('RESULT 3', result) | |
| result = re.sub(r"\s+([.,!?])", r"\1", result) | |
| # print('RESULT 4', result) | |
| return result | |
| def multi_letters_repl(text): | |
| for key, value in mult_letter_correspondences.items(): | |
| text = text.replace(key, value) | |
| return text | |
| def process_text(text): | |
| processed_text = "" | |
| for i in range(len(text)): | |
| char = text[i] | |
| if char.isalpha(): | |
| if is_russian_letter(char): | |
| if char.islower(): | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| if i == 0: | |
| if i + 1 < len(text) and text[i + 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| elif i == len(text) - 1: | |
| if text[i - 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| if text[i - 1].isupper() or text[i + 1].isupper(): | |
| processed_text += one_letter_correspondences[char].upper() | |
| else: | |
| processed_text += one_letter_correspondences[char] | |
| else: | |
| processed_text += char | |
| else: | |
| processed_text += char | |
| return processed_text | |
| def is_russian_letter(char): | |
| # Unicode range for Cyrillic characters (Russian letters) | |
| russian_range = (0x0400, 0x04FF) # Cyrillic | |
| # Convert the character to its Unicode code point | |
| code_point = ord(char) | |
| # Check if the code point falls within the Russian range | |
| if russian_range[0] <= code_point <= russian_range[1]: | |
| return True | |
| else: | |
| return False | |
| processed_text = prepare_text(text) | |
| processed_text = multi_letters_repl(processed_text) | |
| processed_text = process_text(processed_text) | |
| print(text) | |
| print(processed_text) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment