Skip to content

Instantly share code, notes, and snippets.

@dobrosketchkun
Last active November 22, 2023 22:44
Show Gist options
  • Select an option

  • Save dobrosketchkun/0a84e7233a4ee1b775cf748496fc53e0 to your computer and use it in GitHub Desktop.

Select an option

Save dobrosketchkun/0a84e7233a4ee1b775cf748496fc53e0 to your computer and use it in GitHub Desktop.
import pymorphy2
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
one_letter_correspondences = {
'А': 'A',
'Б': 'B',
'В': 'V',
'Г': 'G',
'Д': 'D',
'Е': 'E',
'Ё': 'Jo',
'Ж': 'X',
'З': 'Z',
'И': 'I',
'Й': 'J',
'К': 'K',
'Л': 'L',
'М': 'M',
'Н': 'N',
'О': 'O',
'П': 'P',
'Р': 'R',
'С': 'S',
'Т': 'T',
'У': 'U',
'Ф': 'F',
'Х': 'H',
'Ц': 'Ts',
'Ч': 'C',
'Ш': 'W',
'Щ': 'Wq',
'Ъ': '',
'Ы': 'Y',
'Ь': 'Q',
'Э': 'Je',
'Ю': 'Ju',
'Я': 'Ja',
'а': 'a',
'б': 'b',
'в': 'v',
'г': 'g',
'д': 'd',
'е': 'e',
'ё': 'jo',
'ж': 'x',
'з': 'z',
'и': 'i',
'й': 'j',
'к': 'k',
'л': 'l',
'м': 'm',
'н': 'n',
'о': 'o',
'п': 'p',
'р': 'r',
'с': 's',
'т': 't',
'у': 'u',
'ф': 'f',
'х': 'h',
'ц': 'ts',
'ч': 'c',
'ш': 'w',
'щ': 'wq',
'ъ': '',
'ы': 'y',
'ь': 'q',
'э': 'je',
'ю': 'ju',
'я': 'ja'
}
mult_letter_correspondences = {
'жи': 'xy',
'Жи': 'Xy',
'ши': 'wy',
'Ши': 'Wy',
'чю': 'cu',
'Чю': 'Cu',
'цы': 'tsy',
'Цы': 'Tsy',
'ци': 'tsy',
'Ци': 'Tsy',
'шю': 'wu',
'Шю': 'Wu',
'щю' : 'wqu',
'Щю' : 'Wqu',
'ЖИ': 'XY',
'ШИ': 'WY',
'ЧЮ': 'CU',
'ЦЫ': 'TSY',
'ЦИ': 'TSY',
'ШЮ': 'WU',
'ЩЮ' : 'WQU',
}
def prepare_text(text):
morph = pymorphy2.MorphAnalyzer()
new_text = []
for word in word_tokenize(text, language='russian'):
parsed_word = morph.parse(word)[0]
# мышь/ночь
if parsed_word.tag.gender == "femn":
if parsed_word.word.endswith("ь"):
if parsed_word.word[-2] in ['ч', 'щ', 'ж', 'ш', 'ц']:
new_text.append(word[:-1])
else:
new_text.append(word)
else:
new_text.append(word)
elif parsed_word.tag.POS == 'INFN':
new_text.append(word.replace("ться", "тся"))
else:
# print(parsed_word.word)
new_text.append(word)
# new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form))
result = " ".join(new_text)
# print('RESULT 1', result)
result = re.sub(r"(ц|ж|ш)ы", r"\1и", result)
# print('RESULT 2', result)
result = re.sub(r"(ш)ю", r"\1у", result)
# print('RESULT 3', result)
result = re.sub(r"\s+([.,!?])", r"\1", result)
# print('RESULT 4', result)
return result
def is_russian_letter(char):
# Unicode range for Cyrillic characters (Russian letters)
russian_range = (0x0400, 0x04FF) # Cyrillic
# Convert the character to its Unicode code point
code_point = ord(char)
# Check if the code point falls within the Russian range
if russian_range[0] <= code_point <= russian_range[1]:
return True
else:
return False
def multi_letters_repl(text):
for key, value in mult_letter_correspondences.items():
text = text.replace(key, value)
return text
def process_text(text):
processed_text = ""
for i in range(len(text)):
char = text[i]
if char.isalpha():
if is_russian_letter(char):
if char.islower():
processed_text += one_letter_correspondences[char]
else:
if i == 0:
if i + 1 < len(text) and text[i + 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
elif i == len(text) - 1:
if text[i - 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
else:
if text[i - 1].isupper() or text[i + 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
else:
processed_text += char
else:
processed_text += char
return processed_text
def final_processing(text):
processed_text = prepare_text(text)
processed_text = multi_letters_repl(processed_text)
processed_text = process_text(processed_text)
return processed_text
################################################################################
from html.entities import name2codepoint
from html.parser import HTMLParser
from zipfile import ZipFile
from math import ceil, log
import argparse
import shutil
import string
import re
import os
#################################
file_path = '/content/91317318.epub'
#################################
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
global data_html
# print("Start tag:", tag)
attributes = []
for attr in attrs:
# print(" attr:", attr)
attributes.append(attr)
data_html.append((("Start tag:", tag), ("attr:", attributes)))
def handle_endtag(self, tag):
global data_html
# print("End tag :", tag)
data_html.append(("End tag:", tag))
def handle_data(self, data):
global data_html
data_html.append(("Data:", data))
# print("Data :", data)
def handle_comment(self, data):
pass
# print("Comment :", data)
def handle_entityref(self, name):
c = chr(name2codepoint[name])
# print("Named ent:", c)
def handle_charref(self, name):
if name.startswith('x'):
c = chr(int(name[1:], 16))
else:
c = chr(int(name))
# print("Num ent :", c)
def handle_decl(self, data):
pass
# print("Decl :", data)
# def bolding(text):
# parts = re.findall( r'\w+|[^\s\w]+', text)
# new_text = ''
# for part in parts:
# if part in string.punctuation or part in string.digits:
# new_text += part
# else:
# if len(part) <= 3:
# new_part = ''
# new_part = f"<b>{part[0]}</b>"
# new_part += ''.join(part[1:])
# new_text += ' ' + new_part
# else:
# point = ceil(log(len(part), 2))
# new_part = ''
# new_part = f"<b>{part[0:point]}</b>"
# new_part += ''.join(part[point:])
# new_text += ' ' + new_part
# return new_text
####################################
# parser = argparse.ArgumentParser()
# parser.add_argument("epubfile", help="put a path to your epub file in here")
# args = parser.parse_args()
# file_path = args.epubfile
file_name = os.path.basename(file_path)
epub_path = os.getcwd() +'/rulat_' + file_name
unzip_path_folder = file_name + '_zip/'
unzip_path = os.getcwd() + '/' + unzip_path_folder
print("Unzipping", file_name)
try:
with ZipFile(file_path, 'r') as zipObj:
zipObj.extractall(unzip_path)
except:
with ZipFile(os.getcwd() + '/' + file_path, 'r') as zipObj:
zipObj.extractall(unzip_path)
print('Preparing rulat translation...')
####################################
first_tags = """<?xml version='1.0' encoding='utf-8'?>
<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.1//EN' 'http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd'>\n"""
htmls = []
# r=root, d=directories, f = files
for r, d, f in os.walk(unzip_path):
for hfile in f:
if hfile[-4:] == 'html':
htmls.append(os.path.join(r, hfile))
for html in htmls:
with open(html, 'r', encoding='utf-8') as f:
html_data = f.read()
data_html = []
parser = MyHTMLParser()
parser.feed(html_data)
full_html = ''
for html_part in data_html:
# print(html_part, '\n')
if html_part[0] == 'Data:':
# full_html += html_part[1]
# full_html += f"<b>{html_part[1]}</b>"
# full_html += bolding(html_part[1])
proc_text = prepare_text(html_part[1])
full_html += final_processing(proc_text)
if len(html_part) == 2 and html_part[0][0] == 'Start tag:':
tag = '<' + html_part[0][1]
full_attr = []
for attr in html_part[1][1]:
full_attr.append(attr[0] + f'="{attr[1]}"')
full_attr = ', '.join(full_attr)
if not full_attr:
tag += full_attr + '>'
else:
tag += ' ' + full_attr + '>'
full_html += tag
if html_part[0] == 'End tag:':
tag = f"</{html_part[1]}>"
full_html += tag
full_html = first_tags + full_html
with open(html, 'w', encoding='utf-8') as f:
f.write(full_html)
####################################
os.chdir(unzip_path)
shutil.make_archive(epub_path, 'zip', './')
os.rename((epub_path + '.zip'), (epub_path + '.zip')[:-4])
import pymorphy2
import re
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
one_letter_correspondences = {
'А': 'A',
'Б': 'B',
'В': 'V',
'Г': 'G',
'Д': 'D',
'Е': 'E',
'Ё': 'Jo',
'Ж': 'X',
'З': 'Z',
'И': 'I',
'Й': 'J',
'К': 'K',
'Л': 'L',
'М': 'M',
'Н': 'N',
'О': 'O',
'П': 'P',
'Р': 'R',
'С': 'S',
'Т': 'T',
'У': 'U',
'Ф': 'F',
'Х': 'H',
'Ц': 'Ts',
'Ч': 'C',
'Ш': 'W',
'Щ': 'Wq',
'Ъ': '',
'Ы': 'Y',
'Ь': 'Q',
'Э': 'Je',
'Ю': 'Ju',
'Я': 'Ja',
'а': 'a',
'б': 'b',
'в': 'v',
'г': 'g',
'д': 'd',
'е': 'e',
'ё': 'jo',
'ж': 'x',
'з': 'z',
'и': 'i',
'й': 'j',
'к': 'k',
'л': 'l',
'м': 'm',
'н': 'n',
'о': 'o',
'п': 'p',
'р': 'r',
'с': 's',
'т': 't',
'у': 'u',
'ф': 'f',
'х': 'h',
'ц': 'ts',
'ч': 'c',
'ш': 'w',
'щ': 'wq',
'ъ': '',
'ы': 'y',
'ь': 'q',
'э': 'je',
'ю': 'ju',
'я': 'ja'
}
mult_letter_correspondences = {
'жи': 'xy',
'Жи': 'Xy',
'ши': 'wy',
'Ши': 'Wy',
'чю': 'cu',
'Чю': 'Cu',
'цы': 'tsy',
'Цы': 'Tsy',
'ци': 'tsy',
'Ци': 'Tsy',
'шю': 'wu',
'Шю': 'Wu',
'щю' : 'wqu',
'Щю' : 'Wqu',
'ЖИ': 'XY',
'ШИ': 'WY',
'ЧЮ': 'CU',
'ЦЫ': 'TSY',
'ЦИ': 'TSY',
'ШЮ': 'WU',
'ЩЮ' : 'WQU',
}
def prepare_text(text):
morph = pymorphy2.MorphAnalyzer()
new_text = []
for word in word_tokenize(text, language='russian'):
parsed_word = morph.parse(word)[0]
# мышь/ночь
if parsed_word.tag.gender == "femn" and parsed_word.normal_form.endswith("ь"):
new_text.append(word[:-1])
# print('FEMININE and Ь', parsed_word.word)
elif parsed_word.tag.POS == 'INFN':
new_text.append(word.replace("ться", "тся"))
else:
# print(parsed_word.word)
new_text.append(word)
# new_text.append(re.sub(r"([а-я]+)ться", r"\1", parsed_word.normal_form))
result = " ".join(new_text)
# print('RESULT 1', result)
result = re.sub(r"(ц|ж|ш)ы", r"\1и", result)
# print('RESULT 2', result)
result = re.sub(r"(ш)ю", r"\1у", result)
# print('RESULT 3', result)
result = re.sub(r"\s+([.,!?])", r"\1", result)
# print('RESULT 4', result)
return result
def multi_letters_repl(text):
for key, value in mult_letter_correspondences.items():
text = text.replace(key, value)
return text
def process_text(text):
processed_text = ""
for i in range(len(text)):
char = text[i]
if char.isalpha():
if is_russian_letter(char):
if char.islower():
processed_text += one_letter_correspondences[char]
else:
if i == 0:
if i + 1 < len(text) and text[i + 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
elif i == len(text) - 1:
if text[i - 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
else:
if text[i - 1].isupper() or text[i + 1].isupper():
processed_text += one_letter_correspondences[char].upper()
else:
processed_text += one_letter_correspondences[char]
else:
processed_text += char
else:
processed_text += char
return processed_text
def is_russian_letter(char):
# Unicode range for Cyrillic characters (Russian letters)
russian_range = (0x0400, 0x04FF) # Cyrillic
# Convert the character to its Unicode code point
code_point = ord(char)
# Check if the code point falls within the Russian range
if russian_range[0] <= code_point <= russian_range[1]:
return True
else:
return False
processed_text = prepare_text(text)
processed_text = multi_letters_repl(processed_text)
processed_text = process_text(processed_text)
print(text)
print(processed_text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment