Created
January 10, 2019 15:54
-
-
Save Shide/d29efd51703b7ff0d4db909f0c467357 to your computer and use it in GitHub Desktop.
Text converter between unicode and str.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| class EnDecoder(object): | |
| """Text casting helper.""" | |
| DEFAULT_ENCODING_TYPES = ['ascii', 'utf-8'] # [+aggressive .. -aggressive] | |
| DEFAULT_NORMALIZE_FORMS = ['NFKD', 'NFD', 'NFKC', 'NFC'] # [-aggressive .. +aggressive] | |
| @staticmethod | |
| def _decode(txt, encoding_types=None): | |
| """ | |
| Decode a text string with the encoding methods that are indicated. | |
| :param txt: Text string. | |
| :param encoding_types: List of methods of encoding ('ascii', 'utf-8', etc). | |
| :return: Text string of `unicode` type. | |
| """ | |
| encoding_types = encoding_types or EnDecoder.DEFAULT_ENCODING_TYPES | |
| if not isinstance(encoding_types, list): | |
| encoding_types = [encoding_types] | |
| for enc in encoding_types: | |
| try: | |
| txt_unicode = txt.decode(enc) | |
| used_enc = enc | |
| break | |
| except (UnicodeDecodeError, UnicodeEncodeError): | |
| pass | |
| else: # If it reaches here, it has not been possible to decode | |
| raise UnicodeDecodeError('Unable to decode the text by none of the encoding types specified.') | |
| return txt_unicode, used_enc | |
| @staticmethod | |
| def _encode(txt, encoding_types=None, normalize_forms=None): | |
| """ | |
| Encodes a text string in the types that are indicated and normalizing it as indicated. | |
| Makes all possible combinations of encoding and normalize. | |
| :param txt: Text string. | |
| :param encoding_types: List of available encoding methods. | |
| :param normalize_forms: List of available normalize methods. | |
| :return: Text string of `str` type. | |
| """ | |
| encoding_types = encoding_types or EnDecoder.DEFAULT_ENCODING_TYPES | |
| normalize_forms = normalize_forms or EnDecoder.DEFAULT_NORMALIZE_FORMS | |
| if not isinstance(encoding_types, list): | |
| encoding_types = [encoding_types] | |
| if not isinstance(normalize_forms, list): | |
| normalize_forms = [normalize_forms] | |
| for encoding_t, normalize_f in list(itertools.product(encoding_types, normalize_forms)): | |
| try: | |
| txt_str = unicodedata.normalize(normalize_f, txt).encode(encoding_t, 'ignore') | |
| encoding_used, normalize_used = encoding_t, normalize_f | |
| break | |
| except UnicodeEncodeError: | |
| pass | |
| else: | |
| raise UnicodeEncodeError('Unable to encode and normalize by any combination of ' | |
| 'encoding types and normalize forms specified.') | |
| if not txt_str: | |
| raise UnicodeEncodeError('Encoding combinations resulted on an empty encoding.') | |
| return txt_str, encoding_used, normalize_used | |
| @staticmethod | |
| def to_string(txt): | |
| """ | |
| Convert the text into a string of type `str`. | |
| :param txt: Text string. | |
| :return: Text string of `str` type. | |
| """ | |
| txt_unicode = txt | |
| if type(txt) is str: | |
| txt_unicode, _ = EnDecoder._decode(txt) | |
| txt_str, _, _ = EnDecoder._encode(txt_unicode, encoding_types=['ascii']) # ENCODING -> RESULT: STR | |
| return txt_str | |
| @staticmethod | |
| def to_unicode(txt): | |
| """ | |
| Convert the text into a text string of type `unicode`. | |
| Applies the encoding, decoding and normalized necessary to carry out the task. | |
| :param txt: Text string. | |
| :return: Text string of `unicode` type. | |
| """ | |
| txt_unicode = txt | |
| if type(txt) is str: | |
| txt_unicode, _ = EnDecoder._decode(txt) | |
| txt_str, encoding_used, _ = EnDecoder._encode(txt_unicode, encoding_types=['utf-8', 'ascii']) | |
| txt_unicode, _ = EnDecoder._decode(txt_str, encoding_types=[encoding_used]) | |
| return txt_unicode |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
An useful helper for encoding / decoding tasks.
Feel free to copy/modify.