Skip to content

Instantly share code, notes, and snippets.

@Shide
Created January 10, 2019 15:54
Show Gist options
  • Select an option

  • Save Shide/d29efd51703b7ff0d4db909f0c467357 to your computer and use it in GitHub Desktop.

Select an option

Save Shide/d29efd51703b7ff0d4db909f0c467357 to your computer and use it in GitHub Desktop.
Text converter between unicode and str.
class EnDecoder(object):
"""Text casting helper."""
DEFAULT_ENCODING_TYPES = ['ascii', 'utf-8'] # [+aggressive .. -aggressive]
DEFAULT_NORMALIZE_FORMS = ['NFKD', 'NFD', 'NFKC', 'NFC'] # [-aggressive .. +aggressive]
@staticmethod
def _decode(txt, encoding_types=None):
"""
Decode a text string with the encoding methods that are indicated.
:param txt: Text string.
:param encoding_types: List of methods of encoding ('ascii', 'utf-8', etc).
:return: Text string of `unicode` type.
"""
encoding_types = encoding_types or EnDecoder.DEFAULT_ENCODING_TYPES
if not isinstance(encoding_types, list):
encoding_types = [encoding_types]
for enc in encoding_types:
try:
txt_unicode = txt.decode(enc)
used_enc = enc
break
except (UnicodeDecodeError, UnicodeEncodeError):
pass
else: # If it reaches here, it has not been possible to decode
raise UnicodeDecodeError('Unable to decode the text by none of the encoding types specified.')
return txt_unicode, used_enc
@staticmethod
def _encode(txt, encoding_types=None, normalize_forms=None):
"""
Encodes a text string in the types that are indicated and normalizing it as indicated.
Makes all possible combinations of encoding and normalize.
:param txt: Text string.
:param encoding_types: List of available encoding methods.
:param normalize_forms: List of available normalize methods.
:return: Text string of `str` type.
"""
encoding_types = encoding_types or EnDecoder.DEFAULT_ENCODING_TYPES
normalize_forms = normalize_forms or EnDecoder.DEFAULT_NORMALIZE_FORMS
if not isinstance(encoding_types, list):
encoding_types = [encoding_types]
if not isinstance(normalize_forms, list):
normalize_forms = [normalize_forms]
for encoding_t, normalize_f in list(itertools.product(encoding_types, normalize_forms)):
try:
txt_str = unicodedata.normalize(normalize_f, txt).encode(encoding_t, 'ignore')
encoding_used, normalize_used = encoding_t, normalize_f
break
except UnicodeEncodeError:
pass
else:
raise UnicodeEncodeError('Unable to encode and normalize by any combination of '
'encoding types and normalize forms specified.')
if not txt_str:
raise UnicodeEncodeError('Encoding combinations resulted on an empty encoding.')
return txt_str, encoding_used, normalize_used
@staticmethod
def to_string(txt):
"""
Convert the text into a string of type `str`.
:param txt: Text string.
:return: Text string of `str` type.
"""
txt_unicode = txt
if type(txt) is str:
txt_unicode, _ = EnDecoder._decode(txt)
txt_str, _, _ = EnDecoder._encode(txt_unicode, encoding_types=['ascii']) # ENCODING -> RESULT: STR
return txt_str
@staticmethod
def to_unicode(txt):
"""
Convert the text into a text string of type `unicode`.
Applies the encoding, decoding and normalized necessary to carry out the task.
:param txt: Text string.
:return: Text string of `unicode` type.
"""
txt_unicode = txt
if type(txt) is str:
txt_unicode, _ = EnDecoder._decode(txt)
txt_str, encoding_used, _ = EnDecoder._encode(txt_unicode, encoding_types=['utf-8', 'ascii'])
txt_unicode, _ = EnDecoder._decode(txt_str, encoding_types=[encoding_used])
return txt_unicode
@Shide
Copy link
Author

Shide commented Jan 10, 2019

An useful helper for encoding / decoding tasks.

Feel free to copy/modify.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment