Last active
March 8, 2024 19:40
-
-
Save yzqzss/d1e93357c250a41c85748160e8c572c0 to your computer and use it in GitHub Desktop.
handle UnicodeDecodeError
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- a/wikiteam3/utils/monkey_patch.py | |
| +++ b/wikiteam3/utils/monkey_patch.py | |
| @@ -1,4 +1,5 @@ | |
| import ssl | |
| +import sys | |
| import time | |
| from typing import Optional | |
| import warnings | |
| @@ -26,7 +27,30 @@ def mod_requests_text(requests: requests): # type: ignore | |
| else: | |
| content = _self.content | |
| - return content.decode(encoding) | |
| + try: | |
| + return content.decode(encoding, errors="strict") | |
| + except UnicodeDecodeError as e: | |
| + FFFD_CHAR = u'�' | |
| + FFFD_TOLERANCE = 0.01 # from CLI arg --requests-text-FFFD-tolerance ? | |
| + print('UnicodeDecodeError:', e) | |
| + ignore_text = content.decode(encoding, errors='ignore') | |
| + FFFDs_in_ignore_text = ignore_text.count(FFFD_CHAR) | |
| + replace_text = content.decode(encoding, errors='replace') | |
| + FFFDs_in_replace_text = replace_text.count(FFFD_CHAR) | |
| + | |
| + bad_FFFDs = FFFDs_in_replace_text - FFFDs_in_ignore_text | |
| + bad_FFFDs_ratio = bad_FFFDs / len(replace_text) | |
| + | |
| + if bad_FFFDs_ratio > FFFD_TOLERANCE: | |
| + print('ERROR: Bad \\ufffd too many', bad_FFFDs_ratio, 'tolerance', FFFD_TOLERANCE, file=sys.stderr) | |
| + raise e | |
| + | |
| + warnings.warn( | |
| + message=f"found bad \\ufffd, but tolerable. {bad_FFFDs} bad FFFDs in {len(replace_text)} chars ({bad_FFFDs_ratio:.5})", | |
| + category=UserWarning | |
| + ) | |
| + return replace_text | |
| + | |
| requests.Response.text = property(new_text) # type: ignore |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment