Skip to content

Instantly share code, notes, and snippets.

@yzqzss
Last active March 8, 2024 19:40
Show Gist options
  • Select an option

  • Save yzqzss/d1e93357c250a41c85748160e8c572c0 to your computer and use it in GitHub Desktop.

Select an option

Save yzqzss/d1e93357c250a41c85748160e8c572c0 to your computer and use it in GitHub Desktop.
handle UnicodeDecodeError
--- a/wikiteam3/utils/monkey_patch.py
+++ b/wikiteam3/utils/monkey_patch.py
@@ -1,4 +1,5 @@
import ssl
+import sys
import time
from typing import Optional
import warnings
@@ -26,7 +27,30 @@ def mod_requests_text(requests: requests): # type: ignore
else:
content = _self.content
- return content.decode(encoding)
+ try:
+ return content.decode(encoding, errors="strict")
+ except UnicodeDecodeError as e:
+ FFFD_CHAR = u'�'
+ FFFD_TOLERANCE = 0.01 # from CLI arg --requests-text-FFFD-tolerance ?
+ print('UnicodeDecodeError:', e)
+ ignore_text = content.decode(encoding, errors='ignore')
+ FFFDs_in_ignore_text = ignore_text.count(FFFD_CHAR)
+ replace_text = content.decode(encoding, errors='replace')
+ FFFDs_in_replace_text = replace_text.count(FFFD_CHAR)
+
+ bad_FFFDs = FFFDs_in_replace_text - FFFDs_in_ignore_text
+ bad_FFFDs_ratio = bad_FFFDs / len(replace_text)
+
+ if bad_FFFDs_ratio > FFFD_TOLERANCE:
+ print('ERROR: Bad \\ufffd too many', bad_FFFDs_ratio, 'tolerance', FFFD_TOLERANCE, file=sys.stderr)
+ raise e
+
+ warnings.warn(
+ message=f"found bad \\ufffd, but tolerable. {bad_FFFDs} bad FFFDs in {len(replace_text)} chars ({bad_FFFDs_ratio:.5})",
+ category=UserWarning
+ )
+ return replace_text
+
requests.Response.text = property(new_text) # type: ignore
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment