Last active
November 24, 2020 20:35
-
-
Save seanhuggins1/f6a9d30d40379d93f7b93f6c80e5de05 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def testBit(int_type, offset): | |
| mask = 1 << offset | |
| return(int_type & mask) | |
| def checkUTF(arr): | |
| i = 0 | |
| while (i < len(arr)): | |
| char = arr[i] | |
| #test bit one | |
| if (not testBit(char, 7)): | |
| #this is a valid one byte character with '0xxxxxxx' | |
| i+=1 | |
| continue | |
| #test bit two | |
| if (not testBit(char, 6)): | |
| #invalid '10xxxxxx' | |
| return False | |
| #test bit three | |
| if (not testBit(char, 5)): | |
| #two byte character starting with byte '110xxxxx' | |
| k = 1 | |
| #test bit four | |
| elif (not testBit(char, 4)): | |
| #three byte character starting with byte '1110xxxx' | |
| k = 2 | |
| #test bit five | |
| elif (not testBit(char, 3)): | |
| #four byte character starting with byte '11110xxx' | |
| k = 3 | |
| else: | |
| #invalid | |
| return False | |
| #check for k subsequent bytes with '10xxxxxx' | |
| while k > 0: | |
| i+=1 | |
| if (i == len(arr)): | |
| #invalid, we expect another byte | |
| return False | |
| char = arr[i] | |
| #check first two bits for '10' | |
| if (not testBit(char, 7) or testBit(char, 7)): | |
| #invalid | |
| return False | |
| k = k - 1 | |
| #valid character | |
| i+=1 | |
| return True | |
| arr = [72,101,108,108,255,111,32,73,32,97,109,32,97,32,118,97,108,105,100,32,85,84,70,45,56,32,115,116,114,105,110,103] | |
| print(checkUTF(arr)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment