Skip to content

Instantly share code, notes, and snippets.

@seanhuggins1
Last active November 24, 2020 20:35
Show Gist options
  • Select an option

  • Save seanhuggins1/f6a9d30d40379d93f7b93f6c80e5de05 to your computer and use it in GitHub Desktop.

Select an option

Save seanhuggins1/f6a9d30d40379d93f7b93f6c80e5de05 to your computer and use it in GitHub Desktop.
def testBit(int_type, offset):
mask = 1 << offset
return(int_type & mask)
def checkUTF(arr):
i = 0
while (i < len(arr)):
char = arr[i]
#test bit one
if (not testBit(char, 7)):
#this is a valid one byte character with '0xxxxxxx'
i+=1
continue
#test bit two
if (not testBit(char, 6)):
#invalid '10xxxxxx'
return False
#test bit three
if (not testBit(char, 5)):
#two byte character starting with byte '110xxxxx'
k = 1
#test bit four
elif (not testBit(char, 4)):
#three byte character starting with byte '1110xxxx'
k = 2
#test bit five
elif (not testBit(char, 3)):
#four byte character starting with byte '11110xxx'
k = 3
else:
#invalid
return False
#check for k subsequent bytes with '10xxxxxx'
while k > 0:
i+=1
if (i == len(arr)):
#invalid, we expect another byte
return False
char = arr[i]
#check first two bits for '10'
if (not testBit(char, 7) or testBit(char, 7)):
#invalid
return False
k = k - 1
#valid character
i+=1
return True
arr = [72,101,108,108,255,111,32,73,32,97,109,32,97,32,118,97,108,105,100,32,85,84,70,45,56,32,115,116,114,105,110,103]
print(checkUTF(arr))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment