Easy to understand UT8 string class.
More details at rawsourcecode.io post
Easy to understand UT8 string class.
More details at rawsourcecode.io post
| #include "Utf8String.h" | |
| namespace | |
| { | |
| enum class EUtf8SequenceSize : uint8_t | |
| { | |
| One, Two, Three, Four, Invalid | |
| }; | |
| [[nodiscard]] EUtf8SequenceSize SizeOfUtf8Sequence(const unsigned char& Utf8Char) | |
| { | |
| // https://unicode.org/mail-arch/unicode-ml/y2003-m02/att-0467/01-The_Algorithm_to_Valide_an_UTF-8_String | |
| if (Utf8Char <= 0x7f) // 0b0111'1111 | |
| { | |
| return EUtf8SequenceSize::One; | |
| } | |
| else if (Utf8Char <= 0xbf) // 0b1011'1111 | |
| { | |
| return EUtf8SequenceSize::Invalid; // Not a leading UTF8 byte, possibly something went wrong reading previous sequences | |
| } | |
| else if (Utf8Char <= 0xdf) // 0b1101'1111 | |
| { | |
| return EUtf8SequenceSize::Two; | |
| } | |
| else if (Utf8Char <= 0xef) // 0b1110'1111 | |
| { | |
| return EUtf8SequenceSize::Three; | |
| } | |
| else if (Utf8Char <= 0xf7) // 0b1111'0111 | |
| { | |
| return EUtf8SequenceSize::Four; | |
| } | |
| // Unicode 3.1 ruled out the five and six octets UTF-8 sequence as illegal although | |
| // previous standard / specification such as Unicode 3.0 and RFC 2279 allow the | |
| // five and six octets UTF-8 sequence. Therefore, we need to make sure those value are not in the UTF-8 | |
| return EUtf8SequenceSize::Invalid; | |
| } | |
| [[nodiscard]] char32_t NextCodepointFromUtf8Sequence(const unsigned char*& Utf8Sequence) | |
| { | |
| if (*Utf8Sequence == 0) | |
| { | |
| return 0; | |
| } | |
| EUtf8SequenceSize NumOfBytes = SizeOfUtf8Sequence(*Utf8Sequence); | |
| if (NumOfBytes == EUtf8SequenceSize::Invalid) | |
| { | |
| return 0; // End processing | |
| } | |
| unsigned char FirstByte = *Utf8Sequence; | |
| if (NumOfBytes == EUtf8SequenceSize::One) | |
| { | |
| ++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
| return FirstByte; | |
| } | |
| unsigned char SecondByte = *(++Utf8Sequence); | |
| if (SecondByte == 0) | |
| { | |
| return 0; | |
| } | |
| if (NumOfBytes == EUtf8SequenceSize::Two) | |
| { | |
| ++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
| return | |
| ((FirstByte & 0b0001'1111) << 6) | | |
| (SecondByte & 0b0011'1111); | |
| } | |
| unsigned char ThirdByte = *(++Utf8Sequence); | |
| if (ThirdByte == 0) | |
| { | |
| return 0; | |
| } | |
| if (NumOfBytes == EUtf8SequenceSize::Three) | |
| { | |
| ++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
| return | |
| ((FirstByte & 0b0000'1111) << 12) | | |
| ((SecondByte & 0b0011'1111) << 6) | | |
| (ThirdByte & 0b0011'1111); | |
| } | |
| unsigned char FourthByte = *(++Utf8Sequence); | |
| if (FourthByte == 0) | |
| { | |
| return 0; | |
| } | |
| ++Utf8Sequence; // Point to the start of the next UTF8 sequence | |
| return | |
| ((FirstByte & 0b0000'0111) << 18) | | |
| ((SecondByte & 0b0011'1111) << 12) | | |
| ((ThirdByte & 0b0011'1111) << 6) | | |
| (FourthByte & 0b0011'1111); | |
| } | |
| } | |
| Utf8String::Utf8String(const char* Str) | |
| : Data(Str) | |
| {} | |
| int32_t Utf8String::Len() const | |
| { | |
| return static_cast<int32_t>(Data.size()); | |
| } | |
| int32_t Utf8String::CodePointsLen() const | |
| { | |
| if (Len() == 0) | |
| { | |
| return 0; | |
| } | |
| int32_t TotalCodePoints = 0; | |
| const unsigned char* Utf8Str = GetRawData(); | |
| while (NextCodepointFromUtf8Sequence(Utf8Str)) | |
| { | |
| ++TotalCodePoints; | |
| } | |
| return TotalCodePoints; | |
| } | |
| bool Utf8String::IsMultiByte() const | |
| { | |
| const unsigned char* Utf8Str = GetRawData(); | |
| while (*Utf8Str != 0) | |
| { | |
| char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
| if (UnicodeCodePoint >= 0x1'0000) | |
| { | |
| return true; | |
| } | |
| } | |
| return false; | |
| } | |
| const char* Utf8String::operator*() const | |
| { | |
| return Data.c_str(); | |
| } | |
| std::u32string Utf8String::ToUtf32() const | |
| { | |
| std::u32string Utf32Output; | |
| if (Len() == 0) | |
| { | |
| return Utf32Output; | |
| } | |
| const unsigned char* Utf8Str = GetRawData(); | |
| while (*Utf8Str != 0) | |
| { | |
| char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
| Utf32Output.push_back(UnicodeCodePoint); | |
| } | |
| return Utf32Output; | |
| } | |
| std::u16string Utf8String::ToUtf16() const | |
| { | |
| // UTF8: https://en.wikipedia.org/wiki/UTF-8 | |
| // UTF16: https://en.wikipedia.org/wiki/UTF-16 | |
| std::u16string Utf16Output; | |
| if (Len() == 0) | |
| { | |
| return Utf16Output; | |
| } | |
| // https://stackoverflow.com/questions/73758747/looking-for-the-description-of-the-algorithm-to-convert-utf8-to-utf16 | |
| const unsigned char* Utf8Str = GetRawData(); | |
| while (*Utf8Str != 0) | |
| { | |
| char32_t UnicodeCodePoint = NextCodepointFromUtf8Sequence(Utf8Str); | |
| if (UnicodeCodePoint < 0x1'0000) // 0b0001'0000'0000'0000'0000 | |
| { | |
| Utf16Output.push_back(UnicodeCodePoint); | |
| } | |
| else | |
| { | |
| UnicodeCodePoint -= 0x1'0000; | |
| char16_t HighSurrogate = 0xd800 + ((UnicodeCodePoint >> 10) & 0x3FF); // 0x3FF == 0b0011'1111'1111 | |
| char16_t LowSurrogate = 0xdc00 + (UnicodeCodePoint & 0x3FF); | |
| Utf16Output.push_back(HighSurrogate); | |
| Utf16Output.push_back(LowSurrogate); | |
| } | |
| } | |
| return Utf16Output; | |
| } | |
| std::wstring Utf8String::ToWide() const | |
| { | |
| std::wstring WideOutput; | |
| if (Len() == 0) | |
| { | |
| return WideOutput; | |
| } | |
| if constexpr (sizeof(wchar_t) == 4) | |
| { | |
| std::u32string Utf32String = ToUtf32(); | |
| WideOutput.reserve(Utf32String.size()); | |
| for (const char32_t& Char : Utf32String) | |
| { | |
| WideOutput.push_back(Char); | |
| } | |
| return WideOutput; | |
| } | |
| else if constexpr (sizeof(wchar_t) == 2) | |
| { | |
| std::u16string Utf16String = ToUtf16(); | |
| WideOutput.reserve(Utf16String.size()); | |
| for (const char16_t& Char : Utf16String) | |
| { | |
| WideOutput.push_back(Char); | |
| } | |
| return WideOutput; | |
| } | |
| else if constexpr (sizeof(wchar_t) == 1) | |
| { | |
| WideOutput.reserve(Data.size()); | |
| for (const char& Char : Data) | |
| { | |
| WideOutput.push_back(Char); | |
| } | |
| return WideOutput; | |
| } | |
| static_assert(sizeof(wchar_t) == 1 || sizeof(wchar_t) == 2 || sizeof(wchar_t) == 4, "Unexpected wchar_t size"); | |
| } | |
| const unsigned char* Utf8String::GetRawData() const | |
| { | |
| return reinterpret_cast<const unsigned char*>(Data.c_str()); | |
| } |
| #pragma once | |
| #include <cstdint> | |
| #include <string> | |
| /** | |
| * UTF-8 encoded string | |
| */ | |
| class Utf8String | |
| { | |
| public: | |
| /** Construct a UTF-8 string */ | |
| explicit FUtf8String(const char* Str); | |
| /** Num of characters in the string (no Unicode codepoints) */ | |
| [[nodiscard]] int32_t Len() const; | |
| /** Num of Unicode codepoints */ | |
| [[nodiscard]] int32_t CodePointsLen() const; | |
| /** If true, then this string contains codepoints outside the ASCII range i.e. [0, 127] which require multiple byts to be encoded */ | |
| [[nodiscard]] bool IsMultiByte() const; | |
| /** Null terminated UTF8 string */ | |
| [[nodiscard]] const char* operator*() const; | |
| /** Converts to a string of UTF32 or USC4, where each element is equivalent to a Unicode codepoint */ | |
| [[nodiscard]] std::u32string ToUtf32() const; | |
| /** Converts to a string of UTF16 */ | |
| [[nodiscard]] std::u16string ToUtf16() const; | |
| /** | |
| * Returns the UTF16 representation of this string if the platform size of wchar_t is 2, | |
| * Returns the UTF32 representation of this string if the platform size of wchar_t is 4, | |
| * this is mostly intended for usage in some APIs that require it like Win32, | |
| * but it's not safe in a cross-platform environment as the size can be different, | |
| * e.g. 2 bytes in Win, 4 bytes in Unix, so avoid as much as possible unless you know what you're doing. | |
| */ | |
| [[nodiscard]] std::wstring ToWide() const; | |
| private: | |
| /** The data as an unsigned char for bitwise manipulation */ | |
| [[nodiscard]] const unsigned char* GetRawData() const; | |
| private: | |
| std::string Data; | |
| }; |