From e7396120918f70c2fdb4ae80a27d0de2f23f2a46 Mon Sep 17 00:00:00 2001 From: cyzhang Date: Thu, 30 Dec 2021 11:42:02 +0800 Subject: [PATCH 1/2] delete 'utf_helper' and integrate into 'utf' Signed-off-by: cyzhang --- libpandabase/tests/utf_test.cpp | 11 ++++++ libpandabase/utils/utf.cpp | 63 +++++++++++++++++++++++++-------- libpandabase/utils/utf.h | 32 ++++++++++++++--- 3 files changed, 88 insertions(+), 18 deletions(-) diff --git a/libpandabase/tests/utf_test.cpp b/libpandabase/tests/utf_test.cpp index 9883664b74..bf7b45a0c1 100644 --- a/libpandabase/tests/utf_test.cpp +++ b/libpandabase/tests/utf_test.cpp @@ -215,6 +215,17 @@ TEST(Utf, ConvertRegionUtf16ToMUtf8) out[out.size() - 1] = '\0'; EXPECT_EQ(out, res); } + + // convert to unmodified mutf-8 + { + const std::vector in {0xdf06, 0x33, 0x0000, 0xd801, 0xdc37}; + const std::vector res {0xed, 0xbc, 0x86, 0x33, 0xf0, 0x90, 0x90, 0xb7, 0x00}; + std::vector out(res.size()); + size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size()-1, 0, false); + EXPECT_EQ(sz, 8); + out[out.size() - 1] = '\0'; + EXPECT_EQ(out, res); + } } TEST(Utf, CompareMUtf8ToMUtf8) diff --git a/libpandabase/utils/utf.cpp b/libpandabase/utils/utf.cpp index 8214beb39b..8d6acb21c7 100644 --- a/libpandabase/utils/utf.cpp +++ b/libpandabase/utils/utf.cpp @@ -94,16 +94,9 @@ std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t return {pair, CONST_4}; } -static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) -{ - uint32_t codePoint = d0 - HI_SURROGATE_MIN; - codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH); - codePoint |= d1 - LO_SURROGATE_MIN; - codePoint += LO_SUPPLEMENTS_MIN; - return codePoint; -} -constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1) + +constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1, bool modify) { // When the first utf16 code is in 0xd800-0xdfff and the second utf16 code is 0, // it is a single code point, and it needs to be represented by three MUTF8 code. @@ -115,7 +108,10 @@ constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1) } if (d0 == 0) { - return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}}; + if (modify) { + return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}}; + } + return {0, {}}; } if (d0 <= MUTF8_1B_MAX) { return {1, {static_cast(d0)}}; @@ -142,6 +138,43 @@ constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1) return {CONST_4, {ch0, ch1, ch2, ch3}}; } +bool IsValidUTF8(const std::vector &data) +{ + uint32_t length = data.size(); + switch (length) { + case 1: + if (data.at(0) >= MASK1) { + return false; + } + break; + case CONST_2: + if ((data.at(0) & MUTF8_3B_FIRST) != MUTF8_2B_FIRST) { + return false; + } + break; + case CONST_3: + if ((data.at(0) & MUTF8_4B_FIRST) != MUTF8_3B_FIRST) { + return false; + } + break; + case CONST_4: + if ((data.at(0) & MUTF8_5B_FIRST) != MUTF8_4B_FIRST) { + return false; + } + break; + default: + UNREACHABLE(); + break; + } + + for (uint32_t i = 1; i < length; i++) { + if ((data.at(i) & MUTF8_2B_FIRST) != MASK1) { + return false; + } + } + return true; +} + bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in) { while (*mutf8_in != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) @@ -154,7 +187,7 @@ bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in) } size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, - size_t start) + size_t start, bool modify) { size_t mutf8_pos = 0; if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) { @@ -165,7 +198,7 @@ size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, s // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) - MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code); + MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code, modify); if (mutf8_pos + ch.n > mutf8_len) { break; } @@ -348,7 +381,7 @@ size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len) return res; } -size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length) +size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length, bool modify) { size_t res = 1; // zero byte // When the utf16 data length is only 1 and the code is in 0xd800-0xdfff, @@ -362,7 +395,9 @@ size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length) for (uint32_t i = 0; i < length; ++i) { // NOLINTNEXTLINE(bugprone-branch-clone) if (mutf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) - res += CONST_2; // special case for U+0000 => C0 80 + if (modify) { + res += CONST_2; // special case for U+0000 => C0 80 + } } else if (mutf16[i] <= MUTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) res += 1; } else if (mutf16[i] <= MUTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) diff --git a/libpandabase/utils/utf.h b/libpandabase/utils/utf.h index d60d2c2d02..0f1eac3915 100644 --- a/libpandabase/utils/utf.h +++ b/libpandabase/utils/utf.h @@ -67,7 +67,14 @@ constexpr uint8_t MUTF8_3B_THIRD = 0x80; constexpr uint8_t MUTF8_4B_FIRST = 0xf0; -std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4); +constexpr uint8_t MUTF8_5B_FIRST = 0xf8; // Illegal UTF-8 bits, add for invalid UTF-8 check. + +constexpr uint8_t MUTF8_MAX_COUNTS = 4; +constexpr uint8_t UTF16_MAX_COUNTS = 2; + +std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = MUTF8_MAX_COUNTS); + +bool IsValidUTF8(const std::vector &data); bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in); @@ -77,7 +84,7 @@ size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, s size_t start); size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, - size_t start); + size_t start, bool modify = true); int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2); @@ -91,7 +98,7 @@ size_t MUtf8ToUtf16Size(const uint8_t *mutf8); size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len); -size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length); +size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length, bool modify = true); size_t Mutf8Size(const uint8_t *mutf8); @@ -131,13 +138,30 @@ struct Mutf8Less { } }; -static inline std::pair SplitUtf16Pair(uint32_t pair) +static inline constexpr std::pair SplitUtf16Pair(uint32_t pair) { constexpr size_t P1_MASK = 0xffff; constexpr size_t P2_SHIFT = 16; return {pair >> P2_SHIFT, pair & P1_MASK}; } +static inline constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) +{ + uint32_t codePoint = d0 - utf::HI_SURROGATE_MIN; + codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH); + codePoint |= d1 - utf::LO_SURROGATE_MIN; + codePoint += utf::LO_SUPPLEMENTS_MIN; + return codePoint; +} + +static inline constexpr uint32_t DecodeUtf16Pair(uint32_t pair) +{ + auto [lead, trail] = utf::SplitUtf16Pair(pair); + uint32_t codePoint = (lead - U16_LEAD) << (PAIR_ELEMENT_WIDTH - DATA_WIDTH); + codePoint |= trail - U16_TAIL; + return codePoint; +} + } // namespace panda::utf #endif // PANDA_LIBPANDABASE_UTILS_UTF_H_ -- Gitee From 49818150b02d6d7bdf52ee0a66e2914c97cb8c2b Mon Sep 17 00:00:00 2001 From: cyzhang Date: Thu, 30 Dec 2021 14:32:41 +0800 Subject: [PATCH 2/2] supplement utf Signed-off-by: cyzhang --- libpandabase/utils/utf.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/libpandabase/utils/utf.cpp b/libpandabase/utils/utf.cpp index 8d6acb21c7..ab3b765be2 100644 --- a/libpandabase/utils/utf.cpp +++ b/libpandabase/utils/utf.cpp @@ -94,8 +94,6 @@ std::pair ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t return {pair, CONST_4}; } - - constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1, bool modify) { // When the first utf16 code is in 0xd800-0xdfff and the second utf16 code is 0, -- Gitee