From 847a1207f07d0ab0e6ade7e996039fc213bd0ffc Mon Sep 17 00:00:00 2001 From: c00513733 Date: Thu, 23 Sep 2021 10:51:27 +0800 Subject: [PATCH] fixed cf6fea9 from https://gitee.com/chenqy930/ark_runtime_core/pulls/16 Encode StringItem with ASCII tag in length Signed-off-by: c00513733 --- libpandafile/file-inl.h | 4 ++- libpandafile/file.h | 7 ++-- libpandafile/file_items.cpp | 19 ++++++++-- libpandafile/file_items.h | 1 + libpandafile/literal_data_accessor-inl.h | 3 +- runtime/coretypes/string.cpp | 44 +++++++++++++++++++----- runtime/include/coretypes/string.h | 8 +++++ runtime/string_table.cpp | 34 +++++++++--------- runtime/string_table.h | 7 ++-- 9 files changed, 93 insertions(+), 34 deletions(-) diff --git a/libpandafile/file-inl.h b/libpandafile/file-inl.h index 6727055bfc..64211fd5ef 100644 --- a/libpandafile/file-inl.h +++ b/libpandafile/file-inl.h @@ -27,7 +27,9 @@ inline File::StringData File::GetStringData(EntityId id) const StringData str_data {}; auto sp = GetSpanFromId(id); - str_data.utf16_length = panda_file::helpers::ReadULeb128(&sp); + auto tag_utf16_length = panda_file::helpers::ReadULeb128(&sp); + str_data.utf16_length = tag_utf16_length >> 1U; + str_data.is_ascii = static_cast(tag_utf16_length & 1U); str_data.data = sp.data(); return str_data; diff --git a/libpandafile/file.h b/libpandafile/file.h index 50a25740bd..cfa15a734c 100644 --- a/libpandafile/file.h +++ b/libpandafile/file.h @@ -72,8 +72,11 @@ public: }; struct StringData { - uint32_t utf16_length; - const uint8_t *data; + StringData(uint32_t len, const uint8_t *d) : utf16_length(len), data(d), is_ascii(false) {} + StringData() = default; + uint32_t utf16_length; // NOLINT(misc-non-private-member-variables-in-classes) + const uint8_t *data; // NOLINT(misc-non-private-member-variables-in-classes) + bool is_ascii; // NOLINT(misc-non-private-member-variables-in-classes) }; // NOLINTNEXTLINE(cppcoreguidelines-special-member-functions, hicpp-special-member-functions) diff --git a/libpandafile/file_items.cpp b/libpandafile/file_items.cpp index cbc8cf634b..7f3e3b5604 100644 --- a/libpandafile/file_items.cpp +++ b/libpandafile/file_items.cpp @@ -18,6 +18,7 @@ #include "macros.h" #include "utils/bit_utils.h" #include "utils/leb128.h" +#include "utils/utf.h" #include #include @@ -78,18 +79,32 @@ StringItem::StringItem(std::string str) : str_(std::move(str)) { str_.push_back(0); utf16_length_ = utf::MUtf8ToUtf16Size(utf::CStringAsMutf8(str_.data())); + is_ascii_ = 1; + + for (auto c : str_) { + if (static_cast(c) > utf::MUTF8_1B_MAX) { + is_ascii_ = 0; + break; + } + } } size_t StringItem::CalculateSize() const { - return leb128::UnsignedEncodingSize(utf16_length_) + str_.size(); + return leb128::UnsignedEncodingSize((utf16_length_ << 1U) | is_ascii_) + str_.size(); } bool StringItem::Write(Writer *writer) { ASSERT(GetOffset() == writer->GetOffset()); - if (!writer->WriteUleb128(utf16_length_)) { + constexpr size_t max_string_length = 0x7fffffffU; + if (utf16_length_ > max_string_length) { + LOG(ERROR, PANDAFILE) << "Writing StringItem with size greater than 0x7fffffffU is not supported!"; + return false; + } + + if (!writer->WriteUleb128((utf16_length_ << 1U) | is_ascii_)) { return false; } diff --git a/libpandafile/file_items.h b/libpandafile/file_items.h index 077d7388d7..27a09082b6 100644 --- a/libpandafile/file_items.h +++ b/libpandafile/file_items.h @@ -363,6 +363,7 @@ public: private: std::string str_; size_t utf16_length_ {0}; + size_t is_ascii_ {0}; }; class AnnotationItem; diff --git a/libpandafile/literal_data_accessor-inl.h b/libpandafile/literal_data_accessor-inl.h index e79d8d6faf..d683f872ca 100644 --- a/libpandafile/literal_data_accessor-inl.h +++ b/libpandafile/literal_data_accessor-inl.h @@ -55,8 +55,7 @@ inline void LiteralDataAccessor::EnumerateLiteralVals(File::EntityId id, const C value = static_cast(helpers::Read(&sp)); break; case LiteralTag::STRING: { - auto offset = static_cast(helpers::Read(&sp)); - value = panda_file_.GetStringData(File::EntityId(offset)); + value = static_cast(helpers::Read(&sp)); break; } case LiteralTag::METHOD: diff --git a/runtime/coretypes/string.cpp b/runtime/coretypes/string.cpp index 17efb3b34b..d99b3032d6 100644 --- a/runtime/coretypes/string.cpp +++ b/runtime/coretypes/string.cpp @@ -71,9 +71,8 @@ String *String::CreateFromString(String *str, LanguageContext ctx, PandaVM *vm) /* static */ String *String::CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length, - LanguageContext ctx, PandaVM *vm, bool movable) + bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable) { - bool can_be_compressed = compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false; auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm, movable); if (string == nullptr) { return nullptr; @@ -101,7 +100,15 @@ String *String::CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm, bool movable) { - return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, ctx, vm, movable); + bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data); + return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable); +} + +/* static */ +String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed, + LanguageContext ctx, PandaVM *vm, bool movable) +{ + return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable); } /* static */ @@ -109,7 +116,8 @@ String *String::CreateFromMUtf8(const uint8_t *mutf8_data, LanguageContext ctx, { size_t mutf8_length = utf::Mutf8Size(mutf8_data); size_t utf16_length = utf::MUtf8ToUtf16Size(mutf8_data, mutf8_length); - return CreateFromMUtf8(mutf8_data, mutf8_length, utf16_length, ctx, vm, movable); + bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data); + return CreateFromMUtf8(mutf8_data, mutf8_length, utf16_length, can_be_compressed, ctx, vm, movable); } /* static */ @@ -397,6 +405,12 @@ bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_leng return is_compressed; } +/* static */ +bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data) +{ + return compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false; +} + /* static */ bool String::CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non) { @@ -451,18 +465,27 @@ bool String::StringsAreEqual(String *str1, String *str2) /* static */ bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length) +{ + if (str1->GetLength() != utf16_length) { + return false; + } + return StringsAreEqualMUtf8(str1, mutf8_data, utf16_length, CanBeCompressedMUtf8(mutf8_data)); +} + +/* static */ +bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length, + bool can_be_compressed) { bool result = true; if (str1->GetLength() != utf16_length) { result = false; } else { bool str1_can_be_compressed = !str1->IsUtf16(); - bool data2_can_be_compressed = compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false; - if (str1_can_be_compressed != data2_can_be_compressed) { + if (str1_can_be_compressed != can_be_compressed) { return false; } - ASSERT(str1_can_be_compressed == data2_can_be_compressed); + ASSERT(str1_can_be_compressed == can_be_compressed); if (str1_can_be_compressed) { Span data1(str1->GetDataMUtf8(), str1->GetLength()); Span data2(mutf8_data, utf16_length); @@ -608,7 +631,12 @@ uint32_t String::ComputeHashcode() /* static */ uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length) { - bool can_be_compressed = compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false; + return ComputeHashcodeMutf8(mutf8_data, utf16_length, CanBeCompressedMUtf8(mutf8_data)); +} + +/* static */ +uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed) +{ uint32_t hash; if (can_be_compressed) { hash = ComputeHashForMutf8(mutf8_data); diff --git a/runtime/include/coretypes/string.h b/runtime/include/coretypes/string.h index 5d673d931a..6168fabb58 100644 --- a/runtime/include/coretypes/string.h +++ b/runtime/include/coretypes/string.h @@ -37,6 +37,9 @@ public: } static String *CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length, + bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable = true); + + static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable = true); static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm, @@ -235,11 +238,14 @@ public: * Compares strings by bytes. It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length); + static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length, + bool can_be_compressed); /** * Compares strings by bytes. It doesn't check canonical unicode equivalence. */ static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length); static String *DoReplace(String *src, uint16_t old_c, uint16_t new_c, LanguageContext ctx, PandaVM *vm); + static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length, bool can_be_compressed); static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length); static uint32_t ComputeHashcodeUtf16(uint16_t *utf16_data, uint32_t length); @@ -256,6 +262,8 @@ public: static String *FastSubString(String *src, uint32_t start, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm = nullptr); + static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data); + protected: void SetLength(uint32_t length, bool compressed = false) { diff --git a/runtime/string_table.cpp b/runtime/string_table.cpp index 7f52166fbd..6098d369cf 100644 --- a/runtime/string_table.cpp +++ b/runtime/string_table.cpp @@ -24,9 +24,10 @@ namespace panda { coretypes::String *StringTable::GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx) { - auto *str = internal_table_.GetString(mutf8_data, utf16_length, ctx); + bool can_be_compressed = coretypes::String::CanBeCompressedMUtf8(mutf8_data); + auto *str = internal_table_.GetString(mutf8_data, utf16_length, can_be_compressed, ctx); if (str == nullptr) { - str = table_.GetOrInternString(mutf8_data, utf16_length, ctx); + str = table_.GetOrInternString(mutf8_data, utf16_length, can_be_compressed, ctx); } return str; } @@ -54,7 +55,7 @@ coretypes::String *StringTable::GetOrInternInternalString(const panda_file::File LanguageContext ctx) { auto data = pf.GetStringData(id); - coretypes::String *str = table_.GetString(data.data, data.utf16_length, ctx); + coretypes::String *str = table_.GetString(data.data, data.utf16_length, data.is_ascii, ctx); if (str != nullptr) { return str; } @@ -77,13 +78,13 @@ size_t StringTable::Size() } coretypes::String *StringTable::Table::GetString(const uint8_t *utf8_data, uint32_t utf16_length, - [[maybe_unused]] LanguageContext ctx) + bool can_be_compressed, [[maybe_unused]] LanguageContext ctx) { - uint32_t hash_code = coretypes::String::ComputeHashcodeMutf8(utf8_data, utf16_length); + uint32_t hash_code = coretypes::String::ComputeHashcodeMutf8(utf8_data, utf16_length, can_be_compressed); os::memory::ReadLockHolder holder(table_lock_); for (auto it = table_.find(hash_code); it != table_.end(); it++) { auto found_string = it->second; - if (coretypes::String::StringsAreEqualMUtf8(found_string, utf8_data, utf16_length)) { + if (coretypes::String::StringsAreEqualMUtf8(found_string, utf8_data, utf16_length, can_be_compressed)) { return found_string; } } @@ -140,15 +141,16 @@ coretypes::String *StringTable::Table::InternString(coretypes::String *string, [ } coretypes::String *StringTable::Table::GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, - LanguageContext ctx) + bool can_be_compressed, LanguageContext ctx) { - coretypes::String *result = GetString(mutf8_data, utf16_length, ctx); + coretypes::String *result = GetString(mutf8_data, utf16_length, can_be_compressed, ctx); if (result != nullptr) { return result; } // Even if this string is not inserted, it should get removed during GC - result = coretypes::String::CreateFromMUtf8(mutf8_data, utf16_length, ctx, Runtime::GetCurrent()->GetPandaVM()); + result = coretypes::String::CreateFromMUtf8(mutf8_data, utf16_length, can_be_compressed, ctx, + Runtime::GetCurrent()->GetPandaVM()); result = InternString(result, ctx); @@ -233,15 +235,15 @@ size_t StringTable::Table::Size() } coretypes::String *StringTable::InternalTable::GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, - LanguageContext ctx) + bool can_be_compressed, LanguageContext ctx) { - coretypes::String *result = GetString(mutf8_data, utf16_length, ctx); + coretypes::String *result = GetString(mutf8_data, utf16_length, can_be_compressed, ctx); if (result != nullptr) { return result; } - result = - coretypes::String::CreateFromMUtf8(mutf8_data, utf16_length, ctx, Runtime::GetCurrent()->GetPandaVM(), false); + result = coretypes::String::CreateFromMUtf8(mutf8_data, utf16_length, can_be_compressed, ctx, + Runtime::GetCurrent()->GetPandaVM(), false); return InternStringNonMovable(result, ctx); } @@ -262,12 +264,12 @@ coretypes::String *StringTable::InternalTable::GetOrInternString(const panda_fil panda_file::File::EntityId id, LanguageContext ctx) { auto data = pf.GetStringData(id); - coretypes::String *result = GetString(data.data, data.utf16_length, ctx); + coretypes::String *result = GetString(data.data, data.utf16_length, data.is_ascii, ctx); if (result != nullptr) { return result; } - result = coretypes::String::CreateFromMUtf8(data.data, data.utf16_length, ctx, Runtime::GetCurrent()->GetPandaVM(), - false); + result = coretypes::String::CreateFromMUtf8(data.data, data.utf16_length, data.is_ascii, ctx, + Runtime::GetCurrent()->GetPandaVM(), false); result = InternStringNonMovable(result, ctx); // Update cache. diff --git a/runtime/string_table.h b/runtime/string_table.h index 40a293ce32..76f3d6d572 100644 --- a/runtime/string_table.h +++ b/runtime/string_table.h @@ -69,7 +69,7 @@ protected: virtual ~Table() = default; virtual coretypes::String *GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, - LanguageContext ctx); + bool can_be_compressed, LanguageContext ctx); virtual coretypes::String *GetOrInternString(const uint16_t *utf16_data, uint32_t utf16_length, LanguageContext ctx); coretypes::String *GetOrInternString(coretypes::String *string, LanguageContext ctx); @@ -79,7 +79,8 @@ protected: size_t Size(); - coretypes::String *GetString(const uint8_t *utf8_data, uint32_t utf16_length, LanguageContext ctx); + coretypes::String *GetString(const uint8_t *utf8_data, uint32_t utf16_length, bool can_be_compressed, + LanguageContext ctx); coretypes::String *GetString(const uint16_t *utf16_data, uint32_t utf16_length, LanguageContext ctx); coretypes::String *GetString(coretypes::String *string, LanguageContext ctx); @@ -107,7 +108,7 @@ protected: } ~InternalTable() override = default; - coretypes::String *GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, + coretypes::String *GetOrInternString(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed, LanguageContext ctx) override; coretypes::String *GetOrInternString(const uint16_t *utf16_data, uint32_t utf16_length, -- Gitee