diff --git a/frameworks/libs/distributeddb/sqlite_adapter/CMakeLists.txt b/frameworks/libs/distributeddb/sqlite_adapter/CMakeLists.txt index 7e3ddb6bf48af19d1b6bafb8827a01f8aa78a2d0..15a3a216b44f2b6cb201a4c005524acb2d055287 100644 --- a/frameworks/libs/distributeddb/sqlite_adapter/CMakeLists.txt +++ b/frameworks/libs/distributeddb/sqlite_adapter/CMakeLists.txt @@ -45,3 +45,5 @@ add_library(customtokenizer SHARED ${SOURCE_FILES}) target_link_libraries(customtokenizer PUBLIC sqlite3 securec) target_include_directories(customtokenizer PUBLIC include) + +install(TARGETS customtokenizer LIBRARY DESTINATION lib) diff --git a/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_export_type.h b/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_export_type.h index a8b6961d3e93edd97e32f9480f042ff454997fcd..eccbc88dcdca5c2f08bea33404c82ddf3477b112 100644 --- a/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_export_type.h +++ b/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_export_type.h @@ -72,6 +72,7 @@ typedef struct GRD_TokenizerParam { typedef struct GRD_CutOption { bool needPreProcess; GRD_CutSceneE cutScene; + bool toLowerCase; } GRD_CutOptionT; #define EXTRACT_USE_POS_WEIGHT 1 diff --git a/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_sqlite.cpp b/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_sqlite.cpp index 998766b78d211c5229a2d3fe4011fe9dacb7f085..0397fee8574806b468c28c60d330d260d345f6c7 100644 --- a/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_sqlite.cpp +++ b/frameworks/libs/distributeddb/sqlite_adapter/src/tokenizer_sqlite.cpp @@ -30,16 +30,76 @@ using namespace CNTokenizer; typedef struct Fts5TokenizerParam { uint32_t magicCode = 0; GRD_CutScene cutScene = DEFAULT; + bool caseSensitive = true; } Fts5TokenizerParamT; static std::mutex g_mtx; static uint32_t g_refCount = 0; constexpr int FTS5_MAX_VERSION = 2; -constexpr int CUSTOM_TOKENIZER_PARAM_NUM = 2; constexpr int MAGIC_CODE = 0x12345678; constexpr const char *CUT_SCENE_PARAM_NAME = "cut_mode"; constexpr const char *CUT_SCENE_SHORT_WORDS = "short_words"; constexpr const char *CUT_SCENE_DEFAULT = "default"; +constexpr const char *CUT_CASE_SENSITIVE = "case_sensitive"; + +int AnalyzeCutMode(std::string &value, Fts5TokenizerParamT *para) +{ + if (value == CUT_SCENE_SHORT_WORDS) { + para->cutScene = SEARCH; + } else if (value == CUT_SCENE_DEFAULT) { + para->cutScene = DEFAULT; + } else { + sqlite3_log(SQLITE_ERROR, "invalid arg value of cut scene"); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +int AnalyzeCaseSensitive(std::string &value, Fts5TokenizerParamT *para) +{ + if (value == "1") { + para->caseSensitive = true; + } else if (value == "0") { + para->caseSensitive = false; + } else { + sqlite3_log(SQLITE_ERROR, "invalid arg value of case sensitive"); + return SQLITE_ERROR; + } + return SQLITE_OK; +} + +int ParseArgs(const char **azArg, int nArg, Fts5TokenizerParamT *para) +{ + if (nArg == 0) { + return SQLITE_OK; + } + // 检查参数个数是否为偶数 + if (nArg % 2 != 0) { + sqlite3_log(SQLITE_ERROR, "|Parse Args| invalid args num %d", nArg); + return SQLITE_ERROR; // 参数数量不匹配 + } + int ret = SQLITE_OK; + for (int i = 0; i < nArg - 1; i += 2) { // kv对 一次解析2个 + if (azArg[i] == nullptr || azArg[i + 1] == nullptr) { + sqlite3_log(SQLITE_ERROR, "|Parse Args| azArg[i] null"); + return SQLITE_ERROR; + } + std::string key = std::string(azArg[i]); + std::string value = std::string(azArg[i + 1]); + if (key == CUT_SCENE_PARAM_NAME) { + ret = AnalyzeCutMode(value, para); + } else if (key == CUT_CASE_SENSITIVE) { + ret = AnalyzeCaseSensitive(value, para); + } else { + sqlite3_log(SQLITE_ERROR, "invalid key"); + ret = SQLITE_ERROR; + } + if (ret != SQLITE_OK) { + return ret; + } + } + return SQLITE_OK; +} int fts5_customtokenizer_xCreate(void *sqlite3, const char **azArg, int nArg, Fts5Tokenizer **ppOut) { @@ -50,28 +110,11 @@ int fts5_customtokenizer_xCreate(void *sqlite3, const char **azArg, int nArg, Ft return SQLITE_ERROR; } pFts5TokenizerParam->magicCode = MAGIC_CODE; - if (nArg != 0 && nArg != CUSTOM_TOKENIZER_PARAM_NUM) { - sqlite3_log(SQLITE_ERROR, "invalid args num"); + int ret = ParseArgs(azArg, nArg, pFts5TokenizerParam); + if (ret != SQLITE_OK) { + sqlite3_log(ret, "Parse Args wrong"); delete pFts5TokenizerParam; - return SQLITE_ERROR; - } - if (nArg == CUSTOM_TOKENIZER_PARAM_NUM) { - std::string paramKey = std::string(azArg[0]); - std::string paramValue = std::string(azArg[1]); - if (paramKey != CUT_SCENE_PARAM_NAME) { - sqlite3_log(SQLITE_ERROR, "invalid arg name"); - delete pFts5TokenizerParam; - return SQLITE_ERROR; - } - if (paramValue == CUT_SCENE_SHORT_WORDS) { - pFts5TokenizerParam->cutScene = SEARCH; - } else if (paramValue == CUT_SCENE_DEFAULT) { - pFts5TokenizerParam->cutScene = DEFAULT; - } else { - sqlite3_log(SQLITE_ERROR, "invalid arg value of cut scene"); - delete pFts5TokenizerParam; - return SQLITE_ERROR; - } + return ret; } g_refCount++; if (g_refCount != 1) { // 说明已经初始化过了,直接返回 @@ -80,7 +123,7 @@ int fts5_customtokenizer_xCreate(void *sqlite3, const char **azArg, int nArg, Ft } GRD_TokenizerParamT param = {CUT_MMSEG, EXTRACT_TF_IDF}; - int ret = GRD_TokenizerInit(NULL, NULL, param); + ret = GRD_TokenizerInit(NULL, NULL, param); if (ret != GRD_OK) { sqlite3_log(ret, "GRD_TokenizerInit wrong"); delete pFts5TokenizerParam; @@ -125,6 +168,7 @@ int fts5_customtokenizer_xTokenize( return GRD_FAILED_MEMORY_ALLOCATE; } GRD_CutOptionT option = {false, pFts5TokenizerParam->cutScene}; + option.toLowerCase = !pFts5TokenizerParam->caseSensitive; GRD_WordEntryListT *entryList = nullptr; int ret = GRD_TokenizerCut(ptr, option, &entryList); if (ret != GRD_OK) { diff --git a/frameworks/libs/distributeddb/test/unittest/common/tokenizer/sqlite_adapter_test.cpp b/frameworks/libs/distributeddb/test/unittest/common/tokenizer/sqlite_adapter_test.cpp index 05d5935fde9e92ac35b4b986492ea3eb644be504..7f27016448d4416e85a2c7118a14496151078a11 100644 --- a/frameworks/libs/distributeddb/test/unittest/common/tokenizer/sqlite_adapter_test.cpp +++ b/frameworks/libs/distributeddb/test/unittest/common/tokenizer/sqlite_adapter_test.cpp @@ -343,4 +343,75 @@ HWTEST_F(SqliteAdapterTest, SqliteAdapterTest004, TestSize.Level0) const char *SQLDROP = "DROP TABLE IF EXISTS example;"; SQLTest(SQLDROP); EXPECT_EQ(sqlite3_close(g_sqliteDb), SQLITE_OK); -} \ No newline at end of file +} + +/** + * @tc.name: SqliteAdapterTest008 + * @tc.desc: Test case Sensitive + * @tc.type: FUNC + * @tc.require: + * @tc.author: whs + */ +HWTEST_F(SqliteAdapterTest, SqliteAdapterTest008, TestSize.Level0) +{ + /** + * @tc.steps: step1. prepare db + * @tc.expected: step1. OK. + */ + // Save any error messages + char *zErrMsg = nullptr; + + // Save the connection result + int rc = sqlite3_open_v2(g_dbPath, &g_sqliteDb, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, nullptr); + HandleRc(g_sqliteDb, rc); + + rc = sqlite3_db_config(g_sqliteDb, SQLITE_DBCONFIG_ENABLE_LOAD_EXTENSION, 1, nullptr); + HandleRc(g_sqliteDb, rc); + + rc = sqlite3_load_extension(g_sqliteDb, "libcustomtokenizer.z.so", nullptr, nullptr); + HandleRc(g_sqliteDb, rc); + /** + * @tc.steps: step2. create table + * @tc.expected: step2. OK. + */ + string sql = "CREATE VIRTUAL TABLE example USING fts5(content, tokenize = 'customtokenizer cut_mode short_words case_sensitive 0')"; + rc = sqlite3_exec(g_sqliteDb, sql.c_str(), Callback, 0, &zErrMsg); + HandleRc(g_sqliteDb, rc); + /** + * @tc.steps: step3. insert records + * @tc.expected: step3. OK. + */ + std::vector records = { + "电子邮件", + "这是一封电子邮件", + "这是一封关于少数民族的电子邮件", + "华中师范大学是一所位于武汉市的全日制综合性师范大学", + "中华人民共和国", + "武汉市长江大桥Wuhan Yangtze River Bridge是武汉市最长的桥" + }; + for (const auto &record : records) { + std::string insertSql = "insert into example values('" + record + "');"; + SQLTest(insertSql.c_str()); + } + /** + * @tc.steps: step4. test cut for short words + * @tc.expected: step4. OK. + */ + std::vector> expectResult = { + {"电子", 3}, {"邮件", 3}, {"电子邮件", 3}, {"少数", 1}, {"民族", 1}, {"少数民族", 1}, {"华中", 1}, + {"中师", 1}, {"师范", 1}, {"共和", 1}, {"共和国", 1}, {"人民共和国", 0}, {"Yangtze", 1}, {"Wuhan", 1}, + {"市长", 0}, {"yangtze", 1}, {"WUHAN", 1}, {"river", 1}, + }; + // 平台没有so导致失败,直接跳过测试 + if (!g_needSkip) { + for (const auto &[word, expectMatchNum] : expectResult) { + std::string querySql = "SELECT count(*) FROM example WHERE content MATCH '" + word + "';"; + EXPECT_EQ(sqlite3_exec(g_sqliteDb, querySql.c_str(), QueryCallback, + reinterpret_cast(expectMatchNum), nullptr), SQLITE_OK); + } + } + + const char *SQLDROP = "DROP TABLE IF EXISTS example;"; + SQLTest(SQLDROP); + EXPECT_EQ(sqlite3_close(g_sqliteDb), SQLITE_OK); +}