From f10808289d2403e94e6cb9edd330dbb1476cdb39 Mon Sep 17 00:00:00 2001 From: xuan0126 Date: Tue, 7 Jan 2025 10:52:04 +0800 Subject: [PATCH] Integrating iconv with ICU - translit is supported - ignore is supported - Some encoding capabilities are supported Issue: https://gitee.com/openharmony/third_party_musl/issues/IBHP3E?from=project-issue Signed-off-by: xuan0126 Change-Id: Ic9b9374841d7ff6ab3875475748273dccb4050df --- .../src/functional/iconv_joint_icu_test.c | 377 ++++++++++++++++++ .../src/functional/test_src_functional.gni | 1 + src/internal/locale_impl.c | 68 ++++ src/internal/locale_impl.h | 19 + src/locale/iconv.c | 313 ++++++++++++++- 5 files changed, 771 insertions(+), 7 deletions(-) create mode 100644 libc-test/src/functional/iconv_joint_icu_test.c diff --git a/libc-test/src/functional/iconv_joint_icu_test.c b/libc-test/src/functional/iconv_joint_icu_test.c new file mode 100644 index 000000000..d6e2c36ae --- /dev/null +++ b/libc-test/src/functional/iconv_joint_icu_test.c @@ -0,0 +1,377 @@ +/* + * Copyright (c) 2025 Huawei Device Co., Ltd. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ +#ifndef __LITEOS__ +#include "test.h" +#include +#include +#include +#include +#define BUFFER_SIZE 1024 +#define IGNORE_SIZE 9 + +typedef struct StatefulCombined { + unsigned sign; + const unsigned char* to; + const unsigned char* from; + iconv_t base_cd; + unsigned state; +} StatefulCombined; + +typedef struct NameMap { + const char* source; + const char* target; +} NameMap; + + +// Global Variables +static NameMap g_mappings[] = { + {"utf8\0char\0\0", "UTF-8"}, + {"utf7\0\0", "UTF-7"}, + {"ucs2\0utf16\0ucs2be\0utf16be\0\0", "UTF-16BE"}, + {"ucs2le\0utf16le\0\0", "UTF-16LE"}, + {"ucs4\0utf32\0ucs4be\0utf32be\0\0", "UTF-32BE"}, + {"wchart\0ucs4le\0utf32le\0\0", "UTF-32LE"}, + {"ascii\0usascii\0""20127\0iso646\0iso646us\0\0", "US-ASCII"}, + {"eucjp\0eucjp2007\0\0", "euc-jp-2007"}, + {"shiftjis\0sjis\0cp932\0ibm943p15a2003\0\0", "ibm-943_P15A-2003"}, + {"gb18030\0\0", "gb18030"}, + {"gbk\0""54936\0windows9362000\0\0", "windows-936-2000"}, + {"gb2312\0""52936\0ibm1383p1101999\0\0", "ibm-1383_P110-1999"}, + {"big5\0""950\0bigfive\0cp950\0windows9502000\0\0", "windows-950-2000"}, + {"big5hk\0big5hkscs\0""951\0ibm1375p1002008\0\0", "ibm-1375_P100-2008"}, + {"euckr\0ibm970p110p1102006u2\0\0", "ibm-970_P110_P110-2006_U2"}, + {"ksc5601\0ksx1001\0cp949\0windows9492000\0\0", "windows-949-2000"}, + {"iso88591\0latin1\0\0", "ISO-8859-1"}, + {"iso88592\0ibm912p1001995\0\0", "ibm-912_P100-1995"}, + {"iso88593\0ibm913p1002000\0\0", "ibm-913_P100-2000"}, + {"iso88594\0ibm914p1001995\0\0", "ibm-914_P100-1995"}, + {"iso88595\0ibm915p1001995\0\0", "ibm-915_P100-1995"}, + {"iso88596\0ibm1089p1001995\0\0", "ibm-1089_P100-1995"}, + {"iso88597\0ibm9005x1102007\0\0", "ibm-9005_X110-2007"}, + {"iso88598\0ibm5012p1001999\0\0", "ibm-5012_P100-1999"}, + {"iso88599\0ibm920p1001995\0\0", "ibm-920_P100-1995"}, + {"iso885910\0iso8859101998\0\0", "iso-8859_10-1998"}, + {"iso885911\0iso8859112001\0\0", "iso-8859_11-2001"}, + {"tis620\0windows8742000\0\0", "windows-874-2000"}, + {"iso885913\0ibm921p1001995\0\0", "ibm-921_P100-1995"}, + {"iso885914\0iso8859141998\0\0", "iso-8859_14-1998"}, + {"iso885915\0latin9\0ibm923p1001998\0\0", "ibm-923_P100-1998"}, + {"cp1250\0windows1250\0ibm5346p1001998\0\0", "ibm-5346_P100-1998"}, + {"cp1251\0windows1251\0ibm5347p1001998\0\0", "ibm-5347_P100-1998"}, + {"cp1252\0windows1252\0ibm5348p1001997\0\0", "ibm-5348_P100-1997"}, + {"cp1253\0windows1253\0ibm5349p1001998\0\0", "ibm-5349_P100-1998"}, + {"cp1254\0windows1254\0ibm5350p1001998\0\0", "ibm-5350_P100-1998"}, + {"cp1255\0windows1255\0ibm9447p1002002\0\0", "ibm-9447_P100-2002"}, + {"cp1256\0windows1256\0ibm9448x1002005\0\0", "ibm-9448_X100-2005"}, + {"cp1257\0windows1257\0ibm9449p1002002\0\0", "ibm-9449_P100-2002"}, + {"cp1258\0windows1258\0ibm5354p1001998\0\0", "ibm-5354_P100-1998"}, + {"koi8r\0ibm878p1001996\0\0", "ibm-878_P100-1996"}, + {"koi8u\0ibm1168p1002002\0\0", "ibm-1168_P100-2002"}, + {"cp437\0ibm437p1001995\0\0", "ibm-437_P100-1995"}, + {"cp850\0ibm850p1001995\0\0", "ibm-850_P100-1995"}, + {"cp866\0ibm866p1001995\0\0", "ibm-866_P100-1995"}, + {"ibm1047\0cp1047\0ibm1047p1001995\0\0", "ibm-1047_P100-1995"}, +}; +char* g_source_str; +char* g_target_str; +char g_mids[BUFFER_SIZE]; +char g_outs[BUFFER_SIZE]; +int32_t g_target_num = sizeof(g_mappings) / sizeof(g_mappings[0]); +char g_ins[] = { + 0x00, 0x20, // ' ' (space) + 0x00, 0x21, // '!' + 0x00, 0x22, // '"' + 0x00, 0x23, // '#' + 0x00, 0x24, // '$' + 0x00, 0x25, // '%' + 0x00, 0x26, // '&' + 0x00, 0x27, // ''' + 0x00, 0x28, // '(' + 0x00, 0x29, // ')' + 0x00, 0x2A, // '*' + 0x00, 0x2B, // '+' + 0x00, 0x2C, // ',' + 0x00, 0x2D, // '-' + 0x00, 0x2E, // '.' + 0x00, 0x2F, // '/' + 0x00, 0x30, // '0' + 0x00, 0x31, // '1' + 0x00, 0x32, // '2' + 0x00, 0x33, // '3' + 0x00, 0x34, // '4' + 0x00, 0x35, // '5' + 0x00, 0x36, // '6' + 0x00, 0x37, // '7' + 0x00, 0x38, // '8' + 0x00, 0x39, // '9' + 0x00, 0x3A, // ':' + 0x00, 0x3B, // ';' + 0x00, 0x3C, // '<' + 0x00, 0x3D, // '=' + 0x00, 0x3E, // '>' + 0x00, 0x3F, // '?' + 0x00, 0x40, // '@' + 0x00, 0x41, // 'A' + 0x00, 0x42, // 'B' + 0x00, 0x43, // 'C' + 0x00, 0x44, // 'D' + 0x00, 0x45, // 'E' + 0x00, 0x46, // 'F' + 0x00, 0x47, // 'G' + 0x00, 0x48, // 'H' + 0x00, 0x49, // 'I' + 0x00, 0x4A, // 'J' + 0x00, 0x4B, // 'K' + 0x00, 0x4C, // 'L' + 0x00, 0x4D, // 'M' + 0x00, 0x4E, // 'N' + 0x00, 0x4F, // 'O' + 0x00, 0x50, // 'P' + 0x00, 0x51, // 'Q' + 0x00, 0x52, // 'R' + 0x00, 0x53, // 'S' + 0x00, 0x54, // 'T' + 0x00, 0x55, // 'U' + 0x00, 0x56, // 'V' + 0x00, 0x57, // 'W' + 0x00, 0x58, // 'X' + 0x00, 0x59, // 'Y' + 0x00, 0x5A, // 'Z' + 0x00, 0x5B, // '[' + 0x00, 0x5C, // '\\' + 0x00, 0x5D, // ']' + 0x00, 0x5E, // '^' + 0x00, 0x5F, // '_' + 0x00, 0x60, // '`' + 0x00, 0x61, // 'a' + 0x00, 0x62, // 'b' + 0x00, 0x63, // 'c' + 0x00, 0x64, // 'd' + 0x00, 0x65, // 'e' + 0x00, 0x66, // 'f' + 0x00, 0x67, // 'g' + 0x00, 0x68, // 'h' + 0x00, 0x69, // 'i' + 0x00, 0x6A, // 'j' + 0x00, 0x6B, // 'k' + 0x00, 0x6C, // 'l' + 0x00, 0x6D, // 'm' + 0x00, 0x6E, // 'n' + 0x00, 0x6F, // 'o' + 0x00, 0x70, // 'p' + 0x00, 0x71, // 'q' + 0x00, 0x72, // 'r' + 0x00, 0x73, // 's' + 0x00, 0x74, // 't' + 0x00, 0x75, // 'u' + 0x00, 0x76, // 'v' + 0x00, 0x77, // 'w' + 0x00, 0x78, // 'x' + 0x00, 0x79, // 'y' + 0x00, 0x7A, // 'z' + 0x00, 0x7B, // '{' + 0x00, 0x7C, // '|' + 0x00, 0x7D, // '}' + 0x00, 0x7E // '~' +}; +size_t g_ins_len = sizeof(g_ins); +char* g_ins_zh = "Big5 编码示例"; +size_t g_ins_zh_len; + +void iconv_close_with_strerror(iconv_t cd) +{ + if (iconv_close(cd)) { + t_error("closed iconv failed, error: %s \n", strerror(errno)); + } +} + +void alias_test(void) +{ + char* s; + StatefulCombined *scd = 0; + for (s = g_source_str; *s;) { + scd = (void *)iconv_open(s, s); + if (scd == (iconv_t)-1) { + t_error("iconv opened failed, from: %s, to: %s, error: %s \n", s, s, strerror(errno)); + return; + } + if (strcmp((void*)scd->to, g_target_str) != 0) { + t_error("verify error: [input]%s, [actual_to]%s [expect_to]%s\n", s, scd->to, g_target_str); + } + if (strcmp((void*)scd->from, g_target_str) != 0) { + t_error("verify error: [input]%s, [actual_from]%s [expect_from]%s\n", s, scd->from, g_target_str); + } + s += strlen(s) + 1; + iconv_close_with_strerror(scd); + } +} + +size_t iconv_test(char* from, char* to, + char** inptrptr, size_t* input_len_ptr, char** outptrptr, size_t* output_len_ptr) +{ + iconv_t cd = iconv_open((void*)to, (void*)from); + if (cd == (iconv_t)-1) { + t_error("iconv opened failed, from: %s, to: %s, error: %s \n", from, to, strerror(errno)); + return (size_t)-1; + } + + size_t res = iconv(cd, inptrptr, input_len_ptr, outptrptr, output_len_ptr); + if (res) { + iconv_close_with_strerror(cd); + return res; + } + + iconv_close_with_strerror(cd); + return res; +} + +void iconv_exchange(char* from, char* to, char* ins, size_t ins_len) +{ + char* ins_ptr = ins; + char* mids_ptr = g_mids; + char* outs_ptr = g_outs; + size_t ins_bytes = ins_len; + size_t mids_bytes = BUFFER_SIZE; + size_t outs_bytes = BUFFER_SIZE; + + iconv_test(from, to, &ins_ptr, &ins_bytes, &mids_ptr, &mids_bytes); + + ins_ptr = g_mids; + ins_bytes = BUFFER_SIZE - mids_bytes; + iconv_test(to, from, &ins_ptr, &ins_bytes, &outs_ptr, &outs_bytes); +} + +static int charcmp(char* a, char* b) +{ + for (; *a && *b; a++, b++) { + if (*a != *b) {return 1;} + } + return *a != *b; +} + +void iconv_exchange_with_charcmp(char* from, char* to, char* ins, size_t ins_len) +{ + iconv_exchange(from, to, ins, ins_len); + if (charcmp(ins, g_outs) != 0) { + t_error("compare error [type]%s, [ins] %s, [outs] %s\n", + to, ins, g_outs); + } +} + +void test_to_ignore_skip(void) +{ + char* expect = "Big5 示例"; + iconv_exchange("utf8", "big5//IGNORE", g_ins_zh, g_ins_zh_len); + if (strcmp(expect, g_outs) != 0) { + t_error("iconv with ignore error: [ins] %s [outs] %s [expect] %s\n", + g_ins_zh, g_outs, expect); + } +} + +void test_to_translit_skip(void) +{ + char* expect = "Big5 ??示例"; + iconv_exchange("utf8", "big5//TRANSLIT", g_ins_zh, g_ins_zh_len); + if (strcmp(expect, g_outs) != 0) { + t_error("iconv with translit error: [ins] %s [outs] %s [expect] %s\n", + g_ins_zh, g_outs, expect); + } +} + +void test_errno_ilseq(void) +{ + char* ins_ptr = g_ins_zh; + size_t ins_bytes = g_ins_zh_len; + char* outs_ptr = g_outs; + size_t outs_bytes = BUFFER_SIZE; + size_t res = iconv_test("utf8", "big5", &ins_ptr, &ins_bytes, &outs_ptr, &outs_bytes); + if (res != (size_t)-1 || errno != EILSEQ) { + t_error("Failed: [ins] %s [outs] %s [res] %d [errno] %s\n", + g_ins_zh, g_outs, res, strerror(errno)); + } +} + +void test_errno_e2big(void) +{ + char* ins_ptr = g_ins_zh; + size_t ins_bytes = g_ins_zh_len; + char* outs_ptr = g_outs; + size_t outs_bytes = 10; + size_t res = iconv_test("utf8", "utf16", &ins_ptr, &ins_bytes, &outs_ptr, &outs_bytes); + if (res != (size_t)-1 || errno != E2BIG) { + t_error("Failed: [ins] %s [outs] %s [res] %d [errno] %s\n", + g_ins_zh, g_outs, res, strerror(errno)); + } +} + +void test_empty_type(void) +{ + StatefulCombined *scd = 0; + scd = (void *)iconv_open("", ""); + if (strcmp((void*)scd->to, "UTF-8") != 0 || strcmp((void*)scd->from, "UTF-8") != 0) { + t_error("verify error: empty type not default fill utf8"); + } + iconv_close_with_strerror(scd); +} + +int main(void) +{ + g_ins_zh_len = strlen(g_ins_zh); + + for (int i = 0; i < g_target_num; i++) { + g_target_str = (void*)g_mappings[i].target; + g_source_str = (void*)g_mappings[i].source; + + // test all avail alias with iconv_open + alias_test(); + + // test iconv without ignore + iconv_exchange_with_charcmp("utf16", g_target_str, g_ins, g_ins_len); + + // test iconv with ignore (norm input->not skip) + char* target_str_with_ignore = malloc(strlen(g_target_str) + IGNORE_SIZE); + strcpy(target_str_with_ignore, g_target_str); + strcat(target_str_with_ignore, "//IGNORE"); + iconv_exchange_with_charcmp("utf16", target_str_with_ignore, g_ins, g_ins_len); + iconv_exchange_with_charcmp("utf16//IGNORE", target_str_with_ignore, g_ins, g_ins_len); + free(target_str_with_ignore); + } + + // test iconv with ignore (special input -> skip) + test_to_ignore_skip(); + + // test iconv with translit (special input -> translit) + test_to_translit_skip(); + + // test basic type (iso885916) + iconv_exchange_with_charcmp("utf16", "iso885916", g_ins, g_ins_len); + + // test errno + test_errno_ilseq(); + test_errno_e2big(); + + // empty type -> fill UTF-8 + test_empty_type(); + + return t_status; +} +#endif \ No newline at end of file diff --git a/libc-test/src/functional/test_src_functional.gni b/libc-test/src/functional/test_src_functional.gni index bc73c84c9..76f36c650 100644 --- a/libc-test/src/functional/test_src_functional.gni +++ b/libc-test/src/functional/test_src_functional.gni @@ -86,6 +86,7 @@ functional_list = [ "renameat2_test", "fopencookie_test", "iconv_test", + "iconv_joint_icu_test", ] if (musl_use_pthread_cancel) { diff --git a/src/internal/locale_impl.c b/src/internal/locale_impl.c index f2bce30ba..595a44339 100644 --- a/src/internal/locale_impl.c +++ b/src/internal/locale_impl.c @@ -27,6 +27,7 @@ static void *g_icuuc_handle = NULL; static void *g_icui18n_handle = NULL; hidden struct icu_opt_func g_icu_opt_func = { NULL }; static int dlopen_fail_flag = 0; +static int icuuc_handle_init_fail = 0; static void *get_icu_handle(icu_so_type type, const char *symbol_name) { @@ -42,6 +43,11 @@ static void *get_icu_handle(icu_so_type type, const char *symbol_name) if (!cur_handle && !dlopen_fail_flag) { cur_handle = dlopen(cur_so, RTLD_LOCAL); + if (type == ICU_UC) { + g_icuuc_handle = cur_handle; + } else { + g_icui18n_handle = cur_handle; + } } if (!cur_handle) { dlopen_fail_flag = 1; @@ -98,4 +104,66 @@ void get_valid_icu_locale_name(const char *name, const char *icu_name, int icu_n strncpy(icu_name, name, valid_len); } } + +bool icuuc_handle_init() +{ + if (icuuc_handle_init_fail) { + return false; + } + + if (!g_icu_opt_func.set_data_directory) { + g_icu_opt_func.set_data_directory = get_icu_handle(ICU_UC, ICU_SET_DATA_DIRECTORY_SYMBOL); + if (g_icu_opt_func.set_data_directory) { + g_icu_opt_func.set_data_directory(); + } else { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_open) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_open), ICU_UCNV_OPEN_SYMBOL); + if (!g_icu_opt_func.ucnv_open) { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_setToUCallBack) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_setToUCallBack), ICU_UCNV_SETTOUCALLBACK_SYMBOL); + if (!g_icu_opt_func.ucnv_setToUCallBack) { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_setFromUCallBack) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_setFromUCallBack), ICU_UCNV_SETFROMUCALLBACK_SYMBOL); + if (!g_icu_opt_func.ucnv_setFromUCallBack) { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_toUChars) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_toUChars), ICU_UCNV_TOUCHARS_SYMBOL); + if (!g_icu_opt_func.ucnv_toUChars) { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_fromUChars) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_fromUChars), ICU_UCNV_FROMUCHARS_SYMBOL); + if (!g_icu_opt_func.ucnv_fromUChars) { + icuuc_handle_init_fail = 1; + return false; + } + } + if (!g_icu_opt_func.ucnv_close) { + get_icu_symbol(ICU_UC, &(g_icu_opt_func.ucnv_close), ICU_UCNV_CLOSE_SYMBOL); + if (!g_icu_opt_func.ucnv_close) { + icuuc_handle_init_fail = 1; + return false; + } + } + + errno = 0; + return true; +} #endif diff --git a/src/internal/locale_impl.h b/src/internal/locale_impl.h index 263128d40..3ca993500 100644 --- a/src/internal/locale_impl.h +++ b/src/internal/locale_impl.h @@ -43,6 +43,12 @@ hidden char *__gettextdomain(void); #define ICU_UNUM_PARSE_DOUBLE_SYMBOL "unum_parseDouble" #define ICU_UNUM_GET_SYMBOL_SYMBOL "unum_getSymbol" #define ICU_AUSTRNCPY_SYMBOL "u_austrncpy" +#define ICU_UCNV_OPEN_SYMBOL "ucnv_open" +#define ICU_UCNV_SETTOUCALLBACK_SYMBOL "ucnv_setToUCallBack" +#define ICU_UCNV_SETFROMUCALLBACK_SYMBOL "ucnv_setFromUCallBack" +#define ICU_UCNV_TOUCHARS_SYMBOL "ucnv_toUChars" +#define ICU_UCNV_FROMUCHARS_SYMBOL "ucnv_fromUChars" +#define ICU_UCNV_CLOSE_SYMBOL "ucnv_close" #ifdef FEATURE_ICU_LOCALE_TMP #define ICU_UCHAR_ISALNUM_SYMBOL "u_isalnum" #define ICU_UCHAR_ISALPHA_SYMBOL "u_isalpha" @@ -74,6 +80,7 @@ hidden void get_valid_icu_locale_name(const char *name, const char *icu_name, in hidden void *icu_unum_open(char *icu_locale_name, int *cur_status); hidden void icu_unum_close(void *fmt); hidden double icu_parse_double(void *fmt, u_char *ustr, int32_t *parse_pos, int *cur_status); +hidden bool icuuc_handle_init(); typedef char *(*f_icuuc_get_icu_version)(void); typedef void (*f_icuuc_u_set_data_directory)(void); @@ -84,6 +91,12 @@ typedef void *(*f_icu18n_u_str_from_utf32)(u_char *, int32_t, int32_t *, const w typedef double (*f_icu18n_unum_parse_double)(void *, u_char *, int32_t, int32_t *, int *); typedef int32_t(*f_icu18n_unum_get_symbol)(const void *, int, u_char *, int32_t, int *); typedef char *(*f_icuuc_u_austrncpy)(char *, const u_char *, int32_t); +typedef void* (*f_ucnv_open)(const char*, int*); +typedef int32_t (*f_ucnv_setToUCallBack)(void*, void*, void*, void*, void*, int*); +typedef int32_t (*f_ucnv_setFromUCallBack)(void*, void*, void*, void*, void*, int*); +typedef int32_t (*f_ucnv_toUChars)(void*, char*, size_t, void*, size_t, int*); +typedef int32_t (*f_ucnv_fromUChars)(void*, char*, size_t, uint16_t*, size_t, int*); +typedef void (*f_ucnv_close)(void*); #ifdef FEATURE_ICU_LOCALE_TMP typedef int(*f_icu18n_u_isalnum)(int c); typedef int(*f_icu18n_u_isalpha)(int c); @@ -111,6 +124,12 @@ struct icu_opt_func { f_icu18n_unum_parse_double unum_parse_double; f_icu18n_unum_get_symbol unum_get_symbol; f_icuuc_u_austrncpy u_austrncpy; + f_ucnv_open ucnv_open; + f_ucnv_setToUCallBack ucnv_setToUCallBack; + f_ucnv_setFromUCallBack ucnv_setFromUCallBack; + f_ucnv_toUChars ucnv_toUChars; + f_ucnv_fromUChars ucnv_fromUChars; + f_ucnv_close ucnv_close; #ifdef FEATURE_ICU_LOCALE_TMP f_icu18n_u_isalnum u_isalnum; f_icu18n_u_isalpha u_isalpha; diff --git a/src/locale/iconv.c b/src/locale/iconv.c index 175def1c6..8ca83a40d 100644 --- a/src/locale/iconv.c +++ b/src/locale/iconv.c @@ -6,6 +6,11 @@ #include #include #include "locale_impl.h" +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +#include +#endif +#endif #define UTF_32BE 0300 #define UTF_16LE 0301 @@ -27,7 +32,22 @@ #define GB2312 0332 #define BIG5 0340 #define EUC_KR 0350 - +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +#define ICU_ZERO_ERROR 0 +#define ICU_IVALID_CHAR_ERROR 10 +#define ICU_TRUNCATED_CHAR_ERROR 11 +#define ICU_ILLEGAL_CHAR_ERROR 12 +#define ICU_BUFFER_OVERFLOW_ERROR 15 +#define ICU_SKIP_THRESHOLD 2 +#define DEVICE_VERSION_THRESHOLD 16 +#define TYPE_FLAG_POS 1 +#define TO_IGNORE_FLAG_POS 2 +#define FROM_IGNORE_FLAG_POS 3 +#define TO_TRANSLIT_FLAG_POS 4 +#define FROM_TRANSLIT_FLAG_POS 5 +#endif +#endif /* Definitions of charmaps. Each charmap consists of: * 1. Empty-string-terminated list of null-terminated aliases. * 2. Special type code or number of elided quads of entries. @@ -59,6 +79,60 @@ static const unsigned char charmaps[] = #include "codepages.h" ; +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +// \0 split alias; \0\0 split name in icu +static const unsigned char icu_name_maps[] = +"utf8\0char\0\0UTF-8\0" +"utf7\0\0UTF-7\0" +"ucs2\0utf16\0ucs2be\0utf16be\0\0UTF-16BE\0" +"ucs2le\0utf16le\0\0UTF-16LE\0" +"ucs4\0utf32\0ucs4be\0utf32be\0\0UTF-32BE\0" +"wchart\0ucs4le\0utf32le\0\0UTF-32LE\0" +"ascii\0usascii\0""20127\0iso646\0iso646us\0\0US-ASCII\0" +"eucjp\0eucjp2007\0\0euc-jp-2007\0" +"shiftjis\0sjis\0cp932\0ibm943p15a2003\0\0ibm-943_P15A-2003\0" +"gb18030\0\0gb18030\0" +"gbk\0""54936\0windows9362000\0\0windows-936-2000\0" +"gb2312\0""52936\0ibm1383p1101999\0\0ibm-1383_P110-1999\0" +"big5\0""950\0bigfive\0cp950\0windows9502000\0\0windows-950-2000\0" +"big5hk\0big5hkscs\0""951\0ibm1375p1002008\0\0ibm-1375_P100-2008\0" +"euckr\0ibm970p110p1102006u2\0\0ibm-970_P110_P110-2006_U2\0" +"ksc5601\0ksx1001\0cp949\0windows9492000\0\0windows-949-2000\0" +"iso88591\0latin1\0\0ISO-8859-1\0" +"iso88592\0ibm912p1001995\0\0ibm-912_P100-1995\0" +"iso88593\0ibm913p1002000\0\0ibm-913_P100-2000\0" +"iso88594\0ibm914p1001995\0\0ibm-914_P100-1995\0" +"iso88595\0ibm915p1001995\0\0ibm-915_P100-1995\0" +"iso88596\0ibm1089p1001995\0\0ibm-1089_P100-1995\0" +"iso88597\0ibm9005x1102007\0\0ibm-9005_X110-2007\0" +"iso88598\0ibm5012p1001999\0\0ibm-5012_P100-1999\0" +"iso88599\0ibm920p1001995\0\0ibm-920_P100-1995\0" +"iso885910\0iso8859101998\0\0iso-8859_10-1998\0" +"iso885911\0iso8859112001\0\0iso-8859_11-2001\0" +"tis620\0windows8742000\0\0windows-874-2000\0" +"iso885913\0ibm921p1001995\0\0ibm-921_P100-1995\0" +"iso885914\0iso8859141998\0\0iso-8859_14-1998\0" +"iso885915\0latin9\0ibm923p1001998\0\0ibm-923_P100-1998\0" +"cp1250\0windows1250\0ibm5346p1001998\0\0ibm-5346_P100-1998\0" +"cp1251\0windows1251\0ibm5347p1001998\0\0ibm-5347_P100-1998\0" +"cp1252\0windows1252\0ibm5348p1001997\0\0ibm-5348_P100-1997\0" +"cp1253\0windows1253\0ibm5349p1001998\0\0ibm-5349_P100-1998\0" +"cp1254\0windows1254\0ibm5350p1001998\0\0ibm-5350_P100-1998\0" +"cp1255\0windows1255\0ibm9447p1002002\0\0ibm-9447_P100-2002\0" +"cp1256\0windows1256\0ibm9448x1002005\0\0ibm-9448_X100-2005\0" +"cp1257\0windows1257\0ibm9449p1002002\0\0ibm-9449_P100-2002\0" +"cp1258\0windows1258\0ibm5354p1001998\0\0ibm-5354_P100-1998\0" +"koi8r\0ibm878p1001996\0\0ibm-878_P100-1996\0" +"koi8u\0ibm1168p1002002\0\0ibm-1168_P100-2002\0" +"cp437\0ibm437p1001995\0\0ibm-437_P100-1995\0" +"cp850\0ibm850p1001995\0\0ibm-850_P100-1995\0" +"cp866\0ibm866p1001995\0\0ibm-866_P100-1995\0" +"ibm1047\0cp1047\0ibm1047p1001995\0\0ibm-1047_P100-1995\0" +; +#endif +#endif + /* Table of characters that appear in legacy 8-bit codepages, * limited to 1024 slots (10 bit indices). The first 256 entries * are elided since those characters are obviously all included. */ @@ -117,7 +191,42 @@ static size_t find_charmap(const void *name) return -1; } +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +static const unsigned char* find_icu_map(const void *query_name) +{ + if (!*(char *)query_name) { + query_name = icu_name_maps; + } + + const unsigned char *icu_name = icu_name_maps; + while (*icu_name) { + if (!fuzzycmp(query_name, icu_name)) { + while (*icu_name) { + icu_name += strlen((void *)icu_name) + 1; //find nearly \0\0 + } + return icu_name + 1; + } + icu_name += strlen((void *)icu_name) + 1; // skip \0 + if (!*icu_name) { // skip \0\0 + icu_name++; + while (*icu_name) {icu_name++;} + icu_name++; + } + } + return NULL; +} +#endif +#endif + struct stateful_cd { +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE + unsigned sign; + const unsigned char* to; + const unsigned char* from; +#endif +#endif iconv_t base_cd; unsigned state; }; @@ -137,11 +246,92 @@ static size_t extract_to(iconv_t cd) return (size_t)cd >> 1 & 0x7fff; } -iconv_t iconv_open(const char *to, const char *from) +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +static void set_type_flag(unsigned* value) {*value = (1 << TYPE_FLAG_POS) | *value;} +static void set_to_ignore_flag(unsigned* value) {*value = (1 << TO_IGNORE_FLAG_POS) | *value;} +static void set_from_ignore_flag(unsigned* value) {*value = (1 << FROM_IGNORE_FLAG_POS) | *value;} +static void set_to_translit_flag(unsigned* value) {*value = (1 << TO_TRANSLIT_FLAG_POS) | *value;} +static void set_from_translit_flag(unsigned* value) {*value = (1 << FROM_TRANSLIT_FLAG_POS) | *value;} +static bool get_type_flag(unsigned value) {return (value >> TYPE_FLAG_POS) & 1;} +static bool get_to_ignore_flag(unsigned value) {return (value >> TO_IGNORE_FLAG_POS) & 1;} +static bool get_from_ignore_flag(unsigned value) {return (value >> FROM_IGNORE_FLAG_POS) & 1;} +static bool get_to_translit_flag(unsigned value) {return (value >> TO_TRANSLIT_FLAG_POS) & 1;} +static bool get_from_translit_flag(unsigned value) {return (value >> FROM_TRANSLIT_FLAG_POS) & 1;} + +static bool deal_with_tail(const char* ins, unsigned* sign, const unsigned char** res, bool is_from) { - size_t f, t; - struct stateful_cd *scd; + char* ins_tmp = strdup(ins); + if (!ins_tmp) {return false;} + char* ins_ignore_pos = strstr(ins_tmp, "//IGNORE"); + char* ins_translit_pos = strstr(ins_tmp, "//TRANSLIT"); + if (ins_ignore_pos) { + if (is_from) { + set_from_ignore_flag(sign); + } else { + set_to_ignore_flag(sign); + } + *ins_ignore_pos = '\0'; + *res = find_icu_map((void*)ins_tmp); + } else if (ins_translit_pos) { + if (is_from) { + set_from_translit_flag(sign); + } else { + set_to_translit_flag(sign); + } + *ins_translit_pos = '\0'; + *res = find_icu_map((void*)ins_tmp); + } else { + *res = find_icu_map(ins); + } + free(ins_tmp); + return true; +} +#endif +#endif +iconv_t iconv_open(const char *to, const char *from) +{ + struct stateful_cd *scd; + +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE + bool is_basic_open = false; + if (get_device_api_version_inner() < DEVICE_VERSION_THRESHOLD) { + is_basic_open = true; + } else { + for (const char* s = "iso885916\0iso2022jp\0\0"; *s;) { // icu not support + if (!fuzzycmp((void*)to, (void*)s) || !fuzzycmp((void*)from, (void*)s)) { + is_basic_open = true; + } + s += strlen(s) + 1; + } + } + + // icu open + if (!is_basic_open && icuuc_handle_init()) { + scd = malloc(sizeof *scd); + if (!scd) {return (iconv_t)-1;} + scd->sign = 0; + scd->state = 0; + + if (!deal_with_tail(to, &scd->sign, &scd->to, false)) {return (iconv_t)-1;} + if (!deal_with_tail(from, &scd->sign, &scd->from, true)) {return (iconv_t)-1;} + + if (!scd->to || !scd->from) { + errno = EINVAL; + free(scd); + return (iconv_t)-1; + } + + set_type_flag(&scd->sign); + return (iconv_t)scd; + } +#endif +#endif + + // basic open + size_t f, t; if ((t = find_charmap(to))==-1 || (f = find_charmap(from))==-1 || (charmaps[t] >= 0330)) { @@ -157,6 +347,7 @@ iconv_t iconv_open(const char *to, const char *from) case ISO2022_JP: scd = malloc(sizeof *scd); if (!scd) return (iconv_t)-1; + memset(scd, 0, sizeof(*scd)); scd->base_cd = cd; scd->state = 0; cd = (iconv_t)scd; @@ -224,13 +415,123 @@ static unsigned uni_to_jis(unsigned c) } } +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE +static void ucnv_from_u_callback_ignore( + const void* context, + void* fromUArgs, + const void* codeUnits, + int32_t length, + int32_t codePoint, + int reason, + int* err) +{ + if (reason <= ICU_SKIP_THRESHOLD) { + *err = ICU_ZERO_ERROR; + } +} + +static void ucnv_from_u_callback_stop(const void* context, ...) { } + +static void ucnv_to_u_callback_ignore( + const void* context, + void* toUArgs, + const void* codeUnits, + int32_t length, + int reason, + int* err) +{ + if (reason <= ICU_SKIP_THRESHOLD) { + *err = ICU_ZERO_ERROR; + } +} + +static void ucnv_to_u_callback_stop(const void* context, ...) { } + +static void set_errno(int errCode) +{ + if (errCode == ICU_ZERO_ERROR) { + errno = 0; + } else if (errCode == ICU_BUFFER_OVERFLOW_ERROR) { + errno = E2BIG; + } else if (errCode == ICU_IVALID_CHAR_ERROR || + errCode == ICU_TRUNCATED_CHAR_ERROR || + errCode == ICU_ILLEGAL_CHAR_ERROR) { + errno = EILSEQ; + } else { + errno = EINVAL; + } +} + +static size_t iconv_icu(unsigned sign, const unsigned char* to, const unsigned char* from, +char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) +{ + int errCode = ICU_ZERO_ERROR; + size_t out_size = 0; + size_t uchars_len = *inb * 4; + uint16_t uchars[uchars_len]; + + // (from -> UChars) <=> ucnv_toUChars + void* conv_to_u = g_icu_opt_func.ucnv_open((void*)from, &errCode); + if (get_from_ignore_flag(sign)) { + g_icu_opt_func.ucnv_setToUCallBack(conv_to_u, ucnv_to_u_callback_ignore, NULL, NULL, NULL, &errCode); + } else if (!get_from_translit_flag(sign)) { + g_icu_opt_func.ucnv_setFromUCallBack(conv_to_u, ucnv_to_u_callback_stop, NULL, NULL, NULL, &errCode); + } + uchars_len = g_icu_opt_func.ucnv_toUChars(conv_to_u, uchars, uchars_len, *in, *inb, &errCode); + if (errCode > ICU_ZERO_ERROR) { + set_errno(errCode); + return (size_t)-1; + } else { + errCode = ICU_ZERO_ERROR; + } + g_icu_opt_func.ucnv_close(conv_to_u); + + // (UChars -> to) <=> ucnv_fromUChars + void* conv_from_u = g_icu_opt_func.ucnv_open((void*)to, &errCode); + if (get_to_ignore_flag(sign)) { + g_icu_opt_func.ucnv_setFromUCallBack(conv_from_u, ucnv_from_u_callback_ignore, NULL, NULL, NULL, &errCode); + } else if (!get_to_translit_flag(sign)) { + g_icu_opt_func.ucnv_setFromUCallBack(conv_from_u, ucnv_from_u_callback_stop, NULL, NULL, NULL, &errCode); + } + out_size = g_icu_opt_func.ucnv_fromUChars(conv_from_u, *out, *outb, uchars, uchars_len, &errCode); + if (errCode > ICU_ZERO_ERROR) { + set_errno(errCode); + return (size_t)-1; + } else { + errCode = ICU_ZERO_ERROR; + } + g_icu_opt_func.ucnv_close(conv_from_u); + + *out += out_size; + *outb -= out_size; + *in += *inb; + *inb -= *inb; + set_errno(errCode); + + return (size_t)errCode; +} +#endif +#endif + size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restrict out, size_t *restrict outb) { - size_t x=0; + if (!in || !*in || !*inb) { + return 0; + } + + size_t x=0; struct stateful_cd *scd=0; if (!((size_t)cd & 1)) { scd = (void *)cd; cd = scd->base_cd; +#ifndef __LITEOS__ +#ifdef FEATURE_ICU_LOCALE + if (get_type_flag(scd->sign)) { + return iconv_icu(scd->sign, scd->to, scd->from, in, inb, out, outb); + } +#endif +#endif } unsigned to = extract_to(cd); unsigned from = extract_from(cd); @@ -245,8 +546,6 @@ size_t iconv(iconv_t cd, char **restrict in, size_t *restrict inb, char **restri unsigned char totype = tomap[-1]; locale_t *ploc = &CURRENT_LOCALE, loc = *ploc; - if (!in || !*in || !*inb) return 0; - *ploc = UTF8_LOCALE; for (; *inb; *in+=l, *inb-=l) { -- Gitee