From 83c3a3ca94fd8169c41da1dd9e1d0e86f8f1a43f Mon Sep 17 00:00:00 2001 From: lushi Date: Thu, 24 Feb 2022 17:53:23 +0800 Subject: [PATCH 1/5] fix cve-2022-25236 Signed-off-by: lushi Change-Id: Ib238722c0075720c416d362874daf41bc19f6289 --- lib/xmlparse.c | 17 +++++++++++++---- tests/runtests.c | 30 ++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 4 deletions(-) diff --git a/lib/xmlparse.c b/lib/xmlparse.c index f19e368..a286335 100644 --- a/lib/xmlparse.c +++ b/lib/xmlparse.c @@ -716,8 +716,7 @@ XML_ParserCreate(const XML_Char *encodingName) { XML_Parser XMLCALL XML_ParserCreateNS(const XML_Char *encodingName, XML_Char nsSep) { - XML_Char tmp[2]; - *tmp = nsSep; + XML_Char tmp[2] = {nsSep, 0}; return XML_ParserCreate_MM(encodingName, NULL, tmp); } @@ -1342,8 +1341,7 @@ XML_ExternalEntityParserCreate(XML_Parser oldParser, const XML_Char *context, would be otherwise. */ if (parser->m_ns) { - XML_Char tmp[2]; - *tmp = parser->m_namespaceSeparator; + XML_Char tmp[2] = {parser->m_namespaceSeparator, 0}; parser = parserCreate(encodingName, &parser->m_mem, tmp, newDtd); } else { parser = parserCreate(encodingName, &parser->m_mem, NULL, newDtd); @@ -3754,6 +3752,17 @@ addBinding(XML_Parser parser, PREFIX *prefix, const ATTRIBUTE_ID *attId, if (! mustBeXML && isXMLNS && (len > xmlnsLen || uri[len] != xmlnsNamespace[len])) isXMLNS = XML_FALSE; + + // NOTE: While Expat does not validate namespace URIs against RFC 3986, + // we have to at least make sure that the XML processor on top of + // Expat (that is splitting tag names by namespace separator into + // 2- or 3-tuples (uri-local or uri-local-prefix)) cannot be confused + // by an attacker putting additional namespace separator characters + // into namespace declarations. That would be ambiguous and not to + // be expected. + if (parser->m_ns && (uri[len] == parser->m_namespaceSeparator)) { + return XML_ERROR_SYNTAX; + } } isXML = isXML && len == xmlLen; isXMLNS = isXMLNS && len == xmlnsLen; diff --git a/tests/runtests.c b/tests/runtests.c index 45ba5d5..d64187e 100644 --- a/tests/runtests.c +++ b/tests/runtests.c @@ -7195,6 +7195,35 @@ START_TEST(test_ns_double_colon_doctype) { } END_TEST +START_TEST(test_ns_separator_in_uri) { + struct test_case { + enum XML_Status expectedStatus; + const char *doc; + }; + struct test_case cases[] = { + {XML_STATUS_OK, ""}, + {XML_STATUS_ERROR, ""}, + }; + + size_t i = 0; + size_t failCount = 0; + for (; i < sizeof(cases) / sizeof(cases[0]); i++) { + XML_Parser parser = XML_ParserCreateNS(NULL, '\n'); + XML_SetElementHandler(parser, dummy_start_element, dummy_end_element); + if (XML_Parse(parser, cases[i].doc, (int)strlen(cases[i].doc), + /*isFinal*/ XML_TRUE) + != cases[i].expectedStatus) { + failCount++; + } + XML_ParserFree(parser); + } + + if (failCount) { + fail("Namespace separator handling is broken"); + } +} +END_TEST + /* Control variable; the number of times duff_allocator() will successfully * allocate */ #define ALLOC_ALWAYS_SUCCEED (-1) @@ -11877,6 +11906,7 @@ make_suite(void) { tcase_add_test(tc_namespace, test_ns_utf16_doctype); tcase_add_test(tc_namespace, test_ns_invalid_doctype); tcase_add_test(tc_namespace, test_ns_double_colon_doctype); + tcase_add_test(tc_namespace, test_ns_separator_in_uri); suite_add_tcase(s, tc_misc); tcase_add_checked_fixture(tc_misc, NULL, basic_teardown); -- Gitee From e09c2419ece153d39b5e0d430ba575ee13883a3d Mon Sep 17 00:00:00 2001 From: lushi Date: Thu, 24 Feb 2022 18:17:41 +0800 Subject: [PATCH 2/5] fix cve-2022-25235 Signed-off-by: lushi Change-Id: I4a8861ea5d593532d75d6fd97777b410fa7f2211 --- lib/xmltok.c | 5 --- lib/xmltok_impl.c | 18 +++++--- tests/runtests.c | 109 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 120 insertions(+), 12 deletions(-) diff --git a/lib/xmltok.c b/lib/xmltok.c index 5114016..53ec1d0 100644 --- a/lib/xmltok.c +++ b/lib/xmltok.c @@ -99,11 +99,6 @@ + ((((byte)[1]) & 3) << 1) + ((((byte)[2]) >> 5) & 1)] \ & (1u << (((byte)[2]) & 0x1F))) -#define UTF8_GET_NAMING(pages, p, n) \ - ((n) == 2 \ - ? UTF8_GET_NAMING2(pages, (const unsigned char *)(p)) \ - : ((n) == 3 ? UTF8_GET_NAMING3(pages, (const unsigned char *)(p)) : 0)) - /* Detection of invalid UTF-8 sequences is based on Table 3.1B of Unicode 3.2: http://www.unicode.org/unicode/reports/tr28/ with the additional restriction of not allowing the Unicode diff --git a/lib/xmltok_impl.c b/lib/xmltok_impl.c index 0430591..84ff35f 100644 --- a/lib/xmltok_impl.c +++ b/lib/xmltok_impl.c @@ -69,7 +69,7 @@ case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (! IS_NAME_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NAME_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -98,7 +98,7 @@ case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ - if (! IS_NMSTRT_CHAR(enc, ptr, n)) { \ + if (IS_INVALID_CHAR(enc, ptr, n) || ! IS_NMSTRT_CHAR(enc, ptr, n)) { \ *nextTokPtr = ptr; \ return XML_TOK_INVALID; \ } \ @@ -1142,6 +1142,10 @@ PREFIX(prologTok)(const ENCODING *enc, const char *ptr, const char *end, case BT_LEAD##n: \ if (end - ptr < n) \ return XML_TOK_PARTIAL_CHAR; \ + if (IS_INVALID_CHAR(enc, ptr, n)) { \ + *nextTokPtr = ptr; \ + return XML_TOK_INVALID; \ + } \ if (IS_NMSTRT_CHAR(enc, ptr, n)) { \ ptr += n; \ tok = XML_TOK_NAME; \ @@ -1270,7 +1274,7 @@ PREFIX(attributeValueTok)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1339,7 +1343,7 @@ PREFIX(entityValueTok)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1518,7 +1522,7 @@ PREFIX(getAtts)(const ENCODING *enc, const char *ptr, int attsMax, state = inName; \ } # define LEAD_CASE(n) \ - case BT_LEAD##n: \ + case BT_LEAD##n: /* NOTE: The encoding has already been validated. */ \ START_NAME ptr += (n - MINBPC(enc)); \ break; LEAD_CASE(2) @@ -1730,7 +1734,7 @@ PREFIX(nameLength)(const ENCODING *enc, const char *ptr) { switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ break; LEAD_CASE(2) LEAD_CASE(3) @@ -1775,7 +1779,7 @@ PREFIX(updatePosition)(const ENCODING *enc, const char *ptr, const char *end, switch (BYTE_TYPE(enc, ptr)) { # define LEAD_CASE(n) \ case BT_LEAD##n: \ - ptr += n; \ + ptr += n; /* NOTE: The encoding has already been validated. */ \ pos->columnNumber++; \ break; LEAD_CASE(2) diff --git a/tests/runtests.c b/tests/runtests.c index d64187e..7453038 100644 --- a/tests/runtests.c +++ b/tests/runtests.c @@ -5973,6 +5973,105 @@ START_TEST(test_utf8_in_cdata_section_2) { } END_TEST +START_TEST(test_utf8_in_start_tags) { + struct test_case { + bool goodName; + bool goodNameStart; + const char *tagName; + }; + + // The idea with the tests below is this: + // We want to cover 1-, 2- and 3-byte sequences, 4-byte sequences + // go to isNever and are hence not a concern. + // + // We start with a character that is a valid name character + // (or even name-start character, see XML 1.0r4 spec) and then we flip + // single bits at places where (1) the result leaves the UTF-8 encoding space + // and (2) we stay in the same n-byte sequence family. + // + // The flipped bits are highlighted in angle brackets in comments, + // e.g. "[<1>011 1001]" means we had [0011 1001] but we now flipped + // the most significant bit to 1 to leave UTF-8 encoding space. + struct test_case cases[] = { + // 1-byte UTF-8: [0xxx xxxx] + {true, true, "\x3A"}, // [0011 1010] = ASCII colon ':' + {false, false, "\xBA"}, // [<1>011 1010] + {true, false, "\x39"}, // [0011 1001] = ASCII nine '9' + {false, false, "\xB9"}, // [<1>011 1001] + + // 2-byte UTF-8: [110x xxxx] [10xx xxxx] + {true, true, "\xDB\xA5"}, // [1101 1011] [1010 0101] = + // Arabic small waw U+06E5 + {false, false, "\x9B\xA5"}, // [1<0>01 1011] [1010 0101] + {false, false, "\xDB\x25"}, // [1101 1011] [<0>010 0101] + {false, false, "\xDB\xE5"}, // [1101 1011] [1<1>10 0101] + {true, false, "\xCC\x81"}, // [1100 1100] [1000 0001] = + // combining char U+0301 + {false, false, "\x8C\x81"}, // [1<0>00 1100] [1000 0001] + {false, false, "\xCC\x01"}, // [1100 1100] [<0>000 0001] + {false, false, "\xCC\xC1"}, // [1100 1100] [1<1>00 0001] + + // 3-byte UTF-8: [1110 xxxx] [10xx xxxx] [10xxxxxx] + {true, true, "\xE0\xA4\x85"}, // [1110 0000] [1010 0100] [1000 0101] = + // Devanagari Letter A U+0905 + {false, false, "\xA0\xA4\x85"}, // [1<0>10 0000] [1010 0100] [1000 0101] + {false, false, "\xE0\x24\x85"}, // [1110 0000] [<0>010 0100] [1000 0101] + {false, false, "\xE0\xE4\x85"}, // [1110 0000] [1<1>10 0100] [1000 0101] + {false, false, "\xE0\xA4\x05"}, // [1110 0000] [1010 0100] [<0>000 0101] + {false, false, "\xE0\xA4\xC5"}, // [1110 0000] [1010 0100] [1<1>00 0101] + {true, false, "\xE0\xA4\x81"}, // [1110 0000] [1010 0100] [1000 0001] = + // combining char U+0901 + {false, false, "\xA0\xA4\x81"}, // [1<0>10 0000] [1010 0100] [1000 0001] + {false, false, "\xE0\x24\x81"}, // [1110 0000] [<0>010 0100] [1000 0001] + {false, false, "\xE0\xE4\x81"}, // [1110 0000] [1<1>10 0100] [1000 0001] + {false, false, "\xE0\xA4\x01"}, // [1110 0000] [1010 0100] [<0>000 0001] + {false, false, "\xE0\xA4\xC1"}, // [1110 0000] [1010 0100] [1<1>00 0001] + }; + const bool atNameStart[] = {true, false}; + + size_t i = 0; + char doc[1024]; + size_t failCount = 0; + + for (; i < sizeof(cases) / sizeof(cases[0]); i++) { + size_t j = 0; + for (; j < sizeof(atNameStart) / sizeof(atNameStart[0]); j++) { + const bool expectedSuccess + = atNameStart[j] ? cases[i].goodNameStart : cases[i].goodName; + sprintf(doc, "<%s%s>