From 7914b8cefd21fccda53df3c8ca54bba55efff205 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Thu, 11 May 2023 20:21:30 +0200 Subject: [PATCH] Use pakutoma's encoding check functions for mb_detect_encoding even in non-strict mode In 6fc8d014df, pakutoma added specialized validity checking functions for some legacy text encodings like ISO-2022-JP and UTF-7. These check functions perform a more strict validity check than the encoding conversion functions for the same text encodings. For example, the check function for ISO-2022-JP verifies that the string ends in the correct state required by the specification for ISO-2022-JP. These check functions are already being used to make detection of text encoding more accurate when 'strict' detection mode is enabled. However, since the default is 'non-strict' detection (a bad API design but we're stuck with it now), most users will not benefit from pakutoma's work. I was previously reluctant to enable this new logic for non-strict detection mode. My intention was to reduce the scope of behavior changes, since almost *any* behavior change may affect *some* user in a way we don't expect. However, we definitely have users whose (production) code was broken by the changes I made in 28b346bc06, and enabling pakutoma's check functions for non-strict detection mode would un-break it. (See GH-10192 as an example.) The added checks do also make sense. In non-strict detection mode, we will not immediately reject candidate encodings whose validity check function returns false; but they will be much less likely to be selected. However, failure of the validity check function is weighted less heavily than an encoding error detected by the encoding conversion function. --- ext/mbstring/mbstring.c | 16 ++++++++++------ ext/mbstring/tests/gh10192_utf7.phpt | 14 +++++++------- ext/mbstring/tests/mb_detect_encoding.phpt | 8 ++++++++ 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 17140747776..88bc7334253 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1816,7 +1816,6 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding) return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string)); } - uint32_t wchar_buf[128]; unsigned char *in = (unsigned char*)ZSTR_VAL(string); size_t in_len = ZSTR_LEN(string); @@ -3006,19 +3005,24 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const for (size_t i = 0; i < length; i++) { const mbfl_encoding *enc = encodings[i]; + array[j].enc = enc; + array[j].state = 0; + array[j].demerits = 0; + /* If any candidate encodings have specialized validation functions, use them * to eliminate as many candidates as possible */ - if (strict && enc->check != NULL) { + if (enc->check != NULL) { for (size_t k = 0; k < n; k++) { if (!enc->check((unsigned char*)in[k], in_len[k])) { - goto skip_to_next; + if (strict) { + goto skip_to_next; + } else { + array[j].demerits += 500; + } } } } - array[j].enc = enc; - array[j].state = 0; - array[j].demerits = 0; /* This multiplier can optionally be used to make candidate encodings listed * first more likely to be chosen. It is a weight factor which multiplies * the number of demerits counted for each candidate. */ diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt index 2930942c12c..9aa4eb69254 100644 --- a/ext/mbstring/tests/gh10192_utf7.phpt +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) { --EXPECT-- non-base64 character after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -93,7 +93,7 @@ int(0) base64 character before + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -174,7 +174,7 @@ int(2) - and + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -219,7 +219,7 @@ int(2) valid direct encoding character = after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -228,7 +228,7 @@ int(2) invalid direct encoding character ~ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -237,7 +237,7 @@ int(2) invalid direct encoding character \ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -246,7 +246,7 @@ int(2) invalid direct encoding character ESC after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index 97136d89b8e..11d5a1c3136 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF- echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n"; echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n"; +// This test case courtesy of Adrien Foulon +// It depends on the below use of '+' being recognized as invalid UTF-7 +$css = 'input[type="radio"]:checked + img { + border: 5px solid #0083ca; +}'; +echo mb_detect_encoding($css, mb_list_encodings(), true), "\n"; + echo "== DETECT ORDER ==\n"; mb_detect_order('auto'); @@ -400,6 +407,7 @@ UTF-8 UTF-8 UTF-8 SJIS +UTF-8 == DETECT ORDER == JIS: JIS EUC-JP: EUC-JP