diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 17140747776..88bc7334253 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -1816,7 +1816,6 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding) return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string)); } - uint32_t wchar_buf[128]; unsigned char *in = (unsigned char*)ZSTR_VAL(string); size_t in_len = ZSTR_LEN(string); @@ -3006,19 +3005,24 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const for (size_t i = 0; i < length; i++) { const mbfl_encoding *enc = encodings[i]; + array[j].enc = enc; + array[j].state = 0; + array[j].demerits = 0; + /* If any candidate encodings have specialized validation functions, use them * to eliminate as many candidates as possible */ - if (strict && enc->check != NULL) { + if (enc->check != NULL) { for (size_t k = 0; k < n; k++) { if (!enc->check((unsigned char*)in[k], in_len[k])) { - goto skip_to_next; + if (strict) { + goto skip_to_next; + } else { + array[j].demerits += 500; + } } } } - array[j].enc = enc; - array[j].state = 0; - array[j].demerits = 0; /* This multiplier can optionally be used to make candidate encodings listed * first more likely to be chosen. It is a weight factor which multiplies * the number of demerits counted for each candidate. */ diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt index 2930942c12c..9aa4eb69254 100644 --- a/ext/mbstring/tests/gh10192_utf7.phpt +++ b/ext/mbstring/tests/gh10192_utf7.phpt @@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) { --EXPECT-- non-base64 character after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -93,7 +93,7 @@ int(0) base64 character before + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -174,7 +174,7 @@ int(2) - and + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -219,7 +219,7 @@ int(2) valid direct encoding character = after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -228,7 +228,7 @@ int(2) invalid direct encoding character ~ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -237,7 +237,7 @@ int(2) invalid direct encoding character \ after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) @@ -246,7 +246,7 @@ int(2) invalid direct encoding character ESC after + string(5) "UTF-8" -string(5) "UTF-7" +string(5) "UTF-8" bool(false) string(5) "UTF-7" bool(false) diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index 97136d89b8e..11d5a1c3136 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF- echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n"; echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n"; +// This test case courtesy of Adrien Foulon +// It depends on the below use of '+' being recognized as invalid UTF-7 +$css = 'input[type="radio"]:checked + img { + border: 5px solid #0083ca; +}'; +echo mb_detect_encoding($css, mb_list_encodings(), true), "\n"; + echo "== DETECT ORDER ==\n"; mb_detect_order('auto'); @@ -400,6 +407,7 @@ UTF-8 UTF-8 UTF-8 SJIS +UTF-8 == DETECT ORDER == JIS: JIS EUC-JP: EUC-JP