1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Use pakutoma's encoding check functions for mb_detect_encoding even in non-strict mode

In 6fc8d014df, pakutoma added specialized validity checking functions
for some legacy text encodings like ISO-2022-JP and UTF-7. These
check functions perform a more strict validity check than the encoding
conversion functions for the same text encodings. For example, the
check function for ISO-2022-JP verifies that the string ends in the
correct state required by the specification for ISO-2022-JP.

These check functions are already being used to make detection of text
encoding more accurate when 'strict' detection mode is enabled.

However, since the default is 'non-strict' detection (a bad API design
but we're stuck with it now), most users will not benefit from
pakutoma's work. I was previously reluctant to enable this new logic
for non-strict detection mode. My intention was to reduce the scope of
behavior changes, since almost *any* behavior change may affect *some*
user in a way we don't expect.

However, we definitely have users whose (production) code was broken
by the changes I made in 28b346bc06, and enabling pakutoma's check
functions for non-strict detection mode would un-break it. (See
GH-10192 as an example.) The added checks do also make sense.

In non-strict detection mode, we will not immediately reject candidate
encodings whose validity check function returns false; but they will
be much less likely to be selected. However, failure of the validity
check function is weighted less heavily than an encoding error detected
by the encoding conversion function.
This commit is contained in:
Alex Dowad
2023-05-11 20:21:30 +02:00
parent 3ab10da758
commit 7914b8cefd
3 changed files with 25 additions and 13 deletions

View File

@@ -1816,7 +1816,6 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
}
uint32_t wchar_buf[128];
unsigned char *in = (unsigned char*)ZSTR_VAL(string);
size_t in_len = ZSTR_LEN(string);
@@ -3006,19 +3005,24 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
for (size_t i = 0; i < length; i++) {
const mbfl_encoding *enc = encodings[i];
array[j].enc = enc;
array[j].state = 0;
array[j].demerits = 0;
/* If any candidate encodings have specialized validation functions, use them
* to eliminate as many candidates as possible */
if (strict && enc->check != NULL) {
if (enc->check != NULL) {
for (size_t k = 0; k < n; k++) {
if (!enc->check((unsigned char*)in[k], in_len[k])) {
goto skip_to_next;
if (strict) {
goto skip_to_next;
} else {
array[j].demerits += 500;
}
}
}
}
array[j].enc = enc;
array[j].state = 0;
array[j].demerits = 0;
/* This multiplier can optionally be used to make candidate encodings listed
* first more likely to be chosen. It is a weight factor which multiplies
* the number of demerits counted for each candidate. */

View File

@@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) {
--EXPECT--
non-base64 character after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -93,7 +93,7 @@ int(0)
base64 character before +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -174,7 +174,7 @@ int(2)
- and +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -219,7 +219,7 @@ int(2)
valid direct encoding character = after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -228,7 +228,7 @@ int(2)
invalid direct encoding character ~ after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -237,7 +237,7 @@ int(2)
invalid direct encoding character \ after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)
@@ -246,7 +246,7 @@ int(2)
invalid direct encoding character ESC after +
string(5) "UTF-8"
string(5) "UTF-7"
string(5) "UTF-8"
bool(false)
string(5) "UTF-7"
bool(false)

View File

@@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-
echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
// This test case courtesy of Adrien Foulon
// It depends on the below use of '+' being recognized as invalid UTF-7
$css = 'input[type="radio"]:checked + img {
border: 5px solid #0083ca;
}';
echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
echo "== DETECT ORDER ==\n";
mb_detect_order('auto');
@@ -400,6 +407,7 @@ UTF-8
UTF-8
UTF-8
SJIS
UTF-8
== DETECT ORDER ==
JIS: JIS
EUC-JP: EUC-JP