From 7914b8cefd21fccda53df3c8ca54bba55efff205 Mon Sep 17 00:00:00 2001
From: Alex Dowad <alexinbeijing@gmail.com>
Date: Thu, 11 May 2023 20:21:30 +0200
Subject: [PATCH] Use pakutoma's encoding check functions for
 mb_detect_encoding even in non-strict mode

In 6fc8d014df, pakutoma added specialized validity checking functions
for some legacy text encodings like ISO-2022-JP and UTF-7. These
check functions perform a more strict validity check than the encoding
conversion functions for the same text encodings. For example, the
check function for ISO-2022-JP verifies that the string ends in the
correct state required by the specification for ISO-2022-JP.

These check functions are already being used to make detection of text
encoding more accurate when 'strict' detection mode is enabled.

However, since the default is 'non-strict' detection (a bad API design
but we're stuck with it now), most users will not benefit from
pakutoma's work. I was previously reluctant to enable this new logic
for non-strict detection mode. My intention was to reduce the scope of
behavior changes, since almost *any* behavior change may affect *some*
user in a way we don't expect.

However, we definitely have users whose (production) code was broken
by the changes I made in 28b346bc06, and enabling pakutoma's check
functions for non-strict detection mode would un-break it. (See
GH-10192 as an example.) The added checks do also make sense.

In non-strict detection mode, we will not immediately reject candidate
encodings whose validity check function returns false; but they will
be much less likely to be selected. However, failure of the validity
check function is weighted less heavily than an encoding error detected
by the encoding conversion function.
---
 ext/mbstring/mbstring.c                    | 16 ++++++++++------
 ext/mbstring/tests/gh10192_utf7.phpt       | 14 +++++++-------
 ext/mbstring/tests/mb_detect_encoding.phpt |  8 ++++++++
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c
index 17140747776..88bc7334253 100644
--- a/ext/mbstring/mbstring.c
+++ b/ext/mbstring/mbstring.c
@@ -1816,7 +1816,6 @@ static size_t mb_get_strlen(zend_string *string, const mbfl_encoding *encoding)
 		return mb_fast_strlen_utf8((unsigned char*)ZSTR_VAL(string), ZSTR_LEN(string));
 	}
 
-
 	uint32_t wchar_buf[128];
 	unsigned char *in = (unsigned char*)ZSTR_VAL(string);
 	size_t in_len = ZSTR_LEN(string);
@@ -3006,19 +3005,24 @@ static size_t init_candidate_array(struct candidate *array, size_t length, const
 	for (size_t i = 0; i < length; i++) {
 		const mbfl_encoding *enc = encodings[i];
 
+		array[j].enc = enc;
+		array[j].state = 0;
+		array[j].demerits = 0;
+
 		/* If any candidate encodings have specialized validation functions, use them
 		 * to eliminate as many candidates as possible */
-		if (strict && enc->check != NULL) {
+		if (enc->check != NULL) {
 			for (size_t k = 0; k < n; k++) {
 				if (!enc->check((unsigned char*)in[k], in_len[k])) {
-					goto skip_to_next;
+					if (strict) {
+						goto skip_to_next;
+					} else {
+						array[j].demerits += 500;
+					}
 				}
 			}
 		}
 
-		array[j].enc = enc;
-		array[j].state = 0;
-		array[j].demerits = 0;
 		/* This multiplier can optionally be used to make candidate encodings listed
 		 * first more likely to be chosen. It is a weight factor which multiplies
 		 * the number of demerits counted for each candidate. */
diff --git a/ext/mbstring/tests/gh10192_utf7.phpt b/ext/mbstring/tests/gh10192_utf7.phpt
index 2930942c12c..9aa4eb69254 100644
--- a/ext/mbstring/tests/gh10192_utf7.phpt
+++ b/ext/mbstring/tests/gh10192_utf7.phpt
@@ -75,7 +75,7 @@ foreach ($testcases as $title => $case) {
 --EXPECT--
 non-base64 character after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -93,7 +93,7 @@ int(0)
 
 base64 character before +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -174,7 +174,7 @@ int(2)
 
 - and +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -219,7 +219,7 @@ int(2)
 
 valid direct encoding character = after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -228,7 +228,7 @@ int(2)
 
 invalid direct encoding character ~ after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -237,7 +237,7 @@ int(2)
 
 invalid direct encoding character \ after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
@@ -246,7 +246,7 @@ int(2)
 
 invalid direct encoding character ESC after +
 string(5) "UTF-8"
-string(5) "UTF-7"
+string(5) "UTF-8"
 bool(false)
 string(5) "UTF-7"
 bool(false)
diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt
index 97136d89b8e..11d5a1c3136 100644
--- a/ext/mbstring/tests/mb_detect_encoding.phpt
+++ b/ext/mbstring/tests/mb_detect_encoding.phpt
@@ -78,6 +78,13 @@ echo mb_detect_encoding($test, ['UTF-8', 'ISO-8859-1']), "\n"; // Should be UTF-
 echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n";
 echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n";
 
+// This test case courtesy of Adrien Foulon
+// It depends on the below use of '+' being recognized as invalid UTF-7
+$css = 'input[type="radio"]:checked + img {
+    border: 5px solid #0083ca;
+}';
+echo mb_detect_encoding($css, mb_list_encodings(), true), "\n";
+
 echo "== DETECT ORDER ==\n";
 
 mb_detect_order('auto');
@@ -400,6 +407,7 @@ UTF-8
 UTF-8
 UTF-8
 SJIS
+UTF-8
 == DETECT ORDER ==
 JIS: JIS
 EUC-JP: EUC-JP