From a2bc57e0e531367f40fc50aa935bffac60cd61e8 Mon Sep 17 00:00:00 2001 From: Alex Dowad Date: Mon, 18 Oct 2021 17:42:33 +0200 Subject: [PATCH] mb_detect_encoding will not return non-encodings Among the text encodings supported by mbstring are several which are not really 'text encodings'. These include Base64, QPrint, UUencode, HTML entities, '7 bit', and '8 bit'. Rather than providing an explicit list of text encodings which they are interested in, users may pass the output of mb_list_encodings to mb_detect_encoding. Since Base64, QPrint, and so on are included in the output of mb_list_encodings, mb_detect_encoding can return one of these as its 'detected encoding' (and in fact, this often happens). Before mb_detect_encoding was enhanced so it could detect any of the supported text encodings, this did not happen, and it is never desired. --- ext/mbstring/mbstring.c | 25 ++++++++++++++++++++++ ext/mbstring/tests/bug81298.phpt | 2 +- ext/mbstring/tests/mb_detect_encoding.phpt | 2 ++ ext/mbstring/tests/other_encodings.phpt | 2 +- 4 files changed, 29 insertions(+), 2 deletions(-) diff --git a/ext/mbstring/mbstring.c b/ext/mbstring/mbstring.c index 725d591427d..79be29db51d 100644 --- a/ext/mbstring/mbstring.c +++ b/ext/mbstring/mbstring.c @@ -2664,6 +2664,23 @@ PHP_FUNCTION(mb_strtolower) } /* }}} */ +static void remove_non_encodings_from_elist(const mbfl_encoding **elist, size_t *size) +{ + /* mbstring supports some 'text encodings' which aren't really text encodings + * at all, but really 'byte encodings', like Base64, QPrint, and so on. + * These should never be returned by `mb_detect_encoding`. */ + int shift = 0; + for (int i = 0; i < *size; i++) { + const mbfl_encoding *encoding = elist[i]; + if (encoding->no_encoding <= mbfl_no_encoding_charset_min) { + shift++; /* Remove this encoding from the list */ + } else if (shift) { + elist[i - shift] = encoding; + } + } + *size -= shift; +} + /* {{{ Encodings of the given string is returned (as a string) */ PHP_FUNCTION(mb_detect_encoding) { @@ -2709,6 +2726,14 @@ PHP_FUNCTION(mb_detect_encoding) RETURN_THROWS(); } + if (free_elist) { + remove_non_encodings_from_elist(elist, &size); + if (size == 0) { + efree(ZEND_VOIDP(elist)); + RETURN_FALSE; + } + } + if (ZEND_NUM_ARGS() < 3) { strict = MBSTRG(strict_detection); } diff --git a/ext/mbstring/tests/bug81298.phpt b/ext/mbstring/tests/bug81298.phpt index d8565421fe7..37e0bc131ed 100644 --- a/ext/mbstring/tests/bug81298.phpt +++ b/ext/mbstring/tests/bug81298.phpt @@ -16,5 +16,5 @@ var_dump(mb_detect_encoding("foobar.", "ascii,html")); bool(false) string(5) "ASCII" string(5) "ASCII" -string(13) "HTML-ENTITIES" +bool(false) string(5) "ASCII" diff --git a/ext/mbstring/tests/mb_detect_encoding.phpt b/ext/mbstring/tests/mb_detect_encoding.phpt index 571fa1ca7f3..f2be2a617db 100644 --- a/ext/mbstring/tests/mb_detect_encoding.phpt +++ b/ext/mbstring/tests/mb_detect_encoding.phpt @@ -61,6 +61,7 @@ echo mb_detect_encoding($test, ['UTF-8', 'UTF-16']), "\n"; // We once had a problem where all kind of strings would be detected as 'UUENCODE' echo mb_detect_encoding('abc', ['UUENCODE', 'UTF-8']), "\n"; +echo mb_detect_encoding('abc', ['UUENCODE', 'QPrint', 'HTML-ENTITIES', 'Base64', '7bit', '8bit', 'SJIS']), "\n"; echo "== DETECT ORDER ==\n"; @@ -242,6 +243,7 @@ ISO-8859-1 UTF-8 UTF-8 UTF-8 +SJIS == DETECT ORDER == JIS: JIS EUC-JP: EUC-JP diff --git a/ext/mbstring/tests/other_encodings.phpt b/ext/mbstring/tests/other_encodings.phpt index 4a62966be50..321eccb247b 100644 --- a/ext/mbstring/tests/other_encodings.phpt +++ b/ext/mbstring/tests/other_encodings.phpt @@ -17,7 +17,7 @@ var_dump(mb_convert_encoding("ABC", "8bit", "7bit")); echo "7bit done\n"; // "8bit" -var_dump(mb_convert_encoding("\x01\x00", "8bit", "UTF-16BE")); // codepoints over 0xFF are illegal for '8-bit' +var_dump(mb_convert_encoding("\x01\x00", "8bit", "UTF-16BE")); // codepoints over 0xFF are illegal or '8-bit' echo "8bit done\n"; // UCS-2