mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Merge branch 'PHP-8.3'
* PHP-8.3: Character indices used by mb_strpos and mb_substr have same meaning, even on invalid strings
This commit is contained in:
@@ -35,6 +35,13 @@ PHP 8.4 UPGRADE NOTES
|
||||
. mb_encode_numericentity() and mb_decode_numericentity() now check that
|
||||
the $map is only composed of integers, if not a ValueError is thrown.
|
||||
. mb_http_input() now always throws a ValueError if the $type is invalid.
|
||||
. On invalid strings (those with encoding errors), mb_substr() now interprets
|
||||
character indices in the same manner as most other mbstring functions. This
|
||||
means that character indices returned by mb_strpos() can be passed to mb_substr().
|
||||
. For SJIS-Mac (MacJapanese) strings, character indices passed to mb_substr() now
|
||||
refer to the indices of the Unicode codepoints which are produced when the string
|
||||
is converted to Unicode. This is significant because around 40 SJIS-Mac characters
|
||||
convert to a sequence of multiple Unicode codepoints.
|
||||
|
||||
- PDO_DBLIB:
|
||||
. setAttribute, DBLIB_ATTR_STRINGIFY_UNIQUEIDENTIFIER and DBLIB_ATTR_DATETIME_CONVERT
|
||||
|
||||
@@ -38,6 +38,7 @@
|
||||
#include "libmbfl/mbfl/mbfilter_wchar.h"
|
||||
#include "libmbfl/mbfl/eaw_table.h"
|
||||
#include "libmbfl/filters/mbfilter_base64.h"
|
||||
#include "libmbfl/filters/mbfilter_cjk.h"
|
||||
#include "libmbfl/filters/mbfilter_qprint.h"
|
||||
#include "libmbfl/filters/mbfilter_htmlent.h"
|
||||
#include "libmbfl/filters/mbfilter_uuencode.h"
|
||||
@@ -2112,8 +2113,9 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
|
||||
unsigned char *in = (unsigned char*)ZSTR_VAL(input);
|
||||
size_t in_len = ZSTR_LEN(input);
|
||||
|
||||
if (from >= in_len || len == 0) {
|
||||
/* No supported text encoding decodes to more than one codepoint per byte
|
||||
if (len == 0 || (from >= in_len && enc != &mbfl_encoding_sjis_mac)) {
|
||||
/* Other than MacJapanese, no supported text encoding decodes to
|
||||
* more than one codepoint per byte
|
||||
* So if the number of codepoints to skip >= number of input bytes,
|
||||
* then definitely the output should be empty */
|
||||
return zend_empty_string;
|
||||
@@ -2134,30 +2136,6 @@ static zend_string* mb_get_substr(zend_string *input, size_t from, size_t len, c
|
||||
len = in_len;
|
||||
}
|
||||
return zend_string_init_fast((const char*)in, len);
|
||||
} else if (enc->mblen_table) {
|
||||
/* The use of the `mblen_table` means that for encodings like MacJapanese,
|
||||
* we treat each character in its native charset as "1 character", even if it
|
||||
* maps to a sequence of several codepoints */
|
||||
const unsigned char *mbtab = enc->mblen_table;
|
||||
unsigned char *limit = in + in_len;
|
||||
while (from && in < limit) {
|
||||
in += mbtab[*in];
|
||||
from--;
|
||||
}
|
||||
if (in >= limit) {
|
||||
return zend_empty_string;
|
||||
} else if (len == MBFL_SUBSTR_UNTIL_END) {
|
||||
return zend_string_init_fast((const char*)in, limit - in);
|
||||
}
|
||||
unsigned char *end = in;
|
||||
while (len && end < limit) {
|
||||
end += mbtab[*end];
|
||||
len--;
|
||||
}
|
||||
if (end > limit) {
|
||||
end = limit;
|
||||
}
|
||||
return zend_string_init_fast((const char*)in, end - in);
|
||||
}
|
||||
|
||||
return mb_get_substr_slow(in, in_len, from, len, enc);
|
||||
@@ -2350,21 +2328,7 @@ PHP_FUNCTION(mb_substr)
|
||||
|
||||
size_t mblen = 0;
|
||||
if (from < 0 || (!len_is_null && len < 0)) {
|
||||
if (enc->mblen_table) {
|
||||
/* Because we use the `mblen_table` when iterating over the string and
|
||||
* extracting the requested part, we also need to use it here for counting
|
||||
* the "length" of the string
|
||||
* Otherwise, we can get wrong results for text encodings like MacJapanese,
|
||||
* where one native 'character' can map to a sequence of several codepoints */
|
||||
const unsigned char *mbtab = enc->mblen_table;
|
||||
unsigned char *p = (unsigned char*)ZSTR_VAL(str), *e = p + ZSTR_LEN(str);
|
||||
while (p < e) {
|
||||
p += mbtab[*p];
|
||||
mblen++;
|
||||
}
|
||||
} else {
|
||||
mblen = mb_get_strlen(str, enc);
|
||||
}
|
||||
mblen = mb_get_strlen(str, enc);
|
||||
}
|
||||
|
||||
/* if "from" position is negative, count start position from the end
|
||||
|
||||
@@ -26,6 +26,11 @@ var_dump(FROM_EUC_JP(mb_strstr(EUC_JP("あいうえおかきくけこ"), EUC_JP(
|
||||
var_dump(bin2hex(mb_strstr("\xdd\x00", "", false, 'UTF-8')));
|
||||
var_dump(bin2hex(mb_strstr("M\xff\xff\xff\x00", "\x00", false, "SJIS")));
|
||||
|
||||
// Test handling of invalid UTF-8 string
|
||||
// Thanks to Stefan Schiller
|
||||
var_dump(mb_strstr("\xf0start", "start", false, "UTF-8"));
|
||||
var_dump(mb_strstr("\xf0start", "start", true, "UTF-8"));
|
||||
|
||||
?>
|
||||
--EXPECT--
|
||||
string(18) "おかきくけこ"
|
||||
@@ -36,5 +41,7 @@ string(12) "あいうえ"
|
||||
string(18) "おかきくけこ"
|
||||
string(18) "おかきくけこ"
|
||||
string(12) "あいうえ"
|
||||
string(4) "dd00"
|
||||
string(4) "3f00"
|
||||
string(2) "00"
|
||||
string(5) "start"
|
||||
string(1) "?"
|
||||
|
||||
@@ -118,6 +118,15 @@ print "3: " . mb_convert_encoding(mb_substr($utf7, -5, 3, 'UTF-7'), 'UTF-8', 'UT
|
||||
print "4: " . mb_convert_encoding(mb_substr($utf7, 1, null, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
|
||||
print "5:" . mb_convert_encoding(mb_substr($utf7, 10, 0, 'UTF-7'), 'UTF-8', 'UTF-7') . "\n";
|
||||
|
||||
echo "Testing agreement with mb_strpos on invalid UTF-8 string:\n";
|
||||
/* Stefan Schiller pointed out that on invalid UTF-8 strings, character indices returned
|
||||
* by mb_strpos would not extract the desired part of the string when passed to mb_substr.
|
||||
* This is the test case which he provided: */
|
||||
$data = "\xF0AAA<b>";
|
||||
$pos = mb_strpos($data, "<", 0, "UTF-8");
|
||||
$out = mb_substr($data, 0, $pos, "UTF-8");
|
||||
print $out . "\n";
|
||||
|
||||
echo "Regression:\n";
|
||||
/* During development, one >= comparison in mb_get_substr was wrongly written as >
|
||||
* This was caught by libFuzzer */
|
||||
@@ -138,30 +147,30 @@ SJIS:
|
||||
4: 967b8cea8365834c8358836782c582b781423031323334825482558256825782588142
|
||||
5:
|
||||
-- Testing illegal SJIS byte 0x80 --
|
||||
6380
|
||||
806162
|
||||
633f
|
||||
3f6162
|
||||
SJIS-2004:
|
||||
6380
|
||||
806162
|
||||
633f
|
||||
3f6162
|
||||
MacJapanese:
|
||||
6380
|
||||
806162
|
||||
SJIS-Mobile#DOCOMO:
|
||||
6380
|
||||
806162
|
||||
633f
|
||||
3f6162
|
||||
SJIS-Mobile#KDDI:
|
||||
6380
|
||||
806162
|
||||
633f
|
||||
3f6162
|
||||
SJIS-Mobile#SoftBank:
|
||||
6380
|
||||
806162
|
||||
633f
|
||||
3f6162
|
||||
-- Testing MacJapanese characters which map to 3-5 codepoints each --
|
||||
616263
|
||||
85ab85ac
|
||||
85ac
|
||||
3f3f
|
||||
58
|
||||
616263
|
||||
85bf85c0
|
||||
85c0
|
||||
3f3f
|
||||
78
|
||||
ISO-2022-JP:
|
||||
1: 1b2442212121721b284241
|
||||
2: 43
|
||||
@@ -200,5 +209,7 @@ UTF-7:
|
||||
3: йте
|
||||
4: reek: Σὲ γνωρίζω ἀπὸ τὴν κόψη Russian: Зарегистрируйтесь
|
||||
5:
|
||||
Testing agreement with mb_strpos on invalid UTF-8 string:
|
||||
?AAA
|
||||
Regression:
|
||||
1b28493d3d3d3d3d3d3d3e3d3d3d1b28423f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f000000003f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f1b28493d3d3d3d3d3d3d3e1b2842013a4f1b28492a1b2842
|
||||
|
||||
Reference in New Issue
Block a user