1
0
mirror of https://github.com/php/php-src.git synced 2026-04-27 18:23:26 +02:00

Refactor mb_substitute_character()

Using the new Fast ZPP API for string|int|null

This also fixes Bug #79448 which was too disruptive to fix in PHP 7.x
This commit is contained in:
George Peter Banyard
2020-05-03 04:44:16 +02:00
parent 28af364d2a
commit 7dd332f110
9 changed files with 295 additions and 166 deletions
+46 -49
View File
@@ -1527,7 +1527,7 @@ PHP_FUNCTION(mb_detect_order)
static inline int php_mb_check_code_point(zend_long cp)
{
if (cp <= 0 || cp >= 0x110000) {
if (cp < 0 || cp >= 0x110000) {
/* Out of Unicode range */
return 0;
}
@@ -1544,61 +1544,58 @@ static inline int php_mb_check_code_point(zend_long cp)
return 1;
}
/* {{{ proto mixed mb_substitute_character([mixed substchar])
/* {{{ proto string|int|true mb_substitute_character([string|int|null substitute_character])
Sets the current substitute_character or returns the current substitute_character */
PHP_FUNCTION(mb_substitute_character)
{
zval *arg1 = NULL;
zend_string *substitute_character = NULL;
zend_long substitute_codepoint;
zend_bool substitute_is_null = 1;
if (zend_parse_parameters(ZEND_NUM_ARGS(), "|z", &arg1) == FAILURE) {
ZEND_PARSE_PARAMETERS_START(0, 1)
Z_PARAM_OPTIONAL
Z_PARAM_STR_OR_LONG_OR_NULL(substitute_character, substitute_codepoint, substitute_is_null)
ZEND_PARSE_PARAMETERS_END();
if (substitute_is_null) {
if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
RETURN_STRING("none");
}
if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
RETURN_STRING("long");
}
if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
RETURN_STRING("entity");
}
RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
}
if (substitute_character != NULL) {
if (zend_string_equals_literal_ci(substitute_character, "none")) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
RETURN_TRUE;
}
if (zend_string_equals_literal_ci(substitute_character, "long")) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
RETURN_TRUE;
}
if (zend_string_equals_literal_ci(substitute_character, "entity")) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
RETURN_TRUE;
}
/* Invalid string value */
zend_argument_value_error(1, "must be 'none', 'long', 'entity' or a valid codepoint");
RETURN_THROWS();
}
/* Integer codepoint passed */
if (!php_mb_check_code_point(substitute_codepoint)) {
zend_argument_value_error(1, "is not a valid codepoint");
RETURN_THROWS();
}
if (!arg1) {
if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE) {
RETURN_STRING("none");
} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG) {
RETURN_STRING("long");
} else if (MBSTRG(current_filter_illegal_mode) == MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY) {
RETURN_STRING("entity");
} else {
RETURN_LONG(MBSTRG(current_filter_illegal_substchar));
}
} else {
RETVAL_TRUE;
switch (Z_TYPE_P(arg1)) {
case IS_STRING:
if (strncasecmp("none", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_NONE;
} else if (strncasecmp("long", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_LONG;
} else if (strncasecmp("entity", Z_STRVAL_P(arg1), Z_STRLEN_P(arg1)) == 0) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_ENTITY;
} else {
convert_to_long_ex(arg1);
if (php_mb_check_code_point(Z_LVAL_P(arg1))) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
}
break;
default:
convert_to_long_ex(arg1);
if (php_mb_check_code_point(Z_LVAL_P(arg1))) {
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = Z_LVAL_P(arg1);
} else {
php_error_docref(NULL, E_WARNING, "Unknown character");
RETURN_FALSE;
}
break;
}
}
MBSTRG(current_filter_illegal_mode) = MBFL_OUTPUTFILTER_ILLEGAL_MODE_CHAR;
MBSTRG(current_filter_illegal_substchar) = substitute_codepoint;
RETURN_TRUE;
}
/* }}} */
+1 -2
View File
@@ -12,8 +12,7 @@ function mb_http_output(string $encoding = UNKNOWN): string|bool {}
function mb_detect_order(array|string $encoding = UNKNOWN): array|bool {}
/** @param string|int $substchar */
function mb_substitute_character($substchar = UNKNOWN): string|int|bool {}
function mb_substitute_character(string|int|null $substitute_character = null): string|int|bool {}
function mb_preferred_mime_name(string $encoding): string|false {}
+1 -1
View File
@@ -19,7 +19,7 @@ ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_detect_order, 0, 0, MAY_BE_AR
ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_substitute_character, 0, 0, MAY_BE_STRING|MAY_BE_LONG|MAY_BE_BOOL)
ZEND_ARG_INFO(0, substchar)
ZEND_ARG_TYPE_MASK(0, substitute_character, MAY_BE_STRING|MAY_BE_LONG|MAY_BE_NULL, "null")
ZEND_END_ARG_INFO()
ZEND_BEGIN_ARG_WITH_RETURN_TYPE_MASK_EX(arginfo_mb_preferred_mime_name, 0, 1, MAY_BE_STRING|MAY_BE_FALSE)
+13 -8
View File
@@ -9,27 +9,32 @@ mb_internal_encoding('UTF-8');
var_dump(mb_substitute_character(0x1F600));
var_dump(bin2hex(mb_scrub("\xff")));
mb_substitute_character(0x3f); // Reset to '?', as the next call will fail
var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
try {
var_dump(mb_substitute_character(0xD800)); // Surrogate (illegal)
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
var_dump(bin2hex(mb_scrub("\xff")));
mb_internal_encoding('EUC-JP-2004');
mb_substitute_character(0x63); // Reset to '?', as the next call will fail
mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
try {
mb_substitute_character(0x8fa1ef); // EUC-JP-2004 encoding of U+50AA (illegal)
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
var_dump(bin2hex(mb_scrub("\x8d")));
mb_substitute_character(0x50aa);
var_dump(bin2hex(mb_scrub("\x8d")));
?>
--EXPECTF--
--EXPECT--
bool(true)
string(8) "f09f9880"
Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint
string(2) "3f"
Warning: mb_substitute_character(): Unknown character in %s on line %d
mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint
string(2) "63"
string(6) "8fa1ef"
@@ -23,9 +23,13 @@ var_dump(mb_substitute_character('entity'));
var_dump(mb_substitute_character());
var_dump(bin2hex(mb_convert_encoding("\xe2\x99\xa0\xe3\x81\x82", "CP932", "UTF-8")));
var_dump(mb_substitute_character('BAD_NAME'));
try {
var_dump(mb_substitute_character('BAD_NAME'));
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
?>
--EXPECTF--
--EXPECT--
bool(true)
int(12356)
string(8) "82a282a0"
@@ -38,6 +42,4 @@ string(4) "82a0"
bool(true)
string(6) "entity"
string(20) "262378323636303b82a0"
Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
@@ -3,7 +3,6 @@ Test mb_substitute_character() function : basic functionality
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip');
function_exists('mb_substitute_character') or die("skip mb_substitute_character() is not available in this build");
?>
--FILE--
<?php
@@ -22,12 +21,19 @@ var_dump( mb_substitute_character(66) );
var_dump( mb_substitute_character() );
var_dump( mb_substitute_character(1234) );
var_dump( mb_substitute_character() );
var_dump( mb_substitute_character("none") );
var_dump( mb_substitute_character('none') );
var_dump( mb_substitute_character() );
var_dump( mb_substitute_character("b") );
// Check string case insensitivity
var_dump( mb_substitute_character('LoNg') );
var_dump( mb_substitute_character() );
try {
var_dump( mb_substitute_character("b") );
} catch (\ValueError $e) {
echo $e->getMessage() . \PHP_EOL;
}
?>
--EXPECTF--
--EXPECT--
*** Testing mb_substitute_character() : basic functionality ***
int(63)
bool(true)
@@ -36,6 +42,6 @@ bool(true)
int(1234)
bool(true)
string(4) "none"
Warning: mb_substitute_character(): Unknown character in %s on line %d
bool(false)
bool(true)
string(4) "long"
mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
@@ -3,7 +3,6 @@ Test mb_substitute_character() function : variation unmappable out char for conv
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip');
function_exists('mb_substitute_character') or die("skip mb_substitute_character() is not available in this build");
?>
--FILE--
<?php
@@ -0,0 +1,175 @@
--TEST--
Test mb_substitute_character() function : usage variation
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip');
?>
--FILE--
<?php
declare(strict_types=1);
/* Prototype : string|int|true mb_substitute_character([string|int|null substitute_character])
* Description: Sets the current substitute_character or returns the current substitute_character
* Source code: ext/mbstring/mbstring.c
* Alias to functions:
*/
echo "*** Testing mb_substitute_character(): various types in strict typing mode ***\n";
// Initialise function arguments not being substituted (if any)
//get an unset variable
$unset_var = 10;
unset ($unset_var);
// define some classes
class classWithToString
{
public function __toString() {
return "Class A object";
}
}
class classWithoutToString
{
}
// heredoc string
$heredoc = <<<EOT
hello world
EOT;
// get a resource variable
$fp = fopen(__FILE__, "r");
// add arrays
$index_array = array (1, 2, 3);
$assoc_array = array ('one' => 1, 'two' => 2);
//array of values to iterate over
$inputs = array(
// int data
'int 0' => 0,
'int 1' => 1,
'int 12345' => 12345,
'int -12345' => -2345,
// float data
'float 10.5' => 10.5,
'float -10.5' => -10.5,
'float 10.0e19' => 10.0e19, // Cannot be represented as int
'float -10.0e19' => -10.0e19, // Cannot be represented as int
'float .5' => .5,
// array data
'empty array' => array(),
'int indexed array' => $index_array,
'associative array' => $assoc_array,
'nested arrays' => array('foo', $index_array, $assoc_array),
// null data
'uppercase NULL' => NULL,
'lowercase null' => null,
// boolean data
'lowercase true' => true,
'lowercase false' =>false,
'uppercase TRUE' =>TRUE,
'uppercase FALSE' =>FALSE,
// empty data
'empty string DQ' => "",
'empty string SQ' => '',
// string data
'string DQ' => "string",
'string SQ' => 'string',
'mixed case string' => "sTrInG",
'heredoc' => $heredoc,
// object data
'instance of classWithToString' => new classWithToString(),
'instance of classWithoutToString' => new classWithoutToString(),
// undefined data
'undefined var' => @$undefined_var,
// unset data
'unset var' => @$unset_var,
);
// loop through each element of the array for substchar
mb_internal_encoding('utf-8');
foreach($inputs as $key =>$value) {
echo "--$key--\n";
try {
var_dump( mb_substitute_character($value) );
} catch (\ValueError|\TypeError $e) {
echo get_class($e) . ': ' . $e->getMessage() . \PHP_EOL;
}
}
fclose($fp);
?>
--EXPECT--
*** Testing mb_substitute_character(): various types in strict typing mode ***
--int 0--
bool(true)
--int 1--
bool(true)
--int 12345--
bool(true)
--int -12345--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint
--float 10.5--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given
--float -10.5--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given
--float 10.0e19--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given
--float -10.0e19--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given
--float .5--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, float given
--empty array--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--int indexed array--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--associative array--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--nested arrays--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--uppercase NULL--
int(12345)
--lowercase null--
int(12345)
--lowercase true--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given
--lowercase false--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given
--uppercase TRUE--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given
--uppercase FALSE--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, bool given
--empty string DQ--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--empty string SQ--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--string DQ--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--string SQ--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--mixed case string--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--heredoc--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--instance of classWithToString--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given
--instance of classWithoutToString--
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given
--undefined var--
int(12345)
--unset var--
int(12345)
@@ -3,26 +3,16 @@ Test mb_substitute_character() function : usage variation
--SKIPIF--
<?php
extension_loaded('mbstring') or die('skip');
function_exists('mb_substitute_character') or die("skip mb_substitute_character() is not available in this build");
?>
--FILE--
<?php
/* Prototype : mixed mb_substitute_character([mixed substchar])
/* Prototype : string|int|true mb_substitute_character([string|int|null substitute_character])
* Description: Sets the current substitute_character or returns the current substitute_character
* Source code: ext/mbstring/mbstring.c
* Alias to functions:
*/
echo "*** Testing mb_substitute_character() : usage variation ***\n";
// Define error handler
function test_error_handler($err_no, $err_msg, $filename, $linenum) {
if (error_reporting() & $err_no) {
// report non-silenced errors
echo "Error: $err_no - $err_msg, $filename($linenum)\n";
}
}
set_error_handler('test_error_handler');
echo "*** Testing mb_substitute_character(): various types in weak typing mode ***\n";
// Initialise function arguments not being substituted (if any)
@@ -66,8 +56,8 @@ $inputs = array(
// float data
'float 10.5' => 10.5,
'float -10.5' => -10.5,
'float 12.3456789000e10' => 12.3456789000e10,
'float -12.3456789000e10' => -12.3456789000e10,
'float 10.0e19' => 10.0e19, // Cannot be represented as int
'float -10.0e19' => -10.0e19, // Cannot be represented as int
'float .5' => .5,
// array data
@@ -111,118 +101,74 @@ $inputs = array(
mb_internal_encoding('utf-8');
foreach($inputs as $key =>$value) {
echo "\n--$key--\n";
var_dump( mb_substitute_character($value) );
};
echo "--$key--\n";
try {
var_dump( mb_substitute_character($value) );
} catch (\ValueError|\TypeError $e) {
echo get_class($e) . ': ' . $e->getMessage() . \PHP_EOL;
}
}
fclose($fp);
?>
--EXPECTF--
*** Testing mb_substitute_character() : usage variation ***
--EXPECT--
*** Testing mb_substitute_character(): various types in weak typing mode ***
--int 0--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
bool(true)
--int 1--
bool(true)
--int 12345--
bool(true)
--int -12345--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint
--float 10.5--
bool(true)
--float -10.5--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float 12.3456789000e10--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
--float -12.3456789000e10--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) is not a valid codepoint
--float 10.0e19--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--float -10.0e19--
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--float .5--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
bool(true)
--empty array--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--int indexed array--
bool(true)
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--associative array--
bool(true)
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--nested arrays--
bool(true)
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, array given
--uppercase NULL--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
int(0)
--lowercase null--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
int(0)
--lowercase true--
bool(true)
--lowercase false--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
bool(true)
--uppercase TRUE--
bool(true)
--uppercase FALSE--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
bool(true)
--empty string DQ--
bool(true)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--empty string SQ--
bool(true)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--string DQ--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--string SQ--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--mixed case string--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--heredoc--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--instance of classWithToString--
Error: 8 - Object of class classWithToString could not be converted to int, %s(%d)
bool(true)
ValueError: mb_substitute_character(): Argument #1 ($substitute_character) must be 'none', 'long', 'entity' or a valid codepoint
--instance of classWithoutToString--
Error: 8 - Object of class classWithoutToString could not be converted to int, %s(%d)
bool(true)
TypeError: mb_substitute_character(): Argument #1 ($substitute_character) must be of type string|int|null, object given
--undefined var--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
int(0)
--unset var--
Error: 2 - mb_substitute_character(): Unknown character, %s(%d)
bool(false)
int(0)