From 6c0578d31c341b505983fc5b63cab04c540fa86a Mon Sep 17 00:00:00 2001 From: Niels Dossche <7771979+nielsdos@users.noreply.github.com> Date: Mon, 21 Apr 2025 13:32:38 +0200 Subject: [PATCH] Improve performance of urldecode() and rawurldecode() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are two hot spots on my machines: 1. We copy the string because the internal PHP API works in-place. 2. The conversion of hex characters is slow due to going through the C locale handling. This patch resolves the first hot spots by introducing 2 new internal APIs that avoid the redundant copy and allocate an empty string upfront. The second hotspot is resolved by having a specialised htoi handler. For the following benchmark: ```php $encoded = "Hello%20World%21+This%20is%20a%20test%3A%20%40%23%24%25%5E%26*%28%29"; for ($i=0;$i<2000000;$i++) { rawurldecode($encoded); urldecode($encoded); } ``` On an i7-4790: ``` Benchmark 1: ./sapi/cli/php x.php Time (mean ± σ): 364.8 ms ± 3.7 ms [User: 359.9 ms, System: 3.3 ms] Range (min … max): 359.9 ms … 372.0 ms 10 runs Benchmark 2: ./sapi/cli/php_old x.php Time (mean ± σ): 565.5 ms ± 4.9 ms [User: 561.8 ms, System: 2.5 ms] Range (min … max): 560.7 ms … 578.2 ms 10 runs Summary ./sapi/cli/php x.php ran 1.55 ± 0.02 times faster than ./sapi/cli/php_old x.php ``` On an i7-1185G7: ``` Benchmark 1: ./sapi/cli/php x.php Time (mean ± σ): 708.8 ms ± 6.1 ms [User: 701.4 ms, System: 6.3 ms] Range (min … max): 701.9 ms … 722.3 ms 10 runs Benchmark 2: ./sapi/cli/php_old x.php Time (mean ± σ): 1.311 s ± 0.019 s [User: 1.300 s, System: 0.008 s] Range (min … max): 1.281 s … 1.348 s 10 runs Summary ./sapi/cli/php x.php ran 1.85 ± 0.03 times faster than ./sapi/cli/php_old x.php ``` Closes GH-18378. --- UPGRADING | 1 + UPGRADING.INTERNALS | 4 +++ ext/standard/url.c | 69 +++++++++++++++++++++++++++------------------ ext/standard/url.h | 2 ++ 4 files changed, 48 insertions(+), 28 deletions(-) diff --git a/UPGRADING b/UPGRADING index 40645a28ac6..68f7d20a7f3 100644 --- a/UPGRADING +++ b/UPGRADING @@ -476,6 +476,7 @@ PHP 8.5 UPGRADE NOTES - Standard: . Improved performance of array functions with callbacks (array_find, array_filter, array_map, usort, ...). + . Improved performance of urlencode() and rawurlencode(). - XMLReader: . Improved property access performance. diff --git a/UPGRADING.INTERNALS b/UPGRADING.INTERNALS index 56c75351584..7c7f093e50c 100644 --- a/UPGRADING.INTERNALS +++ b/UPGRADING.INTERNALS @@ -61,6 +61,10 @@ PHP 8.5 INTERNALS UPGRADE NOTES is still valid. This is useful when a GC cycle is collected and the database object can be destroyed prior to destroying the statement. +- ext/standard + . Added php_url_decode_ex() and php_raw_url_decode_ex() that unlike their + non-ex counterparts do not work in-place. + ======================== 4. OpCode changes ======================== diff --git a/ext/standard/url.c b/ext/standard/url.c index 3d704b0140c..da2ddea0673 100644 --- a/ext/standard/url.c +++ b/ext/standard/url.c @@ -411,21 +411,24 @@ done: } /* }}} */ +/* https://stackoverflow.com/questions/34365746/whats-the-fastest-way-to-convert-hex-to-integer-in-c */ +static unsigned int php_htoi_single(unsigned char x) +{ + ZEND_ASSERT((x >= 'a' && x <= 'f') || (x >= 'A' && x <= 'F') || (x >= '0' && x <= '9')); + return 9 * (x >> 6) + (x & 0xf); +} + /* {{{ php_htoi */ -static int php_htoi(char *s) +static int php_htoi(const char *s) { int value; - int c; + unsigned char c; c = ((unsigned char *)s)[0]; - if (isupper(c)) - c = tolower(c); - value = (c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10) * 16; + value = php_htoi_single(c) * 16; c = ((unsigned char *)s)[1]; - if (isupper(c)) - c = tolower(c); - value += c >= '0' && c <= '9' ? c - '0' : c - 'a' + 10; + value += php_htoi_single(c); return (value); } @@ -572,28 +575,27 @@ PHP_FUNCTION(urldecode) Z_PARAM_STR(in_str) ZEND_PARSE_PARAMETERS_END(); - out_str = zend_string_init(ZSTR_VAL(in_str), ZSTR_LEN(in_str), 0); - ZSTR_LEN(out_str) = php_url_decode(ZSTR_VAL(out_str), ZSTR_LEN(out_str)); + out_str = zend_string_alloc(ZSTR_LEN(in_str), false); + ZSTR_LEN(out_str) = php_url_decode_ex(ZSTR_VAL(out_str), ZSTR_VAL(in_str), ZSTR_LEN(in_str)); RETURN_NEW_STR(out_str); } /* }}} */ -/* {{{ php_url_decode */ -PHPAPI size_t php_url_decode(char *str, size_t len) +PHPAPI size_t php_url_decode_ex(char *dest, const char *src, size_t src_len) { - char *dest = str; - char *data = str; + char *dest_start = dest; + const char *data = src; - while (len--) { + while (src_len--) { if (*data == '+') { *dest = ' '; } - else if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1)) + else if (*data == '%' && src_len >= 2 && isxdigit((int) *(data + 1)) && isxdigit((int) *(data + 2))) { *dest = (char) php_htoi(data + 1); data += 2; - len -= 2; + src_len -= 2; } else { *dest = *data; } @@ -601,7 +603,13 @@ PHPAPI size_t php_url_decode(char *str, size_t len) dest++; } *dest = '\0'; - return dest - str; + return dest - dest_start; +} + +/* {{{ php_url_decode */ +PHPAPI size_t php_url_decode(char *str, size_t len) +{ + return php_url_decode_ex(str, str, len); } /* }}} */ @@ -634,25 +642,24 @@ PHP_FUNCTION(rawurldecode) Z_PARAM_STR(in_str) ZEND_PARSE_PARAMETERS_END(); - out_str = zend_string_init(ZSTR_VAL(in_str), ZSTR_LEN(in_str), 0); - ZSTR_LEN(out_str) = php_raw_url_decode(ZSTR_VAL(out_str), ZSTR_LEN(out_str)); + out_str = zend_string_alloc(ZSTR_LEN(in_str), false); + ZSTR_LEN(out_str) = php_raw_url_decode_ex(ZSTR_VAL(out_str), ZSTR_VAL(in_str), ZSTR_LEN(in_str)); RETURN_NEW_STR(out_str); } /* }}} */ -/* {{{ php_raw_url_decode */ -PHPAPI size_t php_raw_url_decode(char *str, size_t len) +PHPAPI size_t php_raw_url_decode_ex(char *dest, const char *src, size_t src_len) { - char *dest = str; - char *data = str; + char *dest_start = dest; + const char *data = src; - while (len--) { - if (*data == '%' && len >= 2 && isxdigit((int) *(data + 1)) + while (src_len--) { + if (*data == '%' && src_len >= 2 && isxdigit((int) *(data + 1)) && isxdigit((int) *(data + 2))) { *dest = (char) php_htoi(data + 1); data += 2; - len -= 2; + src_len -= 2; } else { *dest = *data; } @@ -660,7 +667,13 @@ PHPAPI size_t php_raw_url_decode(char *str, size_t len) dest++; } *dest = '\0'; - return dest - str; + return dest - dest_start; +} + +/* {{{ php_raw_url_decode */ +PHPAPI size_t php_raw_url_decode(char *str, size_t len) +{ + return php_raw_url_decode_ex(str, str, len); } /* }}} */ diff --git a/ext/standard/url.h b/ext/standard/url.h index 4126ee6c6db..5c531c0086a 100644 --- a/ext/standard/url.h +++ b/ext/standard/url.h @@ -33,7 +33,9 @@ PHPAPI php_url *php_url_parse(char const *str); PHPAPI php_url *php_url_parse_ex(char const *str, size_t length); PHPAPI php_url *php_url_parse_ex2(char const *str, size_t length, bool *has_port); PHPAPI size_t php_url_decode(char *str, size_t len); /* return value: length of decoded string */ +PHPAPI size_t php_url_decode_ex(char *dest, const char *src, size_t src_len); PHPAPI size_t php_raw_url_decode(char *str, size_t len); /* return value: length of decoded string */ +PHPAPI size_t php_raw_url_decode_ex(char *dest, const char *src, size_t src_len); PHPAPI zend_string *php_url_encode(char const *s, size_t len); PHPAPI zend_string *php_raw_url_encode(char const *s, size_t len);