1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Optimizations for mb_trim (#12803)

* Fast path for when there is nothing to trim in mb_trim

* Make mb_trim decide between linear search vs hash table lookup

Using empirical experiments I noticed that on my i7-4790 the hash table
approach becomes faster once we have more than 4 code points in the trim
characters, when evaluated on the worst case.

This patch changes the logic so that a hash table is used for a large
number of trim characters, and linear search when the number of trim
characters is <= 4.
This commit is contained in:
Niels Dossche
2023-11-28 18:49:36 +00:00
committed by GitHub
parent fc1b467d0b
commit 803cd824e5
2 changed files with 49 additions and 10 deletions

View File

@@ -2951,12 +2951,21 @@ typedef enum {
MB_BOTH_TRIM = 3
} mb_trim_mode;
static zend_always_inline bool is_trim_wchar(uint32_t w, const HashTable *ht)
static bool is_trim_wchar(uint32_t w, const HashTable *ht, const uint32_t *default_chars, size_t default_chars_length)
{
return zend_hash_index_exists(ht, w);
if (ht) {
return zend_hash_index_exists(ht, w);
} else {
for (size_t i = 0; i < default_chars_length; i++) {
if (w == default_chars[i]) {
return true;
}
}
return false;
}
}
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, mb_trim_mode mode, const mbfl_encoding *enc)
static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht, const uint32_t *default_chars, size_t default_chars_length, mb_trim_mode mode, const mbfl_encoding *enc)
{
unsigned char *in = (unsigned char*)ZSTR_VAL(str);
uint32_t wchar_buf[128];
@@ -2974,7 +2983,7 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
for (size_t i = 0; i < out_len; i++) {
uint32_t w = wchar_buf[i];
if (is_trim_wchar(w, what_ht)) {
if (is_trim_wchar(w, what_ht, default_chars, default_chars_length)) {
if (mode & MB_LTRIM) {
left += 1;
}
@@ -2990,6 +2999,9 @@ static zend_string* trim_each_wchar(zend_string *str, const HashTable *what_ht,
}
}
if (left == 0 && right == 0) {
return zend_string_copy(str);
}
return mb_get_substr(str, left, total_len - (right + left), enc);
}
@@ -3012,7 +3024,7 @@ static zend_string* mb_trim_default_chars(zend_string *str, mb_trim_mode mode, c
for (size_t i = 0; i < trim_default_chars_length; i++) {
zend_hash_index_add_new(&what_ht, trim_default_chars[i], &val);
}
zend_string* retval = trim_each_wchar(str, &what_ht, mode, enc);
zend_string* retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
zend_hash_destroy(&what_ht);
return retval;
@@ -3027,18 +3039,32 @@ static zend_string* mb_trim_what_chars(zend_string *str, zend_string *what, mb_t
size_t what_len = ZSTR_LEN(what);
HashTable what_ht;
zval val;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
bool hash_initialized = false;
while (what_len) {
what_out_len = enc->to_wchar(&what_in, &what_len, what_wchar_buf, 128, &state);
ZEND_ASSERT(what_out_len <= 128);
for (size_t i = 0; i < what_out_len; i++) {
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
if (what_out_len <= 4 && !hash_initialized) {
return trim_each_wchar(str, NULL, what_wchar_buf, what_out_len, mode, enc);
} else {
if (!hash_initialized) {
hash_initialized = true;
ZVAL_TRUE(&val);
zend_hash_init(&what_ht, what_len, NULL, NULL, false);
}
for (size_t i = 0; i < what_out_len; i++) {
zend_hash_index_add(&what_ht, what_wchar_buf[i], &val);
}
}
}
zend_string *retval = trim_each_wchar(str, &what_ht, mode, enc);
if (UNEXPECTED(!hash_initialized)) {
/* This is only possible if what is empty */
return zend_string_copy(str);
}
zend_string *retval = trim_each_wchar(str, &what_ht, NULL, 0, mode, enc);
zend_hash_destroy(&what_ht);
return retval;

View File

@@ -40,6 +40,15 @@ var_dump(mb_trim(str_repeat(" ", 129)));
var_dump(mb_trim(str_repeat(" ", 129) . "a"));
var_dump(mb_rtrim(str_repeat(" ", 129) . "a"));
echo "== Very long trim characters ==\n";
$trim_chars = "";
for ($i = 1024; $i < 2048; $i++) {
$trim_chars .= mb_chr($i);
}
var_dump(mb_trim($trim_chars . "hello" . $trim_chars, $trim_chars));
var_dump(strlen(mb_ltrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
var_dump(strlen(mb_rtrim($trim_chars . "hello" . $trim_chars, $trim_chars)));
echo "== mb_ltrim ==\n";
var_dump(mb_ltrim("あああああああああああああああああああああああああああああああああいああああ", "あ"));
echo "== mb_rtrim ==\n";
@@ -103,6 +112,10 @@ string(26) " あいうおえお  a"
string(0) ""
string(1) "a"
string(388) "                                                                                                                                 a"
== Very long trim characters ==
string(5) "hello"
int(2053)
int(2053)
== mb_ltrim ==
string(15) "いああああ"
== mb_rtrim ==