mirror of
https://github.com/php/php-src.git
synced 2026-03-24 00:02:20 +01:00
Improve performance of mbfl_name2encoding() by using perfect hashing (#12707)
mbfl_name2encoding() uses a linear loop through the encodings, comparing the name one by one, which is very slow. For the benchmark [1] just looking up the name takes about 50% of run-time. By using perfect hashing instead, we no longer have to loop over the list, and the number of string comparisons is reduced to just a single one. The perfect hashing table is generated using GNU gperf and amended manually to fit in with mbstring and manually changed to reduce the cache size. [1] https://github.com/php/php-src/issues/12684#issuecomment-1813799924
This commit is contained in:
@@ -227,3 +227,5 @@ PHP 8.4 UPGRADE NOTES
|
||||
* mb_strcut() is much faster now for UTF-8 and UTF-16 strings.
|
||||
|
||||
* get_browser() is much faster now, up to 1.5x - 2.5x for some test cases.
|
||||
|
||||
* Looking up mbstring encoding names is much faster now.
|
||||
|
||||
@@ -144,13 +144,212 @@ static const mbfl_encoding *mbfl_encoding_ptr_list[] = {
|
||||
NULL
|
||||
};
|
||||
|
||||
/* The following perfect hashing table was amended from gperf, and hashing code was generated using gperf.
|
||||
* The table was amended to refer to the table above such that it is lighter for the data cache.
|
||||
* Command used: gperf encodings.txt --readonly-tables --null-strings --ignore-case
|
||||
* The encodings.txt contains all the contents of the name fields of the mbfl_encoding_ptr_list table. */
|
||||
|
||||
static const int8_t mbfl_encoding_ptr_list_after_hashing[187] = {
|
||||
-1, -1, -1,
|
||||
65,
|
||||
23,
|
||||
9,
|
||||
-1,
|
||||
60,
|
||||
36,
|
||||
-1, -1,
|
||||
58,
|
||||
42,
|
||||
-1, -1,
|
||||
18,
|
||||
27,
|
||||
77,
|
||||
26,
|
||||
40,
|
||||
72,
|
||||
12,
|
||||
10,
|
||||
2,
|
||||
31,
|
||||
-1, -1,
|
||||
75,
|
||||
74,
|
||||
33,
|
||||
45,
|
||||
-1,
|
||||
67,
|
||||
13,
|
||||
-1,
|
||||
51,
|
||||
53,
|
||||
11,
|
||||
1,
|
||||
-1,
|
||||
48,
|
||||
56,
|
||||
-1,
|
||||
38,
|
||||
20,
|
||||
46,
|
||||
54,
|
||||
-1,
|
||||
14,
|
||||
24,
|
||||
44,
|
||||
39,
|
||||
43,
|
||||
-1,
|
||||
30,
|
||||
49,
|
||||
57,
|
||||
76,
|
||||
-1, -1,
|
||||
68,
|
||||
73,
|
||||
7,
|
||||
16,
|
||||
-1,
|
||||
35,
|
||||
66,
|
||||
-1, -1, -1,
|
||||
47,
|
||||
55,
|
||||
-1, -1, -1,
|
||||
63,
|
||||
15,
|
||||
8,
|
||||
17,
|
||||
-1,
|
||||
21,
|
||||
70,
|
||||
-1,
|
||||
29,
|
||||
5,
|
||||
6,
|
||||
61,
|
||||
-1, -1,
|
||||
71,
|
||||
52,
|
||||
3,
|
||||
37,
|
||||
-1, -1,
|
||||
28,
|
||||
-1, -1, -1,
|
||||
32,
|
||||
50,
|
||||
34,
|
||||
-1, -1, -1,
|
||||
62,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
59,
|
||||
0,
|
||||
-1, -1, -1, -1,
|
||||
22,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
25,
|
||||
41,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
19,
|
||||
-1, -1, -1,
|
||||
4,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
|
||||
69,
|
||||
-1, -1, -1, -1,
|
||||
64,
|
||||
};
|
||||
|
||||
static unsigned int mbfl_name2encoding_perfect_hash(const char *str, size_t len)
|
||||
{
|
||||
static const unsigned char asso_values[] =
|
||||
{
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 0, 187, 187, 5, 20,
|
||||
0, 15, 40, 10, 25, 70, 5, 60, 187, 187,
|
||||
187, 187, 187, 187, 187, 75, 5, 0, 20, 5,
|
||||
0, 75, 5, 0, 40, 75, 20, 0, 0, 0,
|
||||
35, 45, 50, 0, 75, 0, 187, 0, 187, 187,
|
||||
0, 187, 187, 187, 187, 187, 187, 75, 5, 0,
|
||||
20, 5, 0, 75, 5, 0, 40, 75, 20, 0,
|
||||
0, 0, 35, 45, 50, 0, 75, 0, 187, 0,
|
||||
187, 187, 0, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187, 187, 187, 187, 187,
|
||||
187, 187, 187, 187, 187, 187
|
||||
};
|
||||
unsigned int hval = len;
|
||||
|
||||
switch (hval)
|
||||
{
|
||||
default:
|
||||
hval += asso_values[(unsigned char)str[6]];
|
||||
ZEND_FALLTHROUGH;
|
||||
case 6:
|
||||
hval += asso_values[(unsigned char)str[5]];
|
||||
ZEND_FALLTHROUGH;
|
||||
case 5:
|
||||
hval += asso_values[(unsigned char)str[4]];
|
||||
ZEND_FALLTHROUGH;
|
||||
case 4:
|
||||
case 3:
|
||||
hval += asso_values[(unsigned char)str[2]];
|
||||
ZEND_FALLTHROUGH;
|
||||
case 2:
|
||||
case 1:
|
||||
hval += asso_values[(unsigned char)str[0]];
|
||||
break;
|
||||
}
|
||||
return hval + asso_values[(unsigned char)str[len - 1]];
|
||||
}
|
||||
|
||||
#define NAME_HASH_MIN_NAME_LENGTH 2
|
||||
#define NAME_HASH_MAX_NAME_LENGTH 23
|
||||
|
||||
const mbfl_encoding *mbfl_name2encoding(const char *name)
|
||||
{
|
||||
const mbfl_encoding **encoding;
|
||||
const mbfl_encoding *const *encoding;
|
||||
|
||||
/* Sanity check perfect hash for name.
|
||||
* Never enable this in production, this is only a development-time sanity check! */
|
||||
#if ZEND_DEBUG && 0
|
||||
for (encoding = mbfl_encoding_ptr_list; *encoding; encoding++) {
|
||||
if (strcasecmp((*encoding)->name, name) == 0) {
|
||||
return *encoding;
|
||||
size_t name_length = strlen((*encoding)->name);
|
||||
if (!(name_length <= NAME_HASH_MAX_NAME_LENGTH && name_length >= NAME_HASH_MIN_NAME_LENGTH)) {
|
||||
fprintf(stderr, "name length is not satisfying bound check: %zu %s\n", name_length, (*encoding)->name);
|
||||
abort();
|
||||
}
|
||||
unsigned int key = mbfl_name2encoding_perfect_hash((*encoding)->name, name_length);
|
||||
if (mbfl_encoding_ptr_list[mbfl_encoding_ptr_list_after_hashing[key]] != *encoding) {
|
||||
fprintf(stderr, "mbfl_name2encoding_perfect_hash: key %u %s mismatch\n", key, (*encoding)->name);
|
||||
abort();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
/* Use perfect hash lookup for name */
|
||||
size_t name_len = strlen(name);
|
||||
if (name_len <= NAME_HASH_MAX_NAME_LENGTH && name_len >= NAME_HASH_MIN_NAME_LENGTH) {
|
||||
unsigned int key = mbfl_name2encoding_perfect_hash(name, name_len);
|
||||
if (key <= 186) {
|
||||
int8_t offset = mbfl_encoding_ptr_list_after_hashing[key];
|
||||
if (offset >= 0) {
|
||||
encoding = mbfl_encoding_ptr_list + offset;
|
||||
if (strcasecmp((*encoding)->name, name) == 0) {
|
||||
return *encoding;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user