1
0
mirror of https://github.com/php/php-src.git synced 2026-03-24 00:02:20 +01:00

Refactor some ext/pcre code for performance (#12447)

* Always inline populate_match_value and fix argument type

The call overhead of this function is quite large.

* Use _new variant of zend_hash in some places to avoid additional check

* Move allocation of match_sets down to simplify and reduce code size

* Move pcre2_get_ovector_pointer out of the loop

This is allocated together with the match data and stays loop invariant:
the pointer is always the same (the values not however).

* Mark error condition as cold block

* Simplify condition: subpats is already checked

* Move array size preallocation to use allocate the up-to-date size

* Simplify condition

* Rework internal functions to avoid repeated unwrapping

* Remember Z_ARRVAL_P(return_value)

The lookup is loop invariant.

* Mark some pointers as const
This commit is contained in:
Niels Dossche
2023-10-16 18:39:29 +01:00
committed by GitHub
parent 191966be05
commit 1e2e2f391c

View File

@@ -948,9 +948,9 @@ static zend_always_inline void populate_match_value_str(
ZVAL_STRINGL_FAST(val, subject + start_offset, end_offset - start_offset);
}
static inline void populate_match_value(
static zend_always_inline void populate_match_value(
zval *val, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
uint32_t unmatched_as_null) {
bool unmatched_as_null) {
if (PCRE2_UNSET == start_offset) {
if (unmatched_as_null) {
ZVAL_NULL(val);
@@ -963,13 +963,13 @@ static inline void populate_match_value(
}
static inline void add_named(
zval *subpats, zend_string *name, zval *val, bool unmatched) {
HashTable *const subpats, zend_string *name, zval *val, bool unmatched) {
/* If the DUPNAMES option is used, multiple subpatterns might have the same name.
* In this case we want to preserve the one that actually has a value. */
if (!unmatched) {
zend_hash_update(Z_ARRVAL_P(subpats), name, val);
zend_hash_update(subpats, name, val);
} else {
if (!zend_hash_add(Z_ARRVAL_P(subpats), name, val)) {
if (!zend_hash_add(subpats, name, val)) {
return;
}
}
@@ -978,7 +978,7 @@ static inline void add_named(
/* {{{ add_offset_pair */
static inline void add_offset_pair(
zval *result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
HashTable *const result, const char *subject, PCRE2_SIZE start_offset, PCRE2_SIZE end_offset,
zend_string *name, uint32_t unmatched_as_null)
{
zval match_pair;
@@ -1006,7 +1006,7 @@ static inline void add_offset_pair(
if (name) {
add_named(result, name, &match_pair, start_offset == PCRE2_UNSET);
}
zend_hash_next_index_insert(Z_ARRVAL_P(result), &match_pair);
zend_hash_next_index_insert_new(result, &match_pair);
}
/* }}} */
@@ -1017,16 +1017,17 @@ static void populate_subpat_array(
bool unmatched_as_null = (flags & PREG_UNMATCHED_AS_NULL) != 0;
zval val;
int i;
HashTable *subpats_ht = Z_ARRVAL_P(subpats);
if (subpat_names) {
if (offset_capture) {
for (i = 0; i < count; i++) {
add_offset_pair(
subpats, subject, offsets[2*i], offsets[2*i+1],
subpats_ht, subject, offsets[2*i], offsets[2*i+1],
subpat_names[i], unmatched_as_null);
}
if (unmatched_as_null) {
for (i = count; i < num_subpats; i++) {
add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, subpat_names[i], 1);
}
}
} else {
@@ -1034,17 +1035,17 @@ static void populate_subpat_array(
populate_match_value(
&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
if (subpat_names[i]) {
add_named(subpats, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
add_named(subpats_ht, subpat_names[i], &val, offsets[2*i] == PCRE2_UNSET);
}
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
zend_hash_next_index_insert_new(subpats_ht, &val);
}
if (unmatched_as_null) {
for (i = count; i < num_subpats; i++) {
ZVAL_NULL(&val);
if (subpat_names[i]) {
zend_hash_add(Z_ARRVAL_P(subpats), subpat_names[i], &val);
zend_hash_add(subpats_ht, subpat_names[i], &val);
}
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
zend_hash_next_index_insert_new(subpats_ht, &val);
}
}
}
@@ -1052,18 +1053,18 @@ static void populate_subpat_array(
if (offset_capture) {
for (i = 0; i < count; i++) {
add_offset_pair(
subpats, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
subpats_ht, subject, offsets[2*i], offsets[2*i+1], NULL, unmatched_as_null);
}
if (unmatched_as_null) {
for (i = count; i < num_subpats; i++) {
add_offset_pair(subpats, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
add_offset_pair(subpats_ht, NULL, PCRE2_UNSET, PCRE2_UNSET, NULL, 1);
}
}
} else {
for (i = 0; i < count; i++) {
populate_match_value(
&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &val);
zend_hash_next_index_insert_new(subpats_ht, &val);
}
if (unmatched_as_null) {
for (i = count; i < num_subpats; i++) {
@@ -1129,13 +1130,12 @@ static zend_always_inline bool is_known_valid_utf8(
PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zval *subpats, int global, int use_flags, zend_long flags, zend_off_t start_offset)
{
zval result_set, /* Holds a set of subpatterns after
zval result_set; /* Holds a set of subpatterns after
a global match */
*match_sets = NULL; /* An array of sets of matches for each
HashTable **match_sets = NULL; /* An array of sets of matches for each
subpattern after a global match */
uint32_t options; /* Execution options */
int count; /* Count of matched subpatterns */
PCRE2_SIZE *offsets; /* Array of subpattern offsets */
uint32_t num_subpats; /* Number of captured subpatterns */
int matched; /* Has anything matched */
zend_string **subpat_names; /* Array for named subpatterns */
@@ -1215,14 +1215,6 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
}
/* Allocate match sets array and initialize the values. */
if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
match_sets = (zval *)safe_emalloc(num_subpats, sizeof(zval), 0);
for (i=0; i<num_subpats; i++) {
array_init(&match_sets[i]);
}
}
matched = 0;
PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
@@ -1235,13 +1227,21 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
if (subpat_names) {
free_subpats_table(subpat_names, num_subpats);
}
if (match_sets) {
efree(match_sets);
}
RETURN_FALSE;
}
}
/* Allocate match sets array and initialize the values. */
if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
match_sets = safe_emalloc(num_subpats, sizeof(HashTable *), 0);
for (i=0; i<num_subpats; i++) {
match_sets[i] = zend_new_array(0);
}
}
/* Array of subpattern offsets */
PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
orig_start_offset = start_offset2;
options =
(pce->compile_options & PCRE2_UTF) && !is_known_valid_utf8(subject_str, orig_start_offset)
@@ -1269,12 +1269,10 @@ PHPAPI void php_pcre_match_impl(pcre_cache_entry *pce, zend_string *subject_str,
matched:
matched++;
offsets = pcre2_get_ovector_pointer(match_data);
/* If subpatterns array has been passed, fill it in with values. */
if (subpats != NULL) {
/* Try to get the list of substrings and display a warning if failed. */
if (offsets[1] < offsets[0]) {
if (UNEXPECTED(offsets[1] < offsets[0])) {
if (subpat_names) {
free_subpats_table(subpat_names, num_subpats);
}
@@ -1284,12 +1282,12 @@ matched:
}
if (global) { /* global pattern matching */
if (subpats && subpats_order == PREG_PATTERN_ORDER) {
if (subpats_order == PREG_PATTERN_ORDER) {
/* For each subpattern, insert it into the appropriate array. */
if (offset_capture) {
for (i = 0; i < count; i++) {
add_offset_pair(
&match_sets[i], subject, offsets[2*i], offsets[2*i+1],
match_sets[i], subject, offsets[2*i], offsets[2*i+1],
NULL, unmatched_as_null);
}
} else {
@@ -1297,7 +1295,7 @@ matched:
zval val;
populate_match_value(
&val, subject, offsets[2*i], offsets[2*i+1], unmatched_as_null);
zend_hash_next_index_insert_new(Z_ARRVAL(match_sets[i]), &val);
zend_hash_next_index_insert_new(match_sets[i], &val);
}
}
mark = pcre2_get_mark(match_data);
@@ -1317,24 +1315,28 @@ matched:
for (; i < num_subpats; i++) {
if (offset_capture) {
add_offset_pair(
&match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
match_sets[i], NULL, PCRE2_UNSET, PCRE2_UNSET,
NULL, unmatched_as_null);
} else if (unmatched_as_null) {
add_next_index_null(&match_sets[i]);
zval tmp;
ZVAL_NULL(&tmp);
zend_hash_next_index_insert_new(match_sets[i], &tmp);
} else {
add_next_index_str(&match_sets[i], ZSTR_EMPTY_ALLOC());
zval tmp;
ZVAL_EMPTY_STRING(&tmp);
zend_hash_next_index_insert_new(match_sets[i], &tmp);
}
}
}
} else {
/* Allocate and populate the result set array */
array_init_size(&result_set, count + (mark ? 1 : 0));
mark = pcre2_get_mark(match_data);
array_init_size(&result_set, count + (mark ? 1 : 0));
populate_subpat_array(
&result_set, subject, offsets, subpat_names,
num_subpats, count, mark, flags);
/* And add it to the output array */
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &result_set);
zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &result_set);
}
} else { /* single pattern matching */
/* For each subpattern, insert it into the subpatterns array. */
@@ -1408,18 +1410,22 @@ error:
}
/* Add the match sets to the output array and clean up */
if (global && subpats && subpats_order == PREG_PATTERN_ORDER) {
if (match_sets) {
if (subpat_names) {
for (i = 0; i < num_subpats; i++) {
zval wrapper;
ZVAL_ARR(&wrapper, match_sets[i]);
if (subpat_names[i]) {
zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &match_sets[i]);
Z_ADDREF(match_sets[i]);
zend_hash_update(Z_ARRVAL_P(subpats), subpat_names[i], &wrapper);
GC_ADDREF(match_sets[i]);
}
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
}
} else {
for (i = 0; i < num_subpats; i++) {
zend_hash_next_index_insert(Z_ARRVAL_P(subpats), &match_sets[i]);
zval wrapper;
ZVAL_ARR(&wrapper, match_sets[i]);
zend_hash_next_index_insert_new(Z_ARRVAL_P(subpats), &wrapper);
}
}
efree(match_sets);
@@ -1567,7 +1573,6 @@ PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *su
{
uint32_t options; /* Execution options */
int count; /* Count of matched subpatterns */
PCRE2_SIZE *offsets; /* Array of subpattern offsets */
uint32_t num_subpats; /* Number of captured subpatterns */
size_t new_len; /* Length of needed storage */
size_t alloc_len; /* Actual allocated length */
@@ -1609,6 +1614,9 @@ PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *su
options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
/* Array of subpattern offsets */
PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
/* Execute the regular expression. */
#ifdef HAVE_PCRE_JIT_SUPPORT
if ((pce->preg_options & PREG_JIT) && options) {
@@ -1632,8 +1640,6 @@ PHPAPI zend_string *php_pcre_replace_impl(pcre_cache_entry *pce, zend_string *su
}
matched:
offsets = pcre2_get_ovector_pointer(match_data);
if (UNEXPECTED(offsets[1] < offsets[0])) {
PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
if (result) {
@@ -1807,7 +1813,6 @@ static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_strin
{
uint32_t options; /* Execution options */
int count; /* Count of matched subpatterns */
PCRE2_SIZE *offsets; /* Array of subpattern offsets */
zend_string **subpat_names; /* Array for named subpatterns */
uint32_t num_subpats; /* Number of captured subpatterns */
size_t new_len; /* Length of needed storage */
@@ -1865,6 +1870,9 @@ static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_strin
options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
/* Array of subpattern offsets */
PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
/* Execute the regular expression. */
#ifdef HAVE_PCRE_JIT_SUPPORT
if ((pce->preg_options & PREG_JIT) && options) {
@@ -1886,8 +1894,6 @@ static zend_string *php_pcre_replace_func_impl(pcre_cache_entry *pce, zend_strin
}
matched:
offsets = pcre2_get_ovector_pointer(match_data);
if (UNEXPECTED(offsets[1] < offsets[0])) {
PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
if (result) {
@@ -2203,6 +2209,7 @@ static size_t preg_replace_func_impl(zval *return_value,
ZEND_ASSERT(subject_ht != NULL);
array_init_size(return_value, zend_hash_num_elements(subject_ht));
HashTable *return_value_ht = Z_ARRVAL_P(return_value);
/* For each subject entry, convert it to string, then perform replacement
and add the result to the return_value array. */
@@ -2216,9 +2223,9 @@ static size_t preg_replace_func_impl(zval *return_value,
/* Add to return array */
ZVAL_STR(&zv, result);
if (string_key) {
zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
zend_hash_add_new(return_value_ht, string_key, &zv);
} else {
zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
zend_hash_index_add_new(return_value_ht, num_key, &zv);
}
}
zend_tmp_string_release(tmp_subject_entry_str);
@@ -2283,6 +2290,7 @@ static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
ZEND_ASSERT(subject_ht != NULL);
array_init_size(return_value, zend_hash_num_elements(subject_ht));
HashTable *return_value_ht = Z_ARRVAL_P(return_value);
/* For each subject entry, convert it to string, then perform replacement
and add the result to the return_value array. */
@@ -2298,9 +2306,9 @@ static void preg_replace_common(INTERNAL_FUNCTION_PARAMETERS, bool is_filter)
/* Add to return array */
ZVAL_STR(&zv, result);
if (string_key) {
zend_hash_add_new(Z_ARRVAL_P(return_value), string_key, &zv);
zend_hash_add_new(return_value_ht, string_key, &zv);
} else {
zend_hash_index_add_new(Z_ARRVAL_P(return_value), num_key, &zv);
zend_hash_index_add_new(return_value_ht, num_key, &zv);
}
} else {
zend_string_release_ex(result, 0);
@@ -2487,7 +2495,6 @@ PHP_FUNCTION(preg_split)
PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str, zval *return_value,
zend_long limit_val, zend_long flags)
{
PCRE2_SIZE *offsets; /* Array of subpattern offsets */
uint32_t options; /* Execution options */
int count; /* Count of matched subpatterns */
PCRE2_SIZE start_offset; /* Where the new search starts */
@@ -2506,6 +2513,7 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str,
/* Initialize return value */
array_init(return_value);
HashTable *return_value_ht = Z_ARRVAL_P(return_value);
/* Calculate the size of the offsets array, and allocate memory for it. */
num_subpats = pce->capture_count + 1;
@@ -2536,6 +2544,9 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str,
options = (pce->compile_options & PCRE2_UTF) ? 0 : PCRE2_NO_UTF_CHECK;
/* Array of subpattern offsets */
PCRE2_SIZE *const offsets = pcre2_get_ovector_pointer(match_data);
#ifdef HAVE_PCRE_JIT_SUPPORT
if ((pce->preg_options & PREG_JIT) && options) {
count = pcre2_jit_match(pce->re, (PCRE2_SPTR)subject, ZSTR_LEN(subject_str), start_offset,
@@ -2555,8 +2566,6 @@ PHPAPI void php_pcre_split_impl(pcre_cache_entry *pce, zend_string *subject_str,
}
matched:
offsets = pcre2_get_ovector_pointer(match_data);
if (UNEXPECTED(offsets[1] < offsets[0])) {
PCRE_G(error_code) = PHP_PCRE_INTERNAL_ERROR;
break;
@@ -2566,12 +2575,12 @@ matched:
if (offset_capture) {
/* Add (match, offset) pair to the return value */
add_offset_pair(
return_value, subject, last_match_offset, offsets[0],
return_value_ht, subject, last_match_offset, offsets[0],
NULL, 0);
} else {
/* Add the piece to the return value */
populate_match_value_str(&tmp, subject, last_match_offset, offsets[0]);
zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
zend_hash_next_index_insert_new(return_value_ht, &tmp);
}
/* One less left to do */
@@ -2586,10 +2595,10 @@ matched:
if (!no_empty || offsets[2*i] != offsets[2*i+1]) {
if (offset_capture) {
add_offset_pair(
return_value, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
return_value_ht, subject, offsets[2*i], offsets[2*i+1], NULL, 0);
} else {
populate_match_value_str(&tmp, subject, offsets[2*i], offsets[2*i+1]);
zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
zend_hash_next_index_insert_new(return_value_ht, &tmp);
}
}
}
@@ -2663,7 +2672,7 @@ last:
if (!no_empty || start_offset < ZSTR_LEN(subject_str)) {
if (offset_capture) {
/* Add the last (match, offset) pair to the return value */
add_offset_pair(return_value, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
add_offset_pair(return_value_ht, subject, start_offset, ZSTR_LEN(subject_str), NULL, 0);
} else {
/* Add the last piece to the return value */
if (start_offset == 0) {
@@ -2671,7 +2680,7 @@ last:
} else {
populate_match_value_str(&tmp, subject, start_offset, ZSTR_LEN(subject_str));
}
zend_hash_next_index_insert_new(Z_ARRVAL_P(return_value), &tmp);
zend_hash_next_index_insert_new(return_value_ht, &tmp);
}
}
}
@@ -2858,6 +2867,7 @@ PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
/* Initialize return array */
array_init(return_value);
HashTable *return_value_ht = Z_ARRVAL_P(return_value);
PCRE_G(error_code) = PHP_PCRE_NO_ERROR;
@@ -2899,9 +2909,9 @@ PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
/* Add to return array */
if (string_key) {
zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
zend_hash_update(return_value_ht, string_key, entry);
} else {
zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
zend_hash_index_update(return_value_ht, num_key, entry);
}
}
} else if (count == PCRE2_ERROR_NOMATCH) {
@@ -2910,9 +2920,9 @@ PHPAPI void php_pcre_grep_impl(pcre_cache_entry *pce, zval *input, zval *return
/* Add to return array */
if (string_key) {
zend_hash_update(Z_ARRVAL_P(return_value), string_key, entry);
zend_hash_update(return_value_ht, string_key, entry);
} else {
zend_hash_index_update(Z_ARRVAL_P(return_value), num_key, entry);
zend_hash_index_update(return_value_ht, num_key, entry);
}
}
} else {