From af7639683f072c15b8af3648dc6b0c2b14201d21 Mon Sep 17 00:00:00 2001 From: Sebastian Pop Date: Wed, 10 Jul 2019 18:31:27 +0000 Subject: [PATCH] vectorize string-escape with NEON On this benchmark: function simple_string_escape() { $a = "test'asd'asd'' asd\'\"asdfasdf"; for($i=0; $i<512; $i++) { $a .= chr($i%256); } for ($i = 0; $i < 100000; $i++) { if ($a === stripslashes(addslashes($a))) $a .= chr($i%256); else { echo "error at i=".$i."\n"; return; } } } the execution time goes from 21.619s to 8.139s (165% speedup) on an A1 Graviton instance. When removing the characters that need escaping, i.e., this benchmark: function simple_string() { $a = "testasdasd asdasdfasdf"; for ($i = 0; $i < 10000; $i++) { if ($a === stripslashes(addslashes($a))) $a .= "test dedeasdf"; else { echo "error at i=".$i."\n"; return; } } } the execution time goes from 2.932s down to 0.516s (468% speedup) on an A1 Graviton instance. --- ext/standard/string.c | 100 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/ext/standard/string.c b/ext/standard/string.c index d097be78b78..c5a18f5364c 100644 --- a/ext/standard/string.c +++ b/ext/standard/string.c @@ -4068,6 +4068,44 @@ do_escape: /* }}} */ #endif +#ifdef __aarch64__ +typedef union { + uint8_t mem[16]; + uint64_t dw[2]; +} quad_word; + +static zend_always_inline quad_word aarch64_contains_slash_chars(uint8x16_t x) { + uint8x16_t s0 = vceqq_u8(x, vdupq_n_u8('\0')); + uint8x16_t s1 = vceqq_u8(x, vdupq_n_u8('\'')); + uint8x16_t s2 = vceqq_u8(x, vdupq_n_u8('\"')); + uint8x16_t s3 = vceqq_u8(x, vdupq_n_u8('\\')); + uint8x16_t s01 = vorrq_u8(s0, s1); + uint8x16_t s23 = vorrq_u8(s2, s3); + uint8x16_t s0123 = vorrq_u8(s01, s23); + quad_word qw; + vst1q_u8(qw.mem, s0123); + return qw; +} + +static zend_always_inline char *aarch64_add_slashes(quad_word res, const char *source, char *target) +{ + int i = 0; + for (; i < 16; i++) { + char s = source[i]; + if (res.mem[i] == 0) + *target++ = s; + else { + *target++ = '\\'; + if (s == '\0') + *target++ = '0'; + else + *target++ = s; + } + } + return target; +} +#endif /* __aarch64__ */ + #if !ZEND_INTRIN_SSE4_2_NATIVE # if ZEND_INTRIN_SSE4_2_RESOLVER zend_string *php_addslashes_default(zend_string *str) /* {{{ */ @@ -4088,6 +4126,19 @@ PHPAPI zend_string *php_addslashes(zend_string *str) source = ZSTR_VAL(str); end = source + ZSTR_LEN(str); +# ifdef __aarch64__ + quad_word res = {0}; + if (ZSTR_LEN(str) > 15) { + do { + res = aarch64_contains_slash_chars(vld1q_u8((uint8_t *)source)); + if (res.dw[0] | res.dw[1]) + goto do_escape; + source += 16; + } while ((end - source) > 15); + } + /* Finish the last 15 bytes or less with the scalar loop. */ +# endif /* __aarch64__ */ + while (source < end) { switch (*source) { case '\0': @@ -4109,6 +4160,24 @@ do_escape: memcpy(ZSTR_VAL(new_str), ZSTR_VAL(str), offset); target = ZSTR_VAL(new_str) + offset; +# ifdef __aarch64__ + if (res.dw[0] | res.dw[1]) { + target = aarch64_add_slashes(res, source, target); + source += 16; + } + for (; end - source > 15; source += 16) { + uint8x16_t x = vld1q_u8((uint8_t *)source); + res = aarch64_contains_slash_chars(x); + if (res.dw[0] | res.dw[1]) { + target = aarch64_add_slashes(res, source, target); + } else { + vst1q_u8((uint8_t*)target, x); + target += 16; + } + } + /* Finish the last 15 bytes or less with the scalar loop. */ +# endif /* __aarch64__ */ + while (source < end) { switch (*source) { case '\0': @@ -4146,6 +4215,37 @@ do_escape: * be careful, this edits the string in-place */ static zend_always_inline char *php_stripslashes_impl(const char *str, char *out, size_t len) { +#ifdef __aarch64__ + while (len > 15) { + uint8x16_t x = vld1q_u8((uint8_t *)str); + quad_word q; + vst1q_u8(q.mem, vceqq_u8(x, vdupq_n_u8('\\'))); + if (q.dw[0] | q.dw[1]) { + int i = 0; + for (; i < 16; i++) { + if (q.mem[i] == 0) { + *out++ = str[i]; + continue; + } + + i++; /* skip the slash */ + char s = str[i]; + if (s == '0') + *out++ = '\0'; + else + *out++ = s; /* preserve the next character */ + } + str += i; + len -= i; + } else { + vst1q_u8((uint8_t*)out, x); + out += 16; + str += 16; + len -= 16; + } + } + /* Finish the last 15 bytes or less with the scalar loop. */ +#endif /* __aarch64__ */ while (len > 0) { if (*str == '\\') { str++; /* skip the slash */