From 7c2a3a94000890081904ea3af57bdc1998dcdea3 Mon Sep 17 00:00:00 2001 From: Hao Sun Date: Mon, 31 May 2021 14:27:00 +0800 Subject: [PATCH] JIT/AArch64: [macos][ZTS] Support fast path for tlv_get_addr (#7042) * JIT/AArch64: [macos][ZTS] Support fast path for tlv_get_addr Access to TLV(thread local variable) in macOS is in "dynamic" form and function tlv_get_addr() is invoked to resolve the address. See the example in [1]. Note there is one struct TLVDescriptor [2] for each TLV. The first member holds the address of function tlv_get_addr(), and the other two members, "key" and "offset", would be used inside tlv_get_addr(). The disassembly code for function tlv_get_addr() is shown in [3]. With the value from system register, i.e. tpidrro_el0, together with "key" and "offset", the TLV address can be obtained. Note that the value from tpidrro_el0 varies for different threads, and unique address for TLV is resolved. It's worth noting that slow path would be executed, i.e. function tlv_allocate_and_initialize_for_key(), for the first time of TLV access. In this patch: 1. "_tsrm_ls_cache" is guaranteed to be accessed before any VM/JIT code during the request startup, e.g. in init_executor(), therefore, slow path can be avoided. 2. As TLVDecriptor is immutable and zend_jit_setup() executes once, we get this structure in tsrm_get_ls_cache_tcb_offset(). Note the 'ldr' instruction would be patched to 'add' by the linker. 3. Only fast path for tlv_get_addr() is implemented in macro LOAD_TSRM_CACHE. With this patch, all ~4k test cases can pass for ZTS+CALL in macOS on Apple silicon. [1] https://gist.github.com/shqking/4aab67e0105f7c1f2c549d57d5799f94 [2] https://opensource.apple.com/source/dyld/dyld-195.6/src/threadLocalVariables.c.auto.html [3] https://gist.github.com/shqking/329d7712c26bad49786ab0a544a4af43 Change-Id: I613e9c37e3ff2ecc3fab0f53f1e48a0246e12ee3 --- TSRM/TSRM.c | 7 +++++++ ext/opcache/jit/zend_jit.c | 4 ++++ ext/opcache/jit/zend_jit_arm64.dasc | 25 +++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/TSRM/TSRM.c b/TSRM/TSRM.c index a39564b8930..5618849d4d6 100644 --- a/TSRM/TSRM.c +++ b/TSRM/TSRM.c @@ -744,10 +744,17 @@ TSRM_API size_t tsrm_get_ls_cache_tcb_offset(void) #elif defined(__aarch64__) size_t ret; +# ifdef __APPLE__ + // Points to struct TLVDecriptor for _tsrm_ls_cache in macOS. + asm("adrp %0, #__tsrm_ls_cache@TLVPPAGE\n\t" + "ldr %0, [%0, #__tsrm_ls_cache@TLVPPAGEOFF]" + : "=r" (ret)); +# else asm("mov %0, xzr\n\t" "add %0, %0, #:tprel_hi12:_tsrm_ls_cache, lsl #12\n\t" "add %0, %0, #:tprel_lo12_nc:_tsrm_ls_cache" : "=r" (ret)); +# endif return ret; #else return 0; diff --git a/ext/opcache/jit/zend_jit.c b/ext/opcache/jit/zend_jit.c index 3427ae16a8d..e7226aae6dc 100644 --- a/ext/opcache/jit/zend_jit.c +++ b/ext/opcache/jit/zend_jit.c @@ -4055,8 +4055,12 @@ ZEND_EXT_API void zend_jit_unprotect(void) if (!(JIT_G(debug) & (ZEND_JIT_DEBUG_GDB|ZEND_JIT_DEBUG_PERF_DUMP))) { int opts = PROT_READ | PROT_WRITE; #ifdef ZTS + /* TODO: EXEC+WRITE is not supported in macOS. Removing EXEC is still buggy as + * other threads, which are executing the JITed code, would crash anyway. */ +# ifndef __APPLE__ /* Another thread may be executing JITed code. */ opts |= PROT_EXEC; +# endif #endif if (mprotect(dasm_buf, dasm_size, opts) != 0) { fprintf(stderr, "mprotect() failed [%d] %s\n", errno, strerror(errno)); diff --git a/ext/opcache/jit/zend_jit_arm64.dasc b/ext/opcache/jit/zend_jit_arm64.dasc index 6961f533983..62893a822fc 100644 --- a/ext/opcache/jit/zend_jit_arm64.dasc +++ b/ext/opcache/jit/zend_jit_arm64.dasc @@ -184,6 +184,14 @@ const char* zend_reg_name[] = { #if ZTS static size_t tsrm_ls_cache_tcb_offset = 0; +# ifdef __APPLE__ +struct TLVDescriptor { + void* (*thunk)(struct TLVDescriptor*); + uint64_t key; + uint64_t offset; +}; +typedef struct TLVDescriptor TLVDescriptor; +# endif #endif /* By default avoid JITing inline handlers if it does not seem profitable due to lack of @@ -483,10 +491,27 @@ static int logical_immediate_p (uint64_t value, uint32_t reg_size) || } |.endmacro +// Safe memory load/store with an unsigned 64-bit offset. +|.macro SAFE_MEM_ACC_WITH_64_UOFFSET, ldr_str_ins, op, base_reg, offset, tmp_reg +|| if (((uintptr_t)(offset)) > LDR_STR_PIMM64) { +| LOAD_64BIT_VAL tmp_reg, offset +| ldr_str_ins op, [base_reg, tmp_reg] +|| } else { +| ldr_str_ins op, [base_reg, #(offset)] +|| } +|.endmacro + |.macro LOAD_TSRM_CACHE, reg +||#ifdef __APPLE__ +| .long 0xd53bd071 // TODO: hard-coded: mrs TMP3, tpidrro_el0 +| and TMP3, TMP3, #0xfffffffffffffff8 +| SAFE_MEM_ACC_WITH_64_UOFFSET ldr, TMP3, TMP3, (((TLVDescriptor*)tsrm_ls_cache_tcb_offset)->key << 3), TMP1 +| SAFE_MEM_ACC_WITH_64_UOFFSET ldr, reg, TMP3, (((TLVDescriptor*)tsrm_ls_cache_tcb_offset)->offset), TMP1 +||#else | .long 0xd53bd051 // TODO: hard-coded: mrs TMP3, tpidr_el0 || ZEND_ASSERT(tsrm_ls_cache_tcb_offset <= LDR_STR_PIMM64); | ldr reg, [TMP3, #tsrm_ls_cache_tcb_offset] +||#endif |.endmacro |.macro LOAD_ADDR_ZTS, reg, struct, field