diff --git a/ext/opcache/jit/ir/gen_ir_fold_hash.c b/ext/opcache/jit/ir/gen_ir_fold_hash.c index 52205297438..efd656f5b7a 100644 --- a/ext/opcache/jit/ir/gen_ir_fold_hash.c +++ b/ext/opcache/jit/ir/gen_ir_fold_hash.c @@ -15,6 +15,10 @@ #define MAX_RULES 2048 #define MAX_SLOTS (MAX_RULES * 4) +#define USE_SEMI_PERFECT_HASH 1 +#define USE_SHL_HASH 1 +#define USE_ROL_HASH 0 + static ir_strtab strtab; void print_hash(uint32_t *mask, uint32_t count) @@ -28,12 +32,14 @@ void print_hash(uint32_t *mask, uint32_t count) printf("};\n\n"); } -#if 0 +#if USE_SHL_HASH static uint32_t hash_shl2(uint32_t mask, uint32_t r1, uint32_t r2) { return ((mask << r1) - mask) << r2; } -#else +#endif + +#if USE_ROL_HASH #define ir_rol(x, n) (((x)<<(n)) | ((x)>>(-(int)(n)&(8*sizeof(x)-1)))) #define ir_ror(x, n) (((x)<<(-(int)(n)&(8*sizeof(x)-1))) | ((x)>>(n))) @@ -50,29 +56,64 @@ int find_hash(uint32_t *mask, uint32_t count) uint32_t n, r1, r2, i, h; for (n = (count | 1); n < MAX_SLOTS; n += 2) { +#if USE_SEMI_PERFECT_HASH + int semi_perfect = 0; +#endif + for (r1 = 0; r1 < 31; r1++) { for (r2 = 0; r2 < 32; r2++) { -#if 0 +#if USE_SHL_HASH memset(hash, 0, n * sizeof(uint32_t)); for (i = 0; i < count; i++) { h = hash_shl2(mask[i] & 0x1fffff, r1, r2) % n; - if (hash[h]) break; /* collision */ + if (hash[h]) { +#if USE_SEMI_PERFECT_HASH + h++; + if (!hash[h]) { + hash[h] = mask[i]; + semi_perfect = 1; + continue; + } +#endif + break; /* collision */ + } hash[h] = mask[i]; } if (i == count) { print_hash(hash, n); +#if USE_SEMI_PERFECT_HASH + if (semi_perfect) { + printf("#define IR_FOLD_SEMI_PERFECT_HASH\n\n"); + } +#endif printf("static uint32_t _ir_fold_hashkey(uint32_t h)\n{\n\treturn (((h << %d) - h) << %d) %% %d;\n}\n", r1, r2, n); return 1; } -#else +#endif +#if USE_ROL_HASH memset(hash, 0, n * sizeof(uint32_t)); for (i = 0; i < count; i++) { h = hash_rol2(mask[i] & 0x1fffff, r1, r2) % n; - if (hash[h]) break; /* collision */ + if (hash[h]) { +#if USE_SEMI_PERFECT_HASH + h++; + if (!hash[h]) { + hash[h] = mask[i]; + semi_perfect = 1; + continue; + } +#endif + break; /* collision */ + } hash[h] = mask[i]; } if (i == count) { print_hash(hash, n); +#if USE_SEMI_PERFECT_HASH + if (semi_perfect) { + printf("#define IR_FOLD_SEMI_PERFECT_HASH\n\n"); + } +#endif printf("static uint32_t _ir_fold_hashkey(uint32_t h)\n{\nreturn ir_rol32((ir_rol32(h, %d) - h), %d) %% %d;\n}\n", r1, r2, n); return 1; } diff --git a/ext/opcache/jit/ir/ir.c b/ext/opcache/jit/ir/ir.c index 2db9b569806..d4e3314f1b1 100644 --- a/ext/opcache/jit/ir/ir.c +++ b/ext/opcache/jit/ir/ir.c @@ -926,7 +926,11 @@ restart: uint32_t k = key & any; uint32_t h = _ir_fold_hashkey(k); uint32_t fh = _ir_fold_hash[h]; - if (IR_FOLD_KEY(fh) == k /*|| (fh = _ir_fold_hash[h+1], (fh & 0x1fffff) == k)*/) { + if (IR_FOLD_KEY(fh) == k +#ifdef IR_FOLD_SEMI_PERFECT_HASH + || (fh = _ir_fold_hash[h+1], (fh & 0x1fffff) == k) +#endif + ) { switch (IR_FOLD_RULE(fh)) { #include "ir_fold.h" default: @@ -1287,6 +1291,7 @@ void ir_use_list_remove_one(ir_ctx *ctx, ir_ref from, ir_ref ref) *p = IR_UNUSED; break; } + p++; j++; } } diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc index 891fc1c4e03..08634a91015 100644 --- a/ext/opcache/jit/ir/ir_aarch64.dasc +++ b/ext/opcache/jit/ir/ir_aarch64.dasc @@ -4309,7 +4309,7 @@ static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn) } /* Generate a table jmp or a sequence of calls */ - if ((max.i64-min.i64) < count * 8) { + if (count > 2 && (max.i64-min.i64) < count * 8) { int *labels = ir_mem_malloc(sizeof(int) * (max.i64 - min.i64 + 1)); for (i = 0; i <= (max.i64 - min.i64); i++) { diff --git a/ext/opcache/jit/ir/ir_cfg.c b/ext/opcache/jit/ir/ir_cfg.c index 20600c97abe..824cdb61c93 100644 --- a/ext/opcache/jit/ir/ir_cfg.c +++ b/ext/opcache/jit/ir/ir_cfg.c @@ -2027,13 +2027,10 @@ static int ir_schedule_blocks_bottom_up(ir_ctx *ctx) ir_chain *chains; ir_bitqueue worklist; ir_bitset visited; - uint32_t *empty, count; -#ifdef IR_DEBUG - uint32_t empty_count = 0; -#endif + uint32_t *schedule_end, count; ctx->cfg_schedule = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2)); - empty = ctx->cfg_schedule + ctx->cfg_blocks_count; + schedule_end = ctx->cfg_schedule + ctx->cfg_blocks_count; /* 1. Create initial chains for each BB */ chains = ir_mem_malloc(sizeof(ir_chain) * (ctx->cfg_blocks_count + 1)); @@ -2083,11 +2080,8 @@ restart: /* move empty blocks to the end */ IR_ASSERT(chains[b].head == b); chains[b].head = 0; -#ifdef IR_DEBUG - empty_count++; -#endif - *empty = b; - empty--; + *schedule_end = b; + schedule_end--; if (successor > b) { bb_freq[successor] += bb_freq[b]; @@ -2168,14 +2162,22 @@ restart: } else { prob1 = prob2 = 50; } - IR_ASSERT(edges_count < max_edges_count); - freq = bb_freq[b] * (float)prob1 / (float)probN; - if (successor1 > b) { - IR_ASSERT(!ir_bitset_in(visited, successor1)); - bb_freq[successor1] += freq; - ir_bitqueue_add(&worklist, successor1); - } do { + freq = bb_freq[b] * (float)prob1 / (float)probN; + if (successor1 > b) { + IR_ASSERT(!ir_bitset_in(visited, successor1)); + bb_freq[successor1] += freq; + if (successor1_bb->successors_count == 0 && insn1->op2 == 1) { + /* move cold block without successors to the end */ + IR_ASSERT(chains[successor1].head == successor1); + chains[successor1].head = 0; + *schedule_end = successor1; + schedule_end--; + break; + } else { + ir_bitqueue_add(&worklist, successor1); + } + } /* try to join edges early to reduce number of edges and the cost of their sorting */ if (prob1 > prob2 && (successor1_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) { @@ -2187,19 +2189,28 @@ restart: if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break; } successor1 = _ir_skip_empty_blocks(ctx, successor1); + IR_ASSERT(edges_count < max_edges_count); edges[edges_count].from = b; edges[edges_count].to = successor1; edges[edges_count].freq = freq; edges_count++; } while (0); - IR_ASSERT(edges_count < max_edges_count); - freq = bb_freq[b] * (float)prob2 / (float)probN; - if (successor2 > b) { - IR_ASSERT(!ir_bitset_in(visited, successor2)); - bb_freq[successor2] += freq; - ir_bitqueue_add(&worklist, successor2); - } do { + freq = bb_freq[b] * (float)prob2 / (float)probN; + if (successor2 > b) { + IR_ASSERT(!ir_bitset_in(visited, successor2)); + bb_freq[successor2] += freq; + if (successor2_bb->successors_count == 0 && insn2->op2 == 1) { + /* move cold block without successors to the end */ + IR_ASSERT(chains[successor2].head == successor2); + chains[successor2].head = 0; + *schedule_end = successor2; + schedule_end--; + break; + } else { + ir_bitqueue_add(&worklist, successor2); + } + } if (prob2 > prob1 && (successor2_bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) != IR_BB_EMPTY) { uint32_t src = chains[b].next; @@ -2210,6 +2221,7 @@ restart: if (!IR_DEBUG_BB_SCHEDULE_GRAPH) break; } successor2 = _ir_skip_empty_blocks(ctx, successor2); + IR_ASSERT(edges_count < max_edges_count); edges[edges_count].from = b; edges[edges_count].to = successor2; edges[edges_count].freq = freq; @@ -2242,7 +2254,6 @@ restart: } else { prob = 100 / bb->successors_count; } - IR_ASSERT(edges_count < max_edges_count); freq = bb_freq[b] * (float)prob / 100.0f; if (successor > b) { IR_ASSERT(!ir_bitset_in(visited, successor)); @@ -2250,6 +2261,7 @@ restart: ir_bitqueue_add(&worklist, successor); } successor = _ir_skip_empty_blocks(ctx, successor); + IR_ASSERT(edges_count < max_edges_count); edges[edges_count].from = b; edges[edges_count].to = successor; edges[edges_count].freq = freq; @@ -2383,7 +2395,7 @@ restart: } } - IR_ASSERT(count + empty_count == ctx->cfg_blocks_count); + IR_ASSERT(ctx->cfg_schedule + count == schedule_end); ctx->cfg_schedule[ctx->cfg_blocks_count + 1] = 0; ir_mem_free(edges); @@ -2401,17 +2413,14 @@ static int ir_schedule_blocks_top_down(ir_ctx *ctx) uint32_t b, best_successor, last_non_empty; ir_block *bb, *best_successor_bb; ir_insn *insn; - uint32_t *list, *empty; + uint32_t *list, *schedule_end; uint32_t count = 0; -#ifdef IR_DEBUG - uint32_t empty_count = 0; -#endif ir_bitqueue_init(&blocks, ctx->cfg_blocks_count + 1); blocks.pos = 0; list = ir_mem_malloc(sizeof(uint32_t) * (ctx->cfg_blocks_count + 2)); list[ctx->cfg_blocks_count + 1] = 0; - empty = list + ctx->cfg_blocks_count; + schedule_end = list + ctx->cfg_blocks_count; for (b = 1; b <= ctx->cfg_blocks_count; b++) { ir_bitset_incl(blocks.set, b); } @@ -2431,11 +2440,8 @@ static int ir_schedule_blocks_top_down(ir_ctx *ctx) } if ((bb->flags & (IR_BB_START|IR_BB_ENTRY|IR_BB_EMPTY)) == IR_BB_EMPTY) { /* move empty blocks to the end */ -#ifdef IR_DEBUG - empty_count++; -#endif - *empty = b; - empty--; + *schedule_end = b; + schedule_end--; } else { count++; list[count] = b; @@ -2520,7 +2526,7 @@ static int ir_schedule_blocks_top_down(ir_ctx *ctx) } while (1); } - IR_ASSERT(count + empty_count == ctx->cfg_blocks_count); + IR_ASSERT(list + count == schedule_end); ctx->cfg_schedule = list; ir_bitqueue_free(&blocks); diff --git a/ext/opcache/jit/ir/ir_fold.h b/ext/opcache/jit/ir/ir_fold.h index 8a50641e5ea..9ff0f6d9db1 100644 --- a/ext/opcache/jit/ir/ir_fold.h +++ b/ext/opcache/jit/ir/ir_fold.h @@ -2508,6 +2508,7 @@ IR_FOLD(MUL(MUL, C_I8)) IR_FOLD(MUL(MUL, C_I16)) IR_FOLD(MUL(MUL, C_I32)) IR_FOLD(MUL(MUL, C_I64)) +IR_FOLD(MUL(MUL, C_ADDR)) { if (IR_IS_CONST_REF(op1_insn->op2) && !IR_IS_SYM_CONST(ctx->ir_base[op1_insn->op2].op)) { /* (x * c1) * c2 => x * (c1 * c2) */ @@ -2527,6 +2528,7 @@ IR_FOLD(AND(AND, C_I8)) IR_FOLD(AND(AND, C_I16)) IR_FOLD(AND(AND, C_I32)) IR_FOLD(AND(AND, C_I64)) +IR_FOLD(AND(AND, C_ADDR)) { if (IR_IS_CONST_REF(op1_insn->op2) && !IR_IS_SYM_CONST(ctx->ir_base[op1_insn->op2].op)) { /* (x & c1) & c2 => x & (c1 & c2) */ @@ -2546,6 +2548,7 @@ IR_FOLD(OR(OR, C_I8)) IR_FOLD(OR(OR, C_I16)) IR_FOLD(OR(OR, C_I32)) IR_FOLD(OR(OR, C_I64)) +IR_FOLD(OR(OR, C_ADDR)) { if (IR_IS_CONST_REF(op1_insn->op2) && !IR_IS_SYM_CONST(ctx->ir_base[op1_insn->op2].op)) { /* (x | c1) | c2 => x | (c1 | c2) */ @@ -2565,6 +2568,7 @@ IR_FOLD(XOR(XOR, C_I8)) IR_FOLD(XOR(XOR, C_I16)) IR_FOLD(XOR(XOR, C_I32)) IR_FOLD(XOR(XOR, C_I64)) +IR_FOLD(XOR(XOR, C_ADDR)) { if (IR_IS_CONST_REF(op1_insn->op2) && !IR_IS_SYM_CONST(ctx->ir_base[op1_insn->op2].op)) { /* (x ^ c1) ^ c2 => x ^ (c1 ^ c2) */ diff --git a/ext/opcache/jit/ir/ir_ra.c b/ext/opcache/jit/ir/ir_ra.c index 4860dae0ca0..44d00e27cb3 100644 --- a/ext/opcache/jit/ir/ir_ra.c +++ b/ext/opcache/jit/ir/ir_ra.c @@ -1554,6 +1554,10 @@ static bool ir_vregs_inside(ir_ctx *ctx, uint32_t parent, uint32_t child) ir_live_interval *child_ival = ctx->live_intervals[child]; ir_live_interval *parent_ival = ctx->live_intervals[parent]; + if ((child_ival->flags | parent_ival->flags) & IR_LIVE_INTERVAL_COALESCED) { + // TODO: Support valid cases with already coalesced "parent_ival + return 0; + } #if 0 if (child_ival->end >= parent_ival->end) { return 0; @@ -1629,6 +1633,13 @@ static void ir_vregs_coalesce(ir_ctx *ctx, uint32_t v1, uint32_t v2, ir_ref from uint16_t f1 = ctx->live_intervals[v1]->flags; uint16_t f2 = ctx->live_intervals[v2]->flags; +#if 0 + if (ctx->binding) { + ir_ref b1 = ir_binding_find(ctx, from); + ir_ref b2 = ir_binding_find(ctx, to); + IR_ASSERT(b1 == b2); + } +#endif if ((f1 & IR_LIVE_INTERVAL_COALESCED) && !(f2 & IR_LIVE_INTERVAL_COALESCED)) { ir_vregs_join(ctx, v1, v2); ctx->vregs[to] = v1; @@ -1971,6 +1982,13 @@ int ir_coalesce(ir_ctx *ctx) && ctx->vregs[insn->op1] && ctx->vregs[i] != ctx->vregs[insn->op1]) { if (ir_vregs_inside(ctx, ctx->vregs[insn->op1], ctx->vregs[i])) { + if (ctx->binding) { + ir_ref b1 = ir_binding_find(ctx, i); + ir_ref b2 = ir_binding_find(ctx, insn->op1); + if (b1 != b2) { + continue; + } + } ir_vregs_coalesce(ctx, ctx->vregs[i], ctx->vregs[insn->op1], i, insn->op1); compact = 1; } diff --git a/ext/opcache/jit/ir/ir_x86.dasc b/ext/opcache/jit/ir/ir_x86.dasc index c853ec2c929..9afd8ace1d3 100644 --- a/ext/opcache/jit/ir/ir_x86.dasc +++ b/ext/opcache/jit/ir/ir_x86.dasc @@ -7746,7 +7746,7 @@ static void ir_emit_switch(ir_ctx *ctx, uint32_t b, ir_ref def, ir_insn *insn) } /* Generate a table jmp or a seqence of calls */ - if ((max.i64-min.i64) < count * 8) { + if (count > 2 && (max.i64-min.i64) < count * 8) { int *labels = ir_mem_malloc(sizeof(int) * (size_t)(max.i64 - min.i64 + 1)); for (i = 0; i <= (max.i64 - min.i64); i++) {