diff --git a/ext/opcache/jit/ir/ir.c b/ext/opcache/jit/ir/ir.c index ce92137a7e1..faf450c34d7 100644 --- a/ext/opcache/jit/ir/ir.c +++ b/ext/opcache/jit/ir/ir.c @@ -1299,7 +1299,7 @@ static uint32_t ir_hashtab_hash_size(uint32_t size) size |= (size >> 4); size |= (size >> 8); size |= (size >> 16); - return size + 1; + return IR_MAX(size + 1, 4); } static void ir_hashtab_resize(ir_hashtab *tab) diff --git a/ext/opcache/jit/ir/ir_aarch64.dasc b/ext/opcache/jit/ir/ir_aarch64.dasc index 0eece76fdb2..cf444f6f840 100644 --- a/ext/opcache/jit/ir/ir_aarch64.dasc +++ b/ext/opcache/jit/ir/ir_aarch64.dasc @@ -1038,21 +1038,27 @@ static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg) return IR_SPILL_POS_TO_OFFSET(offset); } +static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v) +{ + int32_t offset; + ir_reg base; + + IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]); + offset = ctx->live_intervals[v]->stack_spill_pos; + IR_ASSERT(offset != -1); + if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) { + IR_ASSERT(ctx->spill_base != IR_REG_NONE); + return IR_MEM_BO(ctx->spill_base, offset); + } + base = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER; + offset = IR_SPILL_POS_TO_OFFSET(offset); + return IR_MEM_BO(base, offset); +} + static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref) { - ir_reg reg; - int32_t offset; - - IR_ASSERT(ref >= 0); - offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos; - IR_ASSERT(offset != -1); - if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) { - IR_ASSERT(ctx->spill_base != IR_REG_NONE); - reg = ctx->spill_base; - return IR_MEM_BO(reg, offset); - } - reg = (ctx->flags & IR_USE_FRAME_POINTER) ? IR_REG_FRAME_POINTER : IR_REG_STACK_POINTER; - return IR_MEM_BO(reg, IR_SPILL_POS_TO_OFFSET(offset)); + IR_ASSERT(!IR_IS_CONST_REF(ref)); + return ir_vreg_spill_slot(ctx, ctx->vregs[ref]); } static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem) @@ -1404,12 +1410,8 @@ static void ir_emit_store_mem_fp(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg r } } -static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg) +static void ir_emit_store_mem(ir_ctx *ctx, ir_type type, ir_mem mem, ir_reg reg) { - ir_mem mem; - - IR_ASSERT(dst >= 0); - mem = ir_ref_spill_slot(ctx, dst); if (IR_IS_TYPE_INT(type)) { ir_emit_store_mem_int(ctx, type, mem, reg); } else { @@ -1417,6 +1419,12 @@ static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg) } } +static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg) +{ + IR_ASSERT(dst >= 0); + ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg); +} + static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src) { ir_backend_data *data = ctx->data; @@ -3558,11 +3566,7 @@ static void ir_emit_vstore(ir_ctx *ctx, ir_ref ref, ir_insn *insn) ir_emit_load(ctx, type, op3_reg, insn->op3); } mem = IR_MEM_BO(fp, offset); - if (IR_IS_TYPE_INT(type)) { - ir_emit_store_mem_int(ctx, type, mem, op3_reg); - } else { - ir_emit_store_mem_fp(ctx, type, mem, op3_reg); - } + ir_emit_store_mem(ctx, type, mem, op3_reg); } static ir_mem ir_fuse_addr(ir_ctx *ctx, ir_ref root, ir_ref ref) @@ -3944,7 +3948,7 @@ static void ir_emit_va_start(ir_ctx *ctx, ir_ref def, ir_insn *insn) if (op2_reg != IR_REG_NONE) { | str Rx(tmp_reg), [Rx(op2_reg)] } else { - int32_t offset = ir_ref_spill_slot(ctx, insn->op2, &op2_reg); + int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &op2_reg); | str Rx(tmp_reg), [Rx(op2_reg), #offset] } @@ -4033,7 +4037,7 @@ static void ir_emit_va_arg(ir_ctx *ctx, ir_ref def, ir_insn *insn) if (op2_reg != IR_REG_NONE) { | str Rx(tmp_reg), [Rx(op2_reg)] } else { - int32_t offset = ir_ref_spill_slot(ctx, insn->op2, &op2_reg); + int32_t offset = ir_ref_spill_slot_offset(ctx, insn->op2, &op2_reg); | str Rx(tmp_reg), [Rx(op2_reg), #offset] } @@ -4404,11 +4408,7 @@ static int32_t ir_emit_arguments(ir_ctx *ctx, ir_ref def, ir_insn *insn, ir_reg } else { /* Pass register arguments to stack (REG->MEM moves) */ if (!IR_IS_CONST_REF(arg) && src_reg != IR_REG_NONE && !IR_REG_SPILLED(src_reg)) { - if (IR_IS_TYPE_INT(type)) { - ir_emit_store_mem_int(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg); - } else { - ir_emit_store_mem_fp(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg); - } + ir_emit_store_mem(ctx, type, IR_MEM_BO(IR_REG_STACK_POINTER, stack_offset), src_reg); } else { do_pass3 = 1; } diff --git a/ext/opcache/jit/ir/ir_dump.c b/ext/opcache/jit/ir/ir_dump.c index dbc18d87e78..b4641ab381a 100644 --- a/ext/opcache/jit/ir/ir_dump.c +++ b/ext/opcache/jit/ir/ir_dump.c @@ -162,39 +162,52 @@ void ir_dump_use_lists(const ir_ctx *ctx, FILE *f) } } -static int ir_dump_dessa_move(ir_ctx *ctx, uint8_t type, ir_ref from, ir_ref to) +static void ir_dump_dessa_moves(const ir_ctx *ctx, int b, ir_block *bb, FILE *f) { - FILE *f = ctx->data; - int8_t reg; + uint32_t succ; + ir_block *succ_bb; + ir_use_list *use_list; + ir_ref k, i, *p, use_ref, input; + ir_insn *use_insn; - if (IR_IS_CONST_REF(from)) { - fprintf(f, "\tmov c_%d -> ", -from); - } else if (from) { - fprintf(f, "\tmov R%d", ctx->vregs[from]); - if (ctx->live_intervals && ctx->live_intervals[ctx->vregs[from]]) { - reg = ctx->live_intervals[ctx->vregs[from]]->reg; - if (reg >= 0) { - fprintf(f, " [%%%s]", ir_reg_name(reg, type)); - } - } - fprintf(f, " -> "); - } else { - fprintf(f, "\tmov TMP -> "); - } + IR_ASSERT(bb->successors_count == 1); + succ = ctx->cfg_edges[bb->successors]; + succ_bb = &ctx->cfg_blocks[succ]; + IR_ASSERT(succ_bb->predecessors_count > 1); + use_list = &ctx->use_lists[succ_bb->start]; + k = ir_phi_input_number(ctx, succ_bb, b); - if (to) { - fprintf(f, "R%d", ctx->vregs[to]); - if (ctx->live_intervals && ctx->live_intervals[ctx->vregs[to]]) { - reg = ctx->live_intervals[ctx->vregs[to]]->reg; - if (reg >= 0) { - fprintf(f, " [%%%s]", ir_reg_name(reg, type)); + for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) { + use_ref = *p; + use_insn = &ctx->ir_base[use_ref]; + if (use_insn->op == IR_PHI) { + input = ir_insn_op(use_insn, k); + if (IR_IS_CONST_REF(input)) { + fprintf(f, "\t# DESSA MOV c_%d", -input); + } else if (ctx->vregs[input] != ctx->vregs[use_ref]) { + fprintf(f, "\t# DESSA MOV d_%d {R%d}", input, ctx->vregs[input]); + } else { + continue; } + if (ctx->regs) { + int8_t *regs = ctx->regs[use_ref]; + int8_t reg = regs[k]; + if (reg != IR_REG_NONE) { + fprintf(f, " {%%%s%s}", ir_reg_name(IR_REG_NUM(reg), ctx->ir_base[input].type), + (reg & (IR_REG_SPILL_LOAD|IR_REG_SPILL_SPECIAL)) ? ":load" : ""); + } + } + fprintf(f, " -> d_%d {R%d}", use_ref, ctx->vregs[use_ref]); + if (ctx->regs) { + int8_t reg = ctx->regs[use_ref][0]; + if (reg != IR_REG_NONE) { + fprintf(f, " {%%%s%s}", ir_reg_name(IR_REG_NUM(reg), ctx->ir_base[use_ref].type), + (reg & (IR_REG_SPILL_STORE|IR_REG_SPILL_SPECIAL)) ? ":store" : ""); + } + } + fprintf(f, "\n"); } - fprintf(f, "\n"); - } else { - fprintf(f, "TMP\n"); } - return 1; } void ir_dump_cfg(ir_ctx *ctx, FILE *f) @@ -283,8 +296,7 @@ void ir_dump_cfg(ir_ctx *ctx, FILE *f) } } if (bb->flags & IR_BB_DESSA_MOVES) { - ctx->data = f; - ir_gen_dessa_moves(ctx, b, ir_dump_dessa_move); + ir_dump_dessa_moves(ctx, b, bb, f); } } fprintf(f, "}\n"); @@ -621,50 +633,7 @@ void ir_dump_codegen(const ir_ctx *ctx, FILE *f) } if (bb->flags & IR_BB_DESSA_MOVES) { - uint32_t succ; - ir_block *succ_bb; - ir_use_list *use_list; - ir_ref k, i, *p, use_ref, input; - ir_insn *use_insn; - - IR_ASSERT(bb->successors_count == 1); - succ = ctx->cfg_edges[bb->successors]; - succ_bb = &ctx->cfg_blocks[succ]; - IR_ASSERT(succ_bb->predecessors_count > 1); - use_list = &ctx->use_lists[succ_bb->start]; - k = ir_phi_input_number(ctx, succ_bb, b); - - for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) { - use_ref = *p; - use_insn = &ctx->ir_base[use_ref]; - if (use_insn->op == IR_PHI) { - input = ir_insn_op(use_insn, k); - if (IR_IS_CONST_REF(input)) { - fprintf(f, "\t# DESSA MOV c_%d", -input); - } else if (ctx->vregs[input] != ctx->vregs[use_ref]) { - fprintf(f, "\t# DESSA MOV d_%d {R%d}", input, ctx->vregs[input]); - } else { - continue; - } - if (ctx->regs) { - int8_t *regs = ctx->regs[use_ref]; - int8_t reg = regs[k]; - if (reg != IR_REG_NONE) { - fprintf(f, " {%%%s%s}", ir_reg_name(IR_REG_NUM(reg), ctx->ir_base[input].type), - (reg & (IR_REG_SPILL_LOAD|IR_REG_SPILL_SPECIAL)) ? ":load" : ""); - } - } - fprintf(f, " -> d_%d {R%d}", use_ref, ctx->vregs[use_ref]); - if (ctx->regs) { - int8_t reg = ctx->regs[use_ref][0]; - if (reg != IR_REG_NONE) { - fprintf(f, " {%%%s%s}", ir_reg_name(IR_REG_NUM(reg), ctx->ir_base[use_ref].type), - (reg & (IR_REG_SPILL_STORE|IR_REG_SPILL_SPECIAL)) ? ":store" : ""); - } - } - fprintf(f, "\n"); - } - } + ir_dump_dessa_moves(ctx, b, bb, f); } insn = &ctx->ir_base[bb->end]; diff --git a/ext/opcache/jit/ir/ir_emit.c b/ext/opcache/jit/ir/ir_emit.c index e13270a9ea1..79972457a6a 100644 --- a/ext/opcache/jit/ir/ir_emit.c +++ b/ext/opcache/jit/ir/ir_emit.c @@ -51,13 +51,11 @@ typedef struct _ir_copy { ir_reg to; } ir_copy; -typedef struct _ir_delayed_copy { - ir_ref input; - ir_ref output; +typedef struct _ir_dessa_copy { ir_type type; - ir_reg from; - ir_reg to; -} ir_delayed_copy; + int32_t from; /* negative - constant ref, [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg */ + int32_t to; /* [0..IR_REG_NUM) - CPU reg, [IR_REG_NUM...) - virtual reg */ +} ir_dessa_copy; #if IR_REG_INT_ARGS static const int8_t _ir_int_reg_params[IR_REG_INT_ARGS]; @@ -255,25 +253,6 @@ static int ir_get_args_regs(const ir_ctx *ctx, const ir_insn *insn, int8_t *regs return count; } -static bool ir_is_same_mem(const ir_ctx *ctx, ir_ref r1, ir_ref r2) -{ - ir_live_interval *ival1, *ival2; - int32_t o1, o2; - - if (IR_IS_CONST_REF(r1) || IR_IS_CONST_REF(r2)) { - return 0; - } - - IR_ASSERT(ctx->vregs[r1] && ctx->vregs[r2]); - ival1 = ctx->live_intervals[ctx->vregs[r1]]; - ival2 = ctx->live_intervals[ctx->vregs[r2]]; - IR_ASSERT(ival1 && ival2); - o1 = ival1->stack_spill_pos; - o2 = ival2->stack_spill_pos; - IR_ASSERT(o1 != -1 && o2 != -1); - return o1 == o2; -} - static bool ir_is_same_mem_var(const ir_ctx *ctx, ir_ref r1, int32_t offset) { ir_live_interval *ival1; @@ -479,7 +458,6 @@ static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_ ir_reg to, from; ir_type type; ir_regset todo, ready, srcs; - ir_reg last_reg, last_fp_reg; if (count == 1) { to = copies[0].to; @@ -529,6 +507,11 @@ static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_ return 1; } + /* temporary registers can't be the same as some of the destinations */ + IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_reg)); + IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(todo, tmp_fp_reg)); + + /* first we resolve all "windmill blades" - trees (this doesn't requre temporary registers) */ while (ready != IR_REGSET_EMPTY) { ir_reg r; @@ -551,25 +534,11 @@ static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_ return 1; } - /* temporary registers may be the same as some of the destinations */ - last_reg = IR_REG_NONE; - if (tmp_reg != IR_REG_NONE) { - IR_ASSERT(!IR_REGSET_IN(srcs, tmp_reg)); - if (IR_REGSET_IN(todo, tmp_reg)) { - last_reg = tmp_reg; - IR_REGSET_EXCL(todo, tmp_reg); - } - } - - last_fp_reg = IR_REG_NONE; - if (tmp_fp_reg != IR_REG_NONE) { - IR_ASSERT(!IR_REGSET_IN(srcs, tmp_fp_reg)); - if (IR_REGSET_IN(todo, tmp_fp_reg)) { - last_fp_reg = tmp_fp_reg; - IR_REGSET_EXCL(todo, tmp_fp_reg); - } - } + /* at this point the sources that are the same as temoraries are already moved */ + IR_ASSERT(tmp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_reg) || pred[loc[tmp_reg]] == tmp_reg); + IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !IR_REGSET_IN(srcs, tmp_fp_reg) || pred[loc[tmp_fp_reg]] == tmp_fp_reg); + /* now we resolve all "windmill axles" - cycles (this reuires temporary registers) */ while (todo != IR_REGSET_EMPTY) { to = ir_regset_pop_first(&todo); from = pred[to]; @@ -625,39 +594,271 @@ static int ir_parallel_copy(ir_ctx *ctx, ir_copy *copies, int count, ir_reg tmp_ } } - if (last_reg != IR_REG_NONE) { - to = last_reg; - from = pred[to]; - type = types[from]; - from = loc[from]; - if (to != from) { - IR_ASSERT(IR_IS_TYPE_INT(type)); - ir_emit_mov_ext(ctx, type, to, from); + return 1; +} + +static void ir_emit_dessa_move(ir_ctx *ctx, ir_type type, ir_ref to, ir_ref from, ir_reg tmp_reg, ir_reg tmp_fp_reg) +{ + ir_mem mem_from, mem_to; + + IR_ASSERT(from != to); + if (to < IR_REG_NUM) { + if (IR_IS_CONST_REF(from)) { + ir_emit_load(ctx, type, to, from); + } else if (from < IR_REG_NUM) { + if (IR_IS_TYPE_INT(type)) { + ir_emit_mov(ctx, type, to, from); + } else { + ir_emit_fp_mov(ctx, type, to, from); + } + } else { + mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM); + ir_emit_load_mem(ctx, type, to, mem_from); + } + } else { + mem_to = ir_vreg_spill_slot(ctx, to - IR_REG_NUM); + if (IR_IS_CONST_REF(from)) { +#if defined(IR_TARGET_X86) || defined(IR_TARGET_X64) + if (IR_IS_TYPE_INT(type) + && !IR_IS_SYM_CONST(ctx->ir_base[from].op) + && (ir_type_size[type] != 8 || IR_IS_SIGNED_32BIT(ctx->ir_base[from].val.i64))) { + ir_emit_store_mem_imm(ctx, type, mem_to, ctx->ir_base[from].val.i32); + return; + } +#endif + ir_reg tmp = IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg; + IR_ASSERT(tmp != IR_REG_NONE); + ir_emit_load(ctx, type, tmp, from); + ir_emit_store_mem(ctx, type, mem_to, tmp); + } else if (from < IR_REG_NUM) { + ir_emit_store_mem(ctx, type, mem_to, from); + } else { + mem_from = ir_vreg_spill_slot(ctx, from - IR_REG_NUM); + IR_ASSERT(IR_MEM_VAL(mem_to) != IR_MEM_VAL(mem_from)); + ir_reg tmp = IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg; + IR_ASSERT(tmp != IR_REG_NONE); + ir_emit_load_mem(ctx, type, tmp, mem_from); + ir_emit_store_mem(ctx, type, mem_to, tmp); + } + } +} + +IR_ALWAYS_INLINE void ir_dessa_resolve_cycle(ir_ctx *ctx, int32_t *pred, int32_t *loc, ir_bitset todo, ir_type type, int32_t to, ir_reg tmp_reg, ir_reg tmp_fp_reg) +{ + ir_reg from; + ir_mem tmp_spill_slot; + + IR_MEM_VAL(tmp_spill_slot) = 0; + IR_ASSERT(!IR_IS_CONST_REF(to)); + from = pred[to]; + IR_ASSERT(!IR_IS_CONST_REF(from)); + IR_ASSERT(from != to); + IR_ASSERT(loc[from] == from); + + if (IR_IS_TYPE_INT(type)) { +#ifdef IR_HAVE_SWAP_INT + if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) { + /* a simple cycle from 2 elements */ + ir_emit_swap(ctx, type, to, from); + ir_bitset_excl(todo, from); + ir_bitset_excl(todo, to); + loc[to] = from; + loc[from] = to; + return; + } +#endif + IR_ASSERT(tmp_reg != IR_REG_NONE); + IR_ASSERT(tmp_reg >= IR_REG_GP_FIRST && tmp_reg <= IR_REG_GP_LAST); + loc[to] = tmp_reg; + if (to < IR_REG_NUM) { + ir_emit_mov(ctx, type, tmp_reg, to); + } else { + ir_emit_load_mem_int(ctx, type, tmp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM)); + } + } else { +#ifdef IR_HAVE_SWAP_FP + if (pred[from] == to && to < IR_REG_NUM && from < IR_REG_NUM) { + /* a simple cycle from 2 elements */ + ir_emit_swap_fp(ctx, type, to, from); + IR_REGSET_EXCL(todo, from); + IR_REGSET_EXCL(todo, to); + loc[to] = from; + loc[from] = to; + return; + } +#endif + IR_ASSERT(tmp_fp_reg != IR_REG_NONE); + IR_ASSERT(tmp_fp_reg >= IR_REG_FP_FIRST && tmp_fp_reg <= IR_REG_FP_LAST); + loc[to] = tmp_fp_reg; + if (to < IR_REG_NUM) { + ir_emit_fp_mov(ctx, type, tmp_fp_reg, to); + } else { + ir_emit_load_mem_fp(ctx, type, tmp_fp_reg, ir_vreg_spill_slot(ctx, to - IR_REG_NUM)); } } - if (last_fp_reg != IR_REG_NONE) { - to = last_fp_reg; + while (1) { + int32_t r; + from = pred[to]; - type = types[from]; - from = loc[from]; - if (to != from) { - IR_ASSERT(!IR_IS_TYPE_INT(type)); - ir_emit_fp_mov(ctx, type, to, from); + r = loc[from]; + + if (from == r && ir_bitset_in(todo, from)) { + /* Memory to memory move inside an isolated or "blocked" cycle requres an additional temporary register */ + if (to >= IR_REG_NUM && r >= IR_REG_NUM) { + ir_reg tmp = IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg; + + if (!IR_MEM_VAL(tmp_spill_slot)) { + /* Free a register, saving it in a temporary spill slot */ + tmp_spill_slot = IR_MEM_BO(IR_REG_STACK_POINTER, -16); + ir_emit_store_mem(ctx, type, tmp_spill_slot, tmp); + } + ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg); + } else { + ir_emit_dessa_move(ctx, type, to, r, IR_REG_NONE, IR_REG_NONE); + } + ir_bitset_excl(todo, to); + loc[from] = to; + to = from; + } else { + break; + } + } + if (IR_MEM_VAL(tmp_spill_slot)) { + ir_emit_load_mem(ctx, type, IR_IS_TYPE_INT(type) ? tmp_reg : tmp_fp_reg, tmp_spill_slot); + } + ir_emit_dessa_move(ctx, type, to, loc[from], IR_REG_NONE, IR_REG_NONE); + ir_bitset_excl(todo, to); + loc[from] = to; +} + +static int ir_dessa_parallel_copy(ir_ctx *ctx, ir_dessa_copy *copies, int count, ir_reg tmp_reg, ir_reg tmp_fp_reg) +{ + int i; + int32_t *pred, *loc, to, from; + int8_t *types; + ir_type type; + uint32_t len; + ir_bitset todo, ready, srcs, visited; + + if (count == 1) { + to = copies[0].to; + from = copies[0].from; + IR_ASSERT(from != to); + type = copies[0].type; + ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg); + return 1; + } + + len = IR_REG_NUM + ctx->vregs_count + 1; + todo = ir_bitset_malloc(len); + srcs = ir_bitset_malloc(len); + loc = ir_mem_malloc(len * 2 * sizeof(int32_t) + len * sizeof(int8_t)); + pred = loc + len; + types = (int8_t*)(pred + len); + + for (i = 0; i < count; i++) { + from = copies[i].from; + to = copies[i].to; + IR_ASSERT(from != to); + if (!IR_IS_CONST_REF(from)) { + ir_bitset_incl(srcs, from); + loc[from] = from; + } + pred[to] = from; + types[to] = copies[i].type; + IR_ASSERT(!ir_bitset_in(todo, to)); + ir_bitset_incl(todo, to); + } + + /* temporary registers can't be the same as some of the sources */ + IR_ASSERT(tmp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_reg)); + IR_ASSERT(tmp_fp_reg == IR_REG_NONE || !ir_bitset_in(srcs, tmp_fp_reg)); + + /* first we resolve all "windmill blades" - trees, that don't set temporary registers */ + ready = ir_bitset_malloc(len); + ir_bitset_copy(ready, todo, ir_bitset_len(len)); + ir_bitset_difference(ready, srcs, ir_bitset_len(len)); + if (tmp_reg != IR_REG_NONE) { + ir_bitset_excl(ready, tmp_reg); + } + if (tmp_fp_reg != IR_REG_NONE) { + ir_bitset_excl(ready, tmp_fp_reg); + } + while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) { + ir_bitset_excl(todo, to); + type = types[to]; + from = pred[to]; + if (IR_IS_CONST_REF(from)) { + ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg); + } else { + int32_t r = loc[from]; + ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg); + loc[from] = to; + if (from == r && ir_bitset_in(todo, from) && from != tmp_reg && from != tmp_fp_reg) { + ir_bitset_incl(ready, from); + } } } + /* then we resolve all "windmill axles" - cycles (this requres temporary registers) */ + visited = ir_bitset_malloc(len); + ir_bitset_copy(ready, todo, ir_bitset_len(len)); + ir_bitset_intersection(ready, srcs, ir_bitset_len(len)); + while ((to = ir_bitset_first(ready, ir_bitset_len(len))) >= 0) { + ir_bitset_clear(visited, ir_bitset_len(len)); + ir_bitset_incl(visited, to); + to = pred[to]; + while (!IR_IS_CONST_REF(to) && ir_bitset_in(ready, to)) { + to = pred[to]; + if (!IR_IS_CONST_REF(to) && ir_bitset_in(visited, to)) { + /* We found a cycle. Resolve it. */ + ir_bitset_incl(visited, to); + type = types[to]; + ir_dessa_resolve_cycle(ctx, pred, loc, todo, type, to, tmp_reg, tmp_fp_reg); + break; + } + ir_bitset_incl(visited, to); + } + ir_bitset_difference(ready, visited, ir_bitset_len(len)); + } + + /* finally we resolve remaining "windmill blades" - trees that set temporary registers */ + ir_bitset_copy(ready, todo, ir_bitset_len(len)); + ir_bitset_difference(ready, srcs, ir_bitset_len(len)); + while ((to = ir_bitset_pop_first(ready, ir_bitset_len(len))) >= 0) { + ir_bitset_excl(todo, to); + type = types[to]; + from = pred[to]; + if (IR_IS_CONST_REF(from)) { + ir_emit_dessa_move(ctx, type, to, from, tmp_reg, tmp_fp_reg); + } else { + int32_t r = loc[from]; + ir_emit_dessa_move(ctx, type, to, r, tmp_reg, tmp_fp_reg); + loc[from] = to; + if (from == r && ir_bitset_in(todo, from)) { + ir_bitset_incl(ready, from); + } + } + } + + IR_ASSERT(ir_bitset_empty(todo, ir_bitset_len(len))); + + ir_mem_free(visited); + ir_mem_free(ready); + ir_mem_free(loc); + ir_mem_free(srcs); + ir_mem_free(todo); return 1; } static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb) { - uint32_t succ, k, n = 0, n2 = 0; + uint32_t succ, k, n = 0; ir_block *succ_bb; ir_use_list *use_list; ir_ref i, *p; - ir_copy *copies; - ir_delayed_copy *copies2; + ir_dessa_copy *copies; ir_reg tmp_reg = ctx->regs[bb->end][0]; ir_reg tmp_fp_reg = ctx->regs[bb->end][1]; @@ -668,8 +869,7 @@ static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb) use_list = &ctx->use_lists[succ_bb->start]; k = ir_phi_input_number(ctx, succ_bb, b); - copies = ir_mem_malloc(use_list->count * sizeof(ir_copy) + use_list->count * sizeof(ir_delayed_copy)); - copies2 = (ir_delayed_copy*)(copies + use_list->count); + copies = alloca(use_list->count * sizeof(ir_dessa_copy)); for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) { ir_ref ref = *p; @@ -679,96 +879,29 @@ static void ir_emit_dessa_moves(ir_ctx *ctx, int b, ir_block *bb) ir_ref input = ir_insn_op(insn, k); ir_reg src = ir_get_alocated_reg(ctx, ref, k); ir_reg dst = ctx->regs[ref][0]; + ir_ref from, to; - if (dst == IR_REG_NONE) { - /* STORE to memory cannot clobber any input register (do it right now) */ - if (IR_IS_CONST_REF(input)) { - IR_ASSERT(src == IR_REG_NONE); -#if defined(IR_TARGET_X86) || defined(IR_TARGET_X64) - if (IR_IS_TYPE_INT(insn->type) - && !IR_IS_SYM_CONST(ctx->ir_base[input].op) - && (ir_type_size[insn->type] != 8 || IR_IS_SIGNED_32BIT(ctx->ir_base[input].val.i64))) { - ir_emit_store_imm(ctx, insn->type, ref, ctx->ir_base[input].val.i32); - continue; - } -#endif - ir_reg tmp = IR_IS_TYPE_INT(insn->type) ? tmp_reg : tmp_fp_reg; - - IR_ASSERT(tmp != IR_REG_NONE); - ir_emit_load(ctx, insn->type, tmp, input); - ir_emit_store(ctx, insn->type, ref, tmp); - } else if (src == IR_REG_NONE) { - if (!ir_is_same_mem(ctx, input, ref)) { - ir_reg tmp = IR_IS_TYPE_INT(insn->type) ? tmp_reg : tmp_fp_reg; - - IR_ASSERT(tmp != IR_REG_NONE); - ir_emit_load(ctx, insn->type, tmp, input); - ir_emit_store(ctx, insn->type, ref, tmp); - } - } else { - if (IR_REG_SPILLED(src)) { - src = IR_REG_NUM(src); - ir_emit_load(ctx, insn->type, src, input); - if (ir_is_same_mem(ctx, input, ref)) { - continue; - } - } - ir_emit_store(ctx, insn->type, ref, src); - } - } else if (src == IR_REG_NONE) { - /* STORE of constant or memory can't be clobbered by parallel reg->reg copies (delay it) */ - copies2[n2].input = input; - copies2[n2].output = ref; - copies2[n2].type = insn->type; - copies2[n2].from = src; - copies2[n2].to = dst; - n2++; + IR_ASSERT(dst == IR_REG_NONE || !IR_REG_SPILLED(dst)); + if (IR_IS_CONST_REF(input)) { + from = input; } else { - IR_ASSERT(!IR_IS_CONST_REF(input)); - if (IR_REG_SPILLED(src)) { - ir_emit_load(ctx, insn->type, IR_REG_NUM(src), input); - } - if (IR_REG_SPILLED(dst) && (!IR_REG_SPILLED(src) || !ir_is_same_mem(ctx, input, ref))) { - ir_emit_store(ctx, insn->type, ref, IR_REG_NUM(src)); - } - if (IR_REG_NUM(src) != IR_REG_NUM(dst)) { - /* Schedule parallel reg->reg copy */ - copies[n].type = insn->type; - copies[n].from = IR_REG_NUM(src); - copies[n].to = IR_REG_NUM(dst); - n++; - } + from = (src != IR_REG_NONE && !IR_REG_SPILLED(src)) ? + (ir_ref)src : (ir_ref)(IR_REG_NUM + ctx->vregs[input]); + } + to = (dst != IR_REG_NONE) ? + (ir_ref)dst : (ir_ref)(IR_REG_NUM + ctx->vregs[ref]); + if (to != from) { + copies[n].type = insn->type; + copies[n].from = from; + copies[n].to = to; + n++; } } } if (n > 0) { - ir_parallel_copy(ctx, copies, n, tmp_reg, tmp_fp_reg); + ir_dessa_parallel_copy(ctx, copies, n, tmp_reg, tmp_fp_reg); } - - for (n = 0; n < n2; n++) { - ir_ref input = copies2[n].input; - ir_ref ref = copies2[n].output; - ir_type type = copies2[n].type; - ir_reg dst = copies2[n].to; - - IR_ASSERT(dst != IR_REG_NONE); - if (IR_IS_CONST_REF(input)) { - ir_emit_load(ctx, type, IR_REG_NUM(dst), input); - } else { - IR_ASSERT(copies2[n].from == IR_REG_NONE); - if (IR_REG_SPILLED(dst) && ir_is_same_mem(ctx, input, ref)) { - /* avoid LOAD and STORE to the same memory */ - continue; - } - ir_emit_load(ctx, type, IR_REG_NUM(dst), input); - } - if (IR_REG_SPILLED(dst)) { - ir_emit_store(ctx, type, ref, IR_REG_NUM(dst)); - } - } - - ir_mem_free(copies); } int ir_match(ir_ctx *ctx) diff --git a/ext/opcache/jit/ir/ir_ra.c b/ext/opcache/jit/ir/ir_ra.c index 8cca8d08be1..d94108171da 100644 --- a/ext/opcache/jit/ir/ir_ra.c +++ b/ext/opcache/jit/ir/ir_ra.c @@ -2021,12 +2021,22 @@ int ir_compute_dessa_moves(ir_ctx *ctx) return 1; } +/* + * Parallel copy sequentialization algorithm + * + * The implementation is based on algorithm 1 desriebed in + * "Revisiting Out-of-SSA Translation for Correctness, Code Quality and Efficiency", + * Benoit Boissinot, Alain Darte, Fabrice Rastello, Benoit Dupont de Dinechin, Christophe Guillon. + * 2009 International Symposium on Code Generation and Optimization, Seattle, WA, USA, 2009, + * pp. 114-125, doi: 10.1109/CGO.2009.19. + */ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy) { uint32_t succ, k, n = 0; ir_block *bb, *succ_bb; ir_use_list *use_list; - ir_ref *loc, *pred, i, *p, ref, input; + ir_ref *loc, *pred, *src, *dst, i, *p, ref, input; + ir_ref s, d; ir_insn *insn; uint32_t len; ir_bitset todo, ready; @@ -2044,10 +2054,12 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy) k = ir_phi_input_number(ctx, succ_bb, b); - loc = ir_mem_malloc(ctx->insns_count * 2 * sizeof(ir_ref)); - pred = loc + ctx->insns_count; - len = ir_bitset_len(ctx->insns_count); - todo = ir_bitset_malloc(ctx->insns_count); + loc = ir_mem_malloc((ctx->vregs_count + 1) * 4 * sizeof(ir_ref)); + pred = loc + ctx->vregs_count + 1; + src = pred + ctx->vregs_count + 1; + dst = src + ctx->vregs_count + 1; + len = ir_bitset_len(ctx->vregs_count + 1); + todo = ir_bitset_malloc(ctx->vregs_count + 1); for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) { ref = *p; @@ -2057,21 +2069,28 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy) if (IR_IS_CONST_REF(input)) { have_constants = 1; } else if (ctx->vregs[input] != ctx->vregs[ref]) { - loc[ref] = pred[input] = 0; - ir_bitset_incl(todo, ref); + s = ctx->vregs[input]; + d = ctx->vregs[ref]; + src[s] = input; + dst[d] = ref; + loc[d] = pred[s] = 0; + ir_bitset_incl(todo, d); n++; } } } if (n > 0) { - ready = ir_bitset_malloc(ctx->insns_count); - IR_BITSET_FOREACH(todo, len, ref) { + src[0] = dst[0] = 0; + ready = ir_bitset_malloc(ctx->vregs_count + 1); + IR_BITSET_FOREACH(todo, len, d) { + ref = dst[d]; insn = &ctx->ir_base[ref]; IR_ASSERT(insn->op == IR_PHI); input = ir_insn_op(insn, k); - loc[input] = input; - pred[ref] = input; + s = ctx->vregs[input]; + loc[s] = s; + pred[d] = s; } IR_BITSET_FOREACH_END(); IR_BITSET_FOREACH(todo, len, i) { @@ -2086,9 +2105,10 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy) while ((b = ir_bitset_pop_first(ready, len)) >= 0) { a = pred[b]; c = loc[a]; - emit_copy(ctx, ctx->ir_base[b].type, c, b); + emit_copy(ctx, ctx->ir_base[dst[b]].type, src[c], dst[b]); ir_bitset_excl(todo, b); loc[a] = b; + src[b] = dst[b]; if (a == c && pred[a]) { ir_bitset_incl(ready, a); } @@ -2098,7 +2118,7 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy) break; } IR_ASSERT(b != loc[pred[b]]); - emit_copy(ctx, ctx->ir_base[b].type, b, 0); + emit_copy(ctx, ctx->ir_base[src[b]].type, src[b], 0); loc[b] = 0; ir_bitset_incl(ready, b); } diff --git a/ext/opcache/jit/ir/ir_x86.dasc b/ext/opcache/jit/ir/ir_x86.dasc index dd34e023ba8..0b9ac8af32e 100644 --- a/ext/opcache/jit/ir/ir_x86.dasc +++ b/ext/opcache/jit/ir/ir_x86.dasc @@ -2560,15 +2560,15 @@ static int32_t ir_ref_spill_slot_offset(ir_ctx *ctx, ir_ref ref, ir_reg *reg) return IR_SPILL_POS_TO_OFFSET(offset); } -static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref) +static ir_mem ir_vreg_spill_slot(ir_ctx *ctx, ir_ref v) { int32_t offset; ir_reg base; - IR_ASSERT(ref >= 0 && ctx->vregs[ref] && ctx->live_intervals[ctx->vregs[ref]]); - offset = ctx->live_intervals[ctx->vregs[ref]]->stack_spill_pos; + IR_ASSERT(v > 0 && v <= ctx->vregs_count && ctx->live_intervals[v]); + offset = ctx->live_intervals[v]->stack_spill_pos; IR_ASSERT(offset != -1); - if (ctx->live_intervals[ctx->vregs[ref]]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) { + if (ctx->live_intervals[v]->flags & IR_LIVE_INTERVAL_SPILL_SPECIAL) { IR_ASSERT(ctx->spill_base != IR_REG_NONE); return IR_MEM_BO(ctx->spill_base, offset); } @@ -2577,6 +2577,12 @@ static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref) return IR_MEM_BO(base, offset); } +static ir_mem ir_ref_spill_slot(ir_ctx *ctx, ir_ref ref) +{ + IR_ASSERT(!IR_IS_CONST_REF(ref)); + return ir_vreg_spill_slot(ctx, ctx->vregs[ref]); +} + static bool ir_is_same_spill_slot(ir_ctx *ctx, ir_ref ref, ir_mem mem) { ir_mem m = ir_ref_spill_slot(ctx, ref); @@ -2814,13 +2820,6 @@ static void ir_emit_store(ir_ctx *ctx, ir_type type, ir_ref dst, ir_reg reg) ir_emit_store_mem(ctx, type, ir_ref_spill_slot(ctx, dst), reg); } -static void ir_emit_store_imm(ir_ctx *ctx, ir_type type, ir_ref dst, int32_t imm) -{ - IR_ASSERT(dst >= 0); - IR_ASSERT(IR_IS_TYPE_INT(type)); - ir_emit_store_mem_imm(ctx, type, ir_ref_spill_slot(ctx, dst), imm); -} - static void ir_emit_mov(ir_ctx *ctx, ir_type type, ir_reg dst, ir_reg src) { ir_backend_data *data = ctx->data;