Update IR

IR commit: fe4ba285bc576d83bea4a8099fb7315b8bc8c7fb
2026-03-24 00:02:20 +01:00 · 2024-05-06 22:22:15 +03:00
parent 3fcf6ffbc4
commit bb21d195c1
8 changed files with 156 additions and 65 deletions
--- a/ext/opcache/jit/ir/ir.h
+++ b/ext/opcache/jit/ir/ir.h
@@ -541,6 +541,7 @@ void ir_strtab_free(ir_strtab *strtab);
 # define IR_DEBUG_GCM_SPLIT    (1<<28)
 # define IR_DEBUG_SCHEDULE     (1<<29)
 # define IR_DEBUG_RA           (1<<30)
+# define IR_DEBUG_BB_SCHEDULE  (1U<<31)
 #endif

 typedef struct _ir_ctx           ir_ctx;
--- a/ext/opcache/jit/ir/ir_aarch64.dasc
+++ b/ext/opcache/jit/ir/ir_aarch64.dasc
@@ -1010,7 +1010,7 @@ binop_fp:
 				return IR_RETURN_FP;
 			}
 		case IR_IF:
-			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
+			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
 				op2_insn = &ctx->ir_base[insn->op2];
 				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT) {
 					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
@@ -1020,7 +1020,7 @@ binop_fp:
 						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
 						return IR_CMP_AND_BRANCH_FP;
 					}
-				} else if (op2_insn->op == IR_OVERFLOW) {
+				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
 					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
 					return IR_OVERFLOW_AND_BRANCH;
 				}
@@ -1033,7 +1033,7 @@ binop_fp:
 			}
 		case IR_GUARD:
 		case IR_GUARD_NOT:
-			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
+			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
 				op2_insn = &ctx->ir_base[insn->op2];
 				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT
 					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
@@ -1047,7 +1047,7 @@ binop_fp:
 						ctx->rules[insn->op2] = IR_FUSED | IR_CMP_FP;
 						return IR_GUARD_CMP_FP;
 					}
-				} else if (op2_insn->op == IR_OVERFLOW) {
+				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
 					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
 					return IR_GUARD_OVERFLOW;
 				}
--- a/ext/opcache/jit/ir/ir_cfg.c
+++ b/ext/opcache/jit/ir/ir_cfg.c
@@ -1151,13 +1151,11 @@ static void ir_insert_chain_before(ir_chain *chains, uint32_t c, uint32_t before
 }

 #ifndef IR_DEBUG_BB_SCHEDULE_GRAPH
-# define IR_DEBUG_BB_SCHEDULE_GRAPH 0
-#endif
-#ifndef IR_DEBUG_BB_SCHEDULE_EDGES
-# define IR_DEBUG_BB_SCHEDULE_EDGES 0
-#endif
-#ifndef IR_DEBUG_BB_SCHEDULE_CHAINS
-# define IR_DEBUG_BB_SCHEDULE_CHAINS 0
+# ifdef IR_DEBUG
+#  define IR_DEBUG_BB_SCHEDULE_GRAPH 1
+# else
+#  define IR_DEBUG_BB_SCHEDULE_GRAPH 0
+# endif
 #endif

 #if IR_DEBUG_BB_SCHEDULE_GRAPH
@@ -1210,20 +1208,17 @@ static void ir_dump_cfg_freq_graph(ir_ctx *ctx, float *bb_freq, uint32_t edges_c
 }
 #endif

-#if IR_DEBUG_BB_SCHEDULE_EDGES
+#ifdef IR_DEBUG
 static void ir_dump_edges(ir_ctx *ctx, uint32_t edges_count, ir_edge_info *edges)
 {
 	uint32_t i;

 	fprintf(stderr, "Edges:\n");
 	for (i = 0; i < edges_count; i++) {
-		fprintf(stderr, "\tBB%d -> BB%d [label=\"%0.3f\"]\n", edges[i].from, edges[i].to, edges[i].freq);
+		fprintf(stderr, "\tBB%d -> BB%d %0.3f\n", edges[i].from, edges[i].to, edges[i].freq);
 	}
-	fprintf(stderr, "}\n");
 }
-#endif

-#if IR_DEBUG_BB_SCHEDULE_CHAINS
 static void ir_dump_chains(ir_ctx *ctx, ir_chain *chains)
 {
 	uint32_t b, tail, i;
@@ -1507,8 +1502,10 @@ restart:
 	/* 2. Sort EDGEs according to their frequencies */
 	qsort(edges, edges_count, sizeof(ir_edge_info), ir_edge_info_cmp);

-#if IR_DEBUG_BB_SCHEDULE_EDGES
-	ir_dump_edges(ctx, edges_count, edges);
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
+		ir_dump_edges(ctx, edges_count, edges);
+	}
 #endif

 	/* 3. Process EDGEs in the decreasing frequency order and join the connected chains */
@@ -1555,13 +1552,17 @@ restart:
 	}

 #if IR_DEBUG_BB_SCHEDULE_GRAPH
-	ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
+	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
+		ir_dump_cfg_freq_graph(ctx, bb_freq, edges_count, edges, chains);
+	}
 #endif

 	ir_mem_free(bb_freq);

-#if IR_DEBUG_BB_SCHEDULE_CHAINS
-	ir_dump_chains(ctx, chains);
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
+		ir_dump_chains(ctx, chains);
+	}
 #endif

 	/* 4. Merge empty entry blocks */
@@ -1585,8 +1586,10 @@ restart:
 			}
 		}

-#if IR_DEBUG_BB_SCHEDULE_CHAINS
-		ir_dump_chains(ctx, chains);
+#ifdef IR_DEBUG
+		if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
+			ir_dump_chains(ctx, chains);
+		}
 #endif
 	}

@@ -1619,8 +1622,10 @@ restart:
 		}
 	}

-#if IR_DEBUG_BB_SCHEDULE_CHAINS
-	ir_dump_chains(ctx, chains);
+#ifdef IR_DEBUG
+	if (ctx->flags & IR_DEBUG_BB_SCHEDULE) {
+		ir_dump_chains(ctx, chains);
+	}
 #endif

 	/* 7. Form a final BB order */
--- a/ext/opcache/jit/ir/ir_gcm.c
+++ b/ext/opcache/jit/ir/ir_gcm.c
@@ -84,6 +84,7 @@ static uint32_t ir_gcm_select_best_block(ir_ctx *ctx, ir_ref ref, uint32_t lca)
 		return lca;
 	}

+#if 0 /* This is not necessary anymore. Conditions may be fused with IF across BBs. */
 	if (ctx->ir_base[ref].op >= IR_EQ && ctx->ir_base[ref].op <= IR_UGT) {
 		ir_use_list *use_list = &ctx->use_lists[ref];

@@ -96,6 +97,7 @@ static uint32_t ir_gcm_select_best_block(ir_ctx *ctx, ir_ref ref, uint32_t lca)
 			}
 		}
 	}
+#endif

 	flags = (bb->flags & IR_BB_LOOP_HEADER) ? bb->flags : ctx->cfg_blocks[bb->loop_header].flags;
 	if ((flags & IR_BB_LOOP_WITH_ENTRY)
@@ -487,9 +489,19 @@ static void ir_gcm_schedule_late(ir_ctx *ctx, ir_ref ref, uint32_t b)
 		b = ir_gcm_select_best_block(ctx, ref, lca);

 		ctx->cfg_map[ref] = b;
-		if (ctx->ir_base[ref + 1].op == IR_OVERFLOW) {
-			/* OVERFLOW is a projection and must be scheduled together with previous ADD/SUB/MUL_OV */
-			ctx->cfg_map[ref + 1] = b;
+
+		/* OVERFLOW is a projection of ADD/SUB/MUL_OV and must be scheduled into the same block */
+		if (ctx->ir_base[ref].op >= IR_ADD_OV && ctx->ir_base[ref].op <= IR_MUL_OV) {
+			ir_use_list *use_list = &ctx->use_lists[ref];
+			ir_ref n, *p, use;
+
+			for (n = use_list->count, p = &ctx->use_edges[use_list->refs]; n < 0; p++, n--) {
+				use = *p;
+				if (ctx->ir_base[use].op == IR_OVERFLOW) {
+					ctx->cfg_map[use] = b;
+					break;
+				}
+			}
 		}
 	}
 }
--- a/ext/opcache/jit/ir/ir_private.h
+++ b/ext/opcache/jit/ir/ir_private.h
@@ -582,6 +582,17 @@ IR_ALWAYS_INLINE void ir_bitqueue_init(ir_bitqueue *q, uint32_t n)
 	q->set = ir_bitset_malloc(n);
 }

+IR_ALWAYS_INLINE void ir_bitqueue_grow(ir_bitqueue *q, uint32_t n)
+{
+	uint32_t len = ir_bitset_len(n);
+	IR_ASSERT(len >= q->len);
+	if (len > q->len) {
+		q->set = ir_mem_realloc(q->set, len * (IR_BITSET_BITS / 8));
+		memset(q->set + q->len, 0, (len - q->len) * (IR_BITSET_BITS / 8));
+		q->len = len;
+	}
+}
+
 IR_ALWAYS_INLINE void ir_bitqueue_free(ir_bitqueue *q)
 {
 	ir_mem_free(q->set);
--- a/ext/opcache/jit/ir/ir_ra.c
+++ b/ext/opcache/jit/ir/ir_ra.c
@@ -2115,7 +2115,7 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy)
 	ir_insn *insn;
 	uint32_t len;
 	ir_bitset todo, ready;
-	bool have_constants = 0;
+	bool have_constants_or_addresses = 0;

 	bb = &ctx->cfg_blocks[b];
 	if (!(bb->flags & IR_BB_DESSA_MOVES)) {
@@ -2141,8 +2141,8 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy)
 		insn = &ctx->ir_base[ref];
 		if (insn->op == IR_PHI) {
 			input = ir_insn_op(insn, k);
-			if (IR_IS_CONST_REF(input)) {
-				have_constants = 1;
+			if (IR_IS_CONST_REF(input) || !ctx->vregs[input]) {
+				have_constants_or_addresses = 1;
 			} else if (ctx->vregs[input] != ctx->vregs[ref]) {
 				s = ctx->vregs[input];
 				d = ctx->vregs[ref];
@@ -2204,13 +2204,13 @@ int ir_gen_dessa_moves(ir_ctx *ctx, uint32_t b, emit_copy_t emit_copy)
 	ir_mem_free(todo);
 	ir_mem_free(loc);

-	if (have_constants) {
+	if (have_constants_or_addresses) {
 		for (i = 0, p = &ctx->use_edges[use_list->refs]; i < use_list->count; i++, p++) {
 			ref = *p;
 			insn = &ctx->ir_base[ref];
 			if (insn->op == IR_PHI) {
 				input = ir_insn_op(insn, k);
-				if (IR_IS_CONST_REF(input)) {
+				if (IR_IS_CONST_REF(input) || !ctx->vregs[input]) {
 					emit_copy(ctx, insn->type, input, ref);
 				}
 			}
--- a/ext/opcache/jit/ir/ir_sccp.c
+++ b/ext/opcache/jit/ir/ir_sccp.c
@@ -347,7 +347,8 @@ static void ir_sccp_remove_insn(ir_ctx *ctx, ir_insn *_values, ir_ref ref, ir_bi
 	for (j = 1, p = insn->ops + j; j <= n; j++, p++) {
 		ir_ref input = *p;
 		*p = IR_UNUSED;
-		if (input > 0 && _values[input].op == IR_BOTTOM) {
+		/* we may skip nodes that are going to be removed by SCCP (TOP, CONST and COPY) */
+		if (input > 0 && _values[input].op > IR_COPY) {
 			ir_use_list_remove_all(ctx, input, ref);
 			if (ir_is_dead(ctx, input)) {
 				/* schedule DCE */
@@ -396,13 +397,12 @@ static void ir_sccp_replace_insn(ir_ctx *ctx, ir_insn *_values, ir_ref ref, ir_r
 	for (j = 1, p = insn->ops + 1; j <= n; j++, p++) {
 		ir_ref input = *p;
 		*p = IR_UNUSED;
-		if (input > 0) {
+		/* we may skip nodes that are going to be removed by SCCP (TOP, CONST and COPY) */
+		if (input > 0 && _values[input].op > IR_COPY) {
 			ir_use_list_remove_all(ctx, input, ref);
-			if (_values[input].op == IR_BOTTOM) {
-				if (ir_is_dead(ctx, input)) {
-					/* schedule DCE */
-					ir_bitqueue_add(worklist, input);
-				}
+			if (ir_is_dead(ctx, input)) {
+				/* schedule DCE */
+				ir_bitqueue_add(worklist, input);
 			}
 		}
 	}
@@ -429,8 +429,9 @@ static void ir_sccp_replace_insn(ir_ctx *ctx, ir_insn *_values, ir_ref ref, ir_r
 				}
 			}
 #endif
-			/* schedule folding */
-			if (worklist && _values[use].op == IR_BOTTOM) {
+			/* we may skip nodes that are going to be removed by SCCP (TOP, CONST and COPY) */
+			if (worklist && _values[use].op > IR_COPY) {
+				/* schedule folding */
 				ir_bitqueue_add(worklist, use);
 			}
 		}
@@ -1067,7 +1068,7 @@ static ir_ref ir_ext_const(ir_ctx *ctx, ir_insn *val_insn, ir_op op, ir_type typ
 	return ir_const(ctx, new_val, type);
 }

-static ir_ref ir_ext_ref(ir_ctx *ctx, ir_ref var_ref, ir_ref src_ref, ir_op op, ir_type type)
+static ir_ref ir_ext_ref(ir_ctx *ctx, ir_ref var_ref, ir_ref src_ref, ir_op op, ir_type type, ir_bitqueue *worklist)
 {
 	uint32_t optx = IR_OPTX(op, type, 1);
 	ir_ref ref;
@@ -1079,6 +1080,7 @@ static ir_ref ir_ext_ref(ir_ctx *ctx, ir_ref var_ref, ir_ref src_ref, ir_op op,
 			if (!IR_IS_CONST_REF(src_ref)) {
 				ir_use_list_remove_one(ctx, src_ref, var_ref);
 			}
+			ir_bitqueue_add(worklist, ref);
 			return ref;
 		}
 	}
@@ -1091,6 +1093,8 @@ static ir_ref ir_ext_ref(ir_ctx *ctx, ir_ref var_ref, ir_ref src_ref, ir_op op,
 	if (!IR_IS_CONST_REF(src_ref)) {
 		ir_use_list_replace_one(ctx, src_ref, var_ref, ref);
 	}
+	ir_bitqueue_grow(worklist, ref + 1);
+	ir_bitqueue_add(worklist, ref);
 	return ref;
 }

@@ -1162,8 +1166,7 @@ static bool ir_try_promote_ext(ir_ctx *ctx, ir_ref ext_ref, ir_insn *insn, ir_bi
 				 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op1].op)) {
 					ctx->ir_base[use].op1 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op1], op, type);
 				} else {
-					ctx->ir_base[use].op1 = ir_ext_ref(ctx, use, use_insn->op1, op, type);
-					ir_bitqueue_add(worklist, ctx->ir_base[use].op1);
+					ctx->ir_base[use].op1 = ir_ext_ref(ctx, use, use_insn->op1, op, type, worklist);
 				}
 			}
 			if (use_insn->op2 != ref) {
@@ -1171,8 +1174,7 @@ static bool ir_try_promote_ext(ir_ctx *ctx, ir_ref ext_ref, ir_insn *insn, ir_bi
 				 && !IR_IS_SYM_CONST(ctx->ir_base[use_insn->op2].op)) {
 					ctx->ir_base[use].op2 = ir_ext_const(ctx, &ctx->ir_base[use_insn->op2], op, type);
 				} else {
-					ctx->ir_base[use].op2 = ir_ext_ref(ctx, use, use_insn->op2, op, type);
-					ir_bitqueue_add(worklist, ctx->ir_base[use].op2);
+					ctx->ir_base[use].op2 = ir_ext_ref(ctx, use, use_insn->op2, op, type, worklist);
 				}
 			}
 		}
@@ -1185,8 +1187,7 @@ static bool ir_try_promote_ext(ir_ctx *ctx, ir_ref ext_ref, ir_insn *insn, ir_bi
 	 && !IR_IS_SYM_CONST(ctx->ir_base[phi_insn->op2].op)) {
 		ctx->ir_base[ref].op2 = ir_ext_const(ctx, &ctx->ir_base[phi_insn->op2], op, type);
 	} else {
-		ctx->ir_base[ref].op2 = ir_ext_ref(ctx, ref, phi_insn->op2, op, type);
-		ir_bitqueue_add(worklist, ctx->ir_base[ref].op2);
+		ctx->ir_base[ref].op2 = ir_ext_ref(ctx, ref, phi_insn->op2, op, type, worklist);
 	}

 	return 1;
--- a/ext/opcache/jit/ir/ir_x86.dasc
+++ b/ext/opcache/jit/ir/ir_x86.dasc
@@ -1586,6 +1586,69 @@ static void ir_match_fuse_addr(ir_ctx *ctx, ir_ref addr_ref)
 	}
 }

+static bool ir_match_may_fuse_SI(ir_ctx *ctx, ir_ref ref, ir_ref use)
+{
+	ir_insn *op2_insn, *insn = &ctx->ir_base[use];
+
+	if (insn->op == IR_ADD) {
+		if (insn->op1 == ref) {
+			if (IR_IS_CONST_REF(insn->op2)) {
+				op2_insn = &ctx->ir_base[insn->op2];
+				if (IR_IS_SYM_CONST(op2_insn->op)) {
+					if (ir_may_fuse_addr(ctx, op2_insn)) {
+						return 1; // LEA_SI_O
+					}
+				} else if (IR_IS_SIGNED_32BIT(op2_insn->val.i64)) {
+					return 1; // LEA_SI_O
+				}
+			} else if (insn->op2 != ref) {
+				return 1; // LEA_SI_B or LEA_SI_OB
+			}
+		} else if (insn->op2 == ref && insn->op1 != insn->op2) {
+			return 1; // LEA_B_SI or LEA_OB_SI
+		}
+	}
+	return 0;
+}
+
+static bool ir_match_fuse_addr_all_useges(ir_ctx *ctx, ir_ref ref)
+{
+	uint32_t rule = ctx->rules[ref];
+	ir_use_list *use_list;
+	ir_ref n, *p, use;
+
+	if (rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
+		return 1;
+	} else if (!rule) {
+		ir_insn *insn = &ctx->ir_base[ref];
+
+		IR_ASSERT(IR_IS_TYPE_INT(insn->type) && ir_type_size[insn->type] >= 4);
+		if (insn->op == IR_MUL
+		 && IR_IS_CONST_REF(insn->op2)) {
+			insn = &ctx->ir_base[insn->op2];
+			if (!IR_IS_SYM_CONST(insn->op)
+			 &&	(insn->val.u64 == 2 || insn->val.u64 == 4 || insn->val.u64 == 8)) {
+				ctx->rules[ref] = IR_LEA_SI;
+
+				use_list = &ctx->use_lists[ref];
+				n = use_list->count;
+				IR_ASSERT(n > 1);
+				p = &ctx->use_edges[use_list->refs];
+				for (; n > 0; p++, n--) {
+					use = *p;
+					if (!ir_match_may_fuse_SI(ctx, ref, use)) {
+						return 0;
+					}
+				}
+
+				return 1;
+			}
+		}
+	}
+
+	return 0;
+}
+
 /* A naive check if there is a STORE or CALL between this LOAD and the fusion root */
 static bool ir_match_has_mem_deps(ir_ctx *ctx, ir_ref ref, ir_ref root)
 {
@@ -1895,13 +1958,13 @@ static uint32_t ir_match_insn(ir_ctx *ctx, ir_ref ref)
 					} else if ((ir_type_size[insn->type] >= 4 && insn->op == IR_ADD && IR_IS_SIGNED_32BIT(op2_insn->val.i64)) ||
 							(ir_type_size[insn->type] >= 4 && insn->op == IR_SUB && IR_IS_SIGNED_NEG_32BIT(op2_insn->val.i64))) {
 lea:
-						if (ctx->use_lists[insn->op1].count == 1) {
+						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
 							uint32_t rule = ctx->rules[insn->op1];

 							if (!rule) {
 								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
 							}
-							if (rule == IR_LEA_SI) {
+							if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
 								/* z = MUL(Y, 2|4|8) ... ADD(z, imm32) => SKIP ... LEA [Y*2|4|8+im32] */
 								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
 								return IR_LEA_SI_O;
@@ -1938,19 +2001,19 @@ lea:
 					}
 				} else if ((ctx->flags & IR_OPT_CODEGEN) && insn->op == IR_ADD && ir_type_size[insn->type] >= 4) {
 					if (insn->op1 != insn->op2) {
-						if (ctx->use_lists[insn->op1].count == 1) {
+						if (ctx->use_lists[insn->op1].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op1)) {
 							uint32_t rule =ctx->rules[insn->op1];
 							if (!rule) {
 								ctx->rules[insn->op1] = rule = ir_match_insn(ctx, insn->op1);
 							}
 							if (rule == IR_LEA_OB) {
 								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
-								if (ctx->use_lists[insn->op2].count == 1) {
+								if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
 									rule = ctx->rules[insn->op2];
 									if (!rule) {
 										ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
 									}
-									if (rule == IR_LEA_SI) {
+									if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
 										/* x = ADD(X, imm32) ... y = MUL(Y, 2|4|8) ... ADD(x, y) => SKIP ... SKIP ... LEA */
 										ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
 										return IR_LEA_OB_SI;
@@ -1958,7 +2021,7 @@ lea:
 								}
 								/* x = ADD(X, imm32) ... ADD(x, Y) => SKIP ... LEA */
 								return IR_LEA_OB_I;
-							} else if (rule == IR_LEA_SI) {
+							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
 								ctx->rules[insn->op1] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
 								if (ctx->use_lists[insn->op2].count == 1) {
 									rule = ctx->rules[insn->op2];
@@ -1975,7 +2038,7 @@ lea:
 								return IR_LEA_SI_B;
 							}
 						}
-						if (ctx->use_lists[insn->op2].count == 1) {
+						if (ctx->use_lists[insn->op2].count == 1 || ir_match_fuse_addr_all_useges(ctx, insn->op2)) {
 							uint32_t rule = ctx->rules[insn->op2];
 							if (!rule) {
 								ctx->rules[insn->op2] = rule = ir_match_insn(ctx, insn->op2);
@@ -1984,7 +2047,7 @@ lea:
 								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_OB;
 								/* x = ADD(X, imm32) ... ADD(Y, x) => SKIP ... LEA */
 								return IR_LEA_I_OB;
-							} else if (rule == IR_LEA_SI) {
+							} else if (rule == IR_LEA_SI || rule == (IR_FUSED | IR_SIMPLE | IR_LEA_SI)) {
 								ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_LEA_SI;
 								/* x = MUL(X, 2|4|8) ... ADD(Y, x) => SKIP ... LEA */
 								return IR_LEA_B_SI;
@@ -2497,7 +2560,7 @@ store_int:
 				return IR_RETURN_FP;
 			}
 		case IR_IF:
-			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
+			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
 				op2_insn = &ctx->ir_base[insn->op2];
 				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT) {
 					if (IR_IS_TYPE_INT(ctx->ir_base[op2_insn->op1].type)) {
@@ -2545,15 +2608,14 @@ store_int:
 					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
 					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
 					return IR_TEST_AND_BRANCH_INT;
-				} else if (op2_insn->op == IR_OVERFLOW) {
+				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
 					/* c = OVERFLOW(_) ... IF(c) => SKIP_OVERFLOW ... OVERFLOW_AND_BRANCH */
 					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
 					return IR_OVERFLOW_AND_BRANCH;
 				}
 			}
 			if (IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
-				if (insn->op2 == ref - 1 /* previous instruction */
-				 && ir_in_same_block(ctx, insn->op2)) {
+				if (insn->op2 == ref - 1) { /* previous instruction */
 					op2_insn = &ctx->ir_base[insn->op2];
 					if (op2_insn->op == IR_ADD ||
 					    op2_insn->op == IR_SUB ||
@@ -2575,7 +2637,6 @@ store_int:
 				} else if ((ctx->flags & IR_OPT_CODEGEN)
 				 && insn->op1 == ref - 1 /* previous instruction */
 				 && insn->op2 == ref - 2 /* previous instruction */
-				 && ir_in_same_block(ctx, insn->op2)
 				 && ctx->use_lists[insn->op2].count == 2
 				 && IR_IS_TYPE_INT(ctx->ir_base[insn->op2].type)) {
 					ir_insn *store_insn = &ctx->ir_base[insn->op1];
@@ -2626,7 +2687,7 @@ store_int:
 				break;
 			}
 		case IR_COND:
-			if (ir_in_same_block(ctx, insn->op1) && ctx->use_lists[insn->op1].count == 1) {
+			if (!IR_IS_CONST_REF(insn->op1) && ctx->use_lists[insn->op1].count == 1) {
 				ir_insn *op1_insn = &ctx->ir_base[insn->op1];

 				if (op1_insn->op >= IR_EQ && op1_insn->op <= IR_UGT) {
@@ -2644,7 +2705,7 @@ store_int:
 			return IR_COND;
 		case IR_GUARD:
 		case IR_GUARD_NOT:
-			if (ir_in_same_block(ctx, insn->op2) && ctx->use_lists[insn->op2].count == 1) {
+			if (!IR_IS_CONST_REF(insn->op2) && ctx->use_lists[insn->op2].count == 1) {
 				op2_insn = &ctx->ir_base[insn->op2];
 				if (op2_insn->op >= IR_EQ && op2_insn->op <= IR_UGT
 					// TODO: register allocator may clobber operands of CMP before they are used in the GUARD_CMP
@@ -2734,7 +2795,7 @@ store_int:
 					ir_match_fuse_load_test_int(ctx, op2_insn, ref);
 					ctx->rules[insn->op2] = IR_FUSED | IR_TEST_INT;
 					return IR_GUARD_TEST_INT;
-				} else if (op2_insn->op == IR_OVERFLOW) {
+				} else if (op2_insn->op == IR_OVERFLOW && ir_in_same_block(ctx, insn->op2)) {
 					/* c = OVERFLOW(_) ... GUARD(c) => SKIP_OVERFLOW ... GUARD_OVERFLOW */
 					ctx->rules[insn->op2] = IR_FUSED | IR_SIMPLE | IR_OVERFLOW;
 					return IR_GUARD_OVERFLOW;