@@ -82,6 +82,16 @@ DEF(shr, 1, 2, 0, TCG_OPF_INT)
DEF(sub, 1, 2, 0, TCG_OPF_INT)
DEF(xor, 1, 2, 0, TCG_OPF_INT)
+DEF(addco, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(addc1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(addci, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
+DEF(addcio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
+
+DEF(subbo, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(subb1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(subbi, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
+DEF(subbio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
+
/* load/store */
DEF(ld8u_i32, 1, 1, 1, 0)
DEF(ld8s_i32, 1, 1, 1, 0)
@@ -418,6 +418,11 @@ struct TCGContext {
MemOp riscv_cur_vsew;
TCGType riscv_cur_type;
#endif
+ /*
+ * During the tcg_reg_alloc_op loop, we are within a sequence of
+ * carry-using opcodes like addco+addci.
+ */
+ bool carry_live;
GHashTable *const_table[TCG_TYPE_COUNT];
TCGTempSet free_temps[TCG_TYPE_COUNT];
@@ -749,13 +754,17 @@ enum {
/* Instruction operands are vectors. */
TCG_OPF_VECTOR = 0x40,
/* Instruction is a conditional branch. */
- TCG_OPF_COND_BRANCH = 0x80
+ TCG_OPF_COND_BRANCH = 0x80,
+ /* Instruction produces carry out. */
+ TCG_OPF_CARRY_OUT = 0x100,
+ /* Instruction consumes carry in. */
+ TCG_OPF_CARRY_IN = 0x200,
};
typedef struct TCGOpDef {
const char *name;
uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
- uint8_t flags;
+ uint16_t flags;
} TCGOpDef;
extern const TCGOpDef tcg_op_defs[];
@@ -1214,6 +1214,12 @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
return finish_folding(ctx, op);
}
+static bool fold_add_carry(OptContext *ctx, TCGOp *op)
+{
+ fold_commutative(ctx, op);
+ return finish_folding(ctx, op);
+}
+
static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
{
bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]);
@@ -2817,6 +2823,11 @@ void tcg_optimize(TCGContext *s)
case INDEX_op_add_vec:
done = fold_add_vec(&ctx, op);
break;
+ case INDEX_op_addci:
+ case INDEX_op_addco:
+ case INDEX_op_addcio:
+ done = fold_add_carry(&ctx, op);
+ break;
CASE_OP_32_64(add2):
done = fold_add2(&ctx, op);
break;
@@ -3914,6 +3914,17 @@ liveness_pass_0(TCGContext *s)
}
}
+static void assert_carry_dead(TCGContext *s)
+{
+ /*
+ * Carry operations can be separated by a few insns like mov,
+ * load or store, but they should always be "close", and
+ * carry-out operations should always be paired with carry-in.
+ * At various boundaries, carry must have been consumed.
+ */
+ tcg_debug_assert(!s->carry_live);
+}
+
/* Liveness analysis : update the opc_arg_life array to tell if a
given input arguments is dead. Instructions updating dead
temporaries are removed. */
@@ -3933,17 +3944,19 @@ liveness_pass_1(TCGContext *s)
/* ??? Should be redundant with the exit_tb that ends the TB. */
la_func_end(s, nb_globals, nb_temps);
+ s->carry_live = false;
QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, link, op_prev) {
int nb_iargs, nb_oargs;
TCGOpcode opc_new, opc_new2;
TCGLifeData arg_life = 0;
TCGTemp *ts;
TCGOpcode opc = op->opc;
- const TCGOpDef *def = &tcg_op_defs[opc];
+ const TCGOpDef *def;
const TCGArgConstraint *args_ct;
switch (opc) {
case INDEX_op_call:
+ assert_carry_dead(s);
{
const TCGHelperInfo *info = tcg_call_info(op);
int call_flags = tcg_call_flags(op);
@@ -4055,6 +4068,7 @@ liveness_pass_1(TCGContext *s)
}
break;
case INDEX_op_insn_start:
+ assert_carry_dead(s);
break;
case INDEX_op_discard:
/* mark the temporary as dead */
@@ -4071,6 +4085,7 @@ liveness_pass_1(TCGContext *s)
case INDEX_op_sub2_i64:
opc_new = INDEX_op_sub;
do_addsub2:
+ assert_carry_dead(s);
/* Test if the high part of the operation is dead, but not
the low part. The result can be optimized to a simple
add or sub. This happens often for x86_64 guest when the
@@ -4096,6 +4111,7 @@ liveness_pass_1(TCGContext *s)
opc_new = INDEX_op_mul;
opc_new2 = INDEX_op_muluh;
do_mul2:
+ assert_carry_dead(s);
if (arg_temp(op->args[1])->state == TS_DEAD) {
if (arg_temp(op->args[0])->state == TS_DEAD) {
/* Both parts of the operation are dead. */
@@ -4118,10 +4134,62 @@ liveness_pass_1(TCGContext *s)
/* Mark the single-word operation live. */
goto do_not_remove;
+ case INDEX_op_addco:
+ if (s->carry_live) {
+ goto do_not_remove;
+ }
+ op->opc = opc = INDEX_op_add;
+ goto do_default;
+
+ case INDEX_op_addcio:
+ if (s->carry_live) {
+ goto do_not_remove;
+ }
+ op->opc = opc = INDEX_op_addci;
+ goto do_default;
+
+ case INDEX_op_subbo:
+ if (s->carry_live) {
+ goto do_not_remove;
+ }
+ /* Lower to sub, but this may also require canonicalization. */
+ op->opc = opc = INDEX_op_sub;
+ ts = arg_temp(op->args[2]);
+ if (ts->kind == TEMP_CONST) {
+ ts = tcg_constant_internal(ts->type, -ts->val);
+ if (ts->state_ptr == NULL) {
+ tcg_debug_assert(temp_idx(ts) == nb_temps);
+ nb_temps++;
+ ts->state_ptr = tcg_malloc(sizeof(TCGRegSet));
+ ts->state = TS_DEAD;
+ la_reset_pref(ts);
+ }
+ op->args[2] = temp_arg(ts);
+ op->opc = opc = INDEX_op_add;
+ }
+ goto do_default;
+
+ case INDEX_op_subbio:
+ if (s->carry_live) {
+ goto do_not_remove;
+ }
+ op->opc = opc = INDEX_op_subbi;
+ goto do_default;
+
+ case INDEX_op_addc1o:
+ case INDEX_op_subb1o:
+ if (s->carry_live) {
+ goto do_not_remove;
+ }
+ goto do_default;
+
default:
- /* Test if the operation can be removed because all
- its outputs are dead. We assume that nb_oargs == 0
- implies side effects */
+ do_default:
+ /*
+ * Test if the operation can be removed because all
+ * its outputs are dead. We assume that nb_oargs == 0
+ * implies side effects.
+ */
def = &tcg_op_defs[opc];
if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && def->nb_oargs != 0) {
for (int i = def->nb_oargs - 1; i >= 0; i--) {
@@ -4163,12 +4231,16 @@ liveness_pass_1(TCGContext *s)
/* If end of basic block, update. */
if (def->flags & TCG_OPF_BB_EXIT) {
+ assert_carry_dead(s);
la_func_end(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_COND_BRANCH) {
+ assert_carry_dead(s);
la_bb_sync(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_BB_END) {
+ assert_carry_dead(s);
la_bb_end(s, nb_globals, nb_temps);
} else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
+ assert_carry_dead(s);
la_global_sync(s, nb_globals);
if (def->flags & TCG_OPF_CALL_CLOBBER) {
la_cross_call(s, nb_temps);
@@ -4182,6 +4254,9 @@ liveness_pass_1(TCGContext *s)
arg_life |= DEAD_ARG << i;
}
}
+ if (def->flags & TCG_OPF_CARRY_OUT) {
+ s->carry_live = false;
+ }
/* Input arguments are live for preceding opcodes. */
for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
@@ -4193,6 +4268,9 @@ liveness_pass_1(TCGContext *s)
ts->state &= ~TS_DEAD;
}
}
+ if (def->flags & TCG_OPF_CARRY_IN) {
+ s->carry_live = true;
+ }
/* Incorporate constraints for this operand. */
switch (opc) {
@@ -4232,6 +4310,7 @@ liveness_pass_1(TCGContext *s)
}
op->life = arg_life;
}
+ assert_carry_dead(s);
}
/* Liveness analysis: Convert indirect regs to direct temporaries. */
@@ -4814,9 +4893,8 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
all globals are stored at their canonical location. */
static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
{
- int i;
-
- for (i = s->nb_globals; i < s->nb_temps; i++) {
+ assert_carry_dead(s);
+ for (int i = s->nb_globals; i < s->nb_temps; i++) {
TCGTemp *ts = &s->temps[i];
switch (ts->kind) {
@@ -4847,6 +4925,7 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
*/
static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
{
+ assert_carry_dead(s);
sync_globals(s, allocated_regs);
for (int i = s->nb_globals; i < s->nb_temps; i++) {
@@ -5118,6 +5197,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
int const_args[TCG_MAX_OP_ARGS];
TCGCond op_cond;
+ if (def->flags & TCG_OPF_CARRY_IN) {
+ tcg_debug_assert(s->carry_live);
+ }
+
nb_oargs = def->nb_oargs;
nb_iargs = def->nb_iargs;
@@ -5374,6 +5457,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
tcg_reg_alloc_bb_end(s, i_allocated_regs);
} else {
if (def->flags & TCG_OPF_CALL_CLOBBER) {
+ assert_carry_dead(s);
/* XXX: permit generic clobber register list ? */
for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@@ -5490,7 +5574,8 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
case INDEX_op_sub:
{
- const TCGOutOpSubtract *out = &outop_sub;
+ const TCGOutOpSubtract *out =
+ container_of(all_outop[op->opc], TCGOutOpSubtract, base);
tcg_debug_assert(!const_args[2]);
if (const_args[1]) {
@@ -5501,6 +5586,16 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
}
break;
+ case INDEX_op_addco:
+ case INDEX_op_subbo:
+ case INDEX_op_addci:
+ case INDEX_op_subbi:
+ case INDEX_op_addcio:
+ case INDEX_op_subbio:
+ case INDEX_op_addc1o:
+ case INDEX_op_subb1o:
+ g_assert_not_reached();
+
case INDEX_op_bswap64:
case INDEX_op_ext_i32_i64:
case INDEX_op_extu_i32_i64:
@@ -5688,6 +5783,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
break;
}
+ if (def->flags & TCG_OPF_CARRY_IN) {
+ s->carry_live = false;
+ }
+ if (def->flags & TCG_OPF_CARRY_OUT) {
+ s->carry_live = true;
+ }
+
/* move the outputs in the correct register if needed */
for(i = 0; i < nb_oargs; i++) {
ts = arg_temp(op->args[i]);
@@ -6690,6 +6792,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_out_tb_start(s);
num_insns = -1;
+ s->carry_live = false;
QTAILQ_FOREACH(op, &s->ops, link) {
TCGOpcode opc = op->opc;
@@ -6718,6 +6821,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_reg_alloc_dup(s, op);
break;
case INDEX_op_insn_start:
+ assert_carry_dead(s);
if (num_insns >= 0) {
size_t off = tcg_current_code_size(s);
s->gen_insn_end_off[num_insns] = off;
@@ -6738,6 +6842,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
tcg_out_label(s, arg_label(op->args[0]));
break;
case INDEX_op_call:
+ assert_carry_dead(s);
tcg_reg_alloc_call(s, op);
break;
case INDEX_op_exit_tb:
@@ -6774,6 +6879,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
return -2;
}
}
+ assert_carry_dead(s);
+
tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
@@ -593,6 +593,67 @@ Multiword arithmetic support
.. list-table::
+ * - addco *t0*, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* and in addition output to the
+ carry bit provided by the host architecture.
+
+ * - addci *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+ input carry bit provided by the host architecture.
+ The output carry bit need not be computed.
+
+ * - addcio *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+ input carry bit provided by the host architecture,
+ and also compute the output carry bit.
+
+ * - addc1o *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* + *t2* + 1, and in addition output to the
+ carry bit provided by the host architecture. This is akin to
+ *addcio* with a fixed carry-in value of 1.
+ | This is intended to be used by the optimization pass,
+ intermediate to complete folding of the addition chain.
+ In some cases complete folding is not possible and this
+ opcode will remain until output. If this happens, the
+ code generator will use ``tcg_out_set_carry`` and then
+ the output routine for *addcio*.
+
+ * - subbo *t0*, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* and in addition output to the
+ borrow bit provided by the host architecture.
+ | Depending on the host architecture, the carry bit may or may not be
+ identical to the borrow bit. Thus the addc\* and subb\*
+ opcodes must not be mixed.
+
+ * - subbi *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+ input borrow bit provided by the host architecture.
+ The output borrow bit need not be computed.
+
+ * - subbio *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+ input borrow bit provided by the host architecture,
+ and also compute the output borrow bit.
+
+ * - subb1o *t0, *t1*, *t2*
+
+ - | Compute *t0* = *t1* - *t2* - 1, and in addition output to the
+ borrow bit provided by the host architecture. This is akin to
+ *subbio* with a fixed borrow-in value of 1.
+ | This is intended to be used by the optimization pass,
+ intermediate to complete folding of the subtraction chain.
+ In some cases complete folding is not possible and this
+ opcode will remain until output. If this happens, the
+ code generator will use ``tcg_out_set_borrow`` and then
+ the output routine for *subbio*.
+
* - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
Liveness needs to track carry-live state in order to determine if the (hidden) output of the opcode is used. Code generation needs to track carry-live state in order to avoid clobbering cpu flags when loading constants. So far, output routines and backends are unchanged. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- include/tcg/tcg-opc.h | 10 ++++ include/tcg/tcg.h | 13 ++++- tcg/optimize.c | 11 ++++ tcg/tcg.c | 123 ++++++++++++++++++++++++++++++++++++++--- docs/devel/tcg-ops.rst | 61 ++++++++++++++++++++ 5 files changed, 208 insertions(+), 10 deletions(-)