diff mbox series

[v4,113/163] tcg: Add add/sub with carry opcodes and infrastructure

Message ID 20250415192515.232910-114-richard.henderson@linaro.org
State New
Headers show
Series tcg: Convert to TCGOutOp structures | expand

Commit Message

Richard Henderson April 15, 2025, 7:24 p.m. UTC
Liveness needs to track carry-live state in order to
determine if the (hidden) output of the opcode is used.
Code generation needs to track carry-live state in order
to avoid clobbering cpu flags when loading constants.

So far, output routines and backends are unchanged.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
---
 include/tcg/tcg-opc.h  |  10 +++
 include/tcg/tcg.h      |  13 +++-
 tcg/optimize.c         |  11 +++
 tcg/tcg.c              | 148 ++++++++++++++++++++++++++++++++++++++---
 docs/devel/tcg-ops.rst |  61 +++++++++++++++++
 5 files changed, 233 insertions(+), 10 deletions(-)

Comments

Pierrick Bouvier April 16, 2025, 7:01 p.m. UTC | #1
On 4/15/25 12:24, Richard Henderson wrote:
> Liveness needs to track carry-live state in order to
> determine if the (hidden) output of the opcode is used.
> Code generation needs to track carry-live state in order
> to avoid clobbering cpu flags when loading constants.
> 
> So far, output routines and backends are unchanged.
> 
> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>
> ---
>   include/tcg/tcg-opc.h  |  10 +++
>   include/tcg/tcg.h      |  13 +++-
>   tcg/optimize.c         |  11 +++
>   tcg/tcg.c              | 148 ++++++++++++++++++++++++++++++++++++++---
>   docs/devel/tcg-ops.rst |  61 +++++++++++++++++
>   5 files changed, 233 insertions(+), 10 deletions(-)
> 
> diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
> index 1f995c54be..9cc20cd62c 100644
> --- a/include/tcg/tcg-opc.h
> +++ b/include/tcg/tcg-opc.h
> @@ -82,6 +82,16 @@ DEF(shr, 1, 2, 0, TCG_OPF_INT)
>   DEF(sub, 1, 2, 0, TCG_OPF_INT)
>   DEF(xor, 1, 2, 0, TCG_OPF_INT)
>   
> +DEF(addco, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
> +DEF(addc1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
> +DEF(addci, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
> +DEF(addcio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
> +
> +DEF(subbo, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
> +DEF(subb1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
> +DEF(subbi, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
> +DEF(subbio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
> +
>   /* load/store */
>   DEF(ld8u_i32, 1, 1, 1, 0)
>   DEF(ld8s_i32, 1, 1, 1, 0)
> diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
> index c6b50b5226..aa300a2f8b 100644
> --- a/include/tcg/tcg.h
> +++ b/include/tcg/tcg.h
> @@ -418,6 +418,11 @@ struct TCGContext {
>       MemOp riscv_cur_vsew;
>       TCGType riscv_cur_type;
>   #endif
> +    /*
> +     * During the tcg_reg_alloc_op loop, we are within a sequence of
> +     * carry-using opcodes like addco+addci.
> +     */
> +    bool carry_live;
>   
>       GHashTable *const_table[TCG_TYPE_COUNT];
>       TCGTempSet free_temps[TCG_TYPE_COUNT];
> @@ -749,13 +754,17 @@ enum {
>       /* Instruction operands are vectors.  */
>       TCG_OPF_VECTOR       = 0x40,
>       /* Instruction is a conditional branch. */
> -    TCG_OPF_COND_BRANCH  = 0x80
> +    TCG_OPF_COND_BRANCH  = 0x80,
> +    /* Instruction produces carry out. */
> +    TCG_OPF_CARRY_OUT    = 0x100,
> +    /* Instruction consumes carry in. */
> +    TCG_OPF_CARRY_IN     = 0x200,
>   };
>   
>   typedef struct TCGOpDef {
>       const char *name;
>       uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
> -    uint8_t flags;
> +    uint16_t flags;
>   } TCGOpDef;
>   
>   extern const TCGOpDef tcg_op_defs[];
> diff --git a/tcg/optimize.c b/tcg/optimize.c
> index 9595b32d54..5a21f8bfd9 100644
> --- a/tcg/optimize.c
> +++ b/tcg/optimize.c
> @@ -1214,6 +1214,12 @@ static bool fold_add_vec(OptContext *ctx, TCGOp *op)
>       return finish_folding(ctx, op);
>   }
>   
> +static bool fold_add_carry(OptContext *ctx, TCGOp *op)
> +{
> +    fold_commutative(ctx, op);
> +    return finish_folding(ctx, op);
> +}
> +
>   static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
>   {
>       bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]);
> @@ -2817,6 +2823,11 @@ void tcg_optimize(TCGContext *s)
>           case INDEX_op_add_vec:
>               done = fold_add_vec(&ctx, op);
>               break;
> +        case INDEX_op_addci:
> +        case INDEX_op_addco:
> +        case INDEX_op_addcio:
> +            done = fold_add_carry(&ctx, op);
> +            break;
>           CASE_OP_32_64(add2):
>               done = fold_add2(&ctx, op);
>               break;
> diff --git a/tcg/tcg.c b/tcg/tcg.c
> index 381e76cfc8..c6a49f5648 100644
> --- a/tcg/tcg.c
> +++ b/tcg/tcg.c
> @@ -3914,6 +3914,17 @@ liveness_pass_0(TCGContext *s)
>       }
>   }
>   
> +static void assert_carry_dead(TCGContext *s)
> +{
> +    /*
> +     * Carry operations can be separated by a few insns like mov,
> +     * load or store, but they should always be "close", and
> +     * carry-out operations should always be paired with carry-in.
> +     * At various boundaries, carry must have been consumed.
> +     */
> +    tcg_debug_assert(!s->carry_live);
> +}
> +
>   /* Liveness analysis : update the opc_arg_life array to tell if a
>      given input arguments is dead. Instructions updating dead
>      temporaries are removed. */
> @@ -3933,17 +3944,19 @@ liveness_pass_1(TCGContext *s)
>       /* ??? Should be redundant with the exit_tb that ends the TB.  */
>       la_func_end(s, nb_globals, nb_temps);
>   
> +    s->carry_live = false;
>       QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, link, op_prev) {
>           int nb_iargs, nb_oargs;
>           TCGOpcode opc_new, opc_new2;
>           TCGLifeData arg_life = 0;
>           TCGTemp *ts;
>           TCGOpcode opc = op->opc;
> -        const TCGOpDef *def = &tcg_op_defs[opc];
> +        const TCGOpDef *def;
>           const TCGArgConstraint *args_ct;
>   
>           switch (opc) {
>           case INDEX_op_call:
> +            assert_carry_dead(s);
>               {
>                   const TCGHelperInfo *info = tcg_call_info(op);
>                   int call_flags = tcg_call_flags(op);
> @@ -4055,6 +4068,7 @@ liveness_pass_1(TCGContext *s)
>               }
>               break;
>           case INDEX_op_insn_start:
> +            assert_carry_dead(s);
>               break;
>           case INDEX_op_discard:
>               /* mark the temporary as dead */
> @@ -4071,6 +4085,7 @@ liveness_pass_1(TCGContext *s)
>           case INDEX_op_sub2_i64:
>               opc_new = INDEX_op_sub;
>           do_addsub2:
> +            assert_carry_dead(s);
>               /* Test if the high part of the operation is dead, but not
>                  the low part.  The result can be optimized to a simple
>                  add or sub.  This happens often for x86_64 guest when the
> @@ -4096,6 +4111,7 @@ liveness_pass_1(TCGContext *s)
>               opc_new = INDEX_op_mul;
>               opc_new2 = INDEX_op_muluh;
>           do_mul2:
> +            assert_carry_dead(s);
>               if (arg_temp(op->args[1])->state == TS_DEAD) {
>                   if (arg_temp(op->args[0])->state == TS_DEAD) {
>                       /* Both parts of the operation are dead.  */
> @@ -4118,10 +4134,87 @@ liveness_pass_1(TCGContext *s)
>               /* Mark the single-word operation live.  */
>               goto do_not_remove;
>   
> +        case INDEX_op_addco:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            op->opc = opc = INDEX_op_add;
> +            goto do_default;
> +
> +        case INDEX_op_addcio:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            op->opc = opc = INDEX_op_addci;
> +            goto do_default;
> +
> +        case INDEX_op_subbo:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            /* Lower to sub, but this may also require canonicalization. */
> +            op->opc = opc = INDEX_op_sub;
> +            ts = arg_temp(op->args[2]);
> +            if (ts->kind == TEMP_CONST) {
> +                ts = tcg_constant_internal(ts->type, -ts->val);
> +                if (ts->state_ptr == NULL) {
> +                    tcg_debug_assert(temp_idx(ts) == nb_temps);
> +                    nb_temps++;
> +                    ts->state_ptr = tcg_malloc(sizeof(TCGRegSet));
> +                    ts->state = TS_DEAD;
> +                    la_reset_pref(ts);
> +                }
> +                op->args[2] = temp_arg(ts);
> +                op->opc = opc = INDEX_op_add;
> +            }
> +            goto do_default;
> +
> +        case INDEX_op_subbio:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            op->opc = opc = INDEX_op_subbi;
> +            goto do_default;
> +
> +        case INDEX_op_addc1o:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            /* Lower to add, add +1. */
> +            op_prev = tcg_op_insert_before(s, op, INDEX_op_add, 3);
> +            op_prev->args[0] = op->args[0];
> +            op_prev->args[1] = op->args[1];
> +            op_prev->args[2] = op->args[2];
> +            op->opc = opc = INDEX_op_add;
> +            op->args[1] = op->args[0];
> +            ts = arg_temp(op->args[0]);
> +            ts = tcg_constant_internal(ts->type, 1);
> +            op->args[2] = temp_arg(ts);
> +            goto do_default;
> +
> +        case INDEX_op_subb1o:
> +            if (s->carry_live) {
> +                goto do_not_remove;
> +            }
> +            /* Lower to sub, add -1. */
> +            op_prev = tcg_op_insert_before(s, op, INDEX_op_sub, 3);
> +            op_prev->args[0] = op->args[0];
> +            op_prev->args[1] = op->args[1];
> +            op_prev->args[2] = op->args[2];
> +            op->opc = opc = INDEX_op_add;
> +            op->args[1] = op->args[0];
> +            ts = arg_temp(op->args[0]);
> +            ts = tcg_constant_internal(ts->type, -1);
> +            op->args[2] = temp_arg(ts);
> +            goto do_default;
> +
>           default:
> -            /* Test if the operation can be removed because all
> -               its outputs are dead. We assume that nb_oargs == 0
> -               implies side effects */
> +        do_default:
> +            /*
> +             * Test if the operation can be removed because all
> +             * its outputs are dead. We assume that nb_oargs == 0
> +             * implies side effects.
> +             */
>               def = &tcg_op_defs[opc];
>               if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && def->nb_oargs != 0) {
>                   for (int i = def->nb_oargs - 1; i >= 0; i--) {
> @@ -4163,12 +4256,16 @@ liveness_pass_1(TCGContext *s)
>   
>               /* If end of basic block, update.  */
>               if (def->flags & TCG_OPF_BB_EXIT) {
> +                assert_carry_dead(s);
>                   la_func_end(s, nb_globals, nb_temps);
>               } else if (def->flags & TCG_OPF_COND_BRANCH) {
> +                assert_carry_dead(s);
>                   la_bb_sync(s, nb_globals, nb_temps);
>               } else if (def->flags & TCG_OPF_BB_END) {
> +                assert_carry_dead(s);
>                   la_bb_end(s, nb_globals, nb_temps);
>               } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
> +                assert_carry_dead(s);
>                   la_global_sync(s, nb_globals);
>                   if (def->flags & TCG_OPF_CALL_CLOBBER) {
>                       la_cross_call(s, nb_temps);
> @@ -4182,6 +4279,9 @@ liveness_pass_1(TCGContext *s)
>                       arg_life |= DEAD_ARG << i;
>                   }
>               }
> +            if (def->flags & TCG_OPF_CARRY_OUT) {
> +                s->carry_live = false;
> +            }
>   
>               /* Input arguments are live for preceding opcodes.  */
>               for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
> @@ -4193,6 +4293,9 @@ liveness_pass_1(TCGContext *s)
>                       ts->state &= ~TS_DEAD;
>                   }
>               }
> +            if (def->flags & TCG_OPF_CARRY_IN) {
> +                s->carry_live = true;
> +            }
>   
>               /* Incorporate constraints for this operand.  */
>               switch (opc) {
> @@ -4232,6 +4335,7 @@ liveness_pass_1(TCGContext *s)
>           }
>           op->life = arg_life;
>       }
> +    assert_carry_dead(s);
>   }
>   
>   /* Liveness analysis: Convert indirect regs to direct temporaries.  */
> @@ -4817,9 +4921,8 @@ static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
>      all globals are stored at their canonical location. */
>   static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
>   {
> -    int i;
> -
> -    for (i = s->nb_globals; i < s->nb_temps; i++) {
> +    assert_carry_dead(s);
> +    for (int i = s->nb_globals; i < s->nb_temps; i++) {
>           TCGTemp *ts = &s->temps[i];
>   
>           switch (ts->kind) {
> @@ -4850,6 +4953,7 @@ static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
>    */
>   static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
>   {
> +    assert_carry_dead(s);
>       sync_globals(s, allocated_regs);
>   
>       for (int i = s->nb_globals; i < s->nb_temps; i++) {
> @@ -5121,6 +5225,10 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>       int const_args[TCG_MAX_OP_ARGS];
>       TCGCond op_cond;
>   
> +    if (def->flags & TCG_OPF_CARRY_IN) {
> +        tcg_debug_assert(s->carry_live);
> +    }
> +
>       nb_oargs = def->nb_oargs;
>       nb_iargs = def->nb_iargs;
>   
> @@ -5377,6 +5485,7 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>           tcg_reg_alloc_bb_end(s, i_allocated_regs);
>       } else {
>           if (def->flags & TCG_OPF_CALL_CLOBBER) {
> +            assert_carry_dead(s);
>               /* XXX: permit generic clobber register list ? */
>               for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
>                   if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
> @@ -5494,7 +5603,8 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>   
>       case INDEX_op_sub:
>           {
> -            const TCGOutOpSubtract *out = &outop_sub;
> +            const TCGOutOpSubtract *out =
> +                container_of(all_outop[op->opc], TCGOutOpSubtract, base);
>   
>               /*
>                * Constants should never appear in the second source operand.
> @@ -5509,6 +5619,16 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>           }
>           break;
>   
> +    case INDEX_op_addco:
> +    case INDEX_op_subbo:
> +    case INDEX_op_addci:
> +    case INDEX_op_subbi:
> +    case INDEX_op_addcio:
> +    case INDEX_op_subbio:
> +    case INDEX_op_addc1o:
> +    case INDEX_op_subb1o:
> +        g_assert_not_reached();
> +
>       case INDEX_op_bswap64:
>       case INDEX_op_ext_i32_i64:
>       case INDEX_op_extu_i32_i64:
> @@ -5697,6 +5817,13 @@ static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
>           break;
>       }
>   
> +    if (def->flags & TCG_OPF_CARRY_IN) {
> +        s->carry_live = false;
> +    }
> +    if (def->flags & TCG_OPF_CARRY_OUT) {
> +        s->carry_live = true;
> +    }
> +
>       /* move the outputs in the correct register if needed */
>       for(i = 0; i < nb_oargs; i++) {
>           ts = arg_temp(op->args[i]);
> @@ -6699,6 +6826,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>       tcg_out_tb_start(s);
>   
>       num_insns = -1;
> +    s->carry_live = false;
>       QTAILQ_FOREACH(op, &s->ops, link) {
>           TCGOpcode opc = op->opc;
>   
> @@ -6727,6 +6855,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>               tcg_reg_alloc_dup(s, op);
>               break;
>           case INDEX_op_insn_start:
> +            assert_carry_dead(s);
>               if (num_insns >= 0) {
>                   size_t off = tcg_current_code_size(s);
>                   s->gen_insn_end_off[num_insns] = off;
> @@ -6747,6 +6876,7 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>               tcg_out_label(s, arg_label(op->args[0]));
>               break;
>           case INDEX_op_call:
> +            assert_carry_dead(s);
>               tcg_reg_alloc_call(s, op);
>               break;
>           case INDEX_op_exit_tb:
> @@ -6783,6 +6913,8 @@ int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
>               return -2;
>           }
>       }
> +    assert_carry_dead(s);
> +
>       tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
>       s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
>   
> diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst
> index 9392d88069..93bcc70639 100644
> --- a/docs/devel/tcg-ops.rst
> +++ b/docs/devel/tcg-ops.rst
> @@ -593,6 +593,67 @@ Multiword arithmetic support
>   
>   .. list-table::
>   
> +   * - addco *t0*, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* + *t2* and in addition output to the
> +         carry bit provided by the host architecture.
> +
> +   * - addci *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
> +         input carry bit provided by the host architecture.
> +         The output carry bit need not be computed.
> +
> +   * - addcio *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
> +         input carry bit provided by the host architecture,
> +         and also compute the output carry bit.
> +
> +   * - addc1o *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* + *t2* + 1, and in addition output to the
> +         carry bit provided by the host architecture.  This is akin to
> +         *addcio* with a fixed carry-in value of 1.
> +       | This is intended to be used by the optimization pass,
> +         intermediate to complete folding of the addition chain.
> +         In some cases complete folding is not possible and this
> +         opcode will remain until output.  If this happens, the
> +         code generator will use ``tcg_out_set_carry`` and then
> +         the output routine for *addcio*.
> +
> +   * - subbo *t0*, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* - *t2* and in addition output to the
> +         borrow bit provided by the host architecture.
> +       | Depending on the host architecture, the carry bit may or may not be
> +         identical to the borrow bit.  Thus the addc\* and subb\*
> +         opcodes must not be mixed.
> +
> +   * - subbi *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
> +         input borrow bit provided by the host architecture.
> +         The output borrow bit need not be computed.
> +
> +   * - subbio *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
> +         input borrow bit provided by the host architecture,
> +         and also compute the output borrow bit.
> +
> +   * - subb1o *t0, *t1*, *t2*
> +
> +     - | Compute *t0* = *t1* - *t2* - 1, and in addition output to the
> +         borrow bit provided by the host architecture.  This is akin to
> +         *subbio* with a fixed borrow-in value of 1.
> +       | This is intended to be used by the optimization pass,
> +         intermediate to complete folding of the subtraction chain.
> +         In some cases complete folding is not possible and this
> +         opcode will remain until output.  If this happens, the
> +         code generator will use ``tcg_out_set_borrow`` and then
> +         the output routine for *subbio*.
> +
>      * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
>   
>          sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*

Reviewed-by: Pierrick Bouvier <pierrick.bouvier@linaro.org>
diff mbox series

Patch

diff --git a/include/tcg/tcg-opc.h b/include/tcg/tcg-opc.h
index 1f995c54be..9cc20cd62c 100644
--- a/include/tcg/tcg-opc.h
+++ b/include/tcg/tcg-opc.h
@@ -82,6 +82,16 @@  DEF(shr, 1, 2, 0, TCG_OPF_INT)
 DEF(sub, 1, 2, 0, TCG_OPF_INT)
 DEF(xor, 1, 2, 0, TCG_OPF_INT)
 
+DEF(addco, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(addc1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(addci, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
+DEF(addcio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
+
+DEF(subbo, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(subb1o, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_OUT)
+DEF(subbi, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN)
+DEF(subbio, 1, 2, 0, TCG_OPF_INT | TCG_OPF_CARRY_IN | TCG_OPF_CARRY_OUT)
+
 /* load/store */
 DEF(ld8u_i32, 1, 1, 1, 0)
 DEF(ld8s_i32, 1, 1, 1, 0)
diff --git a/include/tcg/tcg.h b/include/tcg/tcg.h
index c6b50b5226..aa300a2f8b 100644
--- a/include/tcg/tcg.h
+++ b/include/tcg/tcg.h
@@ -418,6 +418,11 @@  struct TCGContext {
     MemOp riscv_cur_vsew;
     TCGType riscv_cur_type;
 #endif
+    /*
+     * During the tcg_reg_alloc_op loop, we are within a sequence of
+     * carry-using opcodes like addco+addci.
+     */
+    bool carry_live;
 
     GHashTable *const_table[TCG_TYPE_COUNT];
     TCGTempSet free_temps[TCG_TYPE_COUNT];
@@ -749,13 +754,17 @@  enum {
     /* Instruction operands are vectors.  */
     TCG_OPF_VECTOR       = 0x40,
     /* Instruction is a conditional branch. */
-    TCG_OPF_COND_BRANCH  = 0x80
+    TCG_OPF_COND_BRANCH  = 0x80,
+    /* Instruction produces carry out. */
+    TCG_OPF_CARRY_OUT    = 0x100,
+    /* Instruction consumes carry in. */
+    TCG_OPF_CARRY_IN     = 0x200,
 };
 
 typedef struct TCGOpDef {
     const char *name;
     uint8_t nb_oargs, nb_iargs, nb_cargs, nb_args;
-    uint8_t flags;
+    uint16_t flags;
 } TCGOpDef;
 
 extern const TCGOpDef tcg_op_defs[];
diff --git a/tcg/optimize.c b/tcg/optimize.c
index 9595b32d54..5a21f8bfd9 100644
--- a/tcg/optimize.c
+++ b/tcg/optimize.c
@@ -1214,6 +1214,12 @@  static bool fold_add_vec(OptContext *ctx, TCGOp *op)
     return finish_folding(ctx, op);
 }
 
+static bool fold_add_carry(OptContext *ctx, TCGOp *op)
+{
+    fold_commutative(ctx, op);
+    return finish_folding(ctx, op);
+}
+
 static bool fold_addsub2(OptContext *ctx, TCGOp *op, bool add)
 {
     bool a_const = arg_is_const(op->args[2]) && arg_is_const(op->args[3]);
@@ -2817,6 +2823,11 @@  void tcg_optimize(TCGContext *s)
         case INDEX_op_add_vec:
             done = fold_add_vec(&ctx, op);
             break;
+        case INDEX_op_addci:
+        case INDEX_op_addco:
+        case INDEX_op_addcio:
+            done = fold_add_carry(&ctx, op);
+            break;
         CASE_OP_32_64(add2):
             done = fold_add2(&ctx, op);
             break;
diff --git a/tcg/tcg.c b/tcg/tcg.c
index 381e76cfc8..c6a49f5648 100644
--- a/tcg/tcg.c
+++ b/tcg/tcg.c
@@ -3914,6 +3914,17 @@  liveness_pass_0(TCGContext *s)
     }
 }
 
+static void assert_carry_dead(TCGContext *s)
+{
+    /*
+     * Carry operations can be separated by a few insns like mov,
+     * load or store, but they should always be "close", and
+     * carry-out operations should always be paired with carry-in.
+     * At various boundaries, carry must have been consumed.
+     */
+    tcg_debug_assert(!s->carry_live);
+}
+
 /* Liveness analysis : update the opc_arg_life array to tell if a
    given input arguments is dead. Instructions updating dead
    temporaries are removed. */
@@ -3933,17 +3944,19 @@  liveness_pass_1(TCGContext *s)
     /* ??? Should be redundant with the exit_tb that ends the TB.  */
     la_func_end(s, nb_globals, nb_temps);
 
+    s->carry_live = false;
     QTAILQ_FOREACH_REVERSE_SAFE(op, &s->ops, link, op_prev) {
         int nb_iargs, nb_oargs;
         TCGOpcode opc_new, opc_new2;
         TCGLifeData arg_life = 0;
         TCGTemp *ts;
         TCGOpcode opc = op->opc;
-        const TCGOpDef *def = &tcg_op_defs[opc];
+        const TCGOpDef *def;
         const TCGArgConstraint *args_ct;
 
         switch (opc) {
         case INDEX_op_call:
+            assert_carry_dead(s);
             {
                 const TCGHelperInfo *info = tcg_call_info(op);
                 int call_flags = tcg_call_flags(op);
@@ -4055,6 +4068,7 @@  liveness_pass_1(TCGContext *s)
             }
             break;
         case INDEX_op_insn_start:
+            assert_carry_dead(s);
             break;
         case INDEX_op_discard:
             /* mark the temporary as dead */
@@ -4071,6 +4085,7 @@  liveness_pass_1(TCGContext *s)
         case INDEX_op_sub2_i64:
             opc_new = INDEX_op_sub;
         do_addsub2:
+            assert_carry_dead(s);
             /* Test if the high part of the operation is dead, but not
                the low part.  The result can be optimized to a simple
                add or sub.  This happens often for x86_64 guest when the
@@ -4096,6 +4111,7 @@  liveness_pass_1(TCGContext *s)
             opc_new = INDEX_op_mul;
             opc_new2 = INDEX_op_muluh;
         do_mul2:
+            assert_carry_dead(s);
             if (arg_temp(op->args[1])->state == TS_DEAD) {
                 if (arg_temp(op->args[0])->state == TS_DEAD) {
                     /* Both parts of the operation are dead.  */
@@ -4118,10 +4134,87 @@  liveness_pass_1(TCGContext *s)
             /* Mark the single-word operation live.  */
             goto do_not_remove;
 
+        case INDEX_op_addco:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            op->opc = opc = INDEX_op_add;
+            goto do_default;
+
+        case INDEX_op_addcio:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            op->opc = opc = INDEX_op_addci;
+            goto do_default;
+
+        case INDEX_op_subbo:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            /* Lower to sub, but this may also require canonicalization. */
+            op->opc = opc = INDEX_op_sub;
+            ts = arg_temp(op->args[2]);
+            if (ts->kind == TEMP_CONST) {
+                ts = tcg_constant_internal(ts->type, -ts->val);
+                if (ts->state_ptr == NULL) {
+                    tcg_debug_assert(temp_idx(ts) == nb_temps);
+                    nb_temps++;
+                    ts->state_ptr = tcg_malloc(sizeof(TCGRegSet));
+                    ts->state = TS_DEAD;
+                    la_reset_pref(ts);
+                }
+                op->args[2] = temp_arg(ts);
+                op->opc = opc = INDEX_op_add;
+            }
+            goto do_default;
+
+        case INDEX_op_subbio:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            op->opc = opc = INDEX_op_subbi;
+            goto do_default;
+
+        case INDEX_op_addc1o:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            /* Lower to add, add +1. */
+            op_prev = tcg_op_insert_before(s, op, INDEX_op_add, 3);
+            op_prev->args[0] = op->args[0];
+            op_prev->args[1] = op->args[1];
+            op_prev->args[2] = op->args[2];
+            op->opc = opc = INDEX_op_add;
+            op->args[1] = op->args[0];
+            ts = arg_temp(op->args[0]);
+            ts = tcg_constant_internal(ts->type, 1);
+            op->args[2] = temp_arg(ts);
+            goto do_default;
+
+        case INDEX_op_subb1o:
+            if (s->carry_live) {
+                goto do_not_remove;
+            }
+            /* Lower to sub, add -1. */
+            op_prev = tcg_op_insert_before(s, op, INDEX_op_sub, 3);
+            op_prev->args[0] = op->args[0];
+            op_prev->args[1] = op->args[1];
+            op_prev->args[2] = op->args[2];
+            op->opc = opc = INDEX_op_add;
+            op->args[1] = op->args[0];
+            ts = arg_temp(op->args[0]);
+            ts = tcg_constant_internal(ts->type, -1);
+            op->args[2] = temp_arg(ts);
+            goto do_default;
+
         default:
-            /* Test if the operation can be removed because all
-               its outputs are dead. We assume that nb_oargs == 0
-               implies side effects */
+        do_default:
+            /*
+             * Test if the operation can be removed because all
+             * its outputs are dead. We assume that nb_oargs == 0
+             * implies side effects.
+             */
             def = &tcg_op_defs[opc];
             if (!(def->flags & TCG_OPF_SIDE_EFFECTS) && def->nb_oargs != 0) {
                 for (int i = def->nb_oargs - 1; i >= 0; i--) {
@@ -4163,12 +4256,16 @@  liveness_pass_1(TCGContext *s)
 
             /* If end of basic block, update.  */
             if (def->flags & TCG_OPF_BB_EXIT) {
+                assert_carry_dead(s);
                 la_func_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_COND_BRANCH) {
+                assert_carry_dead(s);
                 la_bb_sync(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_BB_END) {
+                assert_carry_dead(s);
                 la_bb_end(s, nb_globals, nb_temps);
             } else if (def->flags & TCG_OPF_SIDE_EFFECTS) {
+                assert_carry_dead(s);
                 la_global_sync(s, nb_globals);
                 if (def->flags & TCG_OPF_CALL_CLOBBER) {
                     la_cross_call(s, nb_temps);
@@ -4182,6 +4279,9 @@  liveness_pass_1(TCGContext *s)
                     arg_life |= DEAD_ARG << i;
                 }
             }
+            if (def->flags & TCG_OPF_CARRY_OUT) {
+                s->carry_live = false;
+            }
 
             /* Input arguments are live for preceding opcodes.  */
             for (int i = nb_oargs; i < nb_oargs + nb_iargs; i++) {
@@ -4193,6 +4293,9 @@  liveness_pass_1(TCGContext *s)
                     ts->state &= ~TS_DEAD;
                 }
             }
+            if (def->flags & TCG_OPF_CARRY_IN) {
+                s->carry_live = true;
+            }
 
             /* Incorporate constraints for this operand.  */
             switch (opc) {
@@ -4232,6 +4335,7 @@  liveness_pass_1(TCGContext *s)
         }
         op->life = arg_life;
     }
+    assert_carry_dead(s);
 }
 
 /* Liveness analysis: Convert indirect regs to direct temporaries.  */
@@ -4817,9 +4921,8 @@  static void sync_globals(TCGContext *s, TCGRegSet allocated_regs)
    all globals are stored at their canonical location. */
 static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
 {
-    int i;
-
-    for (i = s->nb_globals; i < s->nb_temps; i++) {
+    assert_carry_dead(s);
+    for (int i = s->nb_globals; i < s->nb_temps; i++) {
         TCGTemp *ts = &s->temps[i];
 
         switch (ts->kind) {
@@ -4850,6 +4953,7 @@  static void tcg_reg_alloc_bb_end(TCGContext *s, TCGRegSet allocated_regs)
  */
 static void tcg_reg_alloc_cbranch(TCGContext *s, TCGRegSet allocated_regs)
 {
+    assert_carry_dead(s);
     sync_globals(s, allocated_regs);
 
     for (int i = s->nb_globals; i < s->nb_temps; i++) {
@@ -5121,6 +5225,10 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
     int const_args[TCG_MAX_OP_ARGS];
     TCGCond op_cond;
 
+    if (def->flags & TCG_OPF_CARRY_IN) {
+        tcg_debug_assert(s->carry_live);
+    }
+
     nb_oargs = def->nb_oargs;
     nb_iargs = def->nb_iargs;
 
@@ -5377,6 +5485,7 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         tcg_reg_alloc_bb_end(s, i_allocated_regs);
     } else {
         if (def->flags & TCG_OPF_CALL_CLOBBER) {
+            assert_carry_dead(s);
             /* XXX: permit generic clobber register list ? */
             for (i = 0; i < TCG_TARGET_NB_REGS; i++) {
                 if (tcg_regset_test_reg(tcg_target_call_clobber_regs, i)) {
@@ -5494,7 +5603,8 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
 
     case INDEX_op_sub:
         {
-            const TCGOutOpSubtract *out = &outop_sub;
+            const TCGOutOpSubtract *out =
+                container_of(all_outop[op->opc], TCGOutOpSubtract, base);
 
             /*
              * Constants should never appear in the second source operand.
@@ -5509,6 +5619,16 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         }
         break;
 
+    case INDEX_op_addco:
+    case INDEX_op_subbo:
+    case INDEX_op_addci:
+    case INDEX_op_subbi:
+    case INDEX_op_addcio:
+    case INDEX_op_subbio:
+    case INDEX_op_addc1o:
+    case INDEX_op_subb1o:
+        g_assert_not_reached();
+
     case INDEX_op_bswap64:
     case INDEX_op_ext_i32_i64:
     case INDEX_op_extu_i32_i64:
@@ -5697,6 +5817,13 @@  static void tcg_reg_alloc_op(TCGContext *s, const TCGOp *op)
         break;
     }
 
+    if (def->flags & TCG_OPF_CARRY_IN) {
+        s->carry_live = false;
+    }
+    if (def->flags & TCG_OPF_CARRY_OUT) {
+        s->carry_live = true;
+    }
+
     /* move the outputs in the correct register if needed */
     for(i = 0; i < nb_oargs; i++) {
         ts = arg_temp(op->args[i]);
@@ -6699,6 +6826,7 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
     tcg_out_tb_start(s);
 
     num_insns = -1;
+    s->carry_live = false;
     QTAILQ_FOREACH(op, &s->ops, link) {
         TCGOpcode opc = op->opc;
 
@@ -6727,6 +6855,7 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             tcg_reg_alloc_dup(s, op);
             break;
         case INDEX_op_insn_start:
+            assert_carry_dead(s);
             if (num_insns >= 0) {
                 size_t off = tcg_current_code_size(s);
                 s->gen_insn_end_off[num_insns] = off;
@@ -6747,6 +6876,7 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             tcg_out_label(s, arg_label(op->args[0]));
             break;
         case INDEX_op_call:
+            assert_carry_dead(s);
             tcg_reg_alloc_call(s, op);
             break;
         case INDEX_op_exit_tb:
@@ -6783,6 +6913,8 @@  int tcg_gen_code(TCGContext *s, TranslationBlock *tb, uint64_t pc_start)
             return -2;
         }
     }
+    assert_carry_dead(s);
+
     tcg_debug_assert(num_insns + 1 == s->gen_tb->icount);
     s->gen_insn_end_off[num_insns] = tcg_current_code_size(s);
 
diff --git a/docs/devel/tcg-ops.rst b/docs/devel/tcg-ops.rst
index 9392d88069..93bcc70639 100644
--- a/docs/devel/tcg-ops.rst
+++ b/docs/devel/tcg-ops.rst
@@ -593,6 +593,67 @@  Multiword arithmetic support
 
 .. list-table::
 
+   * - addco *t0*, *t1*, *t2*
+
+     - | Compute *t0* = *t1* + *t2* and in addition output to the
+         carry bit provided by the host architecture.
+
+   * - addci *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+         input carry bit provided by the host architecture.
+         The output carry bit need not be computed.
+
+   * - addcio *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* + *t2* + *C*, where *C* is the
+         input carry bit provided by the host architecture,
+         and also compute the output carry bit.
+
+   * - addc1o *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* + *t2* + 1, and in addition output to the
+         carry bit provided by the host architecture.  This is akin to
+         *addcio* with a fixed carry-in value of 1.
+       | This is intended to be used by the optimization pass,
+         intermediate to complete folding of the addition chain.
+         In some cases complete folding is not possible and this
+         opcode will remain until output.  If this happens, the
+         code generator will use ``tcg_out_set_carry`` and then
+         the output routine for *addcio*.
+
+   * - subbo *t0*, *t1*, *t2*
+
+     - | Compute *t0* = *t1* - *t2* and in addition output to the
+         borrow bit provided by the host architecture.
+       | Depending on the host architecture, the carry bit may or may not be
+         identical to the borrow bit.  Thus the addc\* and subb\*
+         opcodes must not be mixed.
+
+   * - subbi *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+         input borrow bit provided by the host architecture.
+         The output borrow bit need not be computed.
+
+   * - subbio *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* - *t2* - *B*, where *B* is the
+         input borrow bit provided by the host architecture,
+         and also compute the output borrow bit.
+
+   * - subb1o *t0, *t1*, *t2*
+
+     - | Compute *t0* = *t1* - *t2* - 1, and in addition output to the
+         borrow bit provided by the host architecture.  This is akin to
+         *subbio* with a fixed borrow-in value of 1.
+       | This is intended to be used by the optimization pass,
+         intermediate to complete folding of the subtraction chain.
+         In some cases complete folding is not possible and this
+         opcode will remain until output.  If this happens, the
+         code generator will use ``tcg_out_set_borrow`` and then
+         the output routine for *subbio*.
+
    * - add2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*
 
        sub2_i32/i64 *t0_low*, *t0_high*, *t1_low*, *t1_high*, *t2_low*, *t2_high*