Message ID | 87mv4cbu4z.fsf@linaro.org |
---|---|
State | New |
Headers | show |
Series | [03/nn,AArch64] Rework interface to add constant/offset routines | expand |
Richard Sandiford <richard.sandiford@linaro.org> writes: > The port had aarch64_add_offset and aarch64_add_constant routines > that did similar things. This patch replaces them with an expanded > version of aarch64_add_offset that takes separate source and > destination registers. The new routine also takes a poly_int64 offset > instead of a HOST_WIDE_INT offset, but it leaves the HOST_WIDE_INT > case to aarch64_add_offset_1, which is basically a repurposed > aarch64_add_constant_internal. The SVE patch will put the handling > of VL-based constants in aarch64_add_offset, while still using > aarch64_add_offset_1 for the constant part. > > The vcall_offset == 0 path in aarch64_output_mi_thunk will use temp0 > as well as temp1 once SVE is added. > > A side-effect of the patch is that we now generate: > > mov x29, sp > > instead of: > > add x29, sp, 0 > > in the pr70044.c test. Sorry, I stupidly rebased the patch just before posting and so introduced a last-minute bug. Here's a fixed version that survives testing on aarch64-linux-gnu. Thanks, Richard 2017-10-30 Richard Sandiford <richard.sandiford@linaro.org> Alan Hayward <alan.hayward@arm.com> David Sherwood <david.sherwood@arm.com> gcc/ * config/aarch64/aarch64.c (aarch64_force_temporary): Assert that x exists before using it. (aarch64_add_constant_internal): Rename to... (aarch64_add_offset_1): ...this. Replace regnum with separate src and dest rtxes. Handle the case in which they're different, including when the offset is zero. Replace scratchreg with an rtx. Use 2 additions if there is no spare register into which we can move a 16-bit constant. (aarch64_add_constant): Delete. (aarch64_add_offset): Replace reg with separate src and dest rtxes. Take a poly_int64 offset instead of a HOST_WIDE_INT. Use aarch64_add_offset_1. (aarch64_add_sp, aarch64_sub_sp): Take the scratch register as an rtx rather than an int. Take the delta as a poly_int64 rather than a HOST_WIDE_INT. Use aarch64_add_offset. (aarch64_expand_mov_immediate): Update uses of aarch64_add_offset. (aarch64_allocate_and_probe_stack_space): Take the scratch register as an rtx rather than an int. Use Pmode rather than word_mode in the loop code. Update calls to aarch64_sub_sp. (aarch64_expand_prologue): Update calls to aarch64_sub_sp, aarch64_allocate_and_probe_stack_space and aarch64_add_offset. (aarch64_expand_epilogue): Update calls to aarch64_add_offset and aarch64_add_sp. (aarch64_output_mi_thunk): Use aarch64_add_offset rather than aarch64_add_constant. gcc/testsuite/ * gcc.target/aarch64/pr70044.c: Allow "mov x29, sp" too. Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2017-10-30 10:45:23.806582919 +0000 +++ gcc/config/aarch64/aarch64.c 2017-10-30 10:46:47.278836421 +0000 @@ -1818,30 +1818,13 @@ aarch64_force_temporary (machine_mode mo return force_reg (mode, value); else { - x = aarch64_emit_move (x, value); + gcc_assert (x); + aarch64_emit_move (x, value); return x; } } -static rtx -aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg, - HOST_WIDE_INT offset) -{ - if (!aarch64_plus_immediate (GEN_INT (offset), mode)) - { - rtx high; - /* Load the full offset into a register. This - might be improvable in the future. */ - high = GEN_INT (offset); - offset = 0; - high = aarch64_force_temporary (mode, temp, high); - reg = aarch64_force_temporary (mode, temp, - gen_rtx_PLUS (mode, high, reg)); - } - return plus_constant (mode, reg, offset); -} - static int aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, scalar_int_mode mode) @@ -1966,86 +1949,123 @@ aarch64_internal_mov_immediate (rtx dest return num_insns; } -/* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a - temporary value if necessary. FRAME_RELATED_P should be true if - the RTX_FRAME_RELATED flag should be set and CFA adjustments added - to the generated instructions. If SCRATCHREG is known to hold - abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the - immediate again. - - Since this function may be used to adjust the stack pointer, we must - ensure that it cannot cause transient stack deallocation (for example - by first incrementing SP and then decrementing when adjusting by a - large immediate). */ +/* A subroutine of aarch64_add_offset that handles the case in which + OFFSET is known at compile time. The arguments are otherwise the same. */ static void -aarch64_add_constant_internal (scalar_int_mode mode, int regnum, - int scratchreg, HOST_WIDE_INT delta, - bool frame_related_p, bool emit_move_imm) +aarch64_add_offset_1 (scalar_int_mode mode, rtx dest, + rtx src, HOST_WIDE_INT offset, rtx temp1, + bool frame_related_p, bool emit_move_imm) { - HOST_WIDE_INT mdelta = abs_hwi (delta); - rtx this_rtx = gen_rtx_REG (mode, regnum); + gcc_assert (emit_move_imm || temp1 != NULL_RTX); + gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); + + HOST_WIDE_INT moffset = abs_hwi (offset); rtx_insn *insn; - if (!mdelta) - return; + if (!moffset) + { + if (!rtx_equal_p (dest, src)) + { + insn = emit_insn (gen_rtx_SET (dest, src)); + RTX_FRAME_RELATED_P (insn) = frame_related_p; + } + return; + } /* Single instruction adjustment. */ - if (aarch64_uimm12_shift (mdelta)) + if (aarch64_uimm12_shift (moffset)) { - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta))); + insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset))); RTX_FRAME_RELATED_P (insn) = frame_related_p; return; } - /* Emit 2 additions/subtractions if the adjustment is less than 24 bits. - Only do this if mdelta is not a 16-bit move as adjusting using a move - is better. */ - if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode)) + /* Emit 2 additions/subtractions if the adjustment is less than 24 bits + and either: + + a) the offset cannot be loaded by a 16-bit move or + b) there is no spare register into which we can move it. */ + if (moffset < 0x1000000 + && ((!temp1 && !can_create_pseudo_p ()) + || !aarch64_move_imm (moffset, mode))) { - HOST_WIDE_INT low_off = mdelta & 0xfff; + HOST_WIDE_INT low_off = moffset & 0xfff; - low_off = delta < 0 ? -low_off : low_off; - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off))); + low_off = offset < 0 ? -low_off : low_off; + insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off))); RTX_FRAME_RELATED_P (insn) = frame_related_p; - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off))); + insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off))); RTX_FRAME_RELATED_P (insn) = frame_related_p; return; } /* Emit a move immediate if required and an addition/subtraction. */ - rtx scratch_rtx = gen_rtx_REG (mode, scratchreg); if (emit_move_imm) - aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode); - insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx) - : gen_add2_insn (this_rtx, scratch_rtx)); + { + gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ()); + temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset)); + } + insn = emit_insn (offset < 0 + ? gen_sub3_insn (dest, src, temp1) + : gen_add3_insn (dest, src, temp1)); if (frame_related_p) { RTX_FRAME_RELATED_P (insn) = frame_related_p; - rtx adj = plus_constant (mode, this_rtx, delta); - add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj)); + rtx adj = plus_constant (mode, src, offset); + add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj)); } } -static inline void -aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg, - HOST_WIDE_INT delta) -{ - aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true); -} +/* Set DEST to SRC + OFFSET. MODE is the mode of the addition. + FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should + be set and CFA adjustments added to the generated instructions. + + TEMP1, if nonnull, is a register of mode MODE that can be used as a + temporary if register allocation is already complete. This temporary + register may overlap DEST but must not overlap SRC. If TEMP1 is known + to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting + the immediate again. + + Since this function may be used to adjust the stack pointer, we must + ensure that it cannot cause transient stack deallocation (for example + by first incrementing SP and then decrementing when adjusting by a + large immediate). */ + +static void +aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + poly_int64 offset, rtx temp1, bool frame_related_p, + bool emit_move_imm = true) +{ + gcc_assert (emit_move_imm || temp1 != NULL_RTX); + gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); + + /* SVE support will go here. */ + HOST_WIDE_INT constant = offset.to_constant (); + aarch64_add_offset_1 (mode, dest, src, constant, temp1, + frame_related_p, emit_move_imm); +} + +/* Add DELTA to the stack pointer, marking the instructions frame-related. + TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false + if TEMP1 already contains abs (DELTA). */ static inline void -aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm) +aarch64_add_sp (rtx temp1, poly_int64 delta, bool emit_move_imm) { - aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta, - true, emit_move_imm); + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, + temp1, true, emit_move_imm); } +/* Subtract DELTA from the stack pointer, marking the instructions + frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary + if nonnull. */ + static inline void -aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p) +aarch64_sub_sp (rtx temp1, poly_int64 delta, bool frame_related_p) { - aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta, - frame_related_p, true); + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, + temp1, frame_related_p); } void @@ -2078,9 +2098,8 @@ aarch64_expand_mov_immediate (rtx dest, { gcc_assert (can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); - base = aarch64_add_offset (int_mode, NULL, base, - INTVAL (offset)); - aarch64_emit_move (dest, base); + aarch64_add_offset (int_mode, dest, base, INTVAL (offset), + NULL_RTX, false); return; } @@ -2119,9 +2138,8 @@ aarch64_expand_mov_immediate (rtx dest, { gcc_assert(can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); - base = aarch64_add_offset (int_mode, NULL, base, - INTVAL (offset)); - aarch64_emit_move (dest, base); + aarch64_add_offset (int_mode, dest, base, INTVAL (offset), + NULL_RTX, false); return; } /* FALLTHRU */ @@ -3613,11 +3631,10 @@ aarch64_set_handled_components (sbitmap cfun->machine->reg_is_wrapped_separately[regno] = true; } -/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch - register. */ +/* Allocate SIZE bytes of stack space using TEMP1 as a scratch register. */ static void -aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size) +aarch64_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size) { HOST_WIDE_INT probe_interval = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); @@ -3631,7 +3648,7 @@ aarch64_allocate_and_probe_stack_space ( We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk without any probing. */ gcc_assert (size >= guard_size - guard_used_by_caller); - aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true); + aarch64_sub_sp (temp1, guard_size - guard_used_by_caller, true); HOST_WIDE_INT orig_size = size; size -= (guard_size - guard_used_by_caller); @@ -3643,17 +3660,16 @@ aarch64_allocate_and_probe_stack_space ( if (rounded_size && rounded_size <= 4 * probe_interval) { /* We don't use aarch64_sub_sp here because we don't want to - repeatedly load SCRATCHREG. */ - rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg); + repeatedly load TEMP1. */ if (probe_interval > ARITH_FACTOR) - emit_move_insn (scratch_rtx, GEN_INT (-probe_interval)); + emit_move_insn (temp1, GEN_INT (-probe_interval)); else - scratch_rtx = GEN_INT (-probe_interval); + temp1 = GEN_INT (-probe_interval); for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval) { rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx, - scratch_rtx)); + temp1)); add_reg_note (insn, REG_STACK_CHECK, const0_rtx); if (probe_interval > ARITH_FACTOR) @@ -3674,10 +3690,10 @@ aarch64_allocate_and_probe_stack_space ( else if (rounded_size) { /* Compute the ending address. */ - rtx temp = gen_rtx_REG (word_mode, scratchreg); - emit_move_insn (temp, GEN_INT (-rounded_size)); + unsigned int scratchreg = REGNO (temp1); + emit_move_insn (temp1, GEN_INT (-rounded_size)); rtx_insn *insn - = emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp)); + = emit_insn (gen_add3_insn (temp1, stack_pointer_rtx, temp1)); /* For the initial allocation, we don't have a frame pointer set up, so we always need CFI notes. If we're doing the @@ -3692,7 +3708,7 @@ aarch64_allocate_and_probe_stack_space ( /* We want the CFA independent of the stack pointer for the duration of the loop. */ add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, temp, + plus_constant (Pmode, temp1, (rounded_size + (orig_size - size)))); RTX_FRAME_RELATED_P (insn) = 1; } @@ -3702,7 +3718,7 @@ aarch64_allocate_and_probe_stack_space ( It also probes at a 4k interval regardless of the value of PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL. */ insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, - stack_pointer_rtx, temp)); + stack_pointer_rtx, temp1)); /* Now reset the CFA register if needed. */ if (scratchreg == IP0_REGNUM || !frame_pointer_needed) @@ -3723,7 +3739,7 @@ aarch64_allocate_and_probe_stack_space ( Note that any residual must be probed. */ if (residual) { - aarch64_sub_sp (scratchreg, residual, true); + aarch64_sub_sp (temp1, residual, true); add_reg_note (get_last_insn (), REG_STACK_CHECK, const0_rtx); emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, (residual - GET_MODE_SIZE (word_mode)))); @@ -3814,6 +3830,9 @@ aarch64_expand_prologue (void) aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); } + rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM); + rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM); + /* We do not fully protect aarch64 against stack clash style attacks as doing so would be prohibitively expensive with less utility over time as newer compilers are deployed. @@ -3859,9 +3878,9 @@ aarch64_expand_prologue (void) outgoing args. */ if (flag_stack_clash_protection && initial_adjust >= guard_size - guard_used_by_caller) - aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust); + aarch64_allocate_and_probe_stack_space (ip0_rtx, initial_adjust); else - aarch64_sub_sp (IP0_REGNUM, initial_adjust, true); + aarch64_sub_sp (ip0_rtx, initial_adjust, true); if (callee_adjust != 0) aarch64_push_regs (reg1, reg2, callee_adjust); @@ -3871,10 +3890,9 @@ aarch64_expand_prologue (void) if (callee_adjust == 0) aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM, R30_REGNUM, false); - insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx, - stack_pointer_rtx, - GEN_INT (callee_offset))); - RTX_FRAME_RELATED_P (insn) = frame_pointer_needed; + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, + stack_pointer_rtx, callee_offset, ip1_rtx, + frame_pointer_needed); emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); } @@ -3890,9 +3908,9 @@ aarch64_expand_prologue (void) less the amount of the guard reserved for use by the caller's outgoing args. */ if (final_adjust >= guard_size - guard_used_by_caller) - aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust); + aarch64_allocate_and_probe_stack_space (ip1_rtx, final_adjust); else - aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + aarch64_sub_sp (ip1_rtx, final_adjust, !frame_pointer_needed); /* We must also probe if the final adjustment is larger than the guard that is assumed used by the caller. This may be sub-optimal. */ @@ -3905,7 +3923,7 @@ aarch64_expand_prologue (void) } } else - aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + aarch64_sub_sp (ip1_rtx, final_adjust, !frame_pointer_needed); } /* Return TRUE if we can use a simple_return insn. @@ -3961,17 +3979,16 @@ aarch64_expand_epilogue (bool for_sibcal /* Restore the stack pointer from the frame pointer if it may not be the same as the stack pointer. */ + rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM); + rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM); if (frame_pointer_needed && (final_adjust || cfun->calls_alloca)) - { - insn = emit_insn (gen_add3_insn (stack_pointer_rtx, - hard_frame_pointer_rtx, - GEN_INT (-callee_offset))); - /* If writeback is used when restoring callee-saves, the CFA - is restored on the instruction doing the writeback. */ - RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; - } + /* If writeback is used when restoring callee-saves, the CFA + is restored on the instruction doing the writeback. */ + aarch64_add_offset (Pmode, stack_pointer_rtx, + hard_frame_pointer_rtx, -callee_offset, + ip1_rtx, callee_adjust == 0); else - aarch64_add_sp (IP1_REGNUM, final_adjust, + aarch64_add_sp (ip1_rtx, final_adjust, /* A stack clash protection prologue may not have left IP1_REGNUM in a usable state. */ (flag_stack_clash_protection @@ -4000,7 +4017,7 @@ aarch64_expand_epilogue (bool for_sibcal /* A stack clash protection prologue may not have left IP0_REGNUM in a usable state. */ - aarch64_add_sp (IP0_REGNUM, initial_adjust, + aarch64_add_sp (ip0_rtx, initial_adjust, (flag_stack_clash_protection || df_regs_ever_live_p (IP0_REGNUM))); @@ -4107,16 +4124,16 @@ aarch64_output_mi_thunk (FILE *file, tre reload_completed = 1; emit_note (NOTE_INSN_PROLOGUE_END); + this_rtx = gen_rtx_REG (Pmode, this_regno); + temp0 = gen_rtx_REG (Pmode, IP0_REGNUM); + temp1 = gen_rtx_REG (Pmode, IP1_REGNUM); + if (vcall_offset == 0) - aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta); + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, false); else { gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); - this_rtx = gen_rtx_REG (Pmode, this_regno); - temp0 = gen_rtx_REG (Pmode, IP0_REGNUM); - temp1 = gen_rtx_REG (Pmode, IP1_REGNUM); - addr = this_rtx; if (delta != 0) { @@ -4124,7 +4141,8 @@ aarch64_output_mi_thunk (FILE *file, tre addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx, plus_constant (Pmode, this_rtx, delta)); else - aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta); + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, + false); } if (Pmode == ptr_mode) Index: gcc/testsuite/gcc.target/aarch64/pr70044.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/pr70044.c 2017-10-30 10:45:23.806582919 +0000 +++ gcc/testsuite/gcc.target/aarch64/pr70044.c 2017-10-30 10:46:47.278836421 +0000 @@ -11,4 +11,4 @@ main (int argc, char **argv) } /* Check that the frame pointer really is created. */ -/* { dg-final { scan-lto-assembler "add x29, sp," } } */ +/* { dg-final { scan-lto-assembler "(mov|add) x29, sp" } } */
On Mon, Oct 30, 2017 at 10:52:26AM +0000, Richard Sandiford wrote: > Richard Sandiford <richard.sandiford@linaro.org> writes: > > The port had aarch64_add_offset and aarch64_add_constant routines > > that did similar things. This patch replaces them with an expanded > > version of aarch64_add_offset that takes separate source and > > destination registers. The new routine also takes a poly_int64 offset > > instead of a HOST_WIDE_INT offset, but it leaves the HOST_WIDE_INT > > case to aarch64_add_offset_1, which is basically a repurposed > > aarch64_add_constant_internal. The SVE patch will put the handling > > of VL-based constants in aarch64_add_offset, while still using > > aarch64_add_offset_1 for the constant part. > > > > The vcall_offset == 0 path in aarch64_output_mi_thunk will use temp0 > > as well as temp1 once SVE is added. > > > > A side-effect of the patch is that we now generate: > > > > mov x29, sp > > > > instead of: > > > > add x29, sp, 0 > > > > in the pr70044.c test. > > Sorry, I stupidly rebased the patch just before posting and so > introduced a last-minute bug. Here's a fixed version that survives > testing on aarch64-linux-gnu. > > 2017-10-30 Richard Sandiford <richard.sandiford@linaro.org> > Alan Hayward <alan.hayward@arm.com> > David Sherwood <david.sherwood@arm.com> > > gcc/ > * config/aarch64/aarch64.c (aarch64_force_temporary): Assert that > x exists before using it. > (aarch64_add_constant_internal): Rename to... > (aarch64_add_offset_1): ...this. Replace regnum with separate > src and dest rtxes. Handle the case in which they're different, > including when the offset is zero. Replace scratchreg with an rtx. > Use 2 additions if there is no spare register into which we can > move a 16-bit constant. > (aarch64_add_constant): Delete. > (aarch64_add_offset): Replace reg with separate src and dest > rtxes. Take a poly_int64 offset instead of a HOST_WIDE_INT. > Use aarch64_add_offset_1. > (aarch64_add_sp, aarch64_sub_sp): Take the scratch register as > an rtx rather than an int. Take the delta as a poly_int64 > rather than a HOST_WIDE_INT. Use aarch64_add_offset. > (aarch64_expand_mov_immediate): Update uses of aarch64_add_offset. > (aarch64_allocate_and_probe_stack_space): Take the scratch register > as an rtx rather than an int. Use Pmode rather than word_mode > in the loop code. Update calls to aarch64_sub_sp. > (aarch64_expand_prologue): Update calls to aarch64_sub_sp, > aarch64_allocate_and_probe_stack_space and aarch64_add_offset. > (aarch64_expand_epilogue): Update calls to aarch64_add_offset > and aarch64_add_sp. > (aarch64_output_mi_thunk): Use aarch64_add_offset rather than > aarch64_add_constant. > > gcc/testsuite/ > * gcc.target/aarch64/pr70044.c: Allow "mov x29, sp" too. > @@ -1966,86 +1949,123 @@ aarch64_internal_mov_immediate (rtx dest > return num_insns; > } > > -/* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a > - temporary value if necessary. FRAME_RELATED_P should be true if > - the RTX_FRAME_RELATED flag should be set and CFA adjustments added > - to the generated instructions. If SCRATCHREG is known to hold > - abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the > - immediate again. > - > - Since this function may be used to adjust the stack pointer, we must > - ensure that it cannot cause transient stack deallocation (for example > - by first incrementing SP and then decrementing when adjusting by a > - large immediate). */ > +/* A subroutine of aarch64_add_offset that handles the case in which > + OFFSET is known at compile time. The arguments are otherwise the same. */ Some of the restrictions listed in this comment are important to keep here. > - Since this function may be used to adjust the stack pointer, we must > - ensure that it cannot cause transient stack deallocation (for example > - by first incrementing SP and then decrementing when adjusting by a > - large immediate). */ This one in particular seems like we'd want it kept nearby the code. OK with some sort of change to make the restrictions on what this code should do clear on both functions. Reviewed-by: James Greenhalgh <james.greenhalgh@arm.com> Thanks, James
Index: gcc/config/aarch64/aarch64.c =================================================================== --- gcc/config/aarch64/aarch64.c 2017-10-27 14:10:17.740863052 +0100 +++ gcc/config/aarch64/aarch64.c 2017-10-27 14:11:14.425034427 +0100 @@ -1818,30 +1818,13 @@ aarch64_force_temporary (machine_mode mo return force_reg (mode, value); else { - x = aarch64_emit_move (x, value); + gcc_assert (x); + aarch64_emit_move (x, value); return x; } } -static rtx -aarch64_add_offset (scalar_int_mode mode, rtx temp, rtx reg, - HOST_WIDE_INT offset) -{ - if (!aarch64_plus_immediate (GEN_INT (offset), mode)) - { - rtx high; - /* Load the full offset into a register. This - might be improvable in the future. */ - high = GEN_INT (offset); - offset = 0; - high = aarch64_force_temporary (mode, temp, high); - reg = aarch64_force_temporary (mode, temp, - gen_rtx_PLUS (mode, high, reg)); - } - return plus_constant (mode, reg, offset); -} - static int aarch64_internal_mov_immediate (rtx dest, rtx imm, bool generate, scalar_int_mode mode) @@ -1966,86 +1949,123 @@ aarch64_internal_mov_immediate (rtx dest return num_insns; } -/* Add DELTA to REGNUM in mode MODE. SCRATCHREG can be used to hold a - temporary value if necessary. FRAME_RELATED_P should be true if - the RTX_FRAME_RELATED flag should be set and CFA adjustments added - to the generated instructions. If SCRATCHREG is known to hold - abs (delta), EMIT_MOVE_IMM can be set to false to avoid emitting the - immediate again. - - Since this function may be used to adjust the stack pointer, we must - ensure that it cannot cause transient stack deallocation (for example - by first incrementing SP and then decrementing when adjusting by a - large immediate). */ +/* A subroutine of aarch64_add_offset that handles the case in which + OFFSET is known at compile time. The arguments are otherwise the same. */ static void -aarch64_add_constant_internal (scalar_int_mode mode, int regnum, - int scratchreg, HOST_WIDE_INT delta, - bool frame_related_p, bool emit_move_imm) +aarch64_add_offset_1 (scalar_int_mode mode, rtx dest, + rtx src, HOST_WIDE_INT offset, rtx temp1, + bool frame_related_p, bool emit_move_imm) { - HOST_WIDE_INT mdelta = abs_hwi (delta); - rtx this_rtx = gen_rtx_REG (mode, regnum); + gcc_assert (emit_move_imm || temp1 != NULL_RTX); + gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); + + HOST_WIDE_INT moffset = abs_hwi (offset); rtx_insn *insn; - if (!mdelta) - return; + if (!moffset) + { + if (!rtx_equal_p (dest, src)) + { + insn = emit_insn (gen_rtx_SET (dest, src)); + RTX_FRAME_RELATED_P (insn) = frame_related_p; + } + return; + } /* Single instruction adjustment. */ - if (aarch64_uimm12_shift (mdelta)) + if (aarch64_uimm12_shift (moffset)) { - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta))); + insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (offset))); RTX_FRAME_RELATED_P (insn) = frame_related_p; return; } - /* Emit 2 additions/subtractions if the adjustment is less than 24 bits. - Only do this if mdelta is not a 16-bit move as adjusting using a move - is better. */ - if (mdelta < 0x1000000 && !aarch64_move_imm (mdelta, mode)) + /* Emit 2 additions/subtractions if the adjustment is less than 24 bits + and either: + + a) the offset cannot be loaded by a 16-bit move or + b) there is no spare register into which we can move it. */ + if (moffset < 0x1000000 + && ((!temp1 && !can_create_pseudo_p ()) + || !aarch64_move_imm (moffset, mode))) { - HOST_WIDE_INT low_off = mdelta & 0xfff; + HOST_WIDE_INT low_off = moffset & 0xfff; - low_off = delta < 0 ? -low_off : low_off; - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (low_off))); + low_off = offset < 0 ? -low_off : low_off; + insn = emit_insn (gen_add3_insn (dest, src, GEN_INT (low_off))); RTX_FRAME_RELATED_P (insn) = frame_related_p; - insn = emit_insn (gen_add2_insn (this_rtx, GEN_INT (delta - low_off))); + insn = emit_insn (gen_add2_insn (dest, GEN_INT (offset - low_off))); RTX_FRAME_RELATED_P (insn) = frame_related_p; return; } /* Emit a move immediate if required and an addition/subtraction. */ - rtx scratch_rtx = gen_rtx_REG (mode, scratchreg); if (emit_move_imm) - aarch64_internal_mov_immediate (scratch_rtx, GEN_INT (mdelta), true, mode); - insn = emit_insn (delta < 0 ? gen_sub2_insn (this_rtx, scratch_rtx) - : gen_add2_insn (this_rtx, scratch_rtx)); + { + gcc_assert (temp1 != NULL_RTX || can_create_pseudo_p ()); + temp1 = aarch64_force_temporary (mode, temp1, GEN_INT (moffset)); + } + insn = emit_insn (offset < 0 + ? gen_sub3_insn (dest, src, temp1) + : gen_add3_insn (dest, src, temp1)); if (frame_related_p) { RTX_FRAME_RELATED_P (insn) = frame_related_p; - rtx adj = plus_constant (mode, this_rtx, delta); - add_reg_note (insn , REG_CFA_ADJUST_CFA, gen_rtx_SET (this_rtx, adj)); + rtx adj = plus_constant (mode, src, offset); + add_reg_note (insn, REG_CFA_ADJUST_CFA, gen_rtx_SET (dest, adj)); } } -static inline void -aarch64_add_constant (scalar_int_mode mode, int regnum, int scratchreg, - HOST_WIDE_INT delta) -{ - aarch64_add_constant_internal (mode, regnum, scratchreg, delta, false, true); -} +/* Set DEST to SRC + OFFSET. MODE is the mode of the addition. + FRAME_RELATED_P is true if the RTX_FRAME_RELATED flag should + be set and CFA adjustments added to the generated instructions. + + TEMP1, if nonnull, is a register of mode MODE that can be used as a + temporary if register allocation is already complete. This temporary + register may overlap DEST but must not overlap SRC. If TEMP1 is known + to hold abs (OFFSET), EMIT_MOVE_IMM can be set to false to avoid emitting + the immediate again. + + Since this function may be used to adjust the stack pointer, we must + ensure that it cannot cause transient stack deallocation (for example + by first incrementing SP and then decrementing when adjusting by a + large immediate). */ + +static void +aarch64_add_offset (scalar_int_mode mode, rtx dest, rtx src, + poly_int64 offset, rtx temp1, bool frame_related_p, + bool emit_move_imm = true) +{ + gcc_assert (emit_move_imm || temp1 != NULL_RTX); + gcc_assert (temp1 == NULL_RTX || !reg_overlap_mentioned_p (temp1, src)); + + /* SVE support will go here. */ + HOST_WIDE_INT constant = offset.to_constant (); + aarch64_add_offset_1 (mode, dest, src, constant, temp1, + frame_related_p, emit_move_imm); +} + +/* Add DELTA to the stack pointer, marking the instructions frame-related. + TEMP1 is available as a temporary if nonnull. EMIT_MOVE_IMM is false + if TEMP1 already contains abs (DELTA). */ static inline void -aarch64_add_sp (int scratchreg, HOST_WIDE_INT delta, bool emit_move_imm) +aarch64_add_sp (rtx temp1, poly_int64 delta, bool emit_move_imm) { - aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, delta, - true, emit_move_imm); + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, delta, + temp1, true, emit_move_imm); } +/* Subtract DELTA fom the stack pointer, marking the instructions + frame-related if FRAME_RELATED_P. TEMP1 is available as a temporary + if nonnull. */ + static inline void -aarch64_sub_sp (int scratchreg, HOST_WIDE_INT delta, bool frame_related_p) +aarch64_sub_sp (rtx temp1, poly_int64 delta, bool frame_related_p) { - aarch64_add_constant_internal (Pmode, SP_REGNUM, scratchreg, -delta, - frame_related_p, true); + aarch64_add_offset (Pmode, stack_pointer_rtx, stack_pointer_rtx, -delta, + temp1, frame_related_p); } void @@ -2078,9 +2098,8 @@ aarch64_expand_mov_immediate (rtx dest, { gcc_assert (can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); - base = aarch64_add_offset (int_mode, NULL, base, - INTVAL (offset)); - aarch64_emit_move (dest, base); + aarch64_add_offset (int_mode, dest, base, INTVAL (offset), + NULL_RTX, false); return; } @@ -2119,9 +2138,8 @@ aarch64_expand_mov_immediate (rtx dest, { gcc_assert(can_create_pseudo_p ()); base = aarch64_force_temporary (int_mode, dest, base); - base = aarch64_add_offset (int_mode, NULL, base, - INTVAL (offset)); - aarch64_emit_move (dest, base); + aarch64_add_offset (int_mode, dest, base, INTVAL (offset), + NULL_RTX, false); return; } /* FALLTHRU */ @@ -3613,11 +3631,10 @@ aarch64_set_handled_components (sbitmap cfun->machine->reg_is_wrapped_separately[regno] = true; } -/* Allocate SIZE bytes of stack space using SCRATCH_REG as a scratch - register. */ +/* Allocate SIZE bytes of stack space using TEMP1 as a scratch register. */ static void -aarch64_allocate_and_probe_stack_space (int scratchreg, HOST_WIDE_INT size) +aarch64_allocate_and_probe_stack_space (rtx temp1, HOST_WIDE_INT size) { HOST_WIDE_INT probe_interval = 1 << PARAM_VALUE (PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL); @@ -3631,7 +3648,7 @@ aarch64_allocate_and_probe_stack_space ( We can allocate GUARD_SIZE - GUARD_USED_BY_CALLER as a single chunk without any probing. */ gcc_assert (size >= guard_size - guard_used_by_caller); - aarch64_sub_sp (scratchreg, guard_size - guard_used_by_caller, true); + aarch64_sub_sp (temp1, guard_size - guard_used_by_caller, true); HOST_WIDE_INT orig_size = size; size -= (guard_size - guard_used_by_caller); @@ -3643,17 +3660,16 @@ aarch64_allocate_and_probe_stack_space ( if (rounded_size && rounded_size <= 4 * probe_interval) { /* We don't use aarch64_sub_sp here because we don't want to - repeatedly load SCRATCHREG. */ - rtx scratch_rtx = gen_rtx_REG (Pmode, scratchreg); + repeatedly load TEMP1. */ if (probe_interval > ARITH_FACTOR) - emit_move_insn (scratch_rtx, GEN_INT (-probe_interval)); + emit_move_insn (temp1, GEN_INT (-probe_interval)); else - scratch_rtx = GEN_INT (-probe_interval); + temp1 = GEN_INT (-probe_interval); for (HOST_WIDE_INT i = 0; i < rounded_size; i += probe_interval) { rtx_insn *insn = emit_insn (gen_add2_insn (stack_pointer_rtx, - scratch_rtx)); + temp1)); add_reg_note (insn, REG_STACK_CHECK, const0_rtx); if (probe_interval > ARITH_FACTOR) @@ -3674,10 +3690,10 @@ aarch64_allocate_and_probe_stack_space ( else if (rounded_size) { /* Compute the ending address. */ - rtx temp = gen_rtx_REG (word_mode, scratchreg); - emit_move_insn (temp, GEN_INT (-rounded_size)); + unsigned int scratchreg = REGNO (temp1); + emit_move_insn (temp1, GEN_INT (-rounded_size)); rtx_insn *insn - = emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp)); + = emit_insn (gen_add3_insn (temp1, stack_pointer_rtx, temp1)); /* For the initial allocation, we don't have a frame pointer set up, so we always need CFI notes. If we're doing the @@ -3692,7 +3708,7 @@ aarch64_allocate_and_probe_stack_space ( /* We want the CFA independent of the stack pointer for the duration of the loop. */ add_reg_note (insn, REG_CFA_DEF_CFA, - plus_constant (Pmode, temp, + plus_constant (Pmode, temp1, (rounded_size + (orig_size - size)))); RTX_FRAME_RELATED_P (insn) = 1; } @@ -3702,7 +3718,7 @@ aarch64_allocate_and_probe_stack_space ( It also probes at a 4k interval regardless of the value of PARAM_STACK_CLASH_PROTECTION_PROBE_INTERVAL. */ insn = emit_insn (gen_probe_stack_range (stack_pointer_rtx, - stack_pointer_rtx, temp)); + stack_pointer_rtx, temp1)); /* Now reset the CFA register if needed. */ if (scratchreg == IP0_REGNUM || !frame_pointer_needed) @@ -3723,7 +3739,7 @@ aarch64_allocate_and_probe_stack_space ( Note that any residual must be probed. */ if (residual) { - aarch64_sub_sp (scratchreg, residual, true); + aarch64_sub_sp (temp1, residual, true); add_reg_note (get_last_insn (), REG_STACK_CHECK, const0_rtx); emit_stack_probe (plus_constant (Pmode, stack_pointer_rtx, (residual - GET_MODE_SIZE (word_mode)))); @@ -3814,6 +3830,9 @@ aarch64_expand_prologue (void) aarch64_emit_probe_stack_range (get_stack_check_protect (), frame_size); } + rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM); + rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM); + /* We do not fully protect aarch64 against stack clash style attacks as doing so would be prohibitively expensive with less utility over time as newer compilers are deployed. @@ -3859,9 +3878,9 @@ aarch64_expand_prologue (void) outgoing args. */ if (flag_stack_clash_protection && initial_adjust >= guard_size - guard_used_by_caller) - aarch64_allocate_and_probe_stack_space (IP0_REGNUM, initial_adjust); + aarch64_allocate_and_probe_stack_space (ip0_rtx, initial_adjust); else - aarch64_sub_sp (IP0_REGNUM, initial_adjust, true); + aarch64_sub_sp (ip0_rtx, initial_adjust, true); if (callee_adjust != 0) aarch64_push_regs (reg1, reg2, callee_adjust); @@ -3871,9 +3890,8 @@ aarch64_expand_prologue (void) if (callee_adjust == 0) aarch64_save_callee_saves (DImode, callee_offset, R29_REGNUM, R30_REGNUM, false); - insn = emit_insn (gen_add3_insn (hard_frame_pointer_rtx, - stack_pointer_rtx, - GEN_INT (callee_offset))); + aarch64_add_offset (Pmode, hard_frame_pointer_rtx, + stack_pointer_rtx, callee_offset, ip1_rtx, true); RTX_FRAME_RELATED_P (insn) = frame_pointer_needed; emit_insn (gen_stack_tie (stack_pointer_rtx, hard_frame_pointer_rtx)); } @@ -3890,9 +3908,9 @@ aarch64_expand_prologue (void) less the amount of the guard reserved for use by the caller's outgoing args. */ if (final_adjust >= guard_size - guard_used_by_caller) - aarch64_allocate_and_probe_stack_space (IP1_REGNUM, final_adjust); + aarch64_allocate_and_probe_stack_space (ip1_rtx, final_adjust); else - aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + aarch64_sub_sp (ip1_rtx, final_adjust, !frame_pointer_needed); /* We must also probe if the final adjustment is larger than the guard that is assumed used by the caller. This may be sub-optimal. */ @@ -3905,7 +3923,7 @@ aarch64_expand_prologue (void) } } else - aarch64_sub_sp (IP1_REGNUM, final_adjust, !frame_pointer_needed); + aarch64_sub_sp (ip1_rtx, final_adjust, !frame_pointer_needed); } /* Return TRUE if we can use a simple_return insn. @@ -3961,17 +3979,16 @@ aarch64_expand_epilogue (bool for_sibcal /* Restore the stack pointer from the frame pointer if it may not be the same as the stack pointer. */ + rtx ip0_rtx = gen_rtx_REG (Pmode, IP0_REGNUM); + rtx ip1_rtx = gen_rtx_REG (Pmode, IP1_REGNUM); if (frame_pointer_needed && (final_adjust || cfun->calls_alloca)) - { - insn = emit_insn (gen_add3_insn (stack_pointer_rtx, - hard_frame_pointer_rtx, - GEN_INT (-callee_offset))); - /* If writeback is used when restoring callee-saves, the CFA - is restored on the instruction doing the writeback. */ - RTX_FRAME_RELATED_P (insn) = callee_adjust == 0; - } + /* If writeback is used when restoring callee-saves, the CFA + is restored on the instruction doing the writeback. */ + aarch64_add_offset (Pmode, stack_pointer_rtx, + hard_frame_pointer_rtx, -callee_offset, + ip1_rtx, callee_adjust == 0); else - aarch64_add_sp (IP1_REGNUM, final_adjust, + aarch64_add_sp (ip1_rtx, final_adjust, /* A stack clash protection prologue may not have left IP1_REGNUM in a usable state. */ (flag_stack_clash_protection @@ -4000,7 +4017,7 @@ aarch64_expand_epilogue (bool for_sibcal /* A stack clash protection prologue may not have left IP0_REGNUM in a usable state. */ - aarch64_add_sp (IP0_REGNUM, initial_adjust, + aarch64_add_sp (ip0_rtx, initial_adjust, (flag_stack_clash_protection || df_regs_ever_live_p (IP0_REGNUM))); @@ -4107,16 +4124,16 @@ aarch64_output_mi_thunk (FILE *file, tre reload_completed = 1; emit_note (NOTE_INSN_PROLOGUE_END); + this_rtx = gen_rtx_REG (Pmode, this_regno); + temp0 = gen_rtx_REG (Pmode, IP0_REGNUM); + temp1 = gen_rtx_REG (Pmode, IP1_REGNUM); + if (vcall_offset == 0) - aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta); + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, false); else { gcc_assert ((vcall_offset & (POINTER_BYTES - 1)) == 0); - this_rtx = gen_rtx_REG (Pmode, this_regno); - temp0 = gen_rtx_REG (Pmode, IP0_REGNUM); - temp1 = gen_rtx_REG (Pmode, IP1_REGNUM); - addr = this_rtx; if (delta != 0) { @@ -4124,7 +4141,8 @@ aarch64_output_mi_thunk (FILE *file, tre addr = gen_rtx_PRE_MODIFY (Pmode, this_rtx, plus_constant (Pmode, this_rtx, delta)); else - aarch64_add_constant (Pmode, this_regno, IP1_REGNUM, delta); + aarch64_add_offset (Pmode, this_rtx, this_rtx, delta, temp1, + false); } if (Pmode == ptr_mode) Index: gcc/testsuite/gcc.target/aarch64/pr70044.c =================================================================== --- gcc/testsuite/gcc.target/aarch64/pr70044.c 2017-10-27 14:06:52.606994276 +0100 +++ gcc/testsuite/gcc.target/aarch64/pr70044.c 2017-10-27 14:10:24.015116329 +0100 @@ -11,4 +11,4 @@ main (int argc, char **argv) } /* Check that the frame pointer really is created. */ -/* { dg-final { scan-lto-assembler "add x29, sp," } } */ +/* { dg-final { scan-lto-assembler "(mov|add) x29, sp" } } */