diff mbox

aarch64: Add split-stack initial support

Message ID 1e4a225c-ec8b-0a80-80c6-5f6ac940dfcc@linaro.org
State New
Headers show

Commit Message

Adhemerval Zanella Aug. 18, 2016, 10:17 p.m. UTC
On 08/08/2016 07:58, Jiong Wang wrote:
> 

> Adhemerval Zanella writes:

> 

>>> diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c

>>> index e56398a..2cf239f 100644

>>> --- a/gcc/config/aarch64/aarch64.c

>>> +++ b/gcc/config/aarch64/aarch64.c

>>> @@ -3227,6 +3227,34 @@ aarch64_expand_prologue (void)

>>> +      emit_move_insn (x11, GEN_INT (hard_fp_offset));

>>> +      emit_insn (gen_add3_insn (x10, x29, x11));

>>> +      jump = gen_rtx_IF_THEN_ELSE (VOIDmode,

>>> +				   gen_rtx_GEU (VOIDmode, cc_reg,

>>> +						const0_rtx),

>>> +				   gen_rtx_LABEL_REF (VOIDmode, not_more),

>>> +				   pc_rtx);

>>> +      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));

>>> +      JUMP_LABEL (jump) = not_more;

>>> +      LABEL_NUSES (not_more) += 1;

>>> +      emit_move_insn (x10, x28);

>>> +      emit_label (not_more);

>>> +    }

>>>  }

> 

> This part needs rebase, there are major changes in AArch64 prologue code

> recently.

> 


Right, I see that 'hard_fp_offset' is not defined locally anymore.

>>>  

>>>  /* Return TRUE if we can use a simple_return insn.

>>> @@ -3303,6 +3331,7 @@ aarch64_expand_epilogue (bool for_sibcall)

>>>        offset = offset - fp_offset;

>>>      }

>>>  

>>> +

> 

> Unncessary new line.

>


Ack.

 
>>> +

>>> +  /* Load __private_ss from TCB.  */

>>> +  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);

>>> +  emit_insn (gen_aarch64_load_tp_hard (ssvalue));

>>> +  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));

>>> +  emit_move_insn (ssvalue, mem);

>>> +

>>> +  temp = gen_rtx_REG (Pmode, R10_REGNUM);

>>> +

>>> +  /* Always emit two insns to calculate the requested stack, so the linker

>>> +     can edit them when adjusting size for calling non-split-stack code.  */

>>> +  ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,

>>> +					  Pmode);

>>> +  gcc_assert (ninsn == 1 || ninsn == 2);

>>> +  if (ninsn == 1)

>>> +    emit_insn (gen_nop ());

> 

> there will be trouble to linker if the following add is scheduled before

> the nop?


I theory yes, although I haven't see gcc splitting it.  Which would be the
correct way to tie the nop generation to be emitted after the mov immediate?
 
>>> +

>>> +	# Set up for a call to the target function.

>>> +	#ldp	x29, x30, [x28, STACKFRAME_BASE]

>>> +	ldr	x30, [x28, STACKFRAME_BASE + 8]

>>> +	ldp	x0, x1, [x28, STACKFRAME_BASE + 16]

>>> +	ldp	x2, x3, [x28, STACKFRAME_BASE + 32]

>>> +	ldp	x4, x5, [x28, STACKFRAME_BASE + 48]

>>> +	ldp	x6, x7, [x28, STACKFRAME_BASE + 64]

>>> +	add	x9, x30, 8

>>> +	cmp	x30, x9

> 

> Can you explain why do we need this "cmp" before jumping to target

> function?


This is due the function prologue addition for var args handling:

      mov     x11, <required stack allocation>
      sub     sp, sp, <required stack allocation>
      add     x10, x29, x11
      b.cs    function:
      mov     x10, x28

If __morestack is called it will use the the 'b.cs' to setup the
correct var arg pointer.


Below it the last iteration patch, however I now seeing some similar issue
s390 hit when building libgo:

../../../gcc-git/libgo/go/syscall/socket_linux.go:90:1: error: flow control insn inside a basic block
(jump_insn 90 89 91 14 (set (pc)
        (if_then_else (geu (reg:CC 66 cc)
                (const_int 0 [0]))
            (label_ref 92)
            (pc))) ../../../gcc-git/libgo/go/syscall/socket_linux.go:90 -1
     (nil)
 -> 92)
../../../gcc-git/libgo/go/syscall/socket_linux.go:90:1: internal compiler error: in rtl_verify_bb_insns, at cfgrtl.c:2658
0xac35af _fatal_insn(char const*, rtx_def const*, char const*, int, char const*)

It shows only with -O2, which I think it due how the block is reorganized
internally and regarding the pseudo-return instruction inserted by split-stack.
I am still debugging the issue and how to proper fix it, so if you have any
advice I open to suggestions.


----

-- 
2.1.4
diff mbox

Patch

diff --git a/gcc/common/config/aarch64/aarch64-common.c b/gcc/common/config/aarch64/aarch64-common.c
index 08e7959..01c3239 100644
--- a/gcc/common/config/aarch64/aarch64-common.c
+++ b/gcc/common/config/aarch64/aarch64-common.c
@@ -106,6 +106,21 @@  aarch64_handle_option (struct gcc_options *opts,
     }
 }
 
+/* -fsplit-stack uses a TCB field available on glibc-2.25.  GLIBC also
+   exports symbol, __tcb_private_ss, to signal it has the field available
+   on TCB allocation.  This aims to prevent binaries linked against newer
+   GLIBC to run on non-supported ones.  */
+
+static bool
+aarch64_supports_split_stack (bool report ATTRIBUTE_UNUSED,
+			      struct gcc_options *opts ATTRIBUTE_UNUSED)
+{
+  return true;
+}
+
+#undef TARGET_SUPPORTS_SPLIT_STACK
+#define TARGET_SUPPORTS_SPLIT_STACK aarch64_supports_split_stack
+
 struct gcc_targetm_common targetm_common = TARGETM_COMMON_INITIALIZER;
 
 /* An ISA extension in the co-processor and main instruction set space.  */
@@ -342,4 +357,3 @@  aarch64_rewrite_mcpu (int argc, const char **argv)
 }
 
 #undef AARCH64_CPU_NAME_LENGTH
-
diff --git a/gcc/config/aarch64/aarch64-linux.h b/gcc/config/aarch64/aarch64-linux.h
index 5fcaa59..ab3208b 100644
--- a/gcc/config/aarch64/aarch64-linux.h
+++ b/gcc/config/aarch64/aarch64-linux.h
@@ -80,8 +80,6 @@ 
     }						\
   while (0)
 
-#define TARGET_ASM_FILE_END file_end_indicate_exec_stack
-
 /* Uninitialized common symbols in non-PIE executables, even with
    strong definitions in dependent shared libraries, will resolve
    to COPY relocated symbol in the executable.  See PR65780.  */
diff --git a/gcc/config/aarch64/aarch64-protos.h b/gcc/config/aarch64/aarch64-protos.h
index 3cdd69b..82a4e11 100644
--- a/gcc/config/aarch64/aarch64-protos.h
+++ b/gcc/config/aarch64/aarch64-protos.h
@@ -377,6 +377,8 @@  void aarch64_err_no_fpadvsimd (machine_mode, const char *);
 void aarch64_expand_epilogue (bool);
 void aarch64_expand_mov_immediate (rtx, rtx);
 void aarch64_expand_prologue (void);
+void aarch64_expand_split_stack_prologue (void);
+void aarch64_split_stack_space_check (rtx, rtx);
 void aarch64_expand_vector_init (rtx, rtx);
 void aarch64_init_cumulative_args (CUMULATIVE_ARGS *, const_tree, rtx,
 				   const_tree, unsigned);
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 3e663eb..e92195c 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -3203,6 +3203,10 @@  aarch64_expand_prologue (void)
   unsigned reg1 = cfun->machine->frame.wb_candidate1;
   unsigned reg2 = cfun->machine->frame.wb_candidate2;
   rtx_insn *insn;
+  bool using_split_stack = (flag_split_stack
+			    && (lookup_attribute ("no_split_stack",
+						  DECL_ATTRIBUTES (cfun->decl))
+                               == NULL));
 
   if (flag_stack_usage_info)
     current_function_static_stack_size = frame_size;
@@ -3242,6 +3246,36 @@  aarch64_expand_prologue (void)
 			     callee_adjust != 0 || frame_pointer_needed);
   aarch64_add_constant (Pmode, SP_REGNUM, IP1_REGNUM, -final_adjust,
 			!frame_pointer_needed);
+
+  if (using_split_stack &&
+      (cfun->machine->frame.split_stack_arg_pointer != NULL_RTX))
+    {
+      /* Setup the argument pointer (x10) for -fsplit-stack code.  If
+	 __morestack was called, it will left the arg pointer to the
+	 old stack in x28.  Otherwise, the argument pointer is the top
+	 of current frame.  */
+      HOST_WIDE_INT hard_fp_offset = cfun->machine->frame.hard_fp_offset;
+      rtx x10 = gen_rtx_REG (Pmode, R10_REGNUM);
+      rtx x11 = gen_rtx_REG (Pmode, R11_REGNUM);
+      rtx x28 = gen_rtx_REG (Pmode, R28_REGNUM);
+      rtx x29 = gen_rtx_REG (Pmode, R29_REGNUM);
+      rtx not_more = gen_label_rtx ();
+      rtx cc_reg = gen_rtx_REG (CCmode, CC_REGNUM);
+      rtx jump;
+
+      emit_move_insn (x11, GEN_INT (hard_fp_offset));
+      emit_insn (gen_add3_insn (x10, x29, x11));
+      jump = gen_rtx_IF_THEN_ELSE (VOIDmode,
+				   gen_rtx_GEU (VOIDmode, cc_reg,
+						const0_rtx),
+				   gen_rtx_LABEL_REF (VOIDmode, not_more),
+				   pc_rtx);
+      jump = emit_jump_insn (gen_rtx_SET (pc_rtx, jump));
+      JUMP_LABEL (jump) = not_more;
+      LABEL_NUSES (not_more) += 1;
+      emit_move_insn (x10, x28);
+      emit_label (not_more);
+    }
 }
 
 /* Return TRUE if we can use a simple_return insn.
@@ -9641,7 +9675,7 @@  aarch64_expand_builtin_va_start (tree valist, rtx nextarg ATTRIBUTE_UNUSED)
   /* Emit code to initialize STACK, which points to the next varargs stack
      argument.  CUM->AAPCS_STACK_SIZE gives the number of stack words used
      by named arguments.  STACK is 8-byte aligned.  */
-  t = make_tree (TREE_TYPE (stack), virtual_incoming_args_rtx);
+  t = make_tree (TREE_TYPE (stack), crtl->args.internal_arg_pointer);
   if (cum->aapcs_stack_size > 0)
     t = fold_build_pointer_plus_hwi (t, cum->aapcs_stack_size * UNITS_PER_WORD);
   t = build2 (MODIFY_EXPR, TREE_TYPE (stack), stack, t);
@@ -14000,6 +14034,200 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
     }
 }
 
+/* -fsplit-stack support.  */
+
+/* A SYMBOL_REF for __morestack.  */
+static GTY(()) rtx morestack_ref;
+
+/* Emit -fsplit-stack prologue, which goes before the regular function
+   prologue.  */
+void
+aarch64_expand_split_stack_prologue (void)
+{
+  HOST_WIDE_INT frame_size, args_size;
+  rtx_code_label *ok_label = NULL;
+  rtx mem, ssvalue, compare, jump, insn, call_fusage;
+  rtx reg11, reg30, temp;
+  rtx new_cfa, cfi_ops = NULL;
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+  int ninsn;
+
+  gcc_assert (flag_split_stack && reload_completed);
+
+  /* A minimal stack frame would be created for __morestack call.  */
+  frame_size = cfun->machine->frame.frame_size + 16;
+
+  /* It limits total maximum stack allocation on 2G so its value can be
+     materialized with two instruction at most (movn/movk).  It might be
+     used by the linker to add some extra space for split calling non split
+     stack functions.  */
+  if (frame_size > ((HOST_WIDE_INT) 1 << 31))
+    {
+      sorry ("Stack frame larger than 2G is not supported for -fsplit-stack");
+      return;
+    }
+
+  if (morestack_ref == NULL_RTX)
+    {
+      morestack_ref = gen_rtx_SYMBOL_REF (Pmode, "__morestack");
+      SYMBOL_REF_FLAGS (morestack_ref) |= (SYMBOL_FLAG_LOCAL
+					   | SYMBOL_FLAG_FUNCTION);
+    }
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  temp = gen_rtx_REG (Pmode, R10_REGNUM);
+
+  /* Always emit two insns to calculate the requested stack, so the linker
+     can edit them when adjusting size for calling non-split-stack code.  */
+  ninsn = aarch64_internal_mov_immediate (temp, GEN_INT (-frame_size), true,
+					  Pmode);
+  gcc_assert (ninsn == 1 || ninsn == 2);
+  if (ninsn == 1)
+    emit_insn (gen_nop ());
+  emit_insn (gen_add3_insn (temp, stack_pointer_rtx, temp));
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  ok_label = gen_label_rtx ();
+  compare = aarch64_gen_compare_reg (LT, temp, ssvalue);
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			       gen_rtx_GEU (VOIDmode, compare, const0_rtx),
+			       gen_rtx_LABEL_REF (VOIDmode, ok_label),
+			       pc_rtx);
+  jump = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+  JUMP_LABEL (jump) = ok_label;
+  /* Mark the jump as very likely to be taken.  */
+  add_int_reg_note (jump, REG_BR_PROB, REG_BR_PROB_BASE / 100 - 1);
+
+  call_fusage = NULL_RTX;
+
+  /* Call __morestack with a non-standard call procedure: x10 will hold
+     the requested stack pointer and x11 the required stack size to be
+     copied.  */
+  args_size = crtl->args.size >= 0 ? crtl->args.size : 0;
+  reg11 = gen_rtx_REG (DImode, R11_REGNUM);
+  emit_move_insn (reg11, GEN_INT (args_size));
+  use_reg (&call_fusage, reg11);
+
+  /* Set up a minimum frame pointer to call __morestack.  The SP is not
+     save on x29 prior so in __morestack x29 points to the called SP.  */
+  reg30 = gen_rtx_REG (Pmode, R30_REGNUM);
+  aarch64_pushwb_single_reg (Pmode, R30_REGNUM, 16);
+
+  insn = emit_call_insn (gen_call (gen_rtx_MEM (DImode, morestack_ref),
+				   const0_rtx, const0_rtx));
+  add_function_usage_to (insn, call_fusage);
+
+  cfi_ops = alloc_reg_note (REG_CFA_RESTORE, reg30, cfi_ops);
+  mem = plus_constant (Pmode, stack_pointer_rtx, 16);
+  cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, stack_pointer_rtx, cfi_ops);
+
+  mem = gen_rtx_POST_MODIFY (Pmode, stack_pointer_rtx, mem);
+  mem = gen_rtx_MEM (DImode, mem);
+  insn = emit_move_insn (reg30, mem);
+
+  new_cfa = stack_pointer_rtx;
+  new_cfa = plus_constant (Pmode, new_cfa, 16);
+  cfi_ops = alloc_reg_note (REG_CFA_DEF_CFA, new_cfa, cfi_ops);
+  REG_NOTES (insn) = cfi_ops;
+  RTX_FRAME_RELATED_P (insn) = 1;
+
+  emit_insn (gen_split_stack_return ());
+
+  /* __morestack will call us here.  */
+
+  emit_label (ok_label);
+  LABEL_NUSES (ok_label)++;
+}
+
+/* Implement TARGET_ASM_FILE_END.  */
+static void
+aarch64_file_end (void)
+{
+  file_end_indicate_exec_stack ();
+
+  if (flag_split_stack)
+    file_end_indicate_split_stack ();
+}
+
+/* Return the internal arg pointer used for function incoming arguments.  */
+static rtx
+aarch64_internal_arg_pointer (void)
+{
+  if (flag_split_stack
+     && (lookup_attribute ("no_split_stack", DECL_ATTRIBUTES (cfun->decl))
+         == NULL))
+    {
+      if (cfun->machine->frame.split_stack_arg_pointer == NULL_RTX)
+	{
+	  rtx pat;
+
+	  cfun->machine->frame.split_stack_arg_pointer = gen_reg_rtx (Pmode);
+	  REG_POINTER (cfun->machine->frame.split_stack_arg_pointer) = 1;
+
+	  /* Put the pseudo initialization right after the note at the
+	     beginning of the function.  */
+	  pat = gen_rtx_SET (cfun->machine->frame.split_stack_arg_pointer,
+			     gen_rtx_REG (Pmode, R10_REGNUM));
+	  push_topmost_sequence ();
+	  emit_insn_after (pat, get_insns ());
+	  pop_topmost_sequence ();
+	}
+      return plus_constant (Pmode, cfun->machine->frame.split_stack_arg_pointer,
+			    FIRST_PARM_OFFSET (current_function_decl));
+    }
+  return virtual_incoming_args_rtx;
+}
+
+static void
+aarch64_live_on_entry (bitmap regs)
+{
+  if (flag_split_stack)
+    bitmap_set_bit (regs, R10_REGNUM);
+}
+
+/* Emit -fsplit-stack dynamic stack allocation space check.  */
+
+void
+aarch64_split_stack_space_check (rtx size, rtx label)
+{
+  rtx mem, ssvalue, compare, jump, temp;
+  rtx requested = gen_reg_rtx (Pmode);
+  /* Offset from thread pointer to __private_ss.  */
+  int psso = 0x10;
+
+  /* Load __private_ss from TCB.  */
+  ssvalue = gen_rtx_REG (Pmode, R9_REGNUM);
+  emit_insn (gen_aarch64_load_tp_hard (ssvalue));
+  mem = gen_rtx_MEM (Pmode, plus_constant (Pmode, ssvalue, psso));
+  emit_move_insn (ssvalue, mem);
+
+  /* And compare it with frame pointer plus required stack.  */
+  if (CONST_INT_P (size))
+     emit_insn (gen_add3_insn (requested, stack_pointer_rtx,
+			       GEN_INT (-INTVAL (size))));
+  else
+    {
+      size = force_reg (Pmode, size);
+      emit_move_insn (requested, gen_rtx_MINUS (Pmode, stack_pointer_rtx,
+						size));
+    }
+
+  /* Jump to __morestack call if current __private_ss is not suffice.  */
+  compare = aarch64_gen_compare_reg (LT, requested, ssvalue);
+  temp = gen_rtx_IF_THEN_ELSE (VOIDmode,
+			       gen_rtx_GEU (VOIDmode, compare, const0_rtx),
+			       gen_rtx_LABEL_REF (VOIDmode, label),
+			       pc_rtx);
+  jump = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
+  JUMP_LABEL (jump) = label;
+}
+
 #undef TARGET_ADDRESS_COST
 #define TARGET_ADDRESS_COST aarch64_address_cost
 
@@ -14026,6 +14254,9 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_ASM_FILE_START
 #define TARGET_ASM_FILE_START aarch64_start_file
 
+#undef TARGET_ASM_FILE_END
+#define TARGET_ASM_FILE_END aarch64_file_end
+
 #undef TARGET_ASM_OUTPUT_MI_THUNK
 #define TARGET_ASM_OUTPUT_MI_THUNK aarch64_output_mi_thunk
 
@@ -14108,6 +14339,12 @@  aarch64_optab_supported_p (int op, machine_mode mode1, machine_mode,
 #undef TARGET_FRAME_POINTER_REQUIRED
 #define TARGET_FRAME_POINTER_REQUIRED aarch64_frame_pointer_required
 
+#undef TARGET_EXTRA_LIVE_ON_ENTRY
+#define TARGET_EXTRA_LIVE_ON_ENTRY aarch64_live_on_entry
+
+#undef TARGET_INTERNAL_ARG_POINTER
+#define TARGET_INTERNAL_ARG_POINTER aarch64_internal_arg_pointer
+
 #undef TARGET_GIMPLE_FOLD_BUILTIN
 #define TARGET_GIMPLE_FOLD_BUILTIN aarch64_gimple_fold_builtin
 
diff --git a/gcc/config/aarch64/aarch64.h b/gcc/config/aarch64/aarch64.h
index 19caf9f..2b47327 100644
--- a/gcc/config/aarch64/aarch64.h
+++ b/gcc/config/aarch64/aarch64.h
@@ -587,6 +587,9 @@  struct GTY (()) aarch64_frame
   unsigned wb_candidate2;
 
   bool laid_out;
+
+  /* Alternative internal arg pointer for -fsplit-stack.  */
+  rtx split_stack_arg_pointer;
 };
 
 typedef struct GTY (()) machine_function
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c95258b..6eb7cd4 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -130,6 +130,7 @@ 
     UNSPEC_VSTRUCTDUMMY
     UNSPEC_SP_SET
     UNSPEC_SP_TEST
+    UNSPEC_STACK_CHECK
     UNSPEC_RSQRT
     UNSPEC_RSQRTE
     UNSPEC_RSQRTS
@@ -144,6 +145,7 @@ 
     UNSPECV_SET_FPSR		; Represent assign of FPSR content.
     UNSPECV_BLOCKAGE		; Represent a blockage
     UNSPECV_PROBE_STACK_RANGE	; Represent stack range probing.
+    UNSPECV_SPLIT_STACK_RETURN  ; Represent a camouflaged return
   ]
 )
 
@@ -5381,3 +5383,33 @@ 
 
 ;; ldp/stp peephole patterns
 (include "aarch64-ldpstp.md")
+
+;; Handle -fsplit-stack
+(define_expand "split_stack_prologue"
+  [(const_int 0)]
+  ""
+{
+  aarch64_expand_split_stack_prologue ();
+  DONE;
+})
+
+;; A return instruction which the middle-end does not see.
+(define_insn "split_stack_return"
+  [(unspec_volatile [(const_int 0)] UNSPECV_SPLIT_STACK_RETURN)]
+  ""
+  "ret"
+  [(set_attr "type" "branch")])
+
+;; If there are operand 0 bytes available on the stack, jump to
+;; operand 1.
+(define_expand "split_stack_space_check"
+  [(set (match_dup 2) (compare:CC (match_dup 3) (match_dup 2)))
+   (set (pc) (if_then_else
+	      (geu (match_dup 4) (const_int 0))
+	      (label_ref (match_operand 1))
+	      (pc)))]
+  ""
+{
+  aarch64_split_stack_space_check (operands[0], operands[1]);
+  DONE;
+})
diff --git a/gcc/testsuite/gcc.dg/split-3.c b/gcc/testsuite/gcc.dg/split-3.c
index 64bbb8c..5ba7616 100644
--- a/gcc/testsuite/gcc.dg/split-3.c
+++ b/gcc/testsuite/gcc.dg/split-3.c
@@ -40,6 +40,7 @@  down (int i, ...)
       || va_arg (ap, int) != 9
       || va_arg (ap, int) != 10)
     abort ();
+  va_end (ap);
 
   if (i > 0)
     {
diff --git a/gcc/testsuite/gcc.dg/split-6.c b/gcc/testsuite/gcc.dg/split-6.c
index b32cf8d..b3016ba 100644
--- a/gcc/testsuite/gcc.dg/split-6.c
+++ b/gcc/testsuite/gcc.dg/split-6.c
@@ -37,6 +37,7 @@  down (int i, ...)
       || va_arg (ap, int) != 9
       || va_arg (ap, int) != 10)
     abort ();
+  va_end (ap);
 
   if (i > 0)
     {
diff --git a/libgcc/config.host b/libgcc/config.host
index 4ccf25d..18f49f1 100644
--- a/libgcc/config.host
+++ b/libgcc/config.host
@@ -336,6 +336,7 @@  aarch64*-*-linux*)
 	md_unwind_header=aarch64/linux-unwind.h
 	tmake_file="${tmake_file} ${cpu_type}/t-aarch64"
 	tmake_file="${tmake_file} ${cpu_type}/t-softfp t-softfp t-crtfm"
+	tmake_file="${tmake_file} t-stack aarch64/t-stack-aarch64"
 	;;
 alpha*-*-linux*)
 	tmake_file="${tmake_file} alpha/t-alpha alpha/t-ieee t-crtfm alpha/t-linux"
diff --git a/libgcc/config/aarch64/morestack-c.c b/libgcc/config/aarch64/morestack-c.c
new file mode 100644
index 0000000..8df7895
--- /dev/null
+++ b/libgcc/config/aarch64/morestack-c.c
@@ -0,0 +1,95 @@ 
+/* AArch64 support for -fsplit-stack.
+ * Copyright (C) 2016 Free Software Foundation, Inc.
+ *
+ * This file is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the
+ * Free Software Foundation; either version 3, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Under Section 7 of GPL version 3, you are granted additional
+ * permissions described in the GCC Runtime Library Exception, version
+ * 3.1, as published by the Free Software Foundation.
+ *
+ * You should have received a copy of the GNU General Public License and
+ * a copy of the GCC Runtime Library Exception along with this program;
+ * see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+ * <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef inhibit_libc
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stddef.h>
+#include "generic-morestack.h"
+
+/* This is based on GLIBC definition (version 2.24).  There is no need to
+   keep it sync since new fields are added on the end of structure and do
+   not change the '__private_ss' layout.  */
+typedef struct
+{
+  void *dtv;
+  void *private;
+  void *__private_ss;
+} tcbhead_t;
+
+#define INITIAL_STACK_SIZE  0x4000
+#define BACKOFF             0x1000
+
+void __generic_morestack_set_initial_sp (void *sp, size_t len);
+void *__morestack_get_guard (void);
+void __morestack_set_guard (void *);
+void *__morestack_make_guard (void *stack, size_t size);
+void __morestack_load_mmap (void);
+
+/* We declare is as weak so it fails either at stack linking or
+   at runtime if the GLIBC does not have the required TCB field.  */
+extern void __tcb_private_ss (void) __attribute__ ((weak));
+
+/* Initialize the stack guard when the program starts or when a new
+   thread.  This is called from a constructor using ctors section.  */
+void
+__stack_split_initialize (void)
+{
+  __tcb_private_ss ();
+
+  register void* sp __asm__ ("sp");
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = (void*)((uintptr_t)sp - INITIAL_STACK_SIZE);
+  return __generic_morestack_set_initial_sp (sp, INITIAL_STACK_SIZE);
+}
+
+/* Return current __private_ss.  */
+void *
+__morestack_get_guard (void)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  return tcb->__private_ss;
+}
+
+/* Set __private_ss to ptr.  */
+void
+__morestack_set_guard (void *ptr)
+{
+  tcbhead_t *tcb = ((tcbhead_t *) __builtin_thread_pointer ());
+  tcb->__private_ss = ptr;
+}
+
+/* Return the stack guard value for given stack.  */
+void *
+__morestack_make_guard (void *stack, size_t size)
+{
+  return (void*)((uintptr_t)stack - size + BACKOFF);
+}
+
+/* Make __stack_split_initialize a high priority constructor.  */
+static void (*const ctors []) 
+  __attribute__ ((used, section (".ctors.65535"), aligned (sizeof (void *))))
+  = { __stack_split_initialize, __morestack_load_mmap };
+
+#endif /* !defined (inhibit_libc) */
diff --git a/libgcc/config/aarch64/morestack.S b/libgcc/config/aarch64/morestack.S
new file mode 100644
index 0000000..5bbac4c
--- /dev/null
+++ b/libgcc/config/aarch64/morestack.S
@@ -0,0 +1,269 @@ 
+# AArch64 support for -fsplit-stack.
+# Copyright (C) 2016 Free Software Foundation, Inc.
+
+# This file is part of GCC.
+
+# GCC is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free
+# Software Foundation; either version 3, or (at your option) any later
+# version.
+
+# GCC is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or
+# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+# for more details.
+
+# Under Section 7 of GPL version 3, you are granted additional
+# permissions described in the GCC Runtime Library Exception, version
+# 3.1, as published by the Free Software Foundation.
+
+# You should have received a copy of the GNU General Public License and
+# a copy of the GCC Runtime Library Exception along with this program;
+# see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
+# <http://www.gnu.org/licenses/>.
+
+/* Define an entry point visible from C.  */
+#define ENTRY(name)						\
+  .globl name;							\
+  .type name,%function;						\
+  .align 4;							\
+  name##:
+
+#define END(name)						\
+  .size name,.-name
+
+
+#define MORESTACK_FRAMESIZE	112
+/* Offset based on function stack to get its argument from __morestack
+   frame.  */
+#define STACKFRAME_BASE		(-MORESTACK_FRAMESIZE - 16)
+/* Offset from __morestack frame where the new stack size is saved and
+   passed to __generic_morestack.  */
+#define NEWSTACK_SAVE		88
+/* Offset from __morestack frame where the arguments size saved and
+   passed to __generic_morestack.  */
+#define ARGS_SIZE_SAVE		80
+
+#define BACKOFF			0x2000
+# Large excess allocated when calling non-split-stack code.
+#define NON_SPLIT_STACK		0x100000
+
+# TCB offset of __private_ss
+#define TCB_PRIVATE_SS		#16
+
+	.text
+ENTRY(__morestack_non_split)
+	.cfi_startproc
+# We use a cleanup to restore the tcbhead_t.__private_ss if
+# an exception is thrown through this code.
+	add	x11, x11, NON_SPLIT_STACK
+	.cfi_endproc
+END(__morestack_non_split)
+# Fall through into __morestack
+
+# This function is called with non-standard calling conventions.  On entry
+# x10 is the requested stack pointer.  The split-stack prologue is in the
+# form:
+#
+#	mrs    x9, tpidr_el0
+#	mov    x10, -<required stack allocation>
+#	add    x10, sp, x10
+#	ldr    x9, [x9, 16]
+#	cmp    x10, x9
+#	bcs    enough
+#	stp    x30, [sp, -16]!
+#	mov    x11, <required arguments copy size>
+#	bl     __morestack
+#	ldp    x30, [sp], 16
+#	ret
+# enough:
+#
+# The normal function prologue follows here, with a small addition at the
+# end to set up the argument pointer.  The argument pointer is setup with:
+#
+#	mov     x11, <required stack allocation>
+#	sub	sp, sp, <required stack allocation>
+#	add	x10, x29, x11
+#	b.cs    function:
+#	mov     x10, x28
+# function:
+#
+# Note that all argument parameter registers and the x10 (the argument
+# pointer) are saved.  The N bit is also saved and restores to indicate
+# that the function is called (so the prologue addition can set up the
+# argument pointer correctly).
+
+ENTRY(__morestack)
+.LFB1:
+	.cfi_startproc
+
+#ifdef __PIC__
+	.cfi_personality 0x9b,DW.ref.__gcc_personality_v0
+	.cfi_lsda 0x1b,.LLSDA1
+#else
+	.cfi_personality 0x3,__gcc_personality_v0
+	.cfi_lsda 0x3,.LLSDA1
+#endif
+
+	# Calculate requested stack size.
+	sub	x12, sp, x10
+	# Save parameters
+	stp	x29, x30, [sp, -MORESTACK_FRAMESIZE]!
+	.cfi_def_cfa_offset MORESTACK_FRAMESIZE
+	.cfi_offset 29, -MORESTACK_FRAMESIZE
+	.cfi_offset 30, -MORESTACK_FRAMESIZE+8
+	add	x29, sp, 0
+	.cfi_def_cfa_register 29
+	# Adjust the requested stack size for the frame pointer save.
+	add	x12, x12, 16
+	stp	x0, x1, [sp, 16]
+	stp	x2, x3, [sp, 32]
+	add	x12, x12, BACKOFF
+	stp	x4, x5, [sp, 48]
+	stp	x6, x7, [sp, 64]
+	stp	x11, x12, [sp, 80]
+	str	x28, [sp, 96]
+
+	# Setup on x28 the function initial frame pointer.  Its value will
+	# copied to function argument pointer.
+	add	x28, sp, MORESTACK_FRAMESIZE + 16
+
+	# void __morestack_block_signals (void)
+	bl	__morestack_block_signals
+
+	# void *__generic_morestack (size_t *pframe_size,
+	#			     void *old_stack,
+	#			     size_t param_size)
+	# pframe_size: is the size of the required stack frame (the function
+	#	       amount of space remaining on the allocated stack).
+	# old_stack: points at the parameters the old stack
+	# param_size: size in bytes of parameters to copy to the new stack.
+	add	x0, x28, STACKFRAME_BASE + NEWSTACK_SAVE
+	mov	x1, x28
+	ldr	x2, [sp, ARGS_SIZE_SAVE]
+	bl	__generic_morestack
+
+	# Start using new stack
+	stp	x29, x30, [x0, -16]!
+	mov	sp, x0
+
+	# Set __private_ss stack guard for the new stack.
+	ldr	x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, 16
+	sub	x0, x0, x9
+.LEHB0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	# void __morestack_unblock_signals (void)
+	bl	__morestack_unblock_signals
+
+	# Set up for a call to the target function.
+	#ldp	x29, x30, [x28, STACKFRAME_BASE]
+	ldr	x30, [x28, STACKFRAME_BASE + 8]
+	ldp	x0, x1, [x28, STACKFRAME_BASE + 16]
+	ldp	x2, x3, [x28, STACKFRAME_BASE + 32]
+	ldp	x4, x5, [x28, STACKFRAME_BASE + 48]
+	ldp	x6, x7, [x28, STACKFRAME_BASE + 64]
+	add	x9, x30, 8
+	cmp	x30, x9
+	blr	x9
+
+	stp	x0, x1, [x28, STACKFRAME_BASE + 16]
+	stp	x2, x3, [x28, STACKFRAME_BASE + 32]
+	stp	x4, x5, [x28, STACKFRAME_BASE + 48]
+	stp	x6, x7, [x28, STACKFRAME_BASE + 64]
+
+	bl	__morestack_block_signals
+
+	# void *__generic_releasestack (size_t *pavailable)
+	add	x0, x28, STACKFRAME_BASE + NEWSTACK_SAVE
+	bl	__generic_releasestack
+
+	# Reset __private_ss stack guard to value for old stack
+	ldr	x9, [x28, STACKFRAME_BASE + NEWSTACK_SAVE]
+	add	x0, x0, BACKOFF
+	sub	x0, x0, x9
+
+	# Update TCB split stack field
+.LEHE0:
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+
+	bl __morestack_unblock_signals
+
+	# Use old stack again.
+	sub	sp, x28, 16
+
+	ldp	x0, x1, [x28, STACKFRAME_BASE + 16]
+	ldp	x2, x3, [x28, STACKFRAME_BASE + 32]
+	ldp	x4, x5, [x28, STACKFRAME_BASE + 48]
+	ldp	x6, x7, [x28, STACKFRAME_BASE + 64]
+	ldp	x29, x30, [x28, STACKFRAME_BASE]
+	ldr	x28, [x28, STACKFRAME_BASE + 96]
+
+	.cfi_remember_state
+	.cfi_restore 30
+	.cfi_restore 29
+	.cfi_def_cfa 31, 0
+
+	ret
+
+# This is the cleanup code called by the stack unwinder when
+# unwinding through code between .LEHB0 and .LEHE0 above.
+cleanup:
+	.cfi_restore_state
+	str	x0, [x28, STACKFRAME_BASE]
+	# size_t __generic_findstack (void *stack)
+	mov	x0, x28
+	bl	__generic_findstack
+	sub	x0, x28, x0
+	add	x0, x0, BACKOFF
+	# Restore tcbhead_t.__private_ss
+	mrs	x1, tpidr_el0
+	str	x0, [x1, TCB_PRIVATE_SS]
+	ldr	x0, [x28, STACKFRAME_BASE]
+	b	_Unwind_Resume
+        .cfi_endproc
+END(__morestack)
+
+	.section .gcc_except_table,"a",@progbits
+	.align 4
+.LLSDA1:
+	# @LPStart format (omit)
+        .byte   0xff
+	# @TType format (omit)
+        .byte   0xff
+	# Call-site format (uleb128)
+        .byte   0x1
+	# Call-site table length
+        .uleb128 .LLSDACSE1-.LLSDACSB1
+.LLSDACSB1:
+	# region 0 start
+        .uleb128 .LEHB0-.LFB1
+	# length
+        .uleb128 .LEHE0-.LEHB0
+	# landing pad
+        .uleb128 cleanup-.LFB1
+	# no action (ie a cleanup)
+        .uleb128 0
+.LLSDACSE1:
+
+
+	.global __gcc_personality_v0
+#ifdef __PIC__
+	# Build a position independent reference to the personality function.
+	.hidden DW.ref.__gcc_personality_v0
+	.weak   DW.ref.__gcc_personality_v0
+	.section .data.DW.ref.__gcc_personality_v0,"awG",@progbits,DW.ref.__gcc_personality_v0,comdat
+	.type   DW.ref.__gcc_personality_v0, @object
+	.align 3
+DW.ref.__gcc_personality_v0:
+	.size   DW.ref.__gcc_personality_v0, 8
+	.quad   __gcc_personality_v0
+#endif
+
+	.section .note.GNU-stack,"",@progbits
+	.section .note.GNU-split-stack,"",@progbits
+	.section .note.GNU-no-split-stack,"",@progbits
diff --git a/libgcc/config/aarch64/t-stack-aarch64 b/libgcc/config/aarch64/t-stack-aarch64
new file mode 100644
index 0000000..4babb4e
--- /dev/null
+++ b/libgcc/config/aarch64/t-stack-aarch64
@@ -0,0 +1,3 @@ 
+# Makefile fragment to support -fsplit-stack for aarch64.
+LIB2ADD_ST += $(srcdir)/config/aarch64/morestack.S \
+	      $(srcdir)/config/aarch64/morestack-c.c
diff --git a/libgcc/generic-morestack.c b/libgcc/generic-morestack.c
index b8eec4e..fe7092b 100644
--- a/libgcc/generic-morestack.c
+++ b/libgcc/generic-morestack.c
@@ -943,6 +943,7 @@  __splitstack_find (void *segment_arg, void *sp, size_t *len,
       nsp -= 2 * 160;
 #elif defined __s390__
       nsp -= 2 * 96;
+#elif defined __aarch64__
 #else
 #error "unrecognized target"
 #endif