diff mbox series

[2/4,AArch64] SVE load/store_lanes support

Message ID 87shdox0q7.fsf@linaro.org
State New
Headers show
Series Add SVE support for load/store_lanes | expand

Commit Message

Richard Sandiford Nov. 8, 2017, 3:13 p.m. UTC
This patch adds support for SVE LD[234], ST[234] and associated
structure modes.  Unlike Advanced SIMD, these modes are extra-long
vector modes instead of integer modes.


2017-11-08  Richard Sandiford  <richard.sandiford@linaro.org>
	    Alan Hayward  <alan.hayward@arm.com>
	    David Sherwood  <david.sherwood@arm.com>

gcc/
	* config/aarch64/aarch64-modes.def: Define x2, x3 and x4 vector
	modes for SVE.
	* config/aarch64/aarch64-protos.h
	(aarch64_sve_struct_memory_operand_p): Declare.
	* config/aarch64/iterators.md (SVE_STRUCT): New mode iterator.
	(vector_count, insn_length, VSINGLE, vsingle): New mode attributes.
	(VPRED, vpred): Handle SVE structure modes.
	* config/aarch64/constraints.md (Utx): New constraint.
	* config/aarch64/predicates.md (aarch64_sve_struct_memory_operand)
	(aarch64_sve_struct_nonimmediate_operand): New predicates.
	* config/aarch64/aarch64.md (UNSPEC_LDN, UNSPEC_STN): New unspecs.
	* config/aarch64/aarch64-sve.md (mov<mode>, *aarch64_sve_mov<mode>_le)
	(*aarch64_sve_mov<mode>_be, pred_mov<mode>): New patterns for
	structure modes.  Split into pieces after RA.
	(vec_load_lanes<mode><vsingle>, vec_mask_load_lanes<mode><vsingle>)
	(vec_store_lanes<mode><vsingle>, vec_mask_store_lanes<mode><vsingle>):
	New patterns.
	* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle
	SVE structure modes.
	(aarch64_classify_address): Likewise.
	(sizetochar): Move earlier in file.
	(aarch64_print_operand): Handle SVE register lists.
	(aarch64_array_mode): New function.
	(aarch64_sve_struct_memory_operand_p): Likewise.
	(TARGET_ARRAY_MODE): Redefine.

Comments

Richard Sandiford Jan. 6, 2018, 7:45 p.m. UTC | #1
Both a ping and a repost with the new VNx names.  See:

   https://gcc.gnu.org/ml/gcc-patches/2017-11/msg00592.html

for the full series.

Thanks,
Richard

---

This patch adds support for SVE LD[234], ST[234] and associated
structure modes.  Unlike Advanced SIMD, these modes are extra-long
vector modes instead of integer modes.

2017-11-06  Richard Sandiford  <richard.sandiford@linaro.org>
	    Alan Hayward  <alan.hayward@arm.com>
	    David Sherwood  <david.sherwood@arm.com>

gcc/
	* config/aarch64/aarch64-modes.def: Define x2, x3 and x4 vector
	modes for SVE.
	* config/aarch64/aarch64-protos.h
	(aarch64_sve_struct_memory_operand_p): Declare.
	* config/aarch64/iterators.md (SVE_STRUCT): New mode iterator.
	(vector_count, insn_length, VSINGLE, vsingle): New mode attributes.
	(VPRED, vpred): Handle SVE structure modes.
	* config/aarch64/constraints.md (Utx): New constraint.
	* config/aarch64/predicates.md (aarch64_sve_struct_memory_operand)
	(aarch64_sve_struct_nonimmediate_operand): New predicates.
	* config/aarch64/aarch64.md (UNSPEC_LDN, UNSPEC_STN): New unspecs.
	* config/aarch64/aarch64-sve.md (mov<mode>, *aarch64_sve_mov<mode>_le)
	(*aarch64_sve_mov<mode>_be, pred_mov<mode>): New patterns for
	structure modes.  Split into pieces after RA.
	(vec_load_lanes<mode><vsingle>, vec_mask_load_lanes<mode><vsingle>)
	(vec_store_lanes<mode><vsingle>, vec_mask_store_lanes<mode><vsingle>):
	New patterns.
	* config/aarch64/aarch64.c (aarch64_classify_vector_mode): Handle
	SVE structure modes.
	(aarch64_classify_address): Likewise.
	(sizetochar): Move earlier in file.
	(aarch64_print_operand): Handle SVE register lists.
	(aarch64_array_mode): New function.
	(aarch64_sve_struct_memory_operand_p): Likewise.
	(TARGET_ARRAY_MODE): Redefine.

Index: gcc/config/aarch64/aarch64-modes.def
===================================================================
--- gcc/config/aarch64/aarch64-modes.def	2017-12-22 16:00:58.471012631 +0000
+++ gcc/config/aarch64/aarch64-modes.def	2017-12-22 16:01:42.042358758 +0000
@@ -87,6 +87,9 @@ INT_MODE (XI, 64);
 /* Give SVE vectors the names normally used for 256-bit vectors.
    The actual number depends on command-line flags.  */
 SVE_MODES (1, VNx16, VNx8, VNx4, VNx2)
+SVE_MODES (2, VNx32, VNx16, VNx8, VNx4)
+SVE_MODES (3, VNx48, VNx24, VNx12, VNx6)
+SVE_MODES (4, VNx64, VNx32, VNx16, VNx8)
 
 /* Quad float: 128-bit floating mode for long doubles.  */
 FLOAT_MODE (TF, 16, ieee_quad_format);
Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h	2017-12-22 16:00:58.471012631 +0000
+++ gcc/config/aarch64/aarch64-protos.h	2017-12-22 16:01:42.043358720 +0000
@@ -432,6 +432,7 @@ rtx aarch64_simd_gen_const_vector_dup (m
 bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
+bool aarch64_sve_struct_memory_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	2017-12-22 16:00:58.477012402 +0000
+++ gcc/config/aarch64/iterators.md	2017-12-22 16:01:42.045358644 +0000
@@ -250,6 +250,14 @@ (define_mode_iterator VMUL_CHANGE_NLANES
 (define_mode_iterator SVE_ALL [VNx16QI VNx8HI VNx4SI VNx2DI
 			       VNx8HF VNx4SF VNx2DF])
 
+;; All SVE vector structure modes.
+(define_mode_iterator SVE_STRUCT [VNx32QI VNx16HI VNx8SI VNx4DI
+				  VNx16HF VNx8SF VNx4DF
+				  VNx48QI VNx24HI VNx12SI VNx6DI
+				  VNx24HF VNx12SF VNx6DF
+				  VNx64QI VNx32HI VNx16SI VNx8DI
+				  VNx32HF VNx16SF VNx8DF])
+
 ;; All SVE vector modes that have 8-bit or 16-bit elements.
 (define_mode_iterator SVE_BH [VNx16QI VNx8HI VNx8HF])
 
@@ -587,9 +595,16 @@ (define_mode_attr Vetype [(V8QI "b") (V1
 
 ;; Equivalent of "size" for a vector element.
 (define_mode_attr Vesize [(VNx16QI "b")
-			  (VNx8HI  "h") (VNx8HF "h")
-			  (VNx4SI  "w") (VNx4SF "w")
-			  (VNx2DI  "d") (VNx2DF "d")])
+			  (VNx8HI  "h") (VNx8HF  "h")
+			  (VNx4SI  "w") (VNx4SF  "w")
+			  (VNx2DI  "d") (VNx2DF  "d")
+			  (VNx32QI "b") (VNx48QI "b") (VNx64QI "b")
+			  (VNx16HI "h") (VNx24HI "h") (VNx32HI "h")
+			  (VNx16HF "h") (VNx24HF "h") (VNx32HF "h")
+			  (VNx8SI  "w") (VNx12SI "w") (VNx16SI "w")
+			  (VNx8SF  "w") (VNx12SF "w") (VNx16SF "w")
+			  (VNx4DI  "d") (VNx6DI  "d") (VNx8DI  "d")
+			  (VNx4DF  "d") (VNx6DF  "d") (VNx8DF  "d")])
 
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
@@ -957,17 +972,93 @@ (define_mode_attr insn_count [(OI "8") (
 ;; No need of iterator for -fPIC as it use got_lo12 for both modes.
 (define_mode_attr got_modifier [(SI "gotpage_lo14") (DI "gotpage_lo15")])
 
-;; The predicate mode associated with an SVE data mode.
+;; The number of subvectors in an SVE_STRUCT.
+(define_mode_attr vector_count [(VNx32QI "2") (VNx16HI "2")
+				(VNx8SI  "2") (VNx4DI  "2")
+				(VNx16HF "2") (VNx8SF  "2") (VNx4DF "2")
+				(VNx48QI "3") (VNx24HI "3")
+				(VNx12SI "3") (VNx6DI  "3")
+				(VNx24HF "3") (VNx12SF "3") (VNx6DF "3")
+				(VNx64QI "4") (VNx32HI "4")
+				(VNx16SI "4") (VNx8DI  "4")
+				(VNx32HF "4") (VNx16SF "4") (VNx8DF "4")])
+
+;; The number of instruction bytes needed for an SVE_STRUCT move.  This is
+;; equal to vector_count * 4.
+(define_mode_attr insn_length [(VNx32QI "8")  (VNx16HI "8")
+			       (VNx8SI  "8")  (VNx4DI  "8")
+			       (VNx16HF "8")  (VNx8SF  "8")  (VNx4DF "8")
+			       (VNx48QI "12") (VNx24HI "12")
+			       (VNx12SI "12") (VNx6DI  "12")
+			       (VNx24HF "12") (VNx12SF "12") (VNx6DF "12")
+			       (VNx64QI "16") (VNx32HI "16")
+			       (VNx16SI "16") (VNx8DI  "16")
+			       (VNx32HF "16") (VNx16SF "16") (VNx8DF "16")])
+
+;; The type of a subvector in an SVE_STRUCT.
+(define_mode_attr VSINGLE [(VNx32QI "VNx16QI")
+			   (VNx16HI "VNx8HI") (VNx16HF "VNx8HF")
+			   (VNx8SI "VNx4SI") (VNx8SF "VNx4SF")
+			   (VNx4DI "VNx2DI") (VNx4DF "VNx2DF")
+			   (VNx48QI "VNx16QI")
+			   (VNx24HI "VNx8HI") (VNx24HF "VNx8HF")
+			   (VNx12SI "VNx4SI") (VNx12SF "VNx4SF")
+			   (VNx6DI "VNx2DI") (VNx6DF "VNx2DF")
+			   (VNx64QI "VNx16QI")
+			   (VNx32HI "VNx8HI") (VNx32HF "VNx8HF")
+			   (VNx16SI "VNx4SI") (VNx16SF "VNx4SF")
+			   (VNx8DI "VNx2DI") (VNx8DF "VNx2DF")])
+
+;; ...and again in lower case.
+(define_mode_attr vsingle [(VNx32QI "vnx16qi")
+			   (VNx16HI "vnx8hi") (VNx16HF "vnx8hf")
+			   (VNx8SI "vnx4si") (VNx8SF "vnx4sf")
+			   (VNx4DI "vnx2di") (VNx4DF "vnx2df")
+			   (VNx48QI "vnx16qi")
+			   (VNx24HI "vnx8hi") (VNx24HF "vnx8hf")
+			   (VNx12SI "vnx4si") (VNx12SF "vnx4sf")
+			   (VNx6DI "vnx2di") (VNx6DF "vnx2df")
+			   (VNx64QI "vnx16qi")
+			   (VNx32HI "vnx8hi") (VNx32HF "vnx8hf")
+			   (VNx16SI "vnx4si") (VNx16SF "vnx4sf")
+			   (VNx8DI "vnx2di") (VNx8DF "vnx2df")])
+
+;; The predicate mode associated with an SVE data mode.  For structure modes
+;; this is equivalent to the <VPRED> of the subvector mode.
 (define_mode_attr VPRED [(VNx16QI "VNx16BI")
 			 (VNx8HI "VNx8BI") (VNx8HF "VNx8BI")
 			 (VNx4SI "VNx4BI") (VNx4SF "VNx4BI")
-			 (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")])
+			 (VNx2DI "VNx2BI") (VNx2DF "VNx2BI")
+			 (VNx32QI "VNx16BI")
+			 (VNx16HI "VNx8BI") (VNx16HF "VNx8BI")
+			 (VNx8SI "VNx4BI") (VNx8SF "VNx4BI")
+			 (VNx4DI "VNx2BI") (VNx4DF "VNx2BI")
+			 (VNx48QI "VNx16BI")
+			 (VNx24HI "VNx8BI") (VNx24HF "VNx8BI")
+			 (VNx12SI "VNx4BI") (VNx12SF "VNx4BI")
+			 (VNx6DI "VNx2BI") (VNx6DF "VNx2BI")
+			 (VNx64QI "VNx16BI")
+			 (VNx32HI "VNx8BI") (VNx32HF "VNx8BI")
+			 (VNx16SI "VNx4BI") (VNx16SF "VNx4BI")
+			 (VNx8DI "VNx2BI") (VNx8DF "VNx2BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(VNx16QI "vnx16bi")
 			 (VNx8HI "vnx8bi") (VNx8HF "vnx8bi")
 			 (VNx4SI "vnx4bi") (VNx4SF "vnx4bi")
-			 (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")])
+			 (VNx2DI "vnx2bi") (VNx2DF "vnx2bi")
+			 (VNx32QI "vnx16bi")
+			 (VNx16HI "vnx8bi") (VNx16HF "vnx8bi")
+			 (VNx8SI "vnx4bi") (VNx8SF "vnx4bi")
+			 (VNx4DI "vnx2bi") (VNx4DF "vnx2bi")
+			 (VNx48QI "vnx16bi")
+			 (VNx24HI "vnx8bi") (VNx24HF "vnx8bi")
+			 (VNx12SI "vnx4bi") (VNx12SF "vnx4bi")
+			 (VNx6DI "vnx2bi") (VNx6DF "vnx2bi")
+			 (VNx64QI "vnx16bi")
+			 (VNx32HI "vnx8bi") (VNx32HF "vnx4bi")
+			 (VNx16SI "vnx4bi") (VNx16SF "vnx4bi")
+			 (VNx8DI "vnx2bi") (VNx8DF "vnx2bi")])
 
 ;; -------------------------------------------------------------------
 ;; Code Iterators
Index: gcc/config/aarch64/constraints.md
===================================================================
--- gcc/config/aarch64/constraints.md	2017-12-22 16:00:58.476012440 +0000
+++ gcc/config/aarch64/constraints.md	2017-12-22 16:01:42.045358644 +0000
@@ -237,6 +237,12 @@ (define_memory_constraint "Uty"
   (and (match_code "mem")
        (match_test "aarch64_sve_ld1r_operand_p (op)")))
 
+(define_memory_constraint "Utx"
+  "@internal
+   An address valid for SVE structure mov patterns (as distinct from
+   LD[234] and ST[234] patterns)."
+  (match_operand 0 "aarch64_sve_struct_memory_operand"))
+
 (define_constraint "Ufc"
   "A floating point constant which can be used with an\
    FMOV immediate operation."
Index: gcc/config/aarch64/predicates.md
===================================================================
--- gcc/config/aarch64/predicates.md	2017-12-22 16:00:58.477012402 +0000
+++ gcc/config/aarch64/predicates.md	2017-12-22 16:01:42.045358644 +0000
@@ -482,6 +482,14 @@ (define_predicate "aarch64_sve_general_o
 	    (match_operand 0 "aarch64_sve_ldr_operand")
 	    (match_test "aarch64_mov_operand_p (op, mode)"))))
 
+(define_predicate "aarch64_sve_struct_memory_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_struct_memory_operand_p (op)")))
+
+(define_predicate "aarch64_sve_struct_nonimmediate_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_struct_memory_operand")))
+
 ;; Doesn't include immediates, since those are handled by the move
 ;; patterns instead.
 (define_predicate "aarch64_sve_dup_operand"
Index: gcc/config/aarch64/aarch64.md
===================================================================
--- gcc/config/aarch64/aarch64.md	2017-12-22 16:00:58.476012440 +0000
+++ gcc/config/aarch64/aarch64.md	2017-12-22 16:01:42.045358644 +0000
@@ -161,6 +161,8 @@ (define_c_enum "unspec" [
     UNSPEC_PACK
     UNSPEC_FLOAT_CONVERT
     UNSPEC_WHILE_LO
+    UNSPEC_LDN
+    UNSPEC_STN
 ])
 
 (define_c_enum "unspecv" [
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2017-12-22 16:00:58.471012631 +0000
+++ gcc/config/aarch64/aarch64-sve.md	2017-12-22 16:01:42.043358720 +0000
@@ -189,6 +189,105 @@ (define_insn "maskstore<mode><vpred>"
   "st1<Vesize>\t%1.<Vetype>, %2, %0"
 )
 
+;; SVE structure moves.
+(define_expand "mov<mode>"
+  [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
+	(match_operand:SVE_STRUCT 1 "general_operand"))]
+  "TARGET_SVE"
+  {
+    /* Big-endian loads and stores need to be done via LD1 and ST1;
+       see the comment at the head of the file for details.  */
+    if ((MEM_P (operands[0]) || MEM_P (operands[1]))
+	&& BYTES_BIG_ENDIAN)
+      {
+	gcc_assert (can_create_pseudo_p ());
+	aarch64_expand_sve_mem_move (operands[0], operands[1], <VPRED>mode);
+	DONE;
+      }
+
+    if (CONSTANT_P (operands[1]))
+      {
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
+	DONE;
+      }
+  }
+)
+
+;; Unpredicated structure moves (little-endian).
+(define_insn "*aarch64_sve_mov<mode>_le"
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
+	(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  "#"
+  [(set_attr "length" "<insn_length>")]
+)
+
+;; Unpredicated structure moves (big-endian).  Memory accesses require
+;; secondary reloads.
+(define_insn "*aarch64_sve_mov<mode>_le"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
+	(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "#"
+  [(set_attr "length" "<insn_length>")]
+)
+
+;; Split unpredicated structure moves into pieces.  This is the same
+;; for both big-endian and little-endian code, although it only needs
+;; to handle memory operands for little-endian code.
+(define_split
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand")
+	(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand"))]
+  "TARGET_SVE && reload_completed"
+  [(const_int 0)]
+  {
+    rtx dest = operands[0];
+    rtx src = operands[1];
+    if (REG_P (dest) && REG_P (src))
+      aarch64_simd_emit_reg_reg_move (operands, <VSINGLE>mode, <vector_count>);
+    else
+      for (unsigned int i = 0; i < <vector_count>; ++i)
+	{
+	  rtx subdest = simplify_gen_subreg (<VSINGLE>mode, dest, <MODE>mode,
+					     i * BYTES_PER_SVE_VECTOR);
+	  rtx subsrc = simplify_gen_subreg (<VSINGLE>mode, src, <MODE>mode,
+					    i * BYTES_PER_SVE_VECTOR);
+	  emit_insn (gen_rtx_SET (subdest, subsrc));
+	}
+    DONE;
+  }
+)
+
+;; Predicated structure moves.  This works for both endiannesses but in
+;; practice is only useful for big-endian.
+(define_insn_and_split "pred_mov<mode>"
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_struct_nonimmediate_operand" "=w, Utx")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "Utx, w")]
+	  UNSPEC_MERGE_PTRUE))]
+  "TARGET_SVE
+   && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    for (unsigned int i = 0; i < <vector_count>; ++i)
+      {
+	rtx subdest = simplify_gen_subreg (<VSINGLE>mode, operands[0],
+					   <MODE>mode,
+					   i * BYTES_PER_SVE_VECTOR);
+	rtx subsrc = simplify_gen_subreg (<VSINGLE>mode, operands[2],
+					  <MODE>mode,
+					  i * BYTES_PER_SVE_VECTOR);
+	aarch64_emit_sve_pred_move (subdest, operands[1], subsrc);
+      }
+    DONE;
+  }
+  [(set_attr "length" "<insn_length>")]
+)
+
 (define_expand "mov<mode>"
   [(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
 	(match_operand:PRED_ALL 1 "general_operand"))]
@@ -460,6 +559,60 @@ (define_insn "*vec_series<mode>_plus"
   }
 )
 
+;; Unpredicated LD[234].
+(define_expand "vec_load_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand")
+	(unspec:SVE_STRUCT
+	  [(match_dup 2)
+	   (match_operand:SVE_STRUCT 1 "memory_operand")]
+	  UNSPEC_LDN))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; Predicated LD[234].
+(define_insn "vec_mask_load_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_STRUCT 1 "memory_operand" "m")]
+	  UNSPEC_LDN))]
+  "TARGET_SVE"
+  "ld<vector_count><Vesize>\t%0, %2/z, %1"
+)
+
+;; Unpredicated ST[234].  This is always a full update, so the dependence
+;; on the old value of the memory location (via (match_dup 0)) is redundant.
+;; There doesn't seem to be any obvious benefit to treating the all-true
+;; case differently though.  In particular, it's very unlikely that we'll
+;; only find out during RTL that a store_lanes is dead.
+(define_expand "vec_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
+	(unspec:SVE_STRUCT
+	  [(match_dup 2)
+	   (match_operand:SVE_STRUCT 1 "register_operand")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; Predicated ST[234].
+(define_insn "vec_mask_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  "st<vector_count><Vesize>\t%1, %2, %0"
+)
+
 (define_expand "vec_perm<mode>"
   [(match_operand:SVE_ALL 0 "register_operand")
    (match_operand:SVE_ALL 1 "register_operand")
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	2017-12-22 16:00:42.829606965 +0000
+++ gcc/config/aarch64/aarch64.c	2017-12-22 16:01:42.044358682 +0000
@@ -1178,9 +1178,15 @@ aarch64_classify_vector_mode (machine_mo
 	  || inner == DImode
 	  || inner == DFmode))
     {
-      if (TARGET_SVE
-	  && known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
-	return VEC_SVE_DATA;
+      if (TARGET_SVE)
+	{
+	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
+	    return VEC_SVE_DATA;
+	  if (known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
+	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
+	      || known_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
+	    return VEC_SVE_DATA | VEC_STRUCT;
+	}
 
       /* This includes V1DF but not V1DI (which doesn't exist).  */
       if (TARGET_SIMD
@@ -1208,6 +1214,18 @@ aarch64_sve_data_mode_p (machine_mode mo
   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
 }
 
+/* Implement target hook TARGET_ARRAY_MODE.  */
+static opt_machine_mode
+aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
+{
+  if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
+      && IN_RANGE (nelems, 2, 4))
+    return mode_for_vector (GET_MODE_INNER (mode),
+			    GET_MODE_NUNITS (mode) * nelems);
+
+  return opt_machine_mode ();
+}
+
 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 static bool
 aarch64_array_mode_supported_p (machine_mode mode,
@@ -5778,6 +5796,18 @@ aarch64_classify_address (struct aarch64
 		    ? offset_4bit_signed_scaled_p (mode, offset)
 		    : offset_9bit_signed_scaled_p (mode, offset));
 
+	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
+	    {
+	      poly_int64 end_offset = (offset
+				       + GET_MODE_SIZE (mode)
+				       - BYTES_PER_SVE_VECTOR);
+	      return (type == ADDR_QUERY_M
+		      ? offset_4bit_signed_scaled_p (mode, offset)
+		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
+			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
+							 end_offset)));
+	    }
+
 	  if (vec_flags == VEC_SVE_PRED)
 	    return offset_9bit_signed_scaled_p (mode, offset);
 
@@ -6490,6 +6520,20 @@ aarch64_print_vector_float_operand (FILE
   return true;
 }
 
+/* Return the equivalent letter for size.  */
+static char
+sizetochar (int size)
+{
+  switch (size)
+    {
+    case 64: return 'd';
+    case 32: return 's';
+    case 16: return 'h';
+    case 8 : return 'b';
+    default: gcc_unreachable ();
+    }
+}
+
 /* Print operand X to file F in a target specific manner according to CODE.
    The acceptable formatting commands given by CODE are:
      'c':		An integer or symbol address without a preceding #
@@ -6777,7 +6821,18 @@ aarch64_print_operand (FILE *f, rtx x, i
 	{
 	case REG:
 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
-	    asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
+	    {
+	      if (REG_NREGS (x) == 1)
+		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
+	      else
+		{
+		  char suffix
+		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
+		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
+			       REGNO (x) - V0_REGNUM, suffix,
+			       END_REGNO (x) - V0_REGNUM - 1, suffix);
+		}
+	    }
 	  else
 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
 	  break;
@@ -12952,20 +13007,6 @@ aarch64_final_prescan_insn (rtx_insn *in
 }
 
 
-/* Return the equivalent letter for size.  */
-static char
-sizetochar (int size)
-{
-  switch (size)
-    {
-    case 64: return 'd';
-    case 32: return 's';
-    case 16: return 'h';
-    case 8 : return 'b';
-    default: gcc_unreachable ();
-    }
-}
-
 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
    instruction.  */
 
@@ -13560,6 +13601,28 @@ aarch64_sve_ldr_operand_p (rtx op)
 	  && addr.type == ADDRESS_REG_IMM);
 }
 
+/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
+   We need to be able to access the individual pieces, so the range
+   is different from LD[234] and ST[234].  */
+bool
+aarch64_sve_struct_memory_operand_p (rtx op)
+{
+  if (!MEM_P (op))
+    return false;
+
+  machine_mode mode = GET_MODE (op);
+  struct aarch64_address_info addr;
+  if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
+				 ADDR_QUERY_ANY)
+      || addr.type != ADDRESS_REG_IMM)
+    return false;
+
+  poly_int64 first = addr.const_offset;
+  poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
+  return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
+	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
@@ -17629,6 +17692,9 @@ #define TARGET_VECTOR_MODE_SUPPORTED_P a
 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
   aarch64_builtin_support_vector_misalignment
 
+#undef TARGET_ARRAY_MODE
+#define TARGET_ARRAY_MODE aarch64_array_mode
+
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p
diff mbox series

Patch

Index: gcc/config/aarch64/aarch64-modes.def
===================================================================
--- gcc/config/aarch64/aarch64-modes.def	2017-11-08 15:05:50.202852894 +0000
+++ gcc/config/aarch64/aarch64-modes.def	2017-11-08 15:06:19.687849905 +0000
@@ -87,6 +87,9 @@  INT_MODE (XI, 64);
 /* Give SVE vectors the names normally used for 256-bit vectors.
    The actual number depends on command-line flags.  */
 SVE_MODES (1, V32, V16, V8, V4)
+SVE_MODES (2, V64, V32, V16, V8)
+SVE_MODES (3, V96, V48, V24, V12)
+SVE_MODES (4, V128, V64, V32, V16)
 
 /* Quad float: 128-bit floating mode for long doubles.  */
 FLOAT_MODE (TF, 16, ieee_quad_format);
Index: gcc/config/aarch64/aarch64-protos.h
===================================================================
--- gcc/config/aarch64/aarch64-protos.h	2017-11-08 15:05:50.204852894 +0000
+++ gcc/config/aarch64/aarch64-protos.h	2017-11-08 15:06:19.687849905 +0000
@@ -432,6 +432,7 @@  rtx aarch64_simd_gen_const_vector_dup (m
 bool aarch64_simd_mem_operand_p (rtx);
 bool aarch64_sve_ld1r_operand_p (rtx);
 bool aarch64_sve_ldr_operand_p (rtx);
+bool aarch64_sve_struct_memory_operand_p (rtx);
 rtx aarch64_simd_vect_par_cnst_half (machine_mode, int, bool);
 rtx aarch64_tls_get_addr (void);
 tree aarch64_fold_builtin (tree, int, tree *, bool);
Index: gcc/config/aarch64/iterators.md
===================================================================
--- gcc/config/aarch64/iterators.md	2017-11-08 15:05:50.216852893 +0000
+++ gcc/config/aarch64/iterators.md	2017-11-08 15:06:19.690849904 +0000
@@ -249,6 +249,11 @@  (define_mode_iterator VMUL_CHANGE_NLANES
 ;; All SVE vector modes.
 (define_mode_iterator SVE_ALL [V32QI V16HI V8SI V4DI V16HF V8SF V4DF])
 
+;; All SVE vector structure modes.
+(define_mode_iterator SVE_STRUCT [V64QI V32HI V16SI V8DI V32HF V16SF V8DF
+				  V96QI V48HI V24SI V12DI V48HF V24SF V12DF
+				  V128QI V64HI V32SI V16DI V64HF V32SF V16DF])
+
 ;; All SVE vector modes that have 8-bit or 16-bit elements.
 (define_mode_iterator SVE_BH [V32QI V16HI V16HF])
 
@@ -588,7 +593,14 @@  (define_mode_attr Vetype [(V8QI "b") (V1
 (define_mode_attr Vesize [(V32QI "b")
 			  (V16HI "h") (V16HF "h")
 			  (V8SI  "w") (V8SF  "w")
-			  (V4DI  "d") (V4DF  "d")])
+			  (V4DI  "d") (V4DF  "d")
+			  (V64QI "b") (V96QI "b") (V128QI "b")
+			  (V32HI "h") (V48HI "h") (V64HI "h")
+			  (V32HF "h") (V48HF "h") (V64HF "h")
+			  (V16SI "w") (V24SI "w") (V32SI "w")
+			  (V16SF "w") (V24SF "w") (V32SF "w")
+			  (V8DI "d")  (V12DI "d") (V16DI "d")
+			  (V8DF "d")  (V12DF "d") (V16DF "d")])
 
 ;; Vetype is used everywhere in scheduling type and assembly output,
 ;; sometimes they are not the same, for example HF modes on some
@@ -946,17 +958,87 @@  (define_mode_attr insn_count [(OI "8") (
 ;; No need of iterator for -fPIC as it use got_lo12 for both modes.
 (define_mode_attr got_modifier [(SI "gotpage_lo14") (DI "gotpage_lo15")])
 
-;; The predicate mode associated with an SVE data mode.
+;; The number of subvectors in an SVE_STRUCT.
+(define_mode_attr vector_count [(V64QI "2") (V32HI "2")
+				(V16SI "2") (V8DI "2")
+				(V32HF "2") (V16SF "2") (V8DF "2")
+				(V96QI "3") (V48HI "3")
+				(V24SI "3") (V12DI "3")
+				(V48HF "3") (V24SF "3") (V12DF "3")
+				(V128QI "4") (V64HI "4")
+				(V32SI "4") (V16DI "4")
+				(V64HF "4") (V32SF "4") (V16DF "4")])
+
+;; The number of instruction bytes needed for an SVE_STRUCT move.  This is
+;; equal to vector_count * 4.
+(define_mode_attr insn_length [(V64QI "8") (V32HI "8")
+			       (V16SI "8") (V8DI "8")
+			       (V32HF "8") (V16SF "8") (V8DF "8")
+			       (V96QI "12") (V48HI "12")
+			       (V24SI "12") (V12DI "12")
+			       (V48HF "12") (V24SF "12") (V12DF "12")
+			       (V128QI "16") (V64HI "16")
+			       (V32SI "16") (V16DI "16")
+			       (V64HF "16") (V32SF "16") (V16DF "16")])
+
+;; The type of a subvector in an SVE_STRUCT.
+(define_mode_attr VSINGLE [(V64QI "V32QI") (V32HI "V16HI")
+			   (V16SI "V8SI") (V8DI "V4DI")
+			   (V32HF "V16HF") (V16SF "V8SF") (V8DF "V4DF")
+			   (V96QI "V32QI") (V48HI "V16HI")
+			   (V24SI "V8SI") (V12DI "V4DI")
+			   (V48HF "V16HF") (V24SF "V8SF") (V12DF "V4DF")
+			   (V128QI "V32QI") (V64HI "V16HI")
+			   (V32SI "V8SI") (V16DI "V4DI")
+			   (V64HF "V16HF") (V32SF "V8SF") (V16DF "V4DF")])
+
+;; ...and again in lower case.
+(define_mode_attr vsingle [(V64QI "v32qi") (V32HI "v16hi")
+			   (V16SI "v8si") (V8DI "v4di")
+			   (V32HF "v16hf") (V16SF "v8sf") (V8DF "v4df")
+			   (V96QI "v32qi") (V48HI "v16hi")
+			   (V24SI "v8si") (V12DI "v4di")
+			   (V48HF "v16hf") (V24SF "v8sf") (V12DF "v4df")
+			   (V128QI "v32qi") (V64HI "v16hi")
+			   (V32SI "v8si") (V16DI "v4di")
+			   (V64HF "v16hf") (V32SF "v8sf") (V16DF "v4df")])
+
+;; The predicate mode associated with an SVE data mode.  For structure modes
+;; this is equivalent to the <VPRED> of the subvector mode.
 (define_mode_attr VPRED [(V32QI "V32BI")
 			 (V16HI "V16BI") (V16HF "V16BI")
 			 (V8SI "V8BI") (V8SF "V8BI")
-			 (V4DI "V4BI") (V4DF "V4BI")])
+			 (V4DI "V4BI") (V4DF "V4BI")
+			 (V64QI "V32BI")
+			 (V32HI "V16BI") (V32HF "V16BI")
+			 (V16SI "V8BI") (V16SF "V8BI")
+			 (V8DI "V4BI") (V8DF "V4BI")
+			 (V96QI "V32BI")
+			 (V48HI "V16BI") (V48HF "V16BI")
+			 (V24SI "V8BI") (V24SF "V8BI")
+			 (V12DI "V4BI") (V12DF "V4BI")
+			 (V128QI "V32BI")
+			 (V64HI "V16BI") (V64HF "V16BI")
+			 (V32SI "V8BI") (V32SF "V8BI")
+			 (V16DI "V4BI") (V16DF "V4BI")])
 
 ;; ...and again in lower case.
 (define_mode_attr vpred [(V32QI "v32bi")
 			 (V16HI "v16bi") (V16HF "v16bi")
 			 (V8SI "v8bi") (V8SF "v8bi")
-			 (V4DI "v4bi") (V4DF "v4bi")])
+			 (V4DI "v4bi") (V4DF "v4bi")
+			 (V64QI "v32bi")
+			 (V32HI "v16bi") (V32HF "v16bi")
+			 (V16SI "v8bi") (V16SF "v8bi")
+			 (V8DI "v4bi") (V8DF "v4bi")
+			 (V96QI "v32bi")
+			 (V48HI "v16bi") (V48HF "v16bi")
+			 (V24SI "v8bi") (V24SF "v8bi")
+			 (V12DI "v4bi") (V12DF "v4bi")
+			 (V128QI "v32bi")
+			 (V64HI "v16bi") (V64HF "v8bi")
+			 (V32SI "v8bi") (V32SF "v8bi")
+			 (V16DI "v4bi") (V16DF "v4bi")])
 
 ;; -------------------------------------------------------------------
 ;; Code Iterators
Index: gcc/config/aarch64/constraints.md
===================================================================
--- gcc/config/aarch64/constraints.md	2017-11-08 15:05:50.215852893 +0000
+++ gcc/config/aarch64/constraints.md	2017-11-08 15:06:19.690849904 +0000
@@ -210,6 +210,12 @@  (define_memory_constraint "Utw"
   (and (match_code "mem")
        (match_test "aarch64_sve_ld1r_operand_p (op)")))
 
+(define_memory_constraint "Utx"
+  "@internal
+   An address valid for SVE structure mov patterns (as distinct from
+   LD[234] and ST[234] patterns)."
+  (match_operand 0 "aarch64_sve_struct_memory_operand"))
+
 (define_constraint "Ufc"
   "A floating point constant which can be used with an\
    FMOV immediate operation."
Index: gcc/config/aarch64/predicates.md
===================================================================
--- gcc/config/aarch64/predicates.md	2017-11-08 15:05:50.216852893 +0000
+++ gcc/config/aarch64/predicates.md	2017-11-08 15:06:19.690849904 +0000
@@ -471,6 +471,14 @@  (define_predicate "aarch64_sve_general_o
 	    (match_operand 0 "aarch64_sve_ldr_operand")
 	    (match_test "aarch64_mov_operand_p (op, mode)"))))
 
+(define_predicate "aarch64_sve_struct_memory_operand"
+  (and (match_code "mem")
+       (match_test "aarch64_sve_struct_memory_operand_p (op)")))
+
+(define_predicate "aarch64_sve_struct_nonimmediate_operand"
+  (ior (match_operand 0 "register_operand")
+       (match_operand 0 "aarch64_sve_struct_memory_operand")))
+
 ;; Doesn't include immediates, since those are handled by the move
 ;; patterns instead.
 (define_predicate "aarch64_sve_dup_operand"
Index: gcc/config/aarch64/aarch64.md
===================================================================
--- gcc/config/aarch64/aarch64.md	2017-11-08 15:05:50.214852893 +0000
+++ gcc/config/aarch64/aarch64.md	2017-11-08 15:06:19.689849905 +0000
@@ -160,6 +160,8 @@  (define_c_enum "unspec" [
     UNSPEC_PACK
     UNSPEC_FLOAT_CONVERT
     UNSPEC_WHILE_LO
+    UNSPEC_LDN
+    UNSPEC_STN
 ])
 
 (define_c_enum "unspecv" [
Index: gcc/config/aarch64/aarch64-sve.md
===================================================================
--- gcc/config/aarch64/aarch64-sve.md	2017-11-08 15:05:50.206852894 +0000
+++ gcc/config/aarch64/aarch64-sve.md	2017-11-08 15:06:19.687849905 +0000
@@ -189,6 +189,105 @@  (define_insn "maskstore<mode><vpred>"
   "st1<Vesize>\t%1.<Vetype>, %2, %0"
 )
 
+;; SVE structure moves.
+(define_expand "mov<mode>"
+  [(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
+	(match_operand:SVE_STRUCT 1 "general_operand"))]
+  "TARGET_SVE"
+  {
+    /* Big-endian loads and stores need to be done via LD1 and ST1;
+       see the comment at the head of the file for details.  */
+    if ((MEM_P (operands[0]) || MEM_P (operands[1]))
+	&& BYTES_BIG_ENDIAN)
+      {
+	gcc_assert (can_create_pseudo_p ());
+	aarch64_expand_sve_mem_move (operands[0], operands[1], <VPRED>mode);
+	DONE;
+      }
+
+    if (CONSTANT_P (operands[1]))
+      {
+	aarch64_expand_mov_immediate (operands[0], operands[1]);
+	DONE;
+      }
+  }
+)
+
+;; Unpredicated structure moves (little-endian).
+(define_insn "*aarch64_sve_mov<mode>_le"
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
+	(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
+  "TARGET_SVE && !BYTES_BIG_ENDIAN"
+  "#"
+  [(set_attr "length" "<insn_length>")]
+)
+
+;; Unpredicated structure moves (big-endian).  Memory accesses require
+;; secondary reloads.
+(define_insn "*aarch64_sve_mov<mode>_le"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
+	(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
+  "TARGET_SVE && BYTES_BIG_ENDIAN"
+  "#"
+  [(set_attr "length" "<insn_length>")]
+)
+
+;; Split unpredicated structure moves into pieces.  This is the same
+;; for both big-endian and little-endian code, although it only needs
+;; to handle memory operands for little-endian code.
+(define_split
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand")
+	(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand"))]
+  "TARGET_SVE && reload_completed"
+  [(const_int 0)]
+  {
+    rtx dest = operands[0];
+    rtx src = operands[1];
+    if (REG_P (dest) && REG_P (src))
+      aarch64_simd_emit_reg_reg_move (operands, <VSINGLE>mode, <vector_count>);
+    else
+      for (unsigned int i = 0; i < <vector_count>; ++i)
+	{
+	  rtx subdest = simplify_gen_subreg (<VSINGLE>mode, dest, <MODE>mode,
+					     i * BYTES_PER_SVE_VECTOR);
+	  rtx subsrc = simplify_gen_subreg (<VSINGLE>mode, src, <MODE>mode,
+					    i * BYTES_PER_SVE_VECTOR);
+	  emit_insn (gen_rtx_SET (subdest, subsrc));
+	}
+    DONE;
+  }
+)
+
+;; Predicated structure moves.  This works for both endiannesses but in
+;; practice is only useful for big-endian.
+(define_insn_and_split "pred_mov<mode>"
+  [(set (match_operand:SVE_STRUCT 0 "aarch64_sve_struct_nonimmediate_operand" "=w, Utx")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 1 "register_operand" "Upl, Upl")
+	   (match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "Utx, w")]
+	  UNSPEC_MERGE_PTRUE))]
+  "TARGET_SVE
+   && (register_operand (operands[0], <MODE>mode)
+       || register_operand (operands[2], <MODE>mode))"
+  "#"
+  "&& reload_completed"
+  [(const_int 0)]
+  {
+    for (unsigned int i = 0; i < <vector_count>; ++i)
+      {
+	rtx subdest = simplify_gen_subreg (<VSINGLE>mode, operands[0],
+					   <MODE>mode,
+					   i * BYTES_PER_SVE_VECTOR);
+	rtx subsrc = simplify_gen_subreg (<VSINGLE>mode, operands[2],
+					  <MODE>mode,
+					  i * BYTES_PER_SVE_VECTOR);
+	aarch64_emit_sve_pred_move (subdest, operands[1], subsrc);
+      }
+    DONE;
+  }
+  [(set_attr "length" "<insn_length>")]
+)
+
 (define_expand "mov<mode>"
   [(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
 	(match_operand:PRED_ALL 1 "general_operand"))]
@@ -447,6 +546,60 @@  (define_insn "*vec_series<mode>_plus"
   }
 )
 
+;; Unpredicated LD[234].
+(define_expand "vec_load_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand")
+	(unspec:SVE_STRUCT
+	  [(match_dup 2)
+	   (match_operand:SVE_STRUCT 1 "memory_operand")]
+	  UNSPEC_LDN))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; Predicated LD[234].
+(define_insn "vec_mask_load_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_STRUCT 1 "memory_operand" "m")]
+	  UNSPEC_LDN))]
+  "TARGET_SVE"
+  "ld<vector_count><Vesize>\t%0, %2/z, %1"
+)
+
+;; Unpredicated ST[234].  This is always a full update, so the dependence
+;; on the old value of the memory location (via (match_dup 0)) is redundant.
+;; There doesn't seem to be any obvious benefit to treating the all-true
+;; case differently though.  In particular, it's very unlikely that we'll
+;; only find out during RTL that a store_lanes is dead.
+(define_expand "vec_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand")
+	(unspec:SVE_STRUCT
+	  [(match_dup 2)
+	   (match_operand:SVE_STRUCT 1 "register_operand")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  {
+    operands[2] = force_reg (<VPRED>mode, CONSTM1_RTX (<VPRED>mode));
+  }
+)
+
+;; Predicated ST[234].
+(define_insn "vec_mask_store_lanes<mode><vsingle>"
+  [(set (match_operand:SVE_STRUCT 0 "memory_operand" "+m")
+	(unspec:SVE_STRUCT
+	  [(match_operand:<VPRED> 2 "register_operand" "Upl")
+	   (match_operand:SVE_STRUCT 1 "register_operand" "w")
+	   (match_dup 0)]
+	  UNSPEC_STN))]
+  "TARGET_SVE"
+  "st<vector_count><Vesize>\t%1, %2, %0"
+)
+
 (define_expand "vec_perm_const<mode>"
   [(match_operand:SVE_ALL 0 "register_operand")
    (match_operand:SVE_ALL 1 "register_operand")
Index: gcc/config/aarch64/aarch64.c
===================================================================
--- gcc/config/aarch64/aarch64.c	2017-11-08 15:05:50.213852893 +0000
+++ gcc/config/aarch64/aarch64.c	2017-11-08 15:06:19.689849905 +0000
@@ -1179,9 +1179,15 @@  aarch64_classify_vector_mode (machine_mo
 	  || inner == DImode
 	  || inner == DFmode))
     {
-      if (TARGET_SVE
-	  && must_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
-	return VEC_SVE_DATA;
+      if (TARGET_SVE)
+	{
+	  if (must_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR))
+	    return VEC_SVE_DATA;
+	  if (must_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 2)
+	      || must_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 3)
+	      || must_eq (GET_MODE_BITSIZE (mode), BITS_PER_SVE_VECTOR * 4))
+	    return VEC_SVE_DATA | VEC_STRUCT;
+	}
 
       /* This includes V1DF but not V1DI (which doesn't exist).  */
       if (TARGET_SIMD
@@ -1209,6 +1215,18 @@  aarch64_sve_data_mode_p (machine_mode mo
   return aarch64_classify_vector_mode (mode) & VEC_SVE_DATA;
 }
 
+/* Implement target hook TARGET_ARRAY_MODE.  */
+static opt_machine_mode
+aarch64_array_mode (machine_mode mode, unsigned HOST_WIDE_INT nelems)
+{
+  if (aarch64_classify_vector_mode (mode) == VEC_SVE_DATA
+      && IN_RANGE (nelems, 2, 4))
+    return mode_for_vector (GET_MODE_INNER (mode),
+			    GET_MODE_NUNITS (mode) * nelems);
+
+  return opt_machine_mode ();
+}
+
 /* Implement target hook TARGET_ARRAY_MODE_SUPPORTED_P.  */
 static bool
 aarch64_array_mode_supported_p (machine_mode mode,
@@ -5677,6 +5695,18 @@  aarch64_classify_address (struct aarch64
 		    ? offset_4bit_signed_scaled_p (mode, offset)
 		    : offset_9bit_signed_scaled_p (mode, offset));
 
+	  if (vec_flags == (VEC_SVE_DATA | VEC_STRUCT))
+	    {
+	      poly_int64 end_offset = (offset
+				       + GET_MODE_SIZE (mode)
+				       - BYTES_PER_SVE_VECTOR);
+	      return (type == ADDR_QUERY_M
+		      ? offset_4bit_signed_scaled_p (mode, offset)
+		      : (offset_9bit_signed_scaled_p (SVE_BYTE_MODE, offset)
+			 && offset_9bit_signed_scaled_p (SVE_BYTE_MODE,
+							 end_offset)));
+	    }
+
 	  if (vec_flags == VEC_SVE_PRED)
 	    return offset_9bit_signed_scaled_p (mode, offset);
 
@@ -6391,6 +6421,20 @@  aarch64_print_vector_float_operand (FILE
   return true;
 }
 
+/* Return the equivalent letter for size.  */
+static char
+sizetochar (int size)
+{
+  switch (size)
+    {
+    case 64: return 'd';
+    case 32: return 's';
+    case 16: return 'h';
+    case 8 : return 'b';
+    default: gcc_unreachable ();
+    }
+}
+
 /* Print operand X to file F in a target specific manner according to CODE.
    The acceptable formatting commands given by CODE are:
      'c':		An integer or symbol address without a preceding #
@@ -6674,7 +6718,18 @@  aarch64_print_operand (FILE *f, rtx x, i
 	{
 	case REG:
 	  if (aarch64_sve_data_mode_p (GET_MODE (x)))
-	    asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
+	    {
+	      if (REG_NREGS (x) == 1)
+		asm_fprintf (f, "z%d", REGNO (x) - V0_REGNUM);
+	      else
+		{
+		  char suffix
+		    = sizetochar (GET_MODE_UNIT_BITSIZE (GET_MODE (x)));
+		  asm_fprintf (f, "{z%d.%c - z%d.%c}",
+			       REGNO (x) - V0_REGNUM, suffix,
+			       END_REGNO (x) - V0_REGNUM - 1, suffix);
+		}
+	    }
 	  else
 	    asm_fprintf (f, "%s", reg_names [REGNO (x)]);
 	  break;
@@ -12825,20 +12880,6 @@  aarch64_final_prescan_insn (rtx_insn *in
 }
 
 
-/* Return the equivalent letter for size.  */
-static char
-sizetochar (int size)
-{
-  switch (size)
-    {
-    case 64: return 'd';
-    case 32: return 's';
-    case 16: return 'h';
-    case 8 : return 'b';
-    default: gcc_unreachable ();
-    }
-}
-
 /* Return true if BASE_OR_STEP is a valid immediate operand for an SVE INDEX
    instruction.  */
 
@@ -13432,6 +13473,28 @@  aarch64_sve_ldr_operand_p (rtx op)
 	  && addr.type == ADDRESS_REG_IMM);
 }
 
+/* Return true if OP is a valid MEM operand for an SVE_STRUCT mode.
+   We need to be able to access the individual pieces, so the range
+   is different from LD[234] and ST[234].  */
+bool
+aarch64_sve_struct_memory_operand_p (rtx op)
+{
+  if (!MEM_P (op))
+    return false;
+
+  machine_mode mode = GET_MODE (op);
+  struct aarch64_address_info addr;
+  if (!aarch64_classify_address (&addr, XEXP (op, 0), SVE_BYTE_MODE, false,
+				 ADDR_QUERY_ANY)
+      || addr.type != ADDRESS_REG_IMM)
+    return false;
+
+  poly_int64 first = addr.const_offset;
+  poly_int64 last = first + GET_MODE_SIZE (mode) - BYTES_PER_SVE_VECTOR;
+  return (offset_4bit_signed_scaled_p (SVE_BYTE_MODE, first)
+	  && offset_4bit_signed_scaled_p (SVE_BYTE_MODE, last));
+}
+
 /* Emit a register copy from operand to operand, taking care not to
    early-clobber source registers in the process.
 
@@ -17542,6 +17605,9 @@  #define TARGET_VECTOR_MODE_SUPPORTED_P a
 #define TARGET_VECTORIZE_SUPPORT_VECTOR_MISALIGNMENT \
   aarch64_builtin_support_vector_misalignment
 
+#undef TARGET_ARRAY_MODE
+#define TARGET_ARRAY_MODE aarch64_array_mode
+
 #undef TARGET_ARRAY_MODE_SUPPORTED_P
 #define TARGET_ARRAY_MODE_SUPPORTED_P aarch64_array_mode_supported_p