Message ID | 1411069109-31425-3-git-send-email-charles.baylis@linaro.org |
---|---|
State | New |
Headers | show |
On 18/09/14 20:38, Charles Baylis wrote: > This patch replaces the inline assembler implementations of the > vld[234](q?)_lane_* intrinsics with new versions which exploit the new builtin > functions added in patch 1. > > Tested (with the rest of the patch series) with make check on aarch64-oe-linux > with qemu, and also causes no regressions in clyon's NEON intrinsics tests. > > <DATE> Charles Baylis <charles.baylis@linaro.org> > > * config/aarch64/arm_neon.h (__LD2_LANE_FUNC): Rewrite using builtins, > update uses to use new macro arguments. > (__LD3_LANE_FUNC): Likewise. > (__LD4_LANE_FUNC): Likewise. > > Change-Id: I3bd5934b5c4f6127088193c1ab12848144d5540a > --- > gcc/config/aarch64/arm_neon.h | 359 ++++++++++++++++++++++++++++-------------- > 1 file changed, 237 insertions(+), 122 deletions(-) > > diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h > index e62c783..c1fcb47 100644 > --- a/gcc/config/aarch64/arm_neon.h > +++ b/gcc/config/aarch64/arm_neon.h > @@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q) > __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q) > __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q) > > -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ > - "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17"); \ > - return result; \ > - } > - > -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,) > -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) > -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) > -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) > -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) > -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) > -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) > -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) > -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) > -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) > -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) > -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) > -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) > -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) > -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) > -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) > -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) > -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) > -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) > -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) > -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) > -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) > -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) > -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) > +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_oi __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregoi##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregoi##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_ld2_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ > + return __b; \ > +} > + > +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD2_LANE_FUNC > +#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + union { intype __i; \ > + __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ > + __temp.__o = __builtin_aarch64_ld2_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ > + return __temp.__i; \ > +} > + > +__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) > +__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) > +__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) > +__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) > +__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) > +__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) > +__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) > +__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) > +__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) > +__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) > +__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) > +__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) > > #define __LD3R_FUNC(rettype, structtype, ptrtype, \ > regsuffix, funcsuffix, Q) \ > @@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q) > __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q) > __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q) > > -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ > - "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17", "v18"); \ > - return result; \ > - } > - > -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,) > -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) > -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) > -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) > -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) > -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) > -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) > -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) > -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) > -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) > -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) > -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) > -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) > -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) > -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) > -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) > -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) > -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) > -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) > -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) > -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) > -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) > -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) > -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) > +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_ci __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __temp.val[2] = \ > + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_set_qregci##mode (__o, \ > + (signedtype) __temp.val[2], \ > + 2); \ > + __o = __builtin_aarch64_ld3_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ > + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ > + return __b; \ > +} > + > +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD3_LANE_FUNC > +#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + union { intype __i; \ > + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ > + __temp.__o = __builtin_aarch64_ld4_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ > + return __temp.__i; \ > +} > + > +__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) > +__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) > +__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) > +__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) > +__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) > +__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) > +__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) > +__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) > +__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) > +__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) > +__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) > +__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) > > #define __LD4R_FUNC(rettype, structtype, ptrtype, \ > regsuffix, funcsuffix, Q) \ > @@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q) > __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q) > __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q) > > -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ > - lnsuffix, funcsuffix, Q) \ > - __extension__ static __inline rettype \ > - __attribute__ ((__always_inline__)) \ > - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ > - rettype b, const int c) \ > - { \ > - rettype result; \ > - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ > - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ > - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ > - : "=Q"(result) \ > - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ > - : "memory", "v16", "v17", "v18", "v19"); \ > - return result; \ > - } > > -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) > -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) > -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) > -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) > -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) > -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) > -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) > -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) > -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) > -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) > -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) > -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) > -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) > -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) > -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) > -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) > -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) > -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) > -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) > -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) > -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) > -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) > -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) > -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) > +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \ > + mode, ptrmode, funcsuffix, signedtype) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + __builtin_aarch64_simd_xi __o; \ > + largetype __temp; \ > + __temp.val[0] = \ > + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ > + __temp.val[1] = \ > + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ > + __temp.val[2] = \ > + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ > + __temp.val[3] = \ > + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[0], \ > + 0); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[1], \ > + 1); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[2], \ > + 2); \ > + __o = __builtin_aarch64_set_qregxi##mode (__o, \ > + (signedtype) __temp.val[3], \ > + 3); \ > + __o = __builtin_aarch64_ld4_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ > + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ > + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ > + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ > + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ > + return __b; \ > +} > + > +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf, > + sf, f32, float32x4_t) > +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df, > + df, f64, float64x2_t) > +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, > + int8x16_t) > +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi, > + p16, int16x8_t) > +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8, > + int8x16_t) > +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, > + int16x8_t) > +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32, > + int32x4_t) > +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64, > + int64x2_t) > +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, > + int8x16_t) > +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi, > + u16, int16x8_t) > +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si, > + u32, int32x4_t) > +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di, > + u64, int64x2_t) > + > +#undef __LD4_LANE_FUNC > +#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ > +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ > +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ > +{ \ > + union { intype __i; \ > + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ > + __temp.__o = __builtin_aarch64_ld4_lane##mode ( \ > + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ > + return __temp.__i; \ > +} > + The reason we avoided using type-punning using unions was that reload would get confused with potential subreg(mem) that could be introduced because of memory xfer caused by unions and large int modes. As a result, we would get incorrect or sub-optimal code. But this seems to have fixed itself. :-) Because this involves xfers between large int modes and CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test what impact your patch has with C_C_M_C removed, so that it will be easier to fix the fallout once we remove C_C_M_C eventually. To test this you will need Richard's patch set https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html. Same for your other 2 patches in this series(3,4). Thanks, Tejas. > +__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) > +__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) > +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) > +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) > +__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) > +__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) > +__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) > +__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) > +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) > +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) > +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) > +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) > > #define __ST2_LANE_FUNC(intype, largetype, ptrtype, \ > mode, ptr_mode, funcsuffix, signedtype) \ > -- > 1.9.1 > >
On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote: > The reason we avoided using type-punning using unions was that reload would > get confused with potential subreg(mem) that could be introduced because of > memory xfer caused by unions and large int modes. As a result, we would get > incorrect or sub-optimal code. But this seems to have fixed itself. :-) > > Because this involves xfers between large int modes and > CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test > what impact your patch has with C_C_M_C removed, so that it will be easier > to fix the fallout once we remove C_C_M_C eventually. To test this you will > need Richard's patch set > https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html. > > Same for your other 2 patches in this series(3,4). I tried those patches, and altered aarch64_cannot_change_mode_class to return false for all cases. However, this does not avoid the unnecessary moves. Taking a really simple test case: #include <arm_neon.h> int32x2x2_t xvld2_s32(int32_t *__a) { int32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); return ret; } (disabling scheduling for clarity) $ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns -fno-schedule-insns2 ... xvld2_s32: ld2 {v2.2s - v3.2s}, [x0] orr v0.8b, v2.8b, v2.8b orr v1.8b, v3.8b, v3.8b ret ... The reason is apparent in the rtl dump from ira: ... Allocno a0r73 of FP_REGS(32) has 31 avail. regs 33-63, node: 33-63 (confl regs = 0-32 64 65) ... (insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ]) (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64} (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ]) (nil))) (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG) (insn 6 3 20 2 (set (reg/v:OI 73 [ __o ]) (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [ (mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8]) ] UNSPEC_LD2) (vec_duplicate:V2SI (const_int 0 [0]))) (vec_concat:V4SI (unspec:V2SI [ (mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8]) ] UNSPEC_LD2) (vec_duplicate:V2SI (const_int 0 [0])))) 0)) simd.c:8 2149 {aarch64_ld2v2si_dreg} (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ]) (nil))) (insn 20 6 21 2 (set (reg:V2SI 32 v0) (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778 {*aarch64_simd_movv2si} (nil)) (insn 21 20 22 2 (set (reg:V2SI 33 v1) (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778 {*aarch64_simd_movv2si} (expr_list:REG_DEAD (reg/v:OI 73 [ __o ]) (nil))) (insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1 (nil)) (insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1 (nil)) The register allocator considers r73 to conflict with v0, because they are simultaneously live after insn 20. Without the 2nd use of v73 (eg if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the allocator does do the right thing with the subreg and allocates v73 to {v0,v1}. I haven't read all of the old threads relating to Richard's patches yet, but I don't see why they would affect this issue. I don't think the register allocator is able to resolve this unless the conversion between the __builtin_simd type and the int32x4x2_t type is done as a single operation. However, type-punning is not possible with the arrays of 64 bit vectors, as the arrays are not the same size as the corresponding __builtin_simd types, and any solution for those would probably help with the q variants too. Maybe the solution is to pass the NEON intrinsic types directly to the builtins? Is there a reason that it wasn't done that way before? Thanks Charles
On 26/09/14 02:16, Charles Baylis wrote: > On 19 September 2014 12:21, Tejas Belagod <tejas.belagod@arm.com> wrote: >> The reason we avoided using type-punning using unions was that reload would >> get confused with potential subreg(mem) that could be introduced because of >> memory xfer caused by unions and large int modes. As a result, we would get >> incorrect or sub-optimal code. But this seems to have fixed itself. :-) >> >> Because this involves xfers between large int modes and >> CANNOT_CHANGE_MODE_CLASS has some impact on it, it would be good to test >> what impact your patch has with C_C_M_C removed, so that it will be easier >> to fix the fallout once we remove C_C_M_C eventually. To test this you will >> need Richard's patch set >> https://gcc.gnu.org/ml/gcc-patches/2014-09/msg01440.html. >> >> Same for your other 2 patches in this series(3,4). > > I tried those patches, and altered aarch64_cannot_change_mode_class to > return false for all cases. > > However, this does not avoid the unnecessary moves. > > Taking a really simple test case: > > #include <arm_neon.h> > > int32x2x2_t xvld2_s32(int32_t *__a) > { > int32x2x2_t ret; > __builtin_aarch64_simd_oi __o; > __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); > ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); > ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); > return ret; > } > > (disabling scheduling for clarity) > $ aarch64-oe-linux-gcc -O2 -S -o - simd.c -fno-schedule-insns > -fno-schedule-insns2 > ... > xvld2_s32: > ld2 {v2.2s - v3.2s}, [x0] > orr v0.8b, v2.8b, v2.8b > orr v1.8b, v3.8b, v3.8b > ret > ... > > > The reason is apparent in the rtl dump from ira: > ... > Allocno a0r73 of FP_REGS(32) has 31 avail. regs 33-63, node: > 33-63 (confl regs = 0-32 64 65) > ... > (insn 2 4 3 2 (set (reg/v/f:DI 79 [ __a ]) > (reg:DI 0 x0 [ __a ])) simd.c:5 34 {*movdi_aarch64} > (expr_list:REG_DEAD (reg:DI 0 x0 [ __a ]) > (nil))) > (note 3 2 6 2 NOTE_INSN_FUNCTION_BEG) > (insn 6 3 20 2 (set (reg/v:OI 73 [ __o ]) > (subreg:OI (vec_concat:V8SI (vec_concat:V4SI (unspec:V2SI [ > (mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8]) > ] UNSPEC_LD2) > (vec_duplicate:V2SI (const_int 0 [0]))) > (vec_concat:V4SI (unspec:V2SI [ > (mem:TI (reg/v/f:DI 79 [ __a ]) [0 S16 A8]) > ] UNSPEC_LD2) > (vec_duplicate:V2SI (const_int 0 [0])))) 0)) > simd.c:8 2149 {aarch64_ld2v2si_dreg} > (expr_list:REG_DEAD (reg/v/f:DI 79 [ __a ]) > (nil))) > (insn 20 6 21 2 (set (reg:V2SI 32 v0) > (subreg:V2SI (reg/v:OI 73 [ __o ]) 0)) simd.c:12 778 > {*aarch64_simd_movv2si} > (nil)) > (insn 21 20 22 2 (set (reg:V2SI 33 v1) > (subreg:V2SI (reg/v:OI 73 [ __o ]) 16)) simd.c:12 778 > {*aarch64_simd_movv2si} > (expr_list:REG_DEAD (reg/v:OI 73 [ __o ]) > (nil))) > (insn 22 21 23 2 (use (reg:V2SI 32 v0)) simd.c:12 -1 > (nil)) > (insn 23 22 0 2 (use (reg:V2SI 33 v1)) simd.c:12 -1 > (nil)) > > The register allocator considers r73 to conflict with v0, because they > are simultaneously live after insn 20. Without the 2nd use of v73 (eg > if the write to res.val[1] is replaced with vdup_n_s32(0) ) then the > allocator does do the right thing with the subreg and allocates v73 to > {v0,v1}. > > I haven't read all of the old threads relating to Richard's patches > yet, but I don't see why they would affect this issue. > > I don't think the register allocator is able to resolve this unless > the conversion between the __builtin_simd type and the int32x4x2_t > type is done as a single operation. > For this piece of code, #include "arm_neon.h" int32x2x2_t xvld2_s32(int32_t *__a) { union { int32x2x2_t __i; __builtin_aarch64_simd_oi __o; } __temp; __temp.__o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); return __temp.__i; } int32x2x2_t yvld2_s32(int32_t *__a) { int32x2x2_t ret; __builtin_aarch64_simd_oi __o; __o = __builtin_aarch64_ld2v2si ((const __builtin_aarch64_simd_si *) __a); ret.val[0] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 0); ret.val[1] = (int32x2_t) __builtin_aarch64_get_dregoiv2si (__o, 1); return ret; } currently my gcc HEAD generates at -O3: xvld2_s32: ld2 {v0.2s - v1.2s}, [x0] sub sp, sp, #64 st1 {v0.16b - v1.16b}, [sp] ldr x1, [sp] ldr x0, [sp, 8] add sp, sp, 64 ins v0.d[0], x1 ins v1.d[0], x0 ret .... yvld2_s32: ld2 {v2.2s - v3.2s}, [x0] orr v1.8b, v3.8b, v3.8b orr v0.8b, v2.8b, v2.8b ret If we use type-punning, there are unnecessary spills that are generated which is also incorrect for BE because of of the way we spill (st1 {v0.16b - v1.16b}, [sp]) and restore. The implementation without type-punning seems to give a more optimal result. Did your patches improve on the spills for the type-punning solution? > However, type-punning is not possible with the arrays of 64 bit > vectors, as the arrays are not the same size as the corresponding > __builtin_simd types, and any solution for those would probably help > with the q variants too. That is because we fill a zero-extended D-reg value into a 128-bit reg and pack them into an large int mode(eg. OI). We don't have large int modes made up of purely D-regs because we run into ambiguities like 4 D-regs is an OImode and 2 Q-regs is also an OImode. > Maybe the solution is to pass the NEON > intrinsic types directly to the builtins? Is there a reason that it > wasn't done that way before? > How do you mean? Do you mean pass a loaded value int32x2x2_t into a __builtin? How will that work? If you mean why we don't pass an int32x2x2_t into a builtin as a structure, I don't think that would work as it is struct type which would correspond to a BLK mode, but we need RTL patterns with reg-lists to work with large int modes for the regalloc to allocate consecutive regs for the reglists. Thanks, Tejas.
On 26 September 2014 13:47, Tejas Belagod <tejas.belagod@arm.com> wrote: > If we use type-punning, there are unnecessary spills that are generated > which is also incorrect for BE because of of the way we spill (st1 {v0.16b - > v1.16b}, [sp]) and restore. The implementation without type-punning seems to > give a more optimal result. Did your patches improve on the spills for the > type-punning solution? OK, this part seems too contentious, so I've respun the vldN_lane parts without the type punning and reposted them. This issue can be resolved separately. Trying an example like this gives good code with type punning, and poor code without. void t2(int32_t *p) { int32x4x4_t va = vld4q_s32(p); va = vld4q_lane_s32(p + 500, va, 1); vst4q_s32(p+1000, va); } With type-punning, good code: t2: ld4 {v0.4s - v3.4s}, [x0] add x2, x0, 2000 add x1, x0, 4000 ld4 {v0.s - v3.s}[1], [x2] st4 {v0.4s - v3.4s}, [x1] ret Without type-punning, horrible code: t2: ld4 {v0.4s - v3.4s}, [x0] sub sp, sp, #64 add x14, x0, 2000 add x0, x0, 4000 umov x12, v0.d[0] umov x13, v0.d[1] umov x10, v1.d[0] umov x11, v1.d[1] umov x8, v2.d[0] str x12, [sp] umov x9, v2.d[1] str x13, [sp, 8] str q3, [sp, 48] str x10, [sp, 16] str x11, [sp, 24] str x8, [sp, 32] str x9, [sp, 40] ld1 {v0.16b - v3.16b}, [sp] ld4 {v0.s - v3.s}[1], [x14] umov x10, v0.d[0] umov x11, v0.d[1] umov x8, v1.d[0] umov x9, v1.d[1] umov x6, v2.d[0] str x10, [sp] umov x7, v2.d[1] str x11, [sp, 8] str q3, [sp, 48] str x8, [sp, 16] str x9, [sp, 24] str x6, [sp, 32] str x7, [sp, 40] ld1 {v0.16b - v3.16b}, [sp] add sp, sp, 64 st4 {v0.4s - v3.4s}, [x0] ret >> Maybe the solution is to pass the NEON >> intrinsic types directly to the builtins? Is there a reason that it >> wasn't done that way before? > > How do you mean? Do you mean pass a loaded value int32x2x2_t into a > __builtin? How will that work? > > If you mean why we don't pass an int32x2x2_t into a builtin as a structure, > I don't think that would work as it is struct type which would correspond to > a BLK mode, but we need RTL patterns with reg-lists to work with large int > modes for the regalloc to allocate consecutive regs for the reglists. OK, that makes sense. However, something needs to be done to create the __arch64_simd_ objects without register moves. Since the existing mechanism causes problems because the lifetimes of the inputs overlap with the lifetimes of the outputs, I think there are these options: 1. represent the construction/deconstruction as a single operation, to avoid overlapping variable liveness in the source. 2. add a pass or peephole which can combine the existing builtins into a single operation, so that the lifetimes are normalised. 3. teach the register allocator how to handle overlapping liveness of a register and a subreg of that register. Option 1 would require a new builtin interface which somehow handled a whole int32x2x2_t in one operation. Construction is easy (__builtin_aarch64_simd_construct(v.val[0], v.val[1]) or similar). Deconstruction is less obvious Option 2 sounds like a hack, but would probably be effective, particularly if it can be done before inlining. Option 3 would also help with poor code generation for ARM targets with vget_low_*, vget_high_* and vcombine_*. What do you think is the best approach? Thanks Charles
diff --git a/gcc/config/aarch64/arm_neon.h b/gcc/config/aarch64/arm_neon.h index e62c783..c1fcb47 100644 --- a/gcc/config/aarch64/arm_neon.h +++ b/gcc/config/aarch64/arm_neon.h @@ -11805,47 +11805,79 @@ __LD2R_FUNC (uint16x8x2_t, uint16x2_t, uint16_t, 8h, u16, q) __LD2R_FUNC (uint32x4x2_t, uint32x2_t, uint32_t, 4s, u32, q) __LD2R_FUNC (uint64x2x2_t, uint64x2_t, uint64_t, 2d, u64, q) -#define __LD2_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld2 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix ", v17." #regsuffix "}, %1\n\t" \ - "ld2 {v16." #lnsuffix ", v17." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix ", v17." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17"); \ - return result; \ - } - -__LD2_LANE_FUNC (int8x8x2_t, uint8_t, 8b, b, s8,) -__LD2_LANE_FUNC (float32x2x2_t, float32_t, 2s, s, f32,) -__LD2_LANE_FUNC (float64x1x2_t, float64_t, 1d, d, f64,) -__LD2_LANE_FUNC (poly8x8x2_t, poly8_t, 8b, b, p8,) -__LD2_LANE_FUNC (poly16x4x2_t, poly16_t, 4h, h, p16,) -__LD2_LANE_FUNC (int16x4x2_t, int16_t, 4h, h, s16,) -__LD2_LANE_FUNC (int32x2x2_t, int32_t, 2s, s, s32,) -__LD2_LANE_FUNC (int64x1x2_t, int64_t, 1d, d, s64,) -__LD2_LANE_FUNC (uint8x8x2_t, uint8_t, 8b, b, u8,) -__LD2_LANE_FUNC (uint16x4x2_t, uint16_t, 4h, h, u16,) -__LD2_LANE_FUNC (uint32x2x2_t, uint32_t, 2s, s, u32,) -__LD2_LANE_FUNC (uint64x1x2_t, uint64_t, 1d, d, u64,) -__LD2_LANE_FUNC (float32x4x2_t, float32_t, 4s, s, f32, q) -__LD2_LANE_FUNC (float64x2x2_t, float64_t, 2d, d, f64, q) -__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, 16b, b, p8, q) -__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, 8h, h, p16, q) -__LD2_LANE_FUNC (int8x16x2_t, int8_t, 16b, b, s8, q) -__LD2_LANE_FUNC (int16x8x2_t, int16_t, 8h, h, s16, q) -__LD2_LANE_FUNC (int32x4x2_t, int32_t, 4s, s, s32, q) -__LD2_LANE_FUNC (int64x2x2_t, int64_t, 2d, d, s64, q) -__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, 16b, b, u8, q) -__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, 8h, h, u16, q) -__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, 4s, s, u32, q) -__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, 2d, d, u64, q) +#define __LD2_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld2_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_oi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregoi##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregoidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregoidi (__o, 1); \ + return __b; \ +} + +__LD2_LANE_FUNC (float32x2x2_t, float32x2_t, float32x4x2_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD2_LANE_FUNC (float64x1x2_t, float64x1_t, float64x2x2_t, float64_t, v2df, + df, f64, float64x2_t) +__LD2_LANE_FUNC (poly8x8x2_t, poly8x8_t, poly8x16x2_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD2_LANE_FUNC (poly16x4x2_t, poly16x4_t, poly16x8x2_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD2_LANE_FUNC (int8x8x2_t, int8x8_t, int8x16x2_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD2_LANE_FUNC (int16x4x2_t, int16x4_t, int16x8x2_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD2_LANE_FUNC (int32x2x2_t, int32x2_t, int32x4x2_t, int32_t, v4si, si, s32, + int32x4_t) +__LD2_LANE_FUNC (int64x1x2_t, int64x1_t, int64x2x2_t, int64_t, v2di, di, s64, + int64x2_t) +__LD2_LANE_FUNC (uint8x8x2_t, uint8x8_t, uint8x16x2_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD2_LANE_FUNC (uint16x4x2_t, uint16x4_t, uint16x8x2_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD2_LANE_FUNC (uint32x2x2_t, uint32x2_t, uint32x4x2_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD2_LANE_FUNC (uint64x1x2_t, uint64x1_t, uint64x2x2_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD2_LANE_FUNC +#define __LD2_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld2q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_oi __o; } __temp = { __b }; \ + __temp.__o = __builtin_aarch64_ld2_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ + return __temp.__i; \ +} + +__LD2_LANE_FUNC (float32x4x2_t, float32_t, v4sf, sf, f32) +__LD2_LANE_FUNC (float64x2x2_t, float64_t, v2df, df, f64) +__LD2_LANE_FUNC (poly8x16x2_t, poly8_t, v16qi, qi, p8) +__LD2_LANE_FUNC (poly16x8x2_t, poly16_t, v8hi, hi, p16) +__LD2_LANE_FUNC (int8x16x2_t, int8_t, v16qi, qi, s8) +__LD2_LANE_FUNC (int16x8x2_t, int16_t, v8hi, hi, s16) +__LD2_LANE_FUNC (int32x4x2_t, int32_t, v4si, si, s32) +__LD2_LANE_FUNC (int64x2x2_t, int64_t, v2di, di, s64) +__LD2_LANE_FUNC (uint8x16x2_t, uint8_t, v16qi, qi, u8) +__LD2_LANE_FUNC (uint16x8x2_t, uint16_t, v8hi, hi, u16) +__LD2_LANE_FUNC (uint32x4x2_t, uint32_t, v4si, si, u32) +__LD2_LANE_FUNC (uint64x2x2_t, uint64_t, v2di, di, u64) #define __LD3R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ @@ -11887,47 +11919,85 @@ __LD3R_FUNC (uint16x8x3_t, uint16x3_t, uint16_t, 8h, u16, q) __LD3R_FUNC (uint32x4x3_t, uint32x3_t, uint32_t, 4s, u32, q) __LD3R_FUNC (uint64x2x3_t, uint64x3_t, uint64_t, 2d, u64, q) -#define __LD3_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld3 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix " - v18." #regsuffix "}, %1\n\t" \ - "ld3 {v16." #lnsuffix " - v18." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix " - v18." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17", "v18"); \ - return result; \ - } - -__LD3_LANE_FUNC (int8x8x3_t, uint8_t, 8b, b, s8,) -__LD3_LANE_FUNC (float32x2x3_t, float32_t, 2s, s, f32,) -__LD3_LANE_FUNC (float64x1x3_t, float64_t, 1d, d, f64,) -__LD3_LANE_FUNC (poly8x8x3_t, poly8_t, 8b, b, p8,) -__LD3_LANE_FUNC (poly16x4x3_t, poly16_t, 4h, h, p16,) -__LD3_LANE_FUNC (int16x4x3_t, int16_t, 4h, h, s16,) -__LD3_LANE_FUNC (int32x2x3_t, int32_t, 2s, s, s32,) -__LD3_LANE_FUNC (int64x1x3_t, int64_t, 1d, d, s64,) -__LD3_LANE_FUNC (uint8x8x3_t, uint8_t, 8b, b, u8,) -__LD3_LANE_FUNC (uint16x4x3_t, uint16_t, 4h, h, u16,) -__LD3_LANE_FUNC (uint32x2x3_t, uint32_t, 2s, s, u32,) -__LD3_LANE_FUNC (uint64x1x3_t, uint64_t, 1d, d, u64,) -__LD3_LANE_FUNC (float32x4x3_t, float32_t, 4s, s, f32, q) -__LD3_LANE_FUNC (float64x2x3_t, float64_t, 2d, d, f64, q) -__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, 16b, b, p8, q) -__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, 8h, h, p16, q) -__LD3_LANE_FUNC (int8x16x3_t, int8_t, 16b, b, s8, q) -__LD3_LANE_FUNC (int16x8x3_t, int16_t, 8h, h, s16, q) -__LD3_LANE_FUNC (int32x4x3_t, int32_t, 4s, s, s32, q) -__LD3_LANE_FUNC (int64x2x3_t, int64_t, 2d, d, s64, q) -__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, 16b, b, u8, q) -__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, 8h, h, u16, q) -__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, 4s, s, u32, q) -__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, 2d, d, u64, q) +#define __LD3_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld3_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_ci __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregci##mode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_ld3_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregcidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregcidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregcidi (__o, 2); \ + return __b; \ +} + +__LD3_LANE_FUNC (float32x2x3_t, float32x2_t, float32x4x3_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD3_LANE_FUNC (float64x1x3_t, float64x1_t, float64x2x3_t, float64_t, v2df, + df, f64, float64x2_t) +__LD3_LANE_FUNC (poly8x8x3_t, poly8x8_t, poly8x16x3_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD3_LANE_FUNC (poly16x4x3_t, poly16x4_t, poly16x8x3_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD3_LANE_FUNC (int8x8x3_t, int8x8_t, int8x16x3_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD3_LANE_FUNC (int16x4x3_t, int16x4_t, int16x8x3_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD3_LANE_FUNC (int32x2x3_t, int32x2_t, int32x4x3_t, int32_t, v4si, si, s32, + int32x4_t) +__LD3_LANE_FUNC (int64x1x3_t, int64x1_t, int64x2x3_t, int64_t, v2di, di, s64, + int64x2_t) +__LD3_LANE_FUNC (uint8x8x3_t, uint8x8_t, uint8x16x3_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD3_LANE_FUNC (uint16x4x3_t, uint16x4_t, uint16x8x3_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD3_LANE_FUNC (uint32x2x3_t, uint32x2_t, uint32x4x3_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD3_LANE_FUNC (uint64x1x3_t, uint64x1_t, uint64x2x3_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD3_LANE_FUNC +#define __LD3_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld3q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ + __temp.__o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ + return __temp.__i; \ +} + +__LD3_LANE_FUNC (float32x4x3_t, float32_t, v4sf, sf, f32) +__LD3_LANE_FUNC (float64x2x3_t, float64_t, v2df, df, f64) +__LD3_LANE_FUNC (poly8x16x3_t, poly8_t, v16qi, qi, p8) +__LD3_LANE_FUNC (poly16x8x3_t, poly16_t, v8hi, hi, p16) +__LD3_LANE_FUNC (int8x16x3_t, int8_t, v16qi, qi, s8) +__LD3_LANE_FUNC (int16x8x3_t, int16_t, v8hi, hi, s16) +__LD3_LANE_FUNC (int32x4x3_t, int32_t, v4si, si, s32) +__LD3_LANE_FUNC (int64x2x3_t, int64_t, v2di, di, s64) +__LD3_LANE_FUNC (uint8x16x3_t, uint8_t, v16qi, qi, u8) +__LD3_LANE_FUNC (uint16x8x3_t, uint16_t, v8hi, hi, u16) +__LD3_LANE_FUNC (uint32x4x3_t, uint32_t, v4si, si, u32) +__LD3_LANE_FUNC (uint64x2x3_t, uint64_t, v2di, di, u64) #define __LD4R_FUNC(rettype, structtype, ptrtype, \ regsuffix, funcsuffix, Q) \ @@ -11969,47 +12039,92 @@ __LD4R_FUNC (uint16x8x4_t, uint16x4_t, uint16_t, 8h, u16, q) __LD4R_FUNC (uint32x4x4_t, uint32x4_t, uint32_t, 4s, u32, q) __LD4R_FUNC (uint64x2x4_t, uint64x4_t, uint64_t, 2d, u64, q) -#define __LD4_LANE_FUNC(rettype, ptrtype, regsuffix, \ - lnsuffix, funcsuffix, Q) \ - __extension__ static __inline rettype \ - __attribute__ ((__always_inline__)) \ - vld4 ## Q ## _lane_ ## funcsuffix (const ptrtype *ptr, \ - rettype b, const int c) \ - { \ - rettype result; \ - __asm__ ("ld1 {v16." #regsuffix " - v19." #regsuffix "}, %1\n\t" \ - "ld4 {v16." #lnsuffix " - v19." #lnsuffix "}[%3], %2\n\t" \ - "st1 {v16." #regsuffix " - v19." #regsuffix "}, %0\n\t" \ - : "=Q"(result) \ - : "Q"(b), "Q"(*(const rettype *)ptr), "i"(c) \ - : "memory", "v16", "v17", "v18", "v19"); \ - return result; \ - } -__LD4_LANE_FUNC (int8x8x4_t, uint8_t, 8b, b, s8,) -__LD4_LANE_FUNC (float32x2x4_t, float32_t, 2s, s, f32,) -__LD4_LANE_FUNC (float64x1x4_t, float64_t, 1d, d, f64,) -__LD4_LANE_FUNC (poly8x8x4_t, poly8_t, 8b, b, p8,) -__LD4_LANE_FUNC (poly16x4x4_t, poly16_t, 4h, h, p16,) -__LD4_LANE_FUNC (int16x4x4_t, int16_t, 4h, h, s16,) -__LD4_LANE_FUNC (int32x2x4_t, int32_t, 2s, s, s32,) -__LD4_LANE_FUNC (int64x1x4_t, int64_t, 1d, d, s64,) -__LD4_LANE_FUNC (uint8x8x4_t, uint8_t, 8b, b, u8,) -__LD4_LANE_FUNC (uint16x4x4_t, uint16_t, 4h, h, u16,) -__LD4_LANE_FUNC (uint32x2x4_t, uint32_t, 2s, s, u32,) -__LD4_LANE_FUNC (uint64x1x4_t, uint64_t, 1d, d, u64,) -__LD4_LANE_FUNC (float32x4x4_t, float32_t, 4s, s, f32, q) -__LD4_LANE_FUNC (float64x2x4_t, float64_t, 2d, d, f64, q) -__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, 16b, b, p8, q) -__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, 8h, h, p16, q) -__LD4_LANE_FUNC (int8x16x4_t, int8_t, 16b, b, s8, q) -__LD4_LANE_FUNC (int16x8x4_t, int16_t, 8h, h, s16, q) -__LD4_LANE_FUNC (int32x4x4_t, int32_t, 4s, s, s32, q) -__LD4_LANE_FUNC (int64x2x4_t, int64_t, 2d, d, s64, q) -__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, 16b, b, u8, q) -__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, 8h, h, u16, q) -__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, 4s, s, u32, q) -__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, 2d, d, u64, q) +#define __LD4_LANE_FUNC(intype, vectype, largetype, ptrtype, \ + mode, ptrmode, funcsuffix, signedtype) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld4_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + __builtin_aarch64_simd_xi __o; \ + largetype __temp; \ + __temp.val[0] = \ + vcombine_##funcsuffix (__b.val[0], vcreate_##funcsuffix (0)); \ + __temp.val[1] = \ + vcombine_##funcsuffix (__b.val[1], vcreate_##funcsuffix (0)); \ + __temp.val[2] = \ + vcombine_##funcsuffix (__b.val[2], vcreate_##funcsuffix (0)); \ + __temp.val[3] = \ + vcombine_##funcsuffix (__b.val[3], vcreate_##funcsuffix (0)); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[0], \ + 0); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[1], \ + 1); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[2], \ + 2); \ + __o = __builtin_aarch64_set_qregxi##mode (__o, \ + (signedtype) __temp.val[3], \ + 3); \ + __o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __o, __c); \ + __b.val[0] = (vectype) __builtin_aarch64_get_dregxidi (__o, 0); \ + __b.val[1] = (vectype) __builtin_aarch64_get_dregxidi (__o, 1); \ + __b.val[2] = (vectype) __builtin_aarch64_get_dregxidi (__o, 2); \ + __b.val[3] = (vectype) __builtin_aarch64_get_dregxidi (__o, 3); \ + return __b; \ +} + +__LD4_LANE_FUNC (float32x2x4_t, float32x2_t, float32x4x4_t, float32_t, v4sf, + sf, f32, float32x4_t) +__LD4_LANE_FUNC (float64x1x4_t, float64x1_t, float64x2x4_t, float64_t, v2df, + df, f64, float64x2_t) +__LD4_LANE_FUNC (poly8x8x4_t, poly8x8_t, poly8x16x4_t, poly8_t, v16qi, qi, p8, + int8x16_t) +__LD4_LANE_FUNC (poly16x4x4_t, poly16x4_t, poly16x8x4_t, poly16_t, v8hi, hi, + p16, int16x8_t) +__LD4_LANE_FUNC (int8x8x4_t, int8x8_t, int8x16x4_t, int8_t, v16qi, qi, s8, + int8x16_t) +__LD4_LANE_FUNC (int16x4x4_t, int16x4_t, int16x8x4_t, int16_t, v8hi, hi, s16, + int16x8_t) +__LD4_LANE_FUNC (int32x2x4_t, int32x2_t, int32x4x4_t, int32_t, v4si, si, s32, + int32x4_t) +__LD4_LANE_FUNC (int64x1x4_t, int64x1_t, int64x2x4_t, int64_t, v2di, di, s64, + int64x2_t) +__LD4_LANE_FUNC (uint8x8x4_t, uint8x8_t, uint8x16x4_t, uint8_t, v16qi, qi, u8, + int8x16_t) +__LD4_LANE_FUNC (uint16x4x4_t, uint16x4_t, uint16x8x4_t, uint16_t, v8hi, hi, + u16, int16x8_t) +__LD4_LANE_FUNC (uint32x2x4_t, uint32x2_t, uint32x4x4_t, uint32_t, v4si, si, + u32, int32x4_t) +__LD4_LANE_FUNC (uint64x1x4_t, uint64x1_t, uint64x2x4_t, uint64_t, v2di, di, + u64, int64x2_t) + +#undef __LD4_LANE_FUNC +#define __LD4_LANE_FUNC(intype, ptrtype, mode, ptrmode, funcsuffix) \ +__extension__ static __inline intype __attribute__ ((__always_inline__)) \ +vld4q_lane_##funcsuffix (const ptrtype * __ptr, intype __b, const int __c) \ +{ \ + union { intype __i; \ + __builtin_aarch64_simd_xi __o; } __temp = { __b }; \ + __temp.__o = __builtin_aarch64_ld4_lane##mode ( \ + (__builtin_aarch64_simd_##ptrmode *) __ptr, __temp.__o, __c); \ + return __temp.__i; \ +} + +__LD4_LANE_FUNC (float32x4x4_t, float32_t, v4sf, sf, f32) +__LD4_LANE_FUNC (float64x2x4_t, float64_t, v2df, df, f64) +__LD4_LANE_FUNC (poly8x16x4_t, poly8_t, v16qi, qi, p8) +__LD4_LANE_FUNC (poly16x8x4_t, poly16_t, v8hi, hi, p16) +__LD4_LANE_FUNC (int8x16x4_t, int8_t, v16qi, qi, s8) +__LD4_LANE_FUNC (int16x8x4_t, int16_t, v8hi, hi, s16) +__LD4_LANE_FUNC (int32x4x4_t, int32_t, v4si, si, s32) +__LD4_LANE_FUNC (int64x2x4_t, int64_t, v2di, di, s64) +__LD4_LANE_FUNC (uint8x16x4_t, uint8_t, v16qi, qi, u8) +__LD4_LANE_FUNC (uint16x8x4_t, uint16_t, v8hi, hi, u16) +__LD4_LANE_FUNC (uint32x4x4_t, uint32_t, v4si, si, u32) +__LD4_LANE_FUNC (uint64x2x4_t, uint64_t, v2di, di, u64) #define __ST2_LANE_FUNC(intype, largetype, ptrtype, \ mode, ptr_mode, funcsuffix, signedtype) \