Message ID | 7b1f910a-628f-089a-eed3-23476c1bda9e@arm.com |
---|---|
State | New |
Headers | show |
On Mon, Nov 07, 2016 at 01:39:53PM +0000, Richard Earnshaw (lists) wrote: > This patch contains an implementation of search_line_fast for the CPP > lexer. It's based in part on the AArch32 (ARM) code but incorporates > new instructions available in AArch64 (reduction add operations) plus > some tricks for reducing the realignment overheads. We assume a page > size of 4k, but that's a safe assumption -- AArch64 systems can never > have a smaller page size than that: on systems with larger pages we will > go through the realignment code more often than strictly necessary, but > it's still likely to be in the noise (less than 0.5% of the time). > Bootstrapped on aarch64-none-linux-gnu. Some very minor nits wrt. style for the Advanced SIMD intrinsics, otherwise OK from me. > > + const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); It is a pedantic point, but these casts are a GNU extension, the "portable" way to write this would be: vreinterpretq_u8_u64 (vdupq_n_u64 (0x8040201008040201ULL)); > + > +#ifdef __AARCH64EB > + const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0}; This sort of vector initialisation is a bit scary for user programmers, as we shouldn't generally mix Neon intrinsics with the GNU extensions (for exactly the reason you have here, keeping BE and LE straight is extra effort) This could be written portably as: vcombine_u16 (vdup_n_u16 (8), vdup_n_u16 (0)); Or if you prefer to be explicit about the elements: int16_t buf[] = {8, 8, 8, 8, 0, 0, 0, 0}; int16x8_t shift = vld1q_s16 (buf); > +#else > + const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8}; > +#endif > + > + unsigned int found; > + const uint8_t *p; > + uint8x16_t data; > + uint8x16_t t; > + uint16x8_t m; > + uint8x16_t u, v, w; > + > + /* Align the source pointer. */ > + p = (const uint8_t *)((uintptr_t)s & -16); > + > + /* Assuming random string start positions, with a 4k page size we'll take > + the slow path about 0.37% of the time. */ > + if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE > + - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1))) > + < 16, 0)) > + { > + /* Slow path: the string starts near a possible page boundary. */ > + uint32_t misalign, mask; > + > + misalign = (uintptr_t)s & 15; > + mask = (-1u << misalign) & 0xffff; > + data = vld1q_u8 (p); > + t = vceqq_u8 (data, repl_nl); > + u = vceqq_u8 (data, repl_cr); > + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); > + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); > + t = vorrq_u8 (v, w); Can you trust the compiler to perform the reassociation here manually? That would let you write this in the more natural form: t = vceqq_u8 (data, repl_nl); t = vorrq_u8 (t, vceqq_u8 (data, repl_cr)); t = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); t = vorrq_u8 (t, vceqq_u8 (data, repl_qm)); > + t = vandq_u8 (t, xmask); > + m = vpaddlq_u8 (t); > + m = vshlq_u16 (m, shift); > + found = vaddvq_u16 (m); > + found &= mask; > + if (found) > + return (const uchar*)p + __builtin_ctz (found); > + } > + else > + { > + data = vld1q_u8 ((const uint8_t *) s); > + t = vceqq_u8 (data, repl_nl); > + u = vceqq_u8 (data, repl_cr); > + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); > + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); > + t = vorrq_u8 (v, w); > + if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) > + goto done; As above, this cast is a GNU extension: if (__builtin_expect (vpaddd_u64 (vreinterpretq_u64_u8 (t)), 0)) > + } > + > + do > + { > + p += 16; > + data = vld1q_u8 (p); > + t = vceqq_u8 (data, repl_nl); > + u = vceqq_u8 (data, repl_cr); > + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); > + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); > + t = vorrq_u8 (v, w); > + } while (!vpaddd_u64 ((uint64x2_t)t)); Likewise here. Thanks, James
On 08/11/16 09:46, James Greenhalgh wrote: > On Mon, Nov 07, 2016 at 01:39:53PM +0000, Richard Earnshaw (lists) wrote: >> This patch contains an implementation of search_line_fast for the CPP >> lexer. It's based in part on the AArch32 (ARM) code but incorporates >> new instructions available in AArch64 (reduction add operations) plus >> some tricks for reducing the realignment overheads. We assume a page >> size of 4k, but that's a safe assumption -- AArch64 systems can never >> have a smaller page size than that: on systems with larger pages we will >> go through the realignment code more often than strictly necessary, but >> it's still likely to be in the noise (less than 0.5% of the time). >> Bootstrapped on aarch64-none-linux-gnu. > > Some very minor nits wrt. style for the Advanced SIMD intrinsics, otherwise > OK from me. > >> >> + const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); > > > It is a pedantic point, but these casts are a GNU extension, the "portable" > way to write this would be: > > vreinterpretq_u8_u64 (vdupq_n_u64 (0x8040201008040201ULL)); We've used GNU-style casts in the original code and never encountered problems. I personally find the reinterpret casts less readable.. > >> + >> +#ifdef __AARCH64EB >> + const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0}; > > This sort of vector initialisation is a bit scary for user programmers, as > we shouldn't generally mix Neon intrinsics with the GNU extensions (for > exactly the reason you have here, keeping BE and LE straight is extra > effort) > > This could be written portably as: > > vcombine_u16 (vdup_n_u16 (8), vdup_n_u16 (0)); > Nice idea, but that's the wrong way around and fixing it currently generates *terrible* code. > Or if you prefer to be explicit about the elements: > > int16_t buf[] = {8, 8, 8, 8, 0, 0, 0, 0}; > int16x8_t shift = vld1q_s16 (buf); > >> +#else >> + const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8}; >> +#endif >> + >> + unsigned int found; >> + const uint8_t *p; >> + uint8x16_t data; >> + uint8x16_t t; >> + uint16x8_t m; >> + uint8x16_t u, v, w; >> + >> + /* Align the source pointer. */ >> + p = (const uint8_t *)((uintptr_t)s & -16); >> + >> + /* Assuming random string start positions, with a 4k page size we'll take >> + the slow path about 0.37% of the time. */ >> + if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE >> + - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1))) >> + < 16, 0)) >> + { >> + /* Slow path: the string starts near a possible page boundary. */ >> + uint32_t misalign, mask; >> + >> + misalign = (uintptr_t)s & 15; >> + mask = (-1u << misalign) & 0xffff; >> + data = vld1q_u8 (p); >> + t = vceqq_u8 (data, repl_nl); >> + u = vceqq_u8 (data, repl_cr); >> + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); >> + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); >> + t = vorrq_u8 (v, w); > > Can you trust the compiler to perform the reassociation here manually? > That would let you write this in the more natural form: > > t = vceqq_u8 (data, repl_nl); > t = vorrq_u8 (t, vceqq_u8 (data, repl_cr)); > t = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); > t = vorrq_u8 (t, vceqq_u8 (data, repl_qm)); > Maybe, but we have plenty of spare registers (this is target specific code, I know what's happening). Either way, the reassoc code is currently messing with this and serializing the VORRQ operations. >> + t = vandq_u8 (t, xmask); >> + m = vpaddlq_u8 (t); >> + m = vshlq_u16 (m, shift); >> + found = vaddvq_u16 (m); >> + found &= mask; >> + if (found) >> + return (const uchar*)p + __builtin_ctz (found); >> + } >> + else >> + { >> + data = vld1q_u8 ((const uint8_t *) s); >> + t = vceqq_u8 (data, repl_nl); >> + u = vceqq_u8 (data, repl_cr); >> + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); >> + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); >> + t = vorrq_u8 (v, w); >> + if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) >> + goto done; > > As above, this cast is a GNU extension: > > if (__builtin_expect (vpaddd_u64 (vreinterpretq_u64_u8 (t)), 0)) > >> + } >> + >> + do >> + { >> + p += 16; >> + data = vld1q_u8 (p); >> + t = vceqq_u8 (data, repl_nl); >> + u = vceqq_u8 (data, repl_cr); >> + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); >> + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); >> + t = vorrq_u8 (v, w); >> + } while (!vpaddd_u64 ((uint64x2_t)t)); > > Likewise here. > > Thanks, > James >
On Nov 07 2016, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote: > This patch contains an implementation of search_line_fast for the CPP > lexer. It's based in part on the AArch32 (ARM) code but incorporates > new instructions available in AArch64 (reduction add operations) plus > some tricks for reducing the realignment overheads. I'm getting erroneous behaviour when building the compiler in ILP32 mode. build/genmatch --gimple ../../gcc/match.pd \ > tmp-gimple-match.c /home/abuild/rpmbuild/BUILD/gcc-7.0.1-r246083/obj-aarch64-suse-linux/gcc/cfn-operators.pd:91:5 error: expected (, got NAME (define_operator_list EXPM1 ^ Some part of the code appears to depend on LP64. When I insert three newlines before this line then the parser goes on further, but reports a similar error later on. Andreas. -- Andreas Schwab, SUSE Labs, schwab@suse.de GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE 1748 E4D4 88E3 0EEA B9D7 "And now for something completely different."
On 20/03/17 14:53, Andreas Schwab wrote: > On Nov 07 2016, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote: > >> This patch contains an implementation of search_line_fast for the CPP >> lexer. It's based in part on the AArch32 (ARM) code but incorporates >> new instructions available in AArch64 (reduction add operations) plus >> some tricks for reducing the realignment overheads. > > I'm getting erroneous behaviour when building the compiler in ILP32 > mode. > > build/genmatch --gimple ../../gcc/match.pd \ > > tmp-gimple-match.c > /home/abuild/rpmbuild/BUILD/gcc-7.0.1-r246083/obj-aarch64-suse-linux/gcc/cfn-operators.pd:91:5 error: expected (, got NAME > (define_operator_list EXPM1 > ^ > > Some part of the code appears to depend on LP64. When I insert three > newlines before this line then the parser goes on further, but reports a > similar error later on. > > Andreas. > Please file a PR. I don't have access to an ILP32 run-time environment, so I'm not sure how I'll be able to check this out. There are some pointer checks in the code so it's possible something is going awry. Can you compare the assembly output for ILP32 and LP64 to see if there's anything obvious? R.
On Mär 20 2017, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote: > I don't have access to an ILP32 run-time environment, so I'm not sure > how I'll be able to check this out. There are some pointer checks in > the code so it's possible something is going awry. Can you compare the > assembly output for ILP32 and LP64 to see if there's anything obvious? The problem is here: if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) vpaddd_u64 returns a uint64_t value, but __builtin_expect takes a long (32-bit in ILP32 mode). Andreas. * lex.c (search_line_fast) [__ARM_NEON && __ARM_64BIT_STATE]: Convert 64-bit value to boolean before passing to __builtin_expect. -- 2.12.0 -- Andreas Schwab, SUSE Labs, schwab@suse.de GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE 1748 E4D4 88E3 0EEA B9D7 "And now for something completely different."diff --git a/libcpp/lex.c b/libcpp/lex.c index 8a8c79cde7..a431ac8e05 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -821,7 +821,7 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); t = vorrq_u8 (v, w); - if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) + if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0)) goto done; }
On 20/03/17 17:27, Andreas Schwab wrote: > On Mär 20 2017, "Richard Earnshaw (lists)" <Richard.Earnshaw@arm.com> wrote: > >> I don't have access to an ILP32 run-time environment, so I'm not sure >> how I'll be able to check this out. There are some pointer checks in >> the code so it's possible something is going awry. Can you compare the >> assembly output for ILP32 and LP64 to see if there's anything obvious? > > The problem is here: > > if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) > > vpaddd_u64 returns a uint64_t value, but __builtin_expect takes a long > (32-bit in ILP32 mode). > Yikes! I'm a bit surprised __builtin_expect doesn't take a bool, but I guess that's due to needing to support old versions of C that lacked that data type. Either way, a silent truncation is very undesirable. > Andreas. > > * lex.c (search_line_fast) [__ARM_NEON && __ARM_64BIT_STATE]: > Convert 64-bit value to boolean before passing to > __builtin_expect. OK. R. > > diff --git a/libcpp/lex.c b/libcpp/lex.c > index 8a8c79cde7..a431ac8e05 100644 > --- a/libcpp/lex.c > +++ b/libcpp/lex.c > @@ -821,7 +821,7 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) > v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); > w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); > t = vorrq_u8 (v, w); > - if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) > + if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t) != 0, 0)) > goto done; > } > >
diff --git a/libcpp/lex.c b/libcpp/lex.c index 6f65fa1..cea8848 100644 --- a/libcpp/lex.c +++ b/libcpp/lex.c @@ -752,6 +752,101 @@ search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) } } +#elif defined (__ARM_NEON) && defined (__ARM_64BIT_STATE) +#include "arm_neon.h" + +/* This doesn't have to be the exact page size, but no system may use + a size smaller than this. ARMv8 requires a minimum page size of + 4k. The impact of being conservative here is a small number of + cases will take the slightly slower entry path into the main + loop. */ + +#define AARCH64_MIN_PAGE_SIZE 4096 + +static const uchar * +search_line_fast (const uchar *s, const uchar *end ATTRIBUTE_UNUSED) +{ + const uint8x16_t repl_nl = vdupq_n_u8 ('\n'); + const uint8x16_t repl_cr = vdupq_n_u8 ('\r'); + const uint8x16_t repl_bs = vdupq_n_u8 ('\\'); + const uint8x16_t repl_qm = vdupq_n_u8 ('?'); + const uint8x16_t xmask = (uint8x16_t) vdupq_n_u64 (0x8040201008040201ULL); + +#ifdef __AARCH64EB + const int16x8_t shift = {8, 8, 8, 8, 0, 0, 0, 0}; +#else + const int16x8_t shift = {0, 0, 0, 0, 8, 8, 8, 8}; +#endif + + unsigned int found; + const uint8_t *p; + uint8x16_t data; + uint8x16_t t; + uint16x8_t m; + uint8x16_t u, v, w; + + /* Align the source pointer. */ + p = (const uint8_t *)((uintptr_t)s & -16); + + /* Assuming random string start positions, with a 4k page size we'll take + the slow path about 0.37% of the time. */ + if (__builtin_expect ((AARCH64_MIN_PAGE_SIZE + - (((uintptr_t) s) & (AARCH64_MIN_PAGE_SIZE - 1))) + < 16, 0)) + { + /* Slow path: the string starts near a possible page boundary. */ + uint32_t misalign, mask; + + misalign = (uintptr_t)s & 15; + mask = (-1u << misalign) & 0xffff; + data = vld1q_u8 (p); + t = vceqq_u8 (data, repl_nl); + u = vceqq_u8 (data, repl_cr); + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); + t = vorrq_u8 (v, w); + t = vandq_u8 (t, xmask); + m = vpaddlq_u8 (t); + m = vshlq_u16 (m, shift); + found = vaddvq_u16 (m); + found &= mask; + if (found) + return (const uchar*)p + __builtin_ctz (found); + } + else + { + data = vld1q_u8 ((const uint8_t *) s); + t = vceqq_u8 (data, repl_nl); + u = vceqq_u8 (data, repl_cr); + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); + t = vorrq_u8 (v, w); + if (__builtin_expect (vpaddd_u64 ((uint64x2_t)t), 0)) + goto done; + } + + do + { + p += 16; + data = vld1q_u8 (p); + t = vceqq_u8 (data, repl_nl); + u = vceqq_u8 (data, repl_cr); + v = vorrq_u8 (t, vceqq_u8 (data, repl_bs)); + w = vorrq_u8 (u, vceqq_u8 (data, repl_qm)); + t = vorrq_u8 (v, w); + } while (!vpaddd_u64 ((uint64x2_t)t)); + +done: + /* Now that we've found the terminating substring, work out precisely where + we need to stop. */ + t = vandq_u8 (t, xmask); + m = vpaddlq_u8 (t); + m = vshlq_u16 (m, shift); + found = vaddvq_u16 (m); + return (((((uintptr_t) p) < (uintptr_t) s) ? s : (const uchar *)p) + + __builtin_ctz (found)); +} + #elif defined (__ARM_NEON) #include "arm_neon.h"