Message ID | 20191017044232.27601-4-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm vector improvements | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > The gvec form will be needed for implementing SVE2. Hmm I'm seeing a failure against: aarch32-all-v80/insn_VMULL__INC.risu.bin From: https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH/download?path=%2F&files=aarch32-all-v80.tar.xz And some others. But this seems to be broken in master as well so I don't know if this is a regression or because I have my -cpu wrong for qemu-arm for something recorded on a cortex-a53 in aarch32. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/helper.h | 4 +--- > target/arm/neon_helper.c | 30 ------------------------------ > target/arm/translate-a64.c | 28 +++------------------------- > target/arm/translate.c | 16 ++-------------- > target/arm/vec_helper.c | 33 +++++++++++++++++++++++++++++++++ > 5 files changed, 39 insertions(+), 72 deletions(-) > > diff --git a/target/arm/helper.h b/target/arm/helper.h > index 800446e537..d954399b7e 100644 > --- a/target/arm/helper.h > +++ b/target/arm/helper.h > @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) > DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) > DEF_HELPER_2(dc_zva, void, env, i64) > > -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) > -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) > - > DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG, > void, ptr, ptr, ptr, ptr, i32) > DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG, > @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > > DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) > > #ifdef TARGET_AARCH64 > #include "helper-a64.h" > diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c > index 9e7a9a1ac5..6a107da0e1 100644 > --- a/target/arm/neon_helper.c > +++ b/target/arm/neon_helper.c > @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm) > rm[0] = m0; > rd[0] = d0; > } > - > -/* Helper function for 64 bit polynomial multiply case: > - * perform PolynomialMult(op1, op2) and return either the top or > - * bottom half of the 128 bit result. > - */ > -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2) > -{ > - int bitnum; > - uint64_t res = 0; > - > - for (bitnum = 0; bitnum < 64; bitnum++) { > - if (op1 & (1ULL << bitnum)) { > - res ^= op2 << bitnum; > - } > - } > - return res; > -} > -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2) > -{ > - int bitnum; > - uint64_t res = 0; > - > - /* bit 0 of op1 can't influence the high 64 bits at all */ > - for (bitnum = 1; bitnum < 64; bitnum++) { > - if (op1 & (1ULL << bitnum)) { > - res ^= op2 >> (64 - bitnum); > - } > - } > - return res; > -} > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 04e25cfe06..12588d18df 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, > clear_vec_high(s, is_q, rd); > } > > -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm) > -{ > - /* PMULL of 64 x 64 -> 128 is an odd special case because it > - * is the only three-reg-diff instruction which produces a > - * 128-bit wide result from a single operation. However since > - * it's possible to calculate the two halves more or less > - * separately we just use two helper calls. > - */ > - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); > - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); > - TCGv_i64 tcg_res = tcg_temp_new_i64(); > - > - read_vec_element(s, tcg_op1, rn, is_q, MO_64); > - read_vec_element(s, tcg_op2, rm, is_q, MO_64); > - gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2); > - write_vec_element(s, tcg_res, rd, 0, MO_64); > - gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2); > - write_vec_element(s, tcg_res, rd, 1, MO_64); > - > - tcg_temp_free_i64(tcg_op1); > - tcg_temp_free_i64(tcg_op2); > - tcg_temp_free_i64(tcg_res); > -} > - > /* AdvSIMD three different > * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 > * +---+---+---+-----------+------+---+------+--------+-----+------+------+ > @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) > if (!fp_access_check(s)) { > return; > } > - handle_pmull_64(s, is_q, rd, rn, rm); > + /* The Q field specifies lo/hi half input for this insn. */ > + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, > + gen_helper_gvec_pmull_q); > return; > } > goto is_widening; > diff --git a/target/arm/translate.c b/target/arm/translate.c > index b66a2f6b71..4e34249672 100644 > --- a/target/arm/translate.c > +++ b/target/arm/translate.c > @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) > * outside the loop below as it only performs a single pass. > */ > if (op == 14 && size == 2) { > - TCGv_i64 tcg_rn, tcg_rm, tcg_rd; > - > if (!dc_isar_feature(aa32_pmull, s)) { > return 1; > } > - tcg_rn = tcg_temp_new_i64(); > - tcg_rm = tcg_temp_new_i64(); > - tcg_rd = tcg_temp_new_i64(); > - neon_load_reg64(tcg_rn, rn); > - neon_load_reg64(tcg_rm, rm); > - gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm); > - neon_store_reg64(tcg_rd, rd); > - gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm); > - neon_store_reg64(tcg_rd, rd + 1); > - tcg_temp_free_i64(tcg_rn); > - tcg_temp_free_i64(tcg_rm); > - tcg_temp_free_i64(tcg_rd); > + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, > + 0, gen_helper_gvec_pmull_q); > return 0; > } > > diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c > index d401282c6f..5c1074374e 100644 > --- a/target/arm/vec_helper.c > +++ b/target/arm/vec_helper.c > @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) > } > clear_tail(d, opr_sz, simd_maxsz(desc)); > } > + > +/* > + * 64x64->128 polynomial multiply. > + * Because of the lanes are not accessed in strict columns, > + * this probably cannot be turned into a generic helper. > + */ > +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) > +{ > + intptr_t i, j, opr_sz = simd_oprsz(desc); > + intptr_t hi = simd_data(desc); > + uint64_t *d = vd, *n = vn, *m = vm; > + > + for (i = 0; i < opr_sz / 8; i += 2) { > + uint64_t nn = n[i + hi]; > + uint64_t mm = m[i + hi]; > + uint64_t rhi = 0; > + uint64_t rlo = 0; > + > + /* Bit 0 can only influence the low 64-bit result. */ > + if (nn & 1) { > + rlo = mm; > + } > + > + for (j = 1; j < 64; ++j) { > + uint64_t mask = -((nn >> j) & 1); > + rlo ^= (mm << j) & mask; > + rhi ^= (mm >> (64 - j)) & mask; > + } > + d[i] = rlo; > + d[i + 1] = rhi; > + } > + clear_tail(d, opr_sz, simd_maxsz(desc)); > +} -- Alex Bennée
Alex Bennée <alex.bennee@linaro.org> writes: > Richard Henderson <richard.henderson@linaro.org> writes: > >> The gvec form will be needed for implementing SVE2. > > Hmm I'm seeing a failure against: > > aarch32-all-v80/insn_VMULL__INC.risu.bin I take it back, after monkey patching cortex-a53 into qemu-arm it passes. Reviewed-by: Alex Bennée <alex.bennee@linaro.org> Tested-by: Alex Bennée <alex.bennee@linaro.org> > > From: > > https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH > https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH/download?path=%2F&files=aarch32-all-v80.tar.xz > > And some others. But this seems to be broken in master as well so I > don't know if this is a regression or because I have my -cpu wrong for > qemu-arm for something recorded on a cortex-a53 in aarch32. > >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/arm/helper.h | 4 +--- >> target/arm/neon_helper.c | 30 ------------------------------ >> target/arm/translate-a64.c | 28 +++------------------------- >> target/arm/translate.c | 16 ++-------------- >> target/arm/vec_helper.c | 33 +++++++++++++++++++++++++++++++++ >> 5 files changed, 39 insertions(+), 72 deletions(-) >> >> diff --git a/target/arm/helper.h b/target/arm/helper.h >> index 800446e537..d954399b7e 100644 >> --- a/target/arm/helper.h >> +++ b/target/arm/helper.h >> @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) >> DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) >> DEF_HELPER_2(dc_zva, void, env, i64) >> >> -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) >> -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) >> - >> DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG, >> void, ptr, ptr, ptr, ptr, i32) >> DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG, >> @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) >> DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) >> >> DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) >> +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) >> >> #ifdef TARGET_AARCH64 >> #include "helper-a64.h" >> diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c >> index 9e7a9a1ac5..6a107da0e1 100644 >> --- a/target/arm/neon_helper.c >> +++ b/target/arm/neon_helper.c >> @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm) >> rm[0] = m0; >> rd[0] = d0; >> } >> - >> -/* Helper function for 64 bit polynomial multiply case: >> - * perform PolynomialMult(op1, op2) and return either the top or >> - * bottom half of the 128 bit result. >> - */ >> -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2) >> -{ >> - int bitnum; >> - uint64_t res = 0; >> - >> - for (bitnum = 0; bitnum < 64; bitnum++) { >> - if (op1 & (1ULL << bitnum)) { >> - res ^= op2 << bitnum; >> - } >> - } >> - return res; >> -} >> -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2) >> -{ >> - int bitnum; >> - uint64_t res = 0; >> - >> - /* bit 0 of op1 can't influence the high 64 bits at all */ >> - for (bitnum = 1; bitnum < 64; bitnum++) { >> - if (op1 & (1ULL << bitnum)) { >> - res ^= op2 >> (64 - bitnum); >> - } >> - } >> - return res; >> -} >> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c >> index 04e25cfe06..12588d18df 100644 >> --- a/target/arm/translate-a64.c >> +++ b/target/arm/translate-a64.c >> @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, >> clear_vec_high(s, is_q, rd); >> } >> >> -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm) >> -{ >> - /* PMULL of 64 x 64 -> 128 is an odd special case because it >> - * is the only three-reg-diff instruction which produces a >> - * 128-bit wide result from a single operation. However since >> - * it's possible to calculate the two halves more or less >> - * separately we just use two helper calls. >> - */ >> - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); >> - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); >> - TCGv_i64 tcg_res = tcg_temp_new_i64(); >> - >> - read_vec_element(s, tcg_op1, rn, is_q, MO_64); >> - read_vec_element(s, tcg_op2, rm, is_q, MO_64); >> - gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2); >> - write_vec_element(s, tcg_res, rd, 0, MO_64); >> - gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2); >> - write_vec_element(s, tcg_res, rd, 1, MO_64); >> - >> - tcg_temp_free_i64(tcg_op1); >> - tcg_temp_free_i64(tcg_op2); >> - tcg_temp_free_i64(tcg_res); >> -} >> - >> /* AdvSIMD three different >> * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 >> * +---+---+---+-----------+------+---+------+--------+-----+------+------+ >> @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) >> if (!fp_access_check(s)) { >> return; >> } >> - handle_pmull_64(s, is_q, rd, rn, rm); >> + /* The Q field specifies lo/hi half input for this insn. */ >> + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, >> + gen_helper_gvec_pmull_q); >> return; >> } >> goto is_widening; >> diff --git a/target/arm/translate.c b/target/arm/translate.c >> index b66a2f6b71..4e34249672 100644 >> --- a/target/arm/translate.c >> +++ b/target/arm/translate.c >> @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) >> * outside the loop below as it only performs a single pass. >> */ >> if (op == 14 && size == 2) { >> - TCGv_i64 tcg_rn, tcg_rm, tcg_rd; >> - >> if (!dc_isar_feature(aa32_pmull, s)) { >> return 1; >> } >> - tcg_rn = tcg_temp_new_i64(); >> - tcg_rm = tcg_temp_new_i64(); >> - tcg_rd = tcg_temp_new_i64(); >> - neon_load_reg64(tcg_rn, rn); >> - neon_load_reg64(tcg_rm, rm); >> - gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm); >> - neon_store_reg64(tcg_rd, rd); >> - gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm); >> - neon_store_reg64(tcg_rd, rd + 1); >> - tcg_temp_free_i64(tcg_rn); >> - tcg_temp_free_i64(tcg_rm); >> - tcg_temp_free_i64(tcg_rd); >> + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, >> + 0, gen_helper_gvec_pmull_q); >> return 0; >> } >> >> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c >> index d401282c6f..5c1074374e 100644 >> --- a/target/arm/vec_helper.c >> +++ b/target/arm/vec_helper.c >> @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) >> } >> clear_tail(d, opr_sz, simd_maxsz(desc)); >> } >> + >> +/* >> + * 64x64->128 polynomial multiply. >> + * Because of the lanes are not accessed in strict columns, >> + * this probably cannot be turned into a generic helper. >> + */ >> +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) >> +{ >> + intptr_t i, j, opr_sz = simd_oprsz(desc); >> + intptr_t hi = simd_data(desc); >> + uint64_t *d = vd, *n = vn, *m = vm; >> + >> + for (i = 0; i < opr_sz / 8; i += 2) { >> + uint64_t nn = n[i + hi]; >> + uint64_t mm = m[i + hi]; >> + uint64_t rhi = 0; >> + uint64_t rlo = 0; >> + >> + /* Bit 0 can only influence the low 64-bit result. */ >> + if (nn & 1) { >> + rlo = mm; >> + } >> + >> + for (j = 1; j < 64; ++j) { >> + uint64_t mask = -((nn >> j) & 1); >> + rlo ^= (mm << j) & mask; >> + rhi ^= (mm >> (64 - j)) & mask; >> + } >> + d[i] = rlo; >> + d[i + 1] = rhi; >> + } >> + clear_tail(d, opr_sz, simd_maxsz(desc)); >> +} -- Alex Bennée
diff --git a/target/arm/helper.h b/target/arm/helper.h index 800446e537..d954399b7e 100644 --- a/target/arm/helper.h +++ b/target/arm/helper.h @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32) DEF_HELPER_2(dc_zva, void, env, i64) -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64) -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64) - DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG, @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32) #ifdef TARGET_AARCH64 #include "helper-a64.h" diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c index 9e7a9a1ac5..6a107da0e1 100644 --- a/target/arm/neon_helper.c +++ b/target/arm/neon_helper.c @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm) rm[0] = m0; rd[0] = d0; } - -/* Helper function for 64 bit polynomial multiply case: - * perform PolynomialMult(op1, op2) and return either the top or - * bottom half of the 128 bit result. - */ -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2) -{ - int bitnum; - uint64_t res = 0; - - for (bitnum = 0; bitnum < 64; bitnum++) { - if (op1 & (1ULL << bitnum)) { - res ^= op2 << bitnum; - } - } - return res; -} -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2) -{ - int bitnum; - uint64_t res = 0; - - /* bit 0 of op1 can't influence the high 64 bits at all */ - for (bitnum = 1; bitnum < 64; bitnum++) { - if (op1 & (1ULL << bitnum)) { - res ^= op2 >> (64 - bitnum); - } - } - return res; -} diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 04e25cfe06..12588d18df 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size, clear_vec_high(s, is_q, rd); } -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm) -{ - /* PMULL of 64 x 64 -> 128 is an odd special case because it - * is the only three-reg-diff instruction which produces a - * 128-bit wide result from a single operation. However since - * it's possible to calculate the two halves more or less - * separately we just use two helper calls. - */ - TCGv_i64 tcg_op1 = tcg_temp_new_i64(); - TCGv_i64 tcg_op2 = tcg_temp_new_i64(); - TCGv_i64 tcg_res = tcg_temp_new_i64(); - - read_vec_element(s, tcg_op1, rn, is_q, MO_64); - read_vec_element(s, tcg_op2, rm, is_q, MO_64); - gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2); - write_vec_element(s, tcg_res, rd, 0, MO_64); - gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2); - write_vec_element(s, tcg_res, rd, 1, MO_64); - - tcg_temp_free_i64(tcg_op1); - tcg_temp_free_i64(tcg_op2); - tcg_temp_free_i64(tcg_res); -} - /* AdvSIMD three different * 31 30 29 28 24 23 22 21 20 16 15 12 11 10 9 5 4 0 * +---+---+---+-----------+------+---+------+--------+-----+------+------+ @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn) if (!fp_access_check(s)) { return; } - handle_pmull_64(s, is_q, rd, rn, rm); + /* The Q field specifies lo/hi half input for this insn. */ + gen_gvec_op3_ool(s, true, rd, rn, rm, is_q, + gen_helper_gvec_pmull_q); return; } goto is_widening; diff --git a/target/arm/translate.c b/target/arm/translate.c index b66a2f6b71..4e34249672 100644 --- a/target/arm/translate.c +++ b/target/arm/translate.c @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn) * outside the loop below as it only performs a single pass. */ if (op == 14 && size == 2) { - TCGv_i64 tcg_rn, tcg_rm, tcg_rd; - if (!dc_isar_feature(aa32_pmull, s)) { return 1; } - tcg_rn = tcg_temp_new_i64(); - tcg_rm = tcg_temp_new_i64(); - tcg_rd = tcg_temp_new_i64(); - neon_load_reg64(tcg_rn, rn); - neon_load_reg64(tcg_rm, rm); - gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm); - neon_store_reg64(tcg_rd, rd); - gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm); - neon_store_reg64(tcg_rd, rd + 1); - tcg_temp_free_i64(tcg_rn); - tcg_temp_free_i64(tcg_rm); - tcg_temp_free_i64(tcg_rd); + tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16, + 0, gen_helper_gvec_pmull_q); return 0; } diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c index d401282c6f..5c1074374e 100644 --- a/target/arm/vec_helper.c +++ b/target/arm/vec_helper.c @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) } clear_tail(d, opr_sz, simd_maxsz(desc)); } + +/* + * 64x64->128 polynomial multiply. + * Because of the lanes are not accessed in strict columns, + * this probably cannot be turned into a generic helper. + */ +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) +{ + intptr_t i, j, opr_sz = simd_oprsz(desc); + intptr_t hi = simd_data(desc); + uint64_t *d = vd, *n = vn, *m = vm; + + for (i = 0; i < opr_sz / 8; i += 2) { + uint64_t nn = n[i + hi]; + uint64_t mm = m[i + hi]; + uint64_t rhi = 0; + uint64_t rlo = 0; + + /* Bit 0 can only influence the low 64-bit result. */ + if (nn & 1) { + rlo = mm; + } + + for (j = 1; j < 64; ++j) { + uint64_t mask = -((nn >> j) & 1); + rlo ^= (mm << j) & mask; + rhi ^= (mm >> (64 - j)) & mask; + } + d[i] = rlo; + d[i + 1] = rhi; + } + clear_tail(d, opr_sz, simd_maxsz(desc)); +}
The gvec form will be needed for implementing SVE2. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/helper.h | 4 +--- target/arm/neon_helper.c | 30 ------------------------------ target/arm/translate-a64.c | 28 +++------------------------- target/arm/translate.c | 16 ++-------------- target/arm/vec_helper.c | 33 +++++++++++++++++++++++++++++++++ 5 files changed, 39 insertions(+), 72 deletions(-) -- 2.17.1