diff mbox series

[3/4] target/arm: Convert PMULL.64 to gvec

Message ID 20191017044232.27601-4-richard.henderson@linaro.org
State Superseded
Headers show
Series target/arm vector improvements | expand

Commit Message

Richard Henderson Oct. 17, 2019, 4:42 a.m. UTC
The gvec form will be needed for implementing SVE2.

Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

---
 target/arm/helper.h        |  4 +---
 target/arm/neon_helper.c   | 30 ------------------------------
 target/arm/translate-a64.c | 28 +++-------------------------
 target/arm/translate.c     | 16 ++--------------
 target/arm/vec_helper.c    | 33 +++++++++++++++++++++++++++++++++
 5 files changed, 39 insertions(+), 72 deletions(-)

-- 
2.17.1

Comments

Alex Bennée Oct. 18, 2019, 12:24 p.m. UTC | #1
Richard Henderson <richard.henderson@linaro.org> writes:

> The gvec form will be needed for implementing SVE2.


Hmm I'm seeing a failure against:

  aarch32-all-v80/insn_VMULL__INC.risu.bin

From:

  https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH
  https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH/download?path=%2F&files=aarch32-all-v80.tar.xz

And some others. But this seems to be broken in master as well so I
don't know if this is a regression or because I have my -cpu wrong for
qemu-arm for something recorded on a cortex-a53 in aarch32.

>

> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

> ---

>  target/arm/helper.h        |  4 +---

>  target/arm/neon_helper.c   | 30 ------------------------------

>  target/arm/translate-a64.c | 28 +++-------------------------

>  target/arm/translate.c     | 16 ++--------------

>  target/arm/vec_helper.c    | 33 +++++++++++++++++++++++++++++++++

>  5 files changed, 39 insertions(+), 72 deletions(-)

>

> diff --git a/target/arm/helper.h b/target/arm/helper.h

> index 800446e537..d954399b7e 100644

> --- a/target/arm/helper.h

> +++ b/target/arm/helper.h

> @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)

>  DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)

>  DEF_HELPER_2(dc_zva, void, env, i64)

>

> -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)

> -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)

> -

>  DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG,

>                     void, ptr, ptr, ptr, ptr, i32)

>  DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG,

> @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>  DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>

>  DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

> +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>

>  #ifdef TARGET_AARCH64

>  #include "helper-a64.h"

> diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c

> index 9e7a9a1ac5..6a107da0e1 100644

> --- a/target/arm/neon_helper.c

> +++ b/target/arm/neon_helper.c

> @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm)

>      rm[0] = m0;

>      rd[0] = d0;

>  }

> -

> -/* Helper function for 64 bit polynomial multiply case:

> - * perform PolynomialMult(op1, op2) and return either the top or

> - * bottom half of the 128 bit result.

> - */

> -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)

> -{

> -    int bitnum;

> -    uint64_t res = 0;

> -

> -    for (bitnum = 0; bitnum < 64; bitnum++) {

> -        if (op1 & (1ULL << bitnum)) {

> -            res ^= op2 << bitnum;

> -        }

> -    }

> -    return res;

> -}

> -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)

> -{

> -    int bitnum;

> -    uint64_t res = 0;

> -

> -    /* bit 0 of op1 can't influence the high 64 bits at all */

> -    for (bitnum = 1; bitnum < 64; bitnum++) {

> -        if (op1 & (1ULL << bitnum)) {

> -            res ^= op2 >> (64 - bitnum);

> -        }

> -    }

> -    return res;

> -}

> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c

> index 04e25cfe06..12588d18df 100644

> --- a/target/arm/translate-a64.c

> +++ b/target/arm/translate-a64.c

> @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,

>      clear_vec_high(s, is_q, rd);

>  }

>

> -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm)

> -{

> -    /* PMULL of 64 x 64 -> 128 is an odd special case because it

> -     * is the only three-reg-diff instruction which produces a

> -     * 128-bit wide result from a single operation. However since

> -     * it's possible to calculate the two halves more or less

> -     * separately we just use two helper calls.

> -     */

> -    TCGv_i64 tcg_op1 = tcg_temp_new_i64();

> -    TCGv_i64 tcg_op2 = tcg_temp_new_i64();

> -    TCGv_i64 tcg_res = tcg_temp_new_i64();

> -

> -    read_vec_element(s, tcg_op1, rn, is_q, MO_64);

> -    read_vec_element(s, tcg_op2, rm, is_q, MO_64);

> -    gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2);

> -    write_vec_element(s, tcg_res, rd, 0, MO_64);

> -    gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2);

> -    write_vec_element(s, tcg_res, rd, 1, MO_64);

> -

> -    tcg_temp_free_i64(tcg_op1);

> -    tcg_temp_free_i64(tcg_op2);

> -    tcg_temp_free_i64(tcg_res);

> -}

> -

>  /* AdvSIMD three different

>   *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0

>   * +---+---+---+-----------+------+---+------+--------+-----+------+------+

> @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)

>              if (!fp_access_check(s)) {

>                  return;

>              }

> -            handle_pmull_64(s, is_q, rd, rn, rm);

> +            /* The Q field specifies lo/hi half input for this insn.  */

> +            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,

> +                             gen_helper_gvec_pmull_q);

>              return;

>          }

>          goto is_widening;

> diff --git a/target/arm/translate.c b/target/arm/translate.c

> index b66a2f6b71..4e34249672 100644

> --- a/target/arm/translate.c

> +++ b/target/arm/translate.c

> @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)

>                   * outside the loop below as it only performs a single pass.

>                   */

>                  if (op == 14 && size == 2) {

> -                    TCGv_i64 tcg_rn, tcg_rm, tcg_rd;

> -

>                      if (!dc_isar_feature(aa32_pmull, s)) {

>                          return 1;

>                      }

> -                    tcg_rn = tcg_temp_new_i64();

> -                    tcg_rm = tcg_temp_new_i64();

> -                    tcg_rd = tcg_temp_new_i64();

> -                    neon_load_reg64(tcg_rn, rn);

> -                    neon_load_reg64(tcg_rm, rm);

> -                    gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm);

> -                    neon_store_reg64(tcg_rd, rd);

> -                    gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm);

> -                    neon_store_reg64(tcg_rd, rd + 1);

> -                    tcg_temp_free_i64(tcg_rn);

> -                    tcg_temp_free_i64(tcg_rm);

> -                    tcg_temp_free_i64(tcg_rd);

> +                    tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,

> +                                       0, gen_helper_gvec_pmull_q);

>                      return 0;

>                  }

>

> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c

> index d401282c6f..5c1074374e 100644

> --- a/target/arm/vec_helper.c

> +++ b/target/arm/vec_helper.c

> @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)

>      }

>      clear_tail(d, opr_sz, simd_maxsz(desc));

>  }

> +

> +/*

> + * 64x64->128 polynomial multiply.

> + * Because of the lanes are not accessed in strict columns,

> + * this probably cannot be turned into a generic helper.

> + */

> +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)

> +{

> +    intptr_t i, j, opr_sz = simd_oprsz(desc);

> +    intptr_t hi = simd_data(desc);

> +    uint64_t *d = vd, *n = vn, *m = vm;

> +

> +    for (i = 0; i < opr_sz / 8; i += 2) {

> +        uint64_t nn = n[i + hi];

> +        uint64_t mm = m[i + hi];

> +        uint64_t rhi = 0;

> +        uint64_t rlo = 0;

> +

> +        /* Bit 0 can only influence the low 64-bit result.  */

> +        if (nn & 1) {

> +            rlo = mm;

> +        }

> +

> +        for (j = 1; j < 64; ++j) {

> +            uint64_t mask = -((nn >> j) & 1);

> +            rlo ^= (mm << j) & mask;

> +            rhi ^= (mm >> (64 - j)) & mask;

> +        }

> +        d[i] = rlo;

> +        d[i + 1] = rhi;

> +    }

> +    clear_tail(d, opr_sz, simd_maxsz(desc));

> +}



--
Alex Bennée
Alex Bennée Oct. 18, 2019, 1:40 p.m. UTC | #2
Alex Bennée <alex.bennee@linaro.org> writes:

> Richard Henderson <richard.henderson@linaro.org> writes:

>

>> The gvec form will be needed for implementing SVE2.

>

> Hmm I'm seeing a failure against:

>

>   aarch32-all-v80/insn_VMULL__INC.risu.bin


I take it back, after monkey patching cortex-a53 into qemu-arm it
passes.

Reviewed-by: Alex Bennée <alex.bennee@linaro.org>

Tested-by: Alex Bennée <alex.bennee@linaro.org>


>

> From:

>

>   https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH

>   https://fileserver.linaro.org/owncloud/index.php/s/hvEXM2eJ3uZVhlH/download?path=%2F&files=aarch32-all-v80.tar.xz

>

> And some others. But this seems to be broken in master as well so I

> don't know if this is a regression or because I have my -cpu wrong for

> qemu-arm for something recorded on a cortex-a53 in aarch32.

>

>>

>> Signed-off-by: Richard Henderson <richard.henderson@linaro.org>

>> ---

>>  target/arm/helper.h        |  4 +---

>>  target/arm/neon_helper.c   | 30 ------------------------------

>>  target/arm/translate-a64.c | 28 +++-------------------------

>>  target/arm/translate.c     | 16 ++--------------

>>  target/arm/vec_helper.c    | 33 +++++++++++++++++++++++++++++++++

>>  5 files changed, 39 insertions(+), 72 deletions(-)

>>

>> diff --git a/target/arm/helper.h b/target/arm/helper.h

>> index 800446e537..d954399b7e 100644

>> --- a/target/arm/helper.h

>> +++ b/target/arm/helper.h

>> @@ -555,9 +555,6 @@ DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)

>>  DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)

>>  DEF_HELPER_2(dc_zva, void, env, i64)

>>

>> -DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)

>> -DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)

>> -

>>  DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG,

>>                     void, ptr, ptr, ptr, ptr, i32)

>>  DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG,

>> @@ -689,6 +686,7 @@ DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>>  DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>>

>>  DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>> +DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)

>>

>>  #ifdef TARGET_AARCH64

>>  #include "helper-a64.h"

>> diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c

>> index 9e7a9a1ac5..6a107da0e1 100644

>> --- a/target/arm/neon_helper.c

>> +++ b/target/arm/neon_helper.c

>> @@ -2152,33 +2152,3 @@ void HELPER(neon_zip16)(void *vd, void *vm)

>>      rm[0] = m0;

>>      rd[0] = d0;

>>  }

>> -

>> -/* Helper function for 64 bit polynomial multiply case:

>> - * perform PolynomialMult(op1, op2) and return either the top or

>> - * bottom half of the 128 bit result.

>> - */

>> -uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)

>> -{

>> -    int bitnum;

>> -    uint64_t res = 0;

>> -

>> -    for (bitnum = 0; bitnum < 64; bitnum++) {

>> -        if (op1 & (1ULL << bitnum)) {

>> -            res ^= op2 << bitnum;

>> -        }

>> -    }

>> -    return res;

>> -}

>> -uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)

>> -{

>> -    int bitnum;

>> -    uint64_t res = 0;

>> -

>> -    /* bit 0 of op1 can't influence the high 64 bits at all */

>> -    for (bitnum = 1; bitnum < 64; bitnum++) {

>> -        if (op1 & (1ULL << bitnum)) {

>> -            res ^= op2 >> (64 - bitnum);

>> -        }

>> -    }

>> -    return res;

>> -}

>> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c

>> index 04e25cfe06..12588d18df 100644

>> --- a/target/arm/translate-a64.c

>> +++ b/target/arm/translate-a64.c

>> @@ -10598,30 +10598,6 @@ static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,

>>      clear_vec_high(s, is_q, rd);

>>  }

>>

>> -static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm)

>> -{

>> -    /* PMULL of 64 x 64 -> 128 is an odd special case because it

>> -     * is the only three-reg-diff instruction which produces a

>> -     * 128-bit wide result from a single operation. However since

>> -     * it's possible to calculate the two halves more or less

>> -     * separately we just use two helper calls.

>> -     */

>> -    TCGv_i64 tcg_op1 = tcg_temp_new_i64();

>> -    TCGv_i64 tcg_op2 = tcg_temp_new_i64();

>> -    TCGv_i64 tcg_res = tcg_temp_new_i64();

>> -

>> -    read_vec_element(s, tcg_op1, rn, is_q, MO_64);

>> -    read_vec_element(s, tcg_op2, rm, is_q, MO_64);

>> -    gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2);

>> -    write_vec_element(s, tcg_res, rd, 0, MO_64);

>> -    gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2);

>> -    write_vec_element(s, tcg_res, rd, 1, MO_64);

>> -

>> -    tcg_temp_free_i64(tcg_op1);

>> -    tcg_temp_free_i64(tcg_op2);

>> -    tcg_temp_free_i64(tcg_res);

>> -}

>> -

>>  /* AdvSIMD three different

>>   *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0

>>   * +---+---+---+-----------+------+---+------+--------+-----+------+------+

>> @@ -10686,7 +10662,9 @@ static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)

>>              if (!fp_access_check(s)) {

>>                  return;

>>              }

>> -            handle_pmull_64(s, is_q, rd, rn, rm);

>> +            /* The Q field specifies lo/hi half input for this insn.  */

>> +            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,

>> +                             gen_helper_gvec_pmull_q);

>>              return;

>>          }

>>          goto is_widening;

>> diff --git a/target/arm/translate.c b/target/arm/translate.c

>> index b66a2f6b71..4e34249672 100644

>> --- a/target/arm/translate.c

>> +++ b/target/arm/translate.c

>> @@ -5877,23 +5877,11 @@ static int disas_neon_data_insn(DisasContext *s, uint32_t insn)

>>                   * outside the loop below as it only performs a single pass.

>>                   */

>>                  if (op == 14 && size == 2) {

>> -                    TCGv_i64 tcg_rn, tcg_rm, tcg_rd;

>> -

>>                      if (!dc_isar_feature(aa32_pmull, s)) {

>>                          return 1;

>>                      }

>> -                    tcg_rn = tcg_temp_new_i64();

>> -                    tcg_rm = tcg_temp_new_i64();

>> -                    tcg_rd = tcg_temp_new_i64();

>> -                    neon_load_reg64(tcg_rn, rn);

>> -                    neon_load_reg64(tcg_rm, rm);

>> -                    gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm);

>> -                    neon_store_reg64(tcg_rd, rd);

>> -                    gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm);

>> -                    neon_store_reg64(tcg_rd, rd + 1);

>> -                    tcg_temp_free_i64(tcg_rn);

>> -                    tcg_temp_free_i64(tcg_rm);

>> -                    tcg_temp_free_i64(tcg_rd);

>> +                    tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,

>> +                                       0, gen_helper_gvec_pmull_q);

>>                      return 0;

>>                  }

>>

>> diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c

>> index d401282c6f..5c1074374e 100644

>> --- a/target/arm/vec_helper.c

>> +++ b/target/arm/vec_helper.c

>> @@ -1164,3 +1164,36 @@ void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)

>>      }

>>      clear_tail(d, opr_sz, simd_maxsz(desc));

>>  }

>> +

>> +/*

>> + * 64x64->128 polynomial multiply.

>> + * Because of the lanes are not accessed in strict columns,

>> + * this probably cannot be turned into a generic helper.

>> + */

>> +void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)

>> +{

>> +    intptr_t i, j, opr_sz = simd_oprsz(desc);

>> +    intptr_t hi = simd_data(desc);

>> +    uint64_t *d = vd, *n = vn, *m = vm;

>> +

>> +    for (i = 0; i < opr_sz / 8; i += 2) {

>> +        uint64_t nn = n[i + hi];

>> +        uint64_t mm = m[i + hi];

>> +        uint64_t rhi = 0;

>> +        uint64_t rlo = 0;

>> +

>> +        /* Bit 0 can only influence the low 64-bit result.  */

>> +        if (nn & 1) {

>> +            rlo = mm;

>> +        }

>> +

>> +        for (j = 1; j < 64; ++j) {

>> +            uint64_t mask = -((nn >> j) & 1);

>> +            rlo ^= (mm << j) & mask;

>> +            rhi ^= (mm >> (64 - j)) & mask;

>> +        }

>> +        d[i] = rlo;

>> +        d[i + 1] = rhi;

>> +    }

>> +    clear_tail(d, opr_sz, simd_maxsz(desc));

>> +}



--
Alex Bennée
diff mbox series

Patch

diff --git a/target/arm/helper.h b/target/arm/helper.h
index 800446e537..d954399b7e 100644
--- a/target/arm/helper.h
+++ b/target/arm/helper.h
@@ -555,9 +555,6 @@  DEF_HELPER_FLAGS_3(crc32, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
 DEF_HELPER_FLAGS_3(crc32c, TCG_CALL_NO_RWG_SE, i32, i32, i32, i32)
 DEF_HELPER_2(dc_zva, void, env, i64)
 
-DEF_HELPER_FLAGS_2(neon_pmull_64_lo, TCG_CALL_NO_RWG_SE, i64, i64, i64)
-DEF_HELPER_FLAGS_2(neon_pmull_64_hi, TCG_CALL_NO_RWG_SE, i64, i64, i64)
-
 DEF_HELPER_FLAGS_5(gvec_qrdmlah_s16, TCG_CALL_NO_RWG,
                    void, ptr, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_5(gvec_qrdmlsh_s16, TCG_CALL_NO_RWG,
@@ -689,6 +686,7 @@  DEF_HELPER_FLAGS_4(gvec_ushl_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 DEF_HELPER_FLAGS_4(gvec_ushl_h, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 DEF_HELPER_FLAGS_4(gvec_pmul_b, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
+DEF_HELPER_FLAGS_4(gvec_pmull_q, TCG_CALL_NO_RWG, void, ptr, ptr, ptr, i32)
 
 #ifdef TARGET_AARCH64
 #include "helper-a64.h"
diff --git a/target/arm/neon_helper.c b/target/arm/neon_helper.c
index 9e7a9a1ac5..6a107da0e1 100644
--- a/target/arm/neon_helper.c
+++ b/target/arm/neon_helper.c
@@ -2152,33 +2152,3 @@  void HELPER(neon_zip16)(void *vd, void *vm)
     rm[0] = m0;
     rd[0] = d0;
 }
-
-/* Helper function for 64 bit polynomial multiply case:
- * perform PolynomialMult(op1, op2) and return either the top or
- * bottom half of the 128 bit result.
- */
-uint64_t HELPER(neon_pmull_64_lo)(uint64_t op1, uint64_t op2)
-{
-    int bitnum;
-    uint64_t res = 0;
-
-    for (bitnum = 0; bitnum < 64; bitnum++) {
-        if (op1 & (1ULL << bitnum)) {
-            res ^= op2 << bitnum;
-        }
-    }
-    return res;
-}
-uint64_t HELPER(neon_pmull_64_hi)(uint64_t op1, uint64_t op2)
-{
-    int bitnum;
-    uint64_t res = 0;
-
-    /* bit 0 of op1 can't influence the high 64 bits at all */
-    for (bitnum = 1; bitnum < 64; bitnum++) {
-        if (op1 & (1ULL << bitnum)) {
-            res ^= op2 >> (64 - bitnum);
-        }
-    }
-    return res;
-}
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c
index 04e25cfe06..12588d18df 100644
--- a/target/arm/translate-a64.c
+++ b/target/arm/translate-a64.c
@@ -10598,30 +10598,6 @@  static void handle_3rd_narrowing(DisasContext *s, int is_q, int is_u, int size,
     clear_vec_high(s, is_q, rd);
 }
 
-static void handle_pmull_64(DisasContext *s, int is_q, int rd, int rn, int rm)
-{
-    /* PMULL of 64 x 64 -> 128 is an odd special case because it
-     * is the only three-reg-diff instruction which produces a
-     * 128-bit wide result from a single operation. However since
-     * it's possible to calculate the two halves more or less
-     * separately we just use two helper calls.
-     */
-    TCGv_i64 tcg_op1 = tcg_temp_new_i64();
-    TCGv_i64 tcg_op2 = tcg_temp_new_i64();
-    TCGv_i64 tcg_res = tcg_temp_new_i64();
-
-    read_vec_element(s, tcg_op1, rn, is_q, MO_64);
-    read_vec_element(s, tcg_op2, rm, is_q, MO_64);
-    gen_helper_neon_pmull_64_lo(tcg_res, tcg_op1, tcg_op2);
-    write_vec_element(s, tcg_res, rd, 0, MO_64);
-    gen_helper_neon_pmull_64_hi(tcg_res, tcg_op1, tcg_op2);
-    write_vec_element(s, tcg_res, rd, 1, MO_64);
-
-    tcg_temp_free_i64(tcg_op1);
-    tcg_temp_free_i64(tcg_op2);
-    tcg_temp_free_i64(tcg_res);
-}
-
 /* AdvSIMD three different
  *   31  30  29 28       24 23  22  21 20  16 15    12 11 10 9    5 4    0
  * +---+---+---+-----------+------+---+------+--------+-----+------+------+
@@ -10686,7 +10662,9 @@  static void disas_simd_three_reg_diff(DisasContext *s, uint32_t insn)
             if (!fp_access_check(s)) {
                 return;
             }
-            handle_pmull_64(s, is_q, rd, rn, rm);
+            /* The Q field specifies lo/hi half input for this insn.  */
+            gen_gvec_op3_ool(s, true, rd, rn, rm, is_q,
+                             gen_helper_gvec_pmull_q);
             return;
         }
         goto is_widening;
diff --git a/target/arm/translate.c b/target/arm/translate.c
index b66a2f6b71..4e34249672 100644
--- a/target/arm/translate.c
+++ b/target/arm/translate.c
@@ -5877,23 +5877,11 @@  static int disas_neon_data_insn(DisasContext *s, uint32_t insn)
                  * outside the loop below as it only performs a single pass.
                  */
                 if (op == 14 && size == 2) {
-                    TCGv_i64 tcg_rn, tcg_rm, tcg_rd;
-
                     if (!dc_isar_feature(aa32_pmull, s)) {
                         return 1;
                     }
-                    tcg_rn = tcg_temp_new_i64();
-                    tcg_rm = tcg_temp_new_i64();
-                    tcg_rd = tcg_temp_new_i64();
-                    neon_load_reg64(tcg_rn, rn);
-                    neon_load_reg64(tcg_rm, rm);
-                    gen_helper_neon_pmull_64_lo(tcg_rd, tcg_rn, tcg_rm);
-                    neon_store_reg64(tcg_rd, rd);
-                    gen_helper_neon_pmull_64_hi(tcg_rd, tcg_rn, tcg_rm);
-                    neon_store_reg64(tcg_rd, rd + 1);
-                    tcg_temp_free_i64(tcg_rn);
-                    tcg_temp_free_i64(tcg_rm);
-                    tcg_temp_free_i64(tcg_rd);
+                    tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, 16, 16,
+                                       0, gen_helper_gvec_pmull_q);
                     return 0;
                 }
 
diff --git a/target/arm/vec_helper.c b/target/arm/vec_helper.c
index d401282c6f..5c1074374e 100644
--- a/target/arm/vec_helper.c
+++ b/target/arm/vec_helper.c
@@ -1164,3 +1164,36 @@  void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
     }
     clear_tail(d, opr_sz, simd_maxsz(desc));
 }
+
+/*
+ * 64x64->128 polynomial multiply.
+ * Because of the lanes are not accessed in strict columns,
+ * this probably cannot be turned into a generic helper.
+ */
+void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
+{
+    intptr_t i, j, opr_sz = simd_oprsz(desc);
+    intptr_t hi = simd_data(desc);
+    uint64_t *d = vd, *n = vn, *m = vm;
+
+    for (i = 0; i < opr_sz / 8; i += 2) {
+        uint64_t nn = n[i + hi];
+        uint64_t mm = m[i + hi];
+        uint64_t rhi = 0;
+        uint64_t rlo = 0;
+
+        /* Bit 0 can only influence the low 64-bit result.  */
+        if (nn & 1) {
+            rlo = mm;
+        }
+
+        for (j = 1; j < 64; ++j) {
+            uint64_t mask = -((nn >> j) & 1);
+            rlo ^= (mm << j) & mask;
+            rhi ^= (mm >> (64 - j)) & mask;
+        }
+        d[i] = rlo;
+        d[i + 1] = rhi;
+    }
+    clear_tail(d, opr_sz, simd_maxsz(desc));
+}