Message ID | 20180425012300.14698-10-richard.henderson@linaro.org |
---|---|
State | Superseded |
Headers | show |
Series | target/arm: Fixups for ARM_FEATURE_V8_FP16 | expand |
Richard Henderson <richard.henderson@linaro.org> writes: > We missed all of the scalar fp16 fma operations. > > Signed-off-by: Richard Henderson <richard.henderson@linaro.org> > --- > target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ > 1 file changed, 48 insertions(+) > > diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c > index 11b90b7eb0..0cb1fc4d67 100644 > --- a/target/arm/translate-a64.c > +++ b/target/arm/translate-a64.c > @@ -5154,6 +5154,44 @@ static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1, > tcg_temp_free_i64(tcg_res); > } > > +/* Floating-point data-processing (3 source) - half precision */ > +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1, > + int rd, int rn, int rm, int ra) > +{ > + TCGv_i32 tcg_op1, tcg_op2, tcg_op3; > + TCGv_i32 tcg_res = tcg_temp_new_i32(); > + TCGv_ptr fpst = get_fpstatus_ptr(true); > + > + tcg_op1 = read_fp_hreg(s, rn); > + tcg_op2 = read_fp_hreg(s, rm); > + tcg_op3 = read_fp_hreg(s, ra); > + > + /* These are fused multiply-add, and must be done as one > + * floating point operation with no rounding between the > + * multiplication and addition steps. I got confused first time reading this as we cover F[N]M[ADD|SUB]. Perhaps that is better enumerated at the top of the function? Anyway: Reviewed-by: Alex Bennée <alex.bennee@linaro.org> > + * NB that doing the negations here as separate steps is > + * correct : an input NaN should come out with its sign bit > + * flipped if it is a negated-input. > + */ > + if (o1 == true) { > + tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000); > + } > + > + if (o0 != o1) { > + tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000); > + } > + > + gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst); > + > + write_fp_sreg(s, rd, tcg_res); > + > + tcg_temp_free_ptr(fpst); > + tcg_temp_free_i32(tcg_op1); > + tcg_temp_free_i32(tcg_op2); > + tcg_temp_free_i32(tcg_op3); > + tcg_temp_free_i32(tcg_res); > +} > + > /* Floating point data-processing (3 source) > * 31 30 29 28 24 23 22 21 20 16 15 14 10 9 5 4 0 > * +---+---+---+-----------+------+----+------+----+------+------+------+ > @@ -5183,6 +5221,16 @@ static void disas_fp_3src(DisasContext *s, uint32_t insn) > } > handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra); > break; > + case 3: > + if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) { > + unallocated_encoding(s); > + return; > + } > + if (!fp_access_check(s)) { > + return; > + } > + handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra); > + break; > default: > unallocated_encoding(s); > } -- Alex Bennée
On 05/01/2018 04:21 AM, Alex Bennée wrote: > > Richard Henderson <richard.henderson@linaro.org> writes: > >> We missed all of the scalar fp16 fma operations. >> >> Signed-off-by: Richard Henderson <richard.henderson@linaro.org> >> --- >> target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ >> 1 file changed, 48 insertions(+) >> >> diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c >> index 11b90b7eb0..0cb1fc4d67 100644 >> --- a/target/arm/translate-a64.c >> +++ b/target/arm/translate-a64.c >> @@ -5154,6 +5154,44 @@ static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1, >> tcg_temp_free_i64(tcg_res); >> } >> >> +/* Floating-point data-processing (3 source) - half precision */ >> +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1, >> + int rd, int rn, int rm, int ra) >> +{ >> + TCGv_i32 tcg_op1, tcg_op2, tcg_op3; >> + TCGv_i32 tcg_res = tcg_temp_new_i32(); >> + TCGv_ptr fpst = get_fpstatus_ptr(true); >> + >> + tcg_op1 = read_fp_hreg(s, rn); >> + tcg_op2 = read_fp_hreg(s, rm); >> + tcg_op3 = read_fp_hreg(s, ra); >> + >> + /* These are fused multiply-add, and must be done as one >> + * floating point operation with no rounding between the >> + * multiplication and addition steps. > > I got confused first time reading this as we cover F[N]M[ADD|SUB]. > Perhaps that is better enumerated at the top of the function? *shrug* It's an exact copy of the beginnings of the single and double functions. Is it really that confusing? r~
diff --git a/target/arm/translate-a64.c b/target/arm/translate-a64.c index 11b90b7eb0..0cb1fc4d67 100644 --- a/target/arm/translate-a64.c +++ b/target/arm/translate-a64.c @@ -5154,6 +5154,44 @@ static void handle_fp_3src_double(DisasContext *s, bool o0, bool o1, tcg_temp_free_i64(tcg_res); } +/* Floating-point data-processing (3 source) - half precision */ +static void handle_fp_3src_half(DisasContext *s, bool o0, bool o1, + int rd, int rn, int rm, int ra) +{ + TCGv_i32 tcg_op1, tcg_op2, tcg_op3; + TCGv_i32 tcg_res = tcg_temp_new_i32(); + TCGv_ptr fpst = get_fpstatus_ptr(true); + + tcg_op1 = read_fp_hreg(s, rn); + tcg_op2 = read_fp_hreg(s, rm); + tcg_op3 = read_fp_hreg(s, ra); + + /* These are fused multiply-add, and must be done as one + * floating point operation with no rounding between the + * multiplication and addition steps. + * NB that doing the negations here as separate steps is + * correct : an input NaN should come out with its sign bit + * flipped if it is a negated-input. + */ + if (o1 == true) { + tcg_gen_xori_i32(tcg_op3, tcg_op3, 0x8000); + } + + if (o0 != o1) { + tcg_gen_xori_i32(tcg_op1, tcg_op1, 0x8000); + } + + gen_helper_advsimd_muladdh(tcg_res, tcg_op1, tcg_op2, tcg_op3, fpst); + + write_fp_sreg(s, rd, tcg_res); + + tcg_temp_free_ptr(fpst); + tcg_temp_free_i32(tcg_op1); + tcg_temp_free_i32(tcg_op2); + tcg_temp_free_i32(tcg_op3); + tcg_temp_free_i32(tcg_res); +} + /* Floating point data-processing (3 source) * 31 30 29 28 24 23 22 21 20 16 15 14 10 9 5 4 0 * +---+---+---+-----------+------+----+------+----+------+------+------+ @@ -5183,6 +5221,16 @@ static void disas_fp_3src(DisasContext *s, uint32_t insn) } handle_fp_3src_double(s, o0, o1, rd, rn, rm, ra); break; + case 3: + if (!arm_dc_feature(s, ARM_FEATURE_V8_FP16)) { + unallocated_encoding(s); + return; + } + if (!fp_access_check(s)) { + return; + } + handle_fp_3src_half(s, o0, o1, rd, rn, rm, ra); + break; default: unallocated_encoding(s); }
We missed all of the scalar fp16 fma operations. Signed-off-by: Richard Henderson <richard.henderson@linaro.org> --- target/arm/translate-a64.c | 48 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) -- 2.14.3