Message ID | 5652EB5D.8040002@arm.com |
---|---|
State | Superseded |
Headers | show |
On Mon, Nov 23, 2015 at 10:33:01AM +0000, Kyrill Tkachov wrote: > > On 12/11/15 12:05, James Greenhalgh wrote: > >On Tue, Nov 03, 2015 at 03:43:24PM +0000, Kyrill Tkachov wrote: > >>Hi all, > >> > >>Bootstrapped and tested on aarch64. > >> > >>Ok for trunk? > >Comments in-line. > > > > Here's an updated patch according to your comments. > Sorry it took so long to respin it, had other things to deal with with > stage1 closing... > > I've indented the sample code sequences and used valid mnemonics. > These patterns can only match during combine, so I'd expect them to always > split during combine or immediately after, but I don't think that's a documented > guarantee so I've gated them on !reload_completed. > > I've used IN_RANGE in the predicate.md hunk and added scan-assembler checks > in the tests. > > Is this ok? > > Thanks, > Kyrill > > 2015-11-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> > > * config/aarch64/aarch64.md (*condjump): Rename to... > (condjump): ... This. > (*compare_condjump<mode>): New define_insn_and_split. > (*compare_cstore<mode>_insn): Likewise. > (*cstore<mode>_insn): Rename to... > (cstore<mode>_insn): ... This. > * config/aarch64/iterators.md (CMP): Handle ne code. > * config/aarch64/predicates.md (aarch64_imm24): New predicate. > > 2015-11-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> > > * gcc.target/aarch64/cmpimm_branch_1.c: New test. > * gcc.target/aarch64/cmpimm_cset_1.c: Likewise. > > commit bb44feed4e6beaae25d9bdffa45073dc61c65838 > Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> > Date: Mon Sep 21 10:56:47 2015 +0100 > > [AArch64] Improve comparison with complex immediates > > diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md > index 11f6387..3e57d08 100644 > --- a/gcc/config/aarch64/aarch64.md > +++ b/gcc/config/aarch64/aarch64.md > @@ -372,7 +372,7 @@ (define_expand "mod<mode>3" > } > ) > > -(define_insn "*condjump" > +(define_insn "condjump" > [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" > [(match_operand 1 "cc_register" "") (const_int 0)]) > (label_ref (match_operand 2 "" "")) > @@ -397,6 +397,41 @@ (define_insn "*condjump" > (const_int 1)))] > ) > > +;; For a 24-bit immediate CST we can optimize the compare for equality > +;; and branch sequence from: > +;; mov x0, #imm1 > +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ > +;; cmp x1, x0 > +;; b<ne,eq> .Label > +;; into the shorter: > +;; sub x0, x0, #(CST & 0xfff000) > +;; subs x0, x0, #(CST & 0x000fff) sub x0, x1, #(CST....) ? The transform doesn't make sense otherwise. > +;; b<ne,eq> .Label > +(define_insn_and_split "*compare_condjump<mode>" > + [(set (pc) (if_then_else (EQL > + (match_operand:GPI 0 "register_operand" "r") > + (match_operand:GPI 1 "aarch64_imm24" "n")) > + (label_ref:P (match_operand 2 "" "")) > + (pc)))] > + "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode) > + && !aarch64_plus_operand (operands[1], <MODE>mode) > + && !reload_completed" > + "#" > + "&& true" > + [(const_int 0)] > + { > + HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; > + HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; > + rtx tmp = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm))); > + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); > + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); > + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); > + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2])); > + DONE; > + } > +) > + > (define_expand "casesi" > [(match_operand:SI 0 "register_operand" "") ; Index > (match_operand:SI 1 "const_int_operand" "") ; Lower bound > @@ -2901,7 +2936,7 @@ (define_expand "cstore<mode>4" > " > ) > > -(define_insn "*cstore<mode>_insn" > +(define_insn "aarch64_cstore<mode>" > [(set (match_operand:ALLI 0 "register_operand" "=r") > (match_operator:ALLI 1 "aarch64_comparison_operator" > [(match_operand 2 "cc_register" "") (const_int 0)]))] > @@ -2910,6 +2945,40 @@ (define_insn "*cstore<mode>_insn" > [(set_attr "type" "csel")] > ) > > +;; For a 24-bit immediate CST we can optimize the compare for equality > +;; and branch sequence from: > +;; mov x0, #imm1 > +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ > +;; cmp x1, x0 > +;; cset x2, <ne,eq> > +;; into the shorter: > +;; sub x0, x0, #(CST & 0xfff000) > +;; subs x0, x0, #(CST & 0x000fff) > +;; cset x1, <ne, eq>. Please fix the register allocation in your shorter sequence, these are not equivalent. > +(define_insn_and_split "*compare_cstore<mode>_insn" > + [(set (match_operand:GPI 0 "register_operand" "=r") > + (EQL:GPI (match_operand:GPI 1 "register_operand" "r") > + (match_operand:GPI 2 "aarch64_imm24" "n")))] > + "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode) > + && !aarch64_plus_operand (operands[2], <MODE>mode) > + && !reload_completed" > + "#" > + "&& true" > + [(const_int 0)] > + { > + HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; > + HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; > + rtx tmp = gen_reg_rtx (<MODE>mode); > + emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm))); > + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); > + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); > + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); > + emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg)); > + DONE; > + } > + [(set_attr "type" "csel")] > +) > + > ;; zero_extend version of the above > (define_insn "*cstoresi_insn_uxtw" > [(set (match_operand:DI 0 "register_operand" "=r") > diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md > index c2eb7de..422bc87 100644 > --- a/gcc/config/aarch64/iterators.md > +++ b/gcc/config/aarch64/iterators.md > @@ -824,7 +824,8 @@ (define_code_attr cmp_2 [(lt "1") (le "1") (eq "2") (ge "2") (gt "2") > (ltu "1") (leu "1") (geu "2") (gtu "2")]) > > (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT") > - (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")]) > + (ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU") > + (gtu "GTU")]) > > (define_code_attr fix_trunc_optab [(fix "fix_trunc") > (unsigned_fix "fixuns_trunc")]) > diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md > index e7f76e0..c0c3ff5 100644 > --- a/gcc/config/aarch64/predicates.md > +++ b/gcc/config/aarch64/predicates.md > @@ -145,6 +145,11 @@ (define_predicate "aarch64_imm3" > (and (match_code "const_int") > (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4"))) > > +;; An immediate that fits into 24 bits. > +(define_predicate "aarch64_imm24" > + (and (match_code "const_int") > + (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)"))) > + > (define_predicate "aarch64_pwr_imm3" > (and (match_code "const_int") > (match_test "INTVAL (op) != 0 > diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c > new file mode 100644 > index 0000000..7ad736b > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c > @@ -0,0 +1,26 @@ > +/* { dg-do compile } */ > +/* { dg-options "-save-temps -O2" } */ > + > +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ > + > +void g (void); > +void > +foo (int x) > +{ > + if (x != 0x123456) > + g (); > +} > + > +void > +fool (long long x) > +{ > + if (x != 0x123456) > + g (); > +} > + > +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */ > +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */ > +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */ > +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */ > +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */ > +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */ > diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c > new file mode 100644 > index 0000000..6a03cc9 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c > @@ -0,0 +1,23 @@ > +/* { dg-do compile } */ > +/* { dg-options "-save-temps -O2" } */ > + > +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ > + > +int > +foo (int x) > +{ > + return x == 0x123456; > +} > + > +long > +fool (long x) > +{ > + return x == 0x123456; > +} > + This test will be broken for ILP32. This should be long long. OK with those comments fixed. Thanks, James
On 23/11/15 14:58, James Greenhalgh wrote: > On Mon, Nov 23, 2015 at 10:33:01AM +0000, Kyrill Tkachov wrote: >> On 12/11/15 12:05, James Greenhalgh wrote: >>> On Tue, Nov 03, 2015 at 03:43:24PM +0000, Kyrill Tkachov wrote: >>>> Hi all, >>>> >>>> Bootstrapped and tested on aarch64. >>>> >>>> Ok for trunk? >>> Comments in-line. >>> >> Here's an updated patch according to your comments. >> Sorry it took so long to respin it, had other things to deal with with >> stage1 closing... >> >> I've indented the sample code sequences and used valid mnemonics. >> These patterns can only match during combine, so I'd expect them to always >> split during combine or immediately after, but I don't think that's a documented >> guarantee so I've gated them on !reload_completed. >> >> I've used IN_RANGE in the predicate.md hunk and added scan-assembler checks >> in the tests. >> >> Is this ok? >> >> Thanks, >> Kyrill >> >> 2015-11-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> >> >> * config/aarch64/aarch64.md (*condjump): Rename to... >> (condjump): ... This. >> (*compare_condjump<mode>): New define_insn_and_split. >> (*compare_cstore<mode>_insn): Likewise. >> (*cstore<mode>_insn): Rename to... >> (cstore<mode>_insn): ... This. >> * config/aarch64/iterators.md (CMP): Handle ne code. >> * config/aarch64/predicates.md (aarch64_imm24): New predicate. >> >> 2015-11-20 Kyrylo Tkachov <kyrylo.tkachov@arm.com> >> >> * gcc.target/aarch64/cmpimm_branch_1.c: New test. >> * gcc.target/aarch64/cmpimm_cset_1.c: Likewise. >> >> commit bb44feed4e6beaae25d9bdffa45073dc61c65838 >> Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> >> Date: Mon Sep 21 10:56:47 2015 +0100 >> >> [AArch64] Improve comparison with complex immediates >> >> diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md >> index 11f6387..3e57d08 100644 >> --- a/gcc/config/aarch64/aarch64.md >> +++ b/gcc/config/aarch64/aarch64.md >> @@ -372,7 +372,7 @@ (define_expand "mod<mode>3" >> } >> ) >> >> -(define_insn "*condjump" >> +(define_insn "condjump" >> [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" >> [(match_operand 1 "cc_register" "") (const_int 0)]) >> (label_ref (match_operand 2 "" "")) >> @@ -397,6 +397,41 @@ (define_insn "*condjump" >> (const_int 1)))] >> ) >> >> +;; For a 24-bit immediate CST we can optimize the compare for equality >> +;; and branch sequence from: >> +;; mov x0, #imm1 >> +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ >> +;; cmp x1, x0 >> +;; b<ne,eq> .Label >> +;; into the shorter: >> +;; sub x0, x0, #(CST & 0xfff000) >> +;; subs x0, x0, #(CST & 0x000fff) > sub x0, x1, #(CST....) ? > > The transform doesn't make sense otherwise. Doh, yes. The source should be x1 of course. Kyrill > >> +;; b<ne,eq> .Label >> +(define_insn_and_split "*compare_condjump<mode>" >> + [(set (pc) (if_then_else (EQL >> + (match_operand:GPI 0 "register_operand" "r") >> + (match_operand:GPI 1 "aarch64_imm24" "n")) >> + (label_ref:P (match_operand 2 "" "")) >> + (pc)))] >> + "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode) >> + && !aarch64_plus_operand (operands[1], <MODE>mode) >> + && !reload_completed" >> + "#" >> + "&& true" >> + [(const_int 0)] >> + { >> + HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; >> + HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; >> + rtx tmp = gen_reg_rtx (<MODE>mode); >> + emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm))); >> + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); >> + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); >> + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); >> + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2])); >> + DONE; >> + } >> +) >> + >> (define_expand "casesi" >> [(match_operand:SI 0 "register_operand" "") ; Index >> (match_operand:SI 1 "const_int_operand" "") ; Lower bound >> @@ -2901,7 +2936,7 @@ (define_expand "cstore<mode>4" >> " >> ) >> >> -(define_insn "*cstore<mode>_insn" >> +(define_insn "aarch64_cstore<mode>" >> [(set (match_operand:ALLI 0 "register_operand" "=r") >> (match_operator:ALLI 1 "aarch64_comparison_operator" >> [(match_operand 2 "cc_register" "") (const_int 0)]))] >> @@ -2910,6 +2945,40 @@ (define_insn "*cstore<mode>_insn" >> [(set_attr "type" "csel")] >> ) >> >> +;; For a 24-bit immediate CST we can optimize the compare for equality >> +;; and branch sequence from: >> +;; mov x0, #imm1 >> +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ >> +;; cmp x1, x0 >> +;; cset x2, <ne,eq> >> +;; into the shorter: >> +;; sub x0, x0, #(CST & 0xfff000) >> +;; subs x0, x0, #(CST & 0x000fff) >> +;; cset x1, <ne, eq>. > Please fix the register allocation in your shorter sequence, these > are not equivalent. > >> +(define_insn_and_split "*compare_cstore<mode>_insn" >> + [(set (match_operand:GPI 0 "register_operand" "=r") >> + (EQL:GPI (match_operand:GPI 1 "register_operand" "r") >> + (match_operand:GPI 2 "aarch64_imm24" "n")))] >> + "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode) >> + && !aarch64_plus_operand (operands[2], <MODE>mode) >> + && !reload_completed" >> + "#" >> + "&& true" >> + [(const_int 0)] >> + { >> + HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; >> + HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; >> + rtx tmp = gen_reg_rtx (<MODE>mode); >> + emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm))); >> + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); >> + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); >> + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); >> + emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg)); >> + DONE; >> + } >> + [(set_attr "type" "csel")] >> +) >> + >> ;; zero_extend version of the above >> (define_insn "*cstoresi_insn_uxtw" >> [(set (match_operand:DI 0 "register_operand" "=r") >> diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md >> index c2eb7de..422bc87 100644 >> --- a/gcc/config/aarch64/iterators.md >> +++ b/gcc/config/aarch64/iterators.md >> @@ -824,7 +824,8 @@ (define_code_attr cmp_2 [(lt "1") (le "1") (eq "2") (ge "2") (gt "2") >> (ltu "1") (leu "1") (geu "2") (gtu "2")]) >> >> (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT") >> - (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")]) >> + (ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU") >> + (gtu "GTU")]) >> >> (define_code_attr fix_trunc_optab [(fix "fix_trunc") >> (unsigned_fix "fixuns_trunc")]) >> diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md >> index e7f76e0..c0c3ff5 100644 >> --- a/gcc/config/aarch64/predicates.md >> +++ b/gcc/config/aarch64/predicates.md >> @@ -145,6 +145,11 @@ (define_predicate "aarch64_imm3" >> (and (match_code "const_int") >> (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4"))) >> >> +;; An immediate that fits into 24 bits. >> +(define_predicate "aarch64_imm24" >> + (and (match_code "const_int") >> + (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)"))) >> + >> (define_predicate "aarch64_pwr_imm3" >> (and (match_code "const_int") >> (match_test "INTVAL (op) != 0 >> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c >> new file mode 100644 >> index 0000000..7ad736b >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c >> @@ -0,0 +1,26 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-save-temps -O2" } */ >> + >> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ >> + >> +void g (void); >> +void >> +foo (int x) >> +{ >> + if (x != 0x123456) >> + g (); >> +} >> + >> +void >> +fool (long long x) >> +{ >> + if (x != 0x123456) >> + g (); >> +} >> + >> +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */ >> +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */ >> +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */ >> +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */ >> +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */ >> +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */ >> diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c >> new file mode 100644 >> index 0000000..6a03cc9 >> --- /dev/null >> +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c >> @@ -0,0 +1,23 @@ >> +/* { dg-do compile } */ >> +/* { dg-options "-save-temps -O2" } */ >> + >> +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ >> + >> +int >> +foo (int x) >> +{ >> + return x == 0x123456; >> +} >> + >> +long >> +fool (long x) >> +{ >> + return x == 0x123456; >> +} >> + > This test will be broken for ILP32. This should be long long. > > OK with those comments fixed. Thanks, I'll prepare an updated version. Kyrill > Thanks, > James >
commit bb44feed4e6beaae25d9bdffa45073dc61c65838 Author: Kyrylo Tkachov <kyrylo.tkachov@arm.com> Date: Mon Sep 21 10:56:47 2015 +0100 [AArch64] Improve comparison with complex immediates diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md index 11f6387..3e57d08 100644 --- a/gcc/config/aarch64/aarch64.md +++ b/gcc/config/aarch64/aarch64.md @@ -372,7 +372,7 @@ (define_expand "mod<mode>3" } ) -(define_insn "*condjump" +(define_insn "condjump" [(set (pc) (if_then_else (match_operator 0 "aarch64_comparison_operator" [(match_operand 1 "cc_register" "") (const_int 0)]) (label_ref (match_operand 2 "" "")) @@ -397,6 +397,41 @@ (define_insn "*condjump" (const_int 1)))] ) +;; For a 24-bit immediate CST we can optimize the compare for equality +;; and branch sequence from: +;; mov x0, #imm1 +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ +;; cmp x1, x0 +;; b<ne,eq> .Label +;; into the shorter: +;; sub x0, x0, #(CST & 0xfff000) +;; subs x0, x0, #(CST & 0x000fff) +;; b<ne,eq> .Label +(define_insn_and_split "*compare_condjump<mode>" + [(set (pc) (if_then_else (EQL + (match_operand:GPI 0 "register_operand" "r") + (match_operand:GPI 1 "aarch64_imm24" "n")) + (label_ref:P (match_operand 2 "" "")) + (pc)))] + "!aarch64_move_imm (INTVAL (operands[1]), <MODE>mode) + && !aarch64_plus_operand (operands[1], <MODE>mode) + && !reload_completed" + "#" + "&& true" + [(const_int 0)] + { + HOST_WIDE_INT lo_imm = UINTVAL (operands[1]) & 0xfff; + HOST_WIDE_INT hi_imm = UINTVAL (operands[1]) & 0xfff000; + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_insn (gen_add<mode>3 (tmp, operands[0], GEN_INT (-hi_imm))); + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); + emit_jump_insn (gen_condjump (cmp_rtx, cc_reg, operands[2])); + DONE; + } +) + (define_expand "casesi" [(match_operand:SI 0 "register_operand" "") ; Index (match_operand:SI 1 "const_int_operand" "") ; Lower bound @@ -2901,7 +2936,7 @@ (define_expand "cstore<mode>4" " ) -(define_insn "*cstore<mode>_insn" +(define_insn "aarch64_cstore<mode>" [(set (match_operand:ALLI 0 "register_operand" "=r") (match_operator:ALLI 1 "aarch64_comparison_operator" [(match_operand 2 "cc_register" "") (const_int 0)]))] @@ -2910,6 +2945,40 @@ (define_insn "*cstore<mode>_insn" [(set_attr "type" "csel")] ) +;; For a 24-bit immediate CST we can optimize the compare for equality +;; and branch sequence from: +;; mov x0, #imm1 +;; movk x0, #imm2, lsl 16 /* x0 contains CST. */ +;; cmp x1, x0 +;; cset x2, <ne,eq> +;; into the shorter: +;; sub x0, x0, #(CST & 0xfff000) +;; subs x0, x0, #(CST & 0x000fff) +;; cset x1, <ne, eq>. +(define_insn_and_split "*compare_cstore<mode>_insn" + [(set (match_operand:GPI 0 "register_operand" "=r") + (EQL:GPI (match_operand:GPI 1 "register_operand" "r") + (match_operand:GPI 2 "aarch64_imm24" "n")))] + "!aarch64_move_imm (INTVAL (operands[2]), <MODE>mode) + && !aarch64_plus_operand (operands[2], <MODE>mode) + && !reload_completed" + "#" + "&& true" + [(const_int 0)] + { + HOST_WIDE_INT lo_imm = UINTVAL (operands[2]) & 0xfff; + HOST_WIDE_INT hi_imm = UINTVAL (operands[2]) & 0xfff000; + rtx tmp = gen_reg_rtx (<MODE>mode); + emit_insn (gen_add<mode>3 (tmp, operands[1], GEN_INT (-hi_imm))); + emit_insn (gen_add<mode>3_compare0 (tmp, tmp, GEN_INT (-lo_imm))); + rtx cc_reg = gen_rtx_REG (CC_NZmode, CC_REGNUM); + rtx cmp_rtx = gen_rtx_fmt_ee (<EQL:CMP>, <MODE>mode, cc_reg, const0_rtx); + emit_insn (gen_aarch64_cstore<mode> (operands[0], cmp_rtx, cc_reg)); + DONE; + } + [(set_attr "type" "csel")] +) + ;; zero_extend version of the above (define_insn "*cstoresi_insn_uxtw" [(set (match_operand:DI 0 "register_operand" "=r") diff --git a/gcc/config/aarch64/iterators.md b/gcc/config/aarch64/iterators.md index c2eb7de..422bc87 100644 --- a/gcc/config/aarch64/iterators.md +++ b/gcc/config/aarch64/iterators.md @@ -824,7 +824,8 @@ (define_code_attr cmp_2 [(lt "1") (le "1") (eq "2") (ge "2") (gt "2") (ltu "1") (leu "1") (geu "2") (gtu "2")]) (define_code_attr CMP [(lt "LT") (le "LE") (eq "EQ") (ge "GE") (gt "GT") - (ltu "LTU") (leu "LEU") (geu "GEU") (gtu "GTU")]) + (ltu "LTU") (leu "LEU") (ne "NE") (geu "GEU") + (gtu "GTU")]) (define_code_attr fix_trunc_optab [(fix "fix_trunc") (unsigned_fix "fixuns_trunc")]) diff --git a/gcc/config/aarch64/predicates.md b/gcc/config/aarch64/predicates.md index e7f76e0..c0c3ff5 100644 --- a/gcc/config/aarch64/predicates.md +++ b/gcc/config/aarch64/predicates.md @@ -145,6 +145,11 @@ (define_predicate "aarch64_imm3" (and (match_code "const_int") (match_test "(unsigned HOST_WIDE_INT) INTVAL (op) <= 4"))) +;; An immediate that fits into 24 bits. +(define_predicate "aarch64_imm24" + (and (match_code "const_int") + (match_test "IN_RANGE (UINTVAL (op), 0, 0xffffff)"))) + (define_predicate "aarch64_pwr_imm3" (and (match_code "const_int") (match_test "INTVAL (op) != 0 diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c new file mode 100644 index 0000000..7ad736b --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_branch_1.c @@ -0,0 +1,26 @@ +/* { dg-do compile } */ +/* { dg-options "-save-temps -O2" } */ + +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ + +void g (void); +void +foo (int x) +{ + if (x != 0x123456) + g (); +} + +void +fool (long long x) +{ + if (x != 0x123456) + g (); +} + +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */ +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */ +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */ diff --git a/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c new file mode 100644 index 0000000..6a03cc9 --- /dev/null +++ b/gcc/testsuite/gcc.target/aarch64/cmpimm_cset_1.c @@ -0,0 +1,23 @@ +/* { dg-do compile } */ +/* { dg-options "-save-temps -O2" } */ + +/* Test that we emit a sub+subs sequence rather than mov+movk+cmp. */ + +int +foo (int x) +{ + return x == 0x123456; +} + +long +fool (long x) +{ + return x == 0x123456; +} + +/* { dg-final { scan-assembler-not "cmp\tw\[0-9\]*.*" } } */ +/* { dg-final { scan-assembler-not "cmp\tx\[0-9\]*.*" } } */ +/* { dg-final { scan-assembler-times "sub\tw\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "sub\tx\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "subs\tw\[0-9\]+.*" 1 } } */ +/* { dg-final { scan-assembler-times "subs\tx\[0-9\]+.*" 1 } } */