diff mbox

[RFC] Fix PR rtl-optimization/59461

Message ID 3031993.Iryjn4xZby@polaris
State New
Headers show

Commit Message

Eric Botcazou Nov. 7, 2016, 10:39 a.m. UTC
It's a missed optimization of a redundant zero-extension on the SPARC, which 
originally comes from PR rtl-optimization/58295 for ARM.  The extension is 
eliminated on the ARM because the load is explicitly zero-extended in RTL;
on the SPARC the load is implicitly zero-extended by means of LOAD_EXTEND_OP 
and the combiner is blocked by limitations of the nonzero_bits machinery.

The approach is two-pronged:
 1. it lifts a limitation in reg_nonzero_bits_for_combine that was recently 
added (https://gcc.gnu.org/ml/gcc-patches/2013-11/msg03782.html) and prevents 
the combiner from reasoning on larger modes under certain circumstances.
 2. it makes nonzero_bits1 propagate results from inner REGs to paradoxical 
SUBREGs if both WORD_REGISTER_OPERATIONS and LOAD_EXTEND_OP are set.

This also eliminate quite a few zero-extensions in the compile.exp testsuite 
at -O2 on the SPARC.  Tested on x86-64/Linux and SPARC/Solaris.


2016-11-07  Eric Botcazou  <ebotcazou@adacore.com>

	PR rtl-optimization/59461
	* doc/rtl.texi (paradoxical subregs): Add missing word.
	* combine.c (reg_nonzero_bits_for_combine): Do not discard results
	in modes with precision larger than that of last_set_mode.
	* rtlanal.c (nonzero_bits1) <SUBREG>: If WORD_REGISTER_OPERATIONS is
	set and LOAD_EXTEND_OP is appropriate, propagate results from inner
	REGs to paradoxical SUBREGs.
	(num_sign_bit_copies1) <SUBREG>: Likewise.  Check that the mode is not
	larger than a word before invoking LOAD_EXTEND_OP on it.


2016-11-07  Eric Botcazou  <ebotcazou@adacore.com>

	* gcc.target/sparc/pr59461.c: New test.

-- 
Eric Botcazou

Comments

Eric Botcazou Nov. 11, 2016, 10:40 p.m. UTC | #1
> This also eliminate quite a few zero-extensions in the compile.exp testsuite

> at -O2 on the SPARC.  Tested on x86-64/Linux and SPARC/Solaris.

> 

> 

> 2016-11-07  Eric Botcazou  <ebotcazou@adacore.com>

> 

> 	PR rtl-optimization/59461

> 	* doc/rtl.texi (paradoxical subregs): Add missing word.

> 	* combine.c (reg_nonzero_bits_for_combine): Do not discard results

> 	in modes with precision larger than that of last_set_mode.

> 	* rtlanal.c (nonzero_bits1) <SUBREG>: If WORD_REGISTER_OPERATIONS is

> 	set and LOAD_EXTEND_OP is appropriate, propagate results from inner

> 	REGs to paradoxical SUBREGs.

> 	(num_sign_bit_copies1) <SUBREG>: Likewise.  Check that the mode is not

> 	larger than a word before invoking LOAD_EXTEND_OP on it.


I have installed it after testing on ARM/EABI and IA-64/Linux.

-- 
Eric Botcazou
diff mbox

Patch

Index: doc/rtl.texi
===================================================================
--- doc/rtl.texi	(revision 241856)
+++ doc/rtl.texi	(working copy)
@@ -1882,7 +1882,7 @@  When used as an rvalue, the low-order bi
 taken from @var{reg} while the high-order bits may or may not be
 defined.
 
-The high-order bits of rvalues are in the following circumstances:
+The high-order bits of rvalues are defined in the following circumstances:
 
 @itemize
 @item @code{subreg}s of @code{mem}
Index: combine.c
===================================================================
--- combine.c	(revision 241856)
+++ combine.c	(working copy)
@@ -9878,18 +9878,17 @@  reg_nonzero_bits_for_combine (const_rtx
 		  (DF_LR_IN (ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb),
 		   REGNO (x)))))
     {
-      unsigned HOST_WIDE_INT mask = rsp->last_set_nonzero_bits;
-
-      if (GET_MODE_PRECISION (rsp->last_set_mode) < GET_MODE_PRECISION (mode))
-	/* We don't know anything about the upper bits.  */
-	mask |= GET_MODE_MASK (mode) ^ GET_MODE_MASK (rsp->last_set_mode);
-
-      *nonzero &= mask;
+      /* Note that, even if the precision of last_set_mode is lower than that
+	 of mode, record_value_for_reg invoked nonzero_bits on the register
+	 with nonzero_bits_mode (because last_set_mode is necessarily integral
+	 and HWI_COMPUTABLE_MODE_P in this case) so bits in nonzero_bits_mode
+	 are all valid, hence in mode too since nonzero_bits_mode is defined
+	 to the largest HWI_COMPUTABLE_MODE_P mode.  */
+      *nonzero &= rsp->last_set_nonzero_bits;
       return NULL;
     }
 
   tem = get_last_value (x);
-
   if (tem)
     {
       if (SHORT_IMMEDIATES_SIGN_EXTEND)
@@ -9898,7 +9897,8 @@  reg_nonzero_bits_for_combine (const_rtx
 
       return tem;
     }
-  else if (nonzero_sign_valid && rsp->nonzero_bits)
+
+  if (nonzero_sign_valid && rsp->nonzero_bits)
     {
       unsigned HOST_WIDE_INT mask = rsp->nonzero_bits;
 
Index: rtlanal.c
===================================================================
--- rtlanal.c	(revision 241856)
+++ rtlanal.c	(working copy)
@@ -4242,7 +4242,7 @@  cached_nonzero_bits (const_rtx x, machin
 /* Given an expression, X, compute which bits in X can be nonzero.
    We don't care about bits outside of those defined in MODE.
 
-   For most X this is simply GET_MODE_MASK (GET_MODE (MODE)), but if X is
+   For most X this is simply GET_MODE_MASK (GET_MODE (X)), but if X is
    an arithmetic operation, we can do better.  */
 
 static unsigned HOST_WIDE_INT
@@ -4549,18 +4549,17 @@  nonzero_bits1 (const_rtx x, machine_mode
       /* If this is a SUBREG formed for a promoted variable that has
 	 been zero-extended, we know that at least the high-order bits
 	 are zero, though others might be too.  */
-
       if (SUBREG_PROMOTED_VAR_P (x) && SUBREG_PROMOTED_UNSIGNED_P (x))
 	nonzero = GET_MODE_MASK (GET_MODE (x))
 		  & cached_nonzero_bits (SUBREG_REG (x), GET_MODE (x),
 					 known_x, known_mode, known_ret);
 
-      inner_mode = GET_MODE (SUBREG_REG (x));
       /* If the inner mode is a single word for both the host and target
 	 machines, we can compute this from which bits of the inner
 	 object might be nonzero.  */
+      inner_mode = GET_MODE (SUBREG_REG (x));
       if (GET_MODE_PRECISION (inner_mode) <= BITS_PER_WORD
-	  && (GET_MODE_PRECISION (inner_mode) <= HOST_BITS_PER_WIDE_INT))
+	  && GET_MODE_PRECISION (inner_mode) <= HOST_BITS_PER_WIDE_INT)
 	{
 	  nonzero &= cached_nonzero_bits (SUBREG_REG (x), mode,
 					  known_x, known_mode, known_ret);
@@ -4568,19 +4567,17 @@  nonzero_bits1 (const_rtx x, machine_mode
           /* On many CISC machines, accessing an object in a wider mode
 	     causes the high-order bits to become undefined.  So they are
 	     not known to be zero.  */
-	  if (!WORD_REGISTER_OPERATIONS
-	      /* If this is a typical RISC machine, we only have to worry
-		 about the way loads are extended.  */
-	      || ((LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
-		     ? val_signbit_known_set_p (inner_mode, nonzero)
-		     : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
-		   || !MEM_P (SUBREG_REG (x))))
-	    {
-	      if (GET_MODE_PRECISION (GET_MODE (x))
+	  if ((!WORD_REGISTER_OPERATIONS
+	       /* If this is a typical RISC machine, we only have to worry
+		  about the way loads are extended.  */
+		|| (LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
+		    ? val_signbit_known_set_p (inner_mode, nonzero)
+		    : LOAD_EXTEND_OP (inner_mode) != ZERO_EXTEND)
+		|| (!MEM_P (SUBREG_REG (x)) && !REG_P (SUBREG_REG (x))))
+	      && GET_MODE_PRECISION (GET_MODE (x))
 		  > GET_MODE_PRECISION (inner_mode))
-		nonzero |= (GET_MODE_MASK (GET_MODE (x))
-			    & ~GET_MODE_MASK (inner_mode));
-	    }
+	    nonzero
+	      |= (GET_MODE_MASK (GET_MODE (x)) & ~GET_MODE_MASK (inner_mode));
 	}
       break;
 
@@ -4785,6 +4782,7 @@  num_sign_bit_copies1 (const_rtx x, machi
 {
   enum rtx_code code = GET_CODE (x);
   unsigned int bitwidth = GET_MODE_PRECISION (mode);
+  machine_mode inner_mode;
   int num0, num1, result;
   unsigned HOST_WIDE_INT nonzero;
 
@@ -4892,13 +4890,13 @@  num_sign_bit_copies1 (const_rtx x, machi
 	}
 
       /* For a smaller object, just ignore the high bits.  */
-      if (bitwidth <= GET_MODE_PRECISION (GET_MODE (SUBREG_REG (x))))
+      inner_mode = GET_MODE (SUBREG_REG (x));
+      if (bitwidth <= GET_MODE_PRECISION (inner_mode))
 	{
 	  num0 = cached_num_sign_bit_copies (SUBREG_REG (x), VOIDmode,
 					     known_x, known_mode, known_ret);
-	  return MAX (1, (num0
-			  - (int) (GET_MODE_PRECISION (GET_MODE (SUBREG_REG (x)))
-				   - bitwidth)));
+	  return
+	    MAX (1, num0 - (int) (GET_MODE_PRECISION (inner_mode) - bitwidth));
 	}
 
       /* For paradoxical SUBREGs on machines where all register operations
@@ -4912,9 +4910,10 @@  num_sign_bit_copies1 (const_rtx x, machi
 	 to the stack.  */
 
       if (WORD_REGISTER_OPERATIONS
+	  && GET_MODE_PRECISION (inner_mode) <= BITS_PER_WORD
+	  && LOAD_EXTEND_OP (inner_mode) == SIGN_EXTEND
 	  && paradoxical_subreg_p (x)
-	  && LOAD_EXTEND_OP (GET_MODE (SUBREG_REG (x))) == SIGN_EXTEND
-	  && MEM_P (SUBREG_REG (x)))
+	  && (MEM_P (SUBREG_REG (x)) || REG_P (SUBREG_REG (x))))
 	return cached_num_sign_bit_copies (SUBREG_REG (x), mode,
 					   known_x, known_mode, known_ret);
       break;