Issue 6099055: [PATCH] Take branch misprediction effects into account when RTL loop unrolling

Can't Edit
Can't Publish+Mail
Start Review

Created:
13 years, 6 months ago by tejohnson

Modified:
11 years, 1 month ago

Reviewers:
pinskia, richard.guenther, andi, stevenb.gcc, davidxl, izamyatin

CC:
gcc-patches_gcc.gnu.org

Base URL:
svn+ssh://gcc.gnu.org/svn/gcc/trunk/gcc/

Visibility:
Public.

Patch Set 2 : [PATCH] Take branch misprediction effects into account when RTL loop unrolling #

Patch Set 3 : [PATCH] Take branch misprediction effects into account when RTL loop unrolling #

Created: 13 years, 6 months ago

Download [raw] [tar.bz2]

	Unified diffs	Side-by-side diffs	Delta from patch set	Stats (+247 lines, -17 lines)			Patch
M	cfgloop.h	View	1 2	4 chunks	+15 lines, -2 lines	0 comments	Download
M	cfgloop.c	View	1 2	1 chunk	+84 lines, -10 lines	0 comments	Download
M	doc/invoke.texi	View	1	1 chunk	+6 lines, -0 lines	0 comments	Download
M	loop-iv.c	View	1 2	1 chunk	+6 lines, -2 lines	0 comments	Download
M	loop-unroll.c	View	1 2	7 chunks	+127 lines, -3 lines	0 comments	Download
M	params.def	View	1	1 chunk	+9 lines, -0 lines	0 comments	Download

Messages

Total messages: 24

Expand All Messages | Collapse All Messages

tejohnson

This patch adds heuristics to limit unrolling in loops with branches that may increase branch ...

13 years, 6 months ago (2012-04-24 21:26:49 UTC) #1

This patch adds heuristics to limit unrolling in loops with branches that may
increase
branch mispredictions. It affects loops that are not frequently iterated, and
that are
nested within a hot region of code that already contains many branch
instructions.

Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
This improves performance of an internal search indexing benchmark by close to
2% on
all the tested Intel platforms.  It also consistently improves 445.gobmk (with
FDO feedback
where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
neutral.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?

Thanks,
Teresa

2012-04-24   Teresa Johnson  <tejohnson@google.com>

	* loop-unroll.c (loop_has_call): New function.
	(loop_has_FP_comp): Ditto.
	(compute_weighted_branches): Ditto.
	(max_unroll_with_branches): Ditto.
	(decide_unroll_constant_iterations): Add heuristic to avoid
        increasing branch mispredicts when unrolling.
	(decide_unroll_runtime_iterations): Ditto.
	* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.

Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 186783)
+++ loop-unroll.c	(working copy)
@@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
 					     basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Determine whether LOOP contains call.  */
+static bool
+loop_has_call(struct loop *loop)
+{
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (CALL_P (insn))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Determine whether LOOP contains floating-point computation.  */
+static bool
+loop_has_FP_comp(struct loop *loop)
+{
+  rtx set, dest;
+  basic_block *body, bb;
+  unsigned i;
+  rtx insn;
+
+  body = get_loop_body (loop);
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      bb = body[i];
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          set = single_set (insn);
+          if (!set)
+            continue;
+
+          dest = SET_DEST (set);
+          if (FLOAT_MODE_P (GET_MODE (dest)))
+            {
+              free (body);
+              return true;
+            }
+        }
+    }
+  free (body);
+  return false;
+}
+
+/* Compute the number of branches in LOOP, weighted by execution counts.  */
+static float
+compute_weighted_branches(struct loop *loop)
+{
+  int header_count = loop->header->count;
+  unsigned i;
+  float n;
+  basic_block * body;
+
+  /* If no profile feedback data exists, don't limit unrolling  */
+  if (header_count == 0)
+    return 0.0;
+
+  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+
+  body = get_loop_body (loop);
+  n = 0.0;
+  for (i = 0; i < loop->num_nodes; i++)
+    {
+      if (EDGE_COUNT (body[i]->succs) >= 2)
+        {
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on the ratio of times it is
+             executed compared to the header.  */
+          if (body[i]->count < header_count)
+            n += ((float)body[i]->count)/header_count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch at 1.0.  */
+          else
+            n += 1.0;
+        }
+    }
+  free (body);
+
+  return n;
+}
+
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct niter_desc *outer_desc;
+  int outer_niters = 1;
+  float weighted_outer_branches = 0.0;
+  float weighted_num_branches = compute_weighted_branches (loop);
+
+  /* If there was no profile feedback data, weighted_num_branches will be 0.0
+     and we won't limit unrolling. If the weighted_num_branches is at most 1.0,
+     also don't limit unrolling as the back-edge branch will not be duplicated.
 */
+  if (weighted_num_branches <= 1.0)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer(loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+  /* Detect if this is the fake outermost loop (at which point we are done)
+     by checking its outer loop.  */
+  while (loop_outer(outer))
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      weighted_outer_branches = compute_weighted_branches (outer);
+
+      /* Should have been checked by caller.  */
+      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+        {
+          /* Assume that any call will cause the branch budget to be exceeded,
+             and that we can't unroll the current loop without increasing
+             mispredicts.  */
+          if (loop_has_call(outer))
+            return 0;
+
+          /* Otherwise, compute the maximum number of times current loop can be
+             unrolled without exceeding our branch budget. First we subtract
+             off the outer loop's weighted branch count from the budget. Note
+             that this includes the branches in the current loop. This yields
+             the number of branches left in the budget for the unrolled copies.
+             We divide this by the number of branches in the current loop that
+             must be duplicated when we unroll, which is the total weighted
+             number of branches minus the back-edge branch. This yields the
+             number of new loop body copies that can be created by unrolling
+             without exceeding the budget, to which we add 1 to get the unroll
+             factor.  */
+          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
+                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
+        }
+      outer = loop_outer(outer);
+    }
+
+  /* The current loop is not enclosed by a hot enough outer loop in this
+     procedure, since the hot outer loop is inter-procedural, assume that
+     it already contains a significant number of branches, so don't unroll.  */
+  return 0;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -522,6 +696,7 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
+  unsigned nunroll_branches;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+	    fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts. Ignore loops with FP computation as these
+     tend to benefit much more consistently from unrolling.  */
+  if (num_loop_branches (loop) > 1
+      && loop_has_FP_comp(loop)
+      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
+      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
+    {
+      nunroll_branches = max_unroll_with_branches(loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
Index: params.def
===================================================================
--- params.def	(revision 186783)
+++ params.def	(working copy)
@@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+	"min-iter-unroll-with-branches",
+	"Minimum iteration count to ignore branch effects when unrolling",
+	50, 0, 0)
+
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+	"unroll-outer-loop-branch-budget",
+	"Maximum number of branches allowed in hot outer loop region after unroll",
+	25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
 	"max-unswitch-insns",

--
This patch is available for review at http://codereview.appspot.com/6099055

pinskia_gmail.com

On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > This patch ...

13 years, 6 months ago (2012-04-24 21:31:13 UTC) #2

On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> This patch adds heuristics to limit unrolling in loops with branches that may
increase
> branch mispredictions. It affects loops that are not frequently iterated, and
that are
> nested within a hot region of code that already contains many branch
instructions.
>
> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
> This improves performance of an internal search indexing benchmark by close to
2% on
> all the tested Intel platforms.  It also consistently improves 445.gobmk (with
FDO feedback
> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
> neutral.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>
> Thanks,
> Teresa
>
> 2012-04-24   Teresa Johnson  <tejohnson@google.com>
>
>        * loop-unroll.c (loop_has_call): New function.
>        (loop_has_FP_comp): Ditto.
>        (compute_weighted_branches): Ditto.
>        (max_unroll_with_branches): Ditto.
>        (decide_unroll_constant_iterations): Add heuristic to avoid
>        increasing branch mispredicts when unrolling.
>        (decide_unroll_runtime_iterations): Ditto.
>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c       (revision 186783)
> +++ loop-unroll.c       (working copy)
> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>                                             basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +/* Determine whether LOOP contains call.  */
> +static bool
> +loop_has_call(struct loop *loop)
> +{
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          if (CALL_P (insn))
> +            {
> +              free (body);
> +              return true;
> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}
> +
> +/* Determine whether LOOP contains floating-point computation.  */
> +static bool
> +loop_has_FP_comp(struct loop *loop)
> +{
> +  rtx set, dest;
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          set = single_set (insn);
> +          if (!set)
> +            continue;
> +
> +          dest = SET_DEST (set);
> +          if (FLOAT_MODE_P (GET_MODE (dest)))
> +            {
> +              free (body);
> +              return true;
> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}
> +
> +/* Compute the number of branches in LOOP, weighted by execution counts.  */
> +static float
> +compute_weighted_branches(struct loop *loop)
> +{
> +  int header_count = loop->header->count;
> +  unsigned i;
> +  float n;
> +  basic_block * body;
> +
> +  /* If no profile feedback data exists, don't limit unrolling  */
> +  if (header_count == 0)
> +    return 0.0;
> +
> +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
> +
> +  body = get_loop_body (loop);
> +  n = 0.0;
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      if (EDGE_COUNT (body[i]->succs) >= 2)
> +        {
> +          /* If this block is executed less frequently than the header (loop
> +             entry), then it is weighted based on the ratio of times it is
> +             executed compared to the header.  */
> +          if (body[i]->count < header_count)
> +            n += ((float)body[i]->count)/header_count;

Please don't introduce more floating point usage into the compiler
since it could change between different hosts (sse vs x87 for an
example).
Maybe use a fixed point multiply of 1000 (note use a macro for this
special value though)  like what is used in the rest of the predict
code.


Thanks,
Andrew Pinski


> +
> +          /* When it is executed more frequently than the header (i.e. it is
> +             in a nested inner loop), simply weight the branch at 1.0.  */
> +          else
> +            n += 1.0;
> +        }
> +    }
> +  free (body);
> +
> +  return n;
> +}
> +
> +/* Compute the maximum number of times LOOP can be unrolled without exceeding
> +   a branch budget, which can increase branch mispredictions. The number of
> +   branches is computed by weighting each branch with its expected execution
> +   probability through the loop based on profile data. If no profile feedback
> +   data exists, simply return the current NUNROLL factor.  */
> +static unsigned
> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> +{
> +  struct loop *outer;
> +  struct niter_desc *outer_desc;
> +  int outer_niters = 1;
> +  float weighted_outer_branches = 0.0;
> +  float weighted_num_branches = compute_weighted_branches (loop);
> +
> +  /* If there was no profile feedback data, weighted_num_branches will be 0.0
> +     and we won't limit unrolling. If the weighted_num_branches is at most
1.0,
> +     also don't limit unrolling as the back-edge branch will not be
duplicated.  */
> +  if (weighted_num_branches <= 1.0)
> +    return nunroll;
> +
> +  /* Walk up the loop tree until we find a hot outer loop in which the
current
> +     loop is nested. At that point we will compute the number of times the
> +     current loop can be unrolled based on the number of branches in the hot
> +     outer loop.  */
> +  outer = loop_outer(loop);
> +  /* The loop structure contains a fake outermost loop, so this should always
> +     be non-NULL for our current loop.  */
> +  gcc_assert (outer);
> +  /* Detect if this is the fake outermost loop (at which point we are done)
> +     by checking its outer loop.  */
> +  while (loop_outer(outer))
> +    {
> +      outer_desc = get_simple_loop_desc (outer);
> +
> +      if (outer_desc->const_iter)
> +        outer_niters *= outer_desc->niter;
> +      else if (outer->header->count)
> +        outer_niters *= expected_loop_iterations (outer);
> +
> +      weighted_outer_branches = compute_weighted_branches (outer);
> +
> +      /* Should have been checked by caller.  */
> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
> +
> +      /* If the outer loop has enough iterations to be considered hot, then
> +         we can stop our upwards loop tree traversal and examine the current
> +         outer loop.  */
> +      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +        {
> +          /* Assume that any call will cause the branch budget to be
exceeded,
> +             and that we can't unroll the current loop without increasing
> +             mispredicts.  */
> +          if (loop_has_call(outer))
> +            return 0;
> +
> +          /* Otherwise, compute the maximum number of times current loop can
be
> +             unrolled without exceeding our branch budget. First we subtract
> +             off the outer loop's weighted branch count from the budget. Note
> +             that this includes the branches in the current loop. This yields
> +             the number of branches left in the budget for the unrolled
copies.
> +             We divide this by the number of branches in the current loop
that
> +             must be duplicated when we unroll, which is the total weighted
> +             number of branches minus the back-edge branch. This yields the
> +             number of new loop body copies that can be created by unrolling
> +             without exceeding the budget, to which we add 1 to get the
unroll
> +             factor.  */
> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
> +        }
> +      outer = loop_outer(outer);
> +    }
> +
> +  /* The current loop is not enclosed by a hot enough outer loop in this
> +     procedure, since the hot outer loop is inter-procedural, assume that
> +     it already contains a significant number of branches, so don't unroll.
 */
> +  return 0;
> +}
> +
>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>  void
>  unroll_and_peel_loops (int flags)
> @@ -522,6 +696,7 @@ static void
>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>  {
>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
> +  unsigned nunroll_branches;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can increase
> +     the number of mispredicts. Ignore loops with FP computation as these
> +     tend to benefit much more consistently from unrolling.  */
> +  if (num_loop_branches (loop) > 1
> +      && loop_has_FP_comp(loop)
> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> +      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +    {
> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* Check whether the loop rolls enough to consider.  */
>   if (desc->niter < 2 * nunroll)
>     {
> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>  static void
>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>  {
> -  unsigned nunroll, nunroll_by_av, i;
> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can increase
> +     the number of mispredicts. Ignore loops with FP computation as these
> +     tend to benefit much more consistently from unrolling.  */
> +  if (num_loop_branches (loop) > 1
> +      && loop_has_FP_comp(loop)
> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +    {
> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* If we have profile feedback, check whether the loop rolls.  */
>   if ((loop->header->count
>        && expected_loop_iterations (loop) < 2 * nunroll)
> Index: params.def
> ===================================================================
> --- params.def  (revision 186783)
> +++ params.def  (working copy)
> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>         "The maximum depth of a loop nest we completely peel",
>         8, 0, 0)
>
> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> +       "min-iter-unroll-with-branches",
> +       "Minimum iteration count to ignore branch effects when unrolling",
> +       50, 0, 0)
> +
> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> +       "unroll-outer-loop-branch-budget",
> +       "Maximum number of branches allowed in hot outer loop region after
unroll",
> +       25, 0, 0)
> +
>  /* The maximum number of insns of an unswitched loop.  */
>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>        "max-unswitch-insns",
>
> --
> This patch is available for review at http://codereview.appspot.com/6099055

tejohnson

On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote: > On Tue, ...

13 years, 6 months ago (2012-04-24 21:36:12 UTC) #3

On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:

> On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com>
> wrote:
> > This patch adds heuristics to limit unrolling in loops with branches
> that may increase
> > branch mispredictions. It affects loops that are not frequently
> iterated, and that are
> > nested within a hot region of code that already contains many branch
> instructions.
> >
> > Performance tested with both internal benchmarks and with SPEC 2000/2006
> on a variety
> > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different
> AMD Opteron systems.
> > This improves performance of an internal search indexing benchmark by
> close to 2% on
> > all the tested Intel platforms.  It also consistently improves 445.gobmk
> (with FDO feedback
> > where unrolling kicks in) by close to 1% on AMD Opteron. Other
> performance effects are
> > neutral.
> >
> > Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for
> trunk?
> >
> > Thanks,
> > Teresa
> >
> > 2012-04-24   Teresa Johnson  <tejohnson@google.com>
> >
> >        * loop-unroll.c (loop_has_call): New function.
> >        (loop_has_FP_comp): Ditto.
> >        (compute_weighted_branches): Ditto.
> >        (max_unroll_with_branches): Ditto.
> >        (decide_unroll_constant_iterations): Add heuristic to avoid
> >        increasing branch mispredicts when unrolling.
> >        (decide_unroll_runtime_iterations): Ditto.
> >        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
> >        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
> >
> > Index: loop-unroll.c
> > ===================================================================
> > --- loop-unroll.c       (revision 186783)
> > +++ loop-unroll.c       (working copy)
> > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
> >                                             basic_block);
> >  static rtx get_expansion (struct var_to_expand *);
> >
> > +/* Determine whether LOOP contains call.  */
> > +static bool
> > +loop_has_call(struct loop *loop)
> > +{
> > +  basic_block *body, bb;
> > +  unsigned i;
> > +  rtx insn;
> > +
> > +  body = get_loop_body (loop);
> > +  for (i = 0; i < loop->num_nodes; i++)
> > +    {
> > +      bb = body[i];
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +        {
> > +          if (CALL_P (insn))
> > +            {
> > +              free (body);
> > +              return true;
> > +            }
> > +        }
> > +    }
> > +  free (body);
> > +  return false;
> > +}
> > +
> > +/* Determine whether LOOP contains floating-point computation.  */
> > +static bool
> > +loop_has_FP_comp(struct loop *loop)
> > +{
> > +  rtx set, dest;
> > +  basic_block *body, bb;
> > +  unsigned i;
> > +  rtx insn;
> > +
> > +  body = get_loop_body (loop);
> > +  for (i = 0; i < loop->num_nodes; i++)
> > +    {
> > +      bb = body[i];
> > +
> > +      FOR_BB_INSNS (bb, insn)
> > +        {
> > +          set = single_set (insn);
> > +          if (!set)
> > +            continue;
> > +
> > +          dest = SET_DEST (set);
> > +          if (FLOAT_MODE_P (GET_MODE (dest)))
> > +            {
> > +              free (body);
> > +              return true;
> > +            }
> > +        }
> > +    }
> > +  free (body);
> > +  return false;
> > +}
> > +
> > +/* Compute the number of branches in LOOP, weighted by execution
> counts.  */
> > +static float
> > +compute_weighted_branches(struct loop *loop)
> > +{
> > +  int header_count = loop->header->count;
> > +  unsigned i;
> > +  float n;
> > +  basic_block * body;
> > +
> > +  /* If no profile feedback data exists, don't limit unrolling  */
> > +  if (header_count == 0)
> > +    return 0.0;
> > +
> > +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
> > +
> > +  body = get_loop_body (loop);
> > +  n = 0.0;
> > +  for (i = 0; i < loop->num_nodes; i++)
> > +    {
> > +      if (EDGE_COUNT (body[i]->succs) >= 2)
> > +        {
> > +          /* If this block is executed less frequently than the header
> (loop
> > +             entry), then it is weighted based on the ratio of times it
> is
> > +             executed compared to the header.  */
> > +          if (body[i]->count < header_count)
> > +            n += ((float)body[i]->count)/header_count;
>
> Please don't introduce more floating point usage into the compiler
> since it could change between different hosts (sse vs x87 for an
> example).
> Maybe use a fixed point multiply of 1000 (note use a macro for this
> special value though)  like what is used in the rest of the predict
> code.
>

Ok, got it. I will address this in the next version of the patch.

Thanks,
Teresa


>
>
> Thanks,
> Andrew Pinski
>
>
> > +
> > +          /* When it is executed more frequently than the header (i.e.
> it is
> > +             in a nested inner loop), simply weight the branch at 1.0.
>  */
> > +          else
> > +            n += 1.0;
> > +        }
> > +    }
> > +  free (body);
> > +
> > +  return n;
> > +}
> > +
> > +/* Compute the maximum number of times LOOP can be unrolled without
> exceeding
> > +   a branch budget, which can increase branch mispredictions. The
> number of
> > +   branches is computed by weighting each branch with its expected
> execution
> > +   probability through the loop based on profile data. If no profile
> feedback
> > +   data exists, simply return the current NUNROLL factor.  */
> > +static unsigned
> > +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> > +{
> > +  struct loop *outer;
> > +  struct niter_desc *outer_desc;
> > +  int outer_niters = 1;
> > +  float weighted_outer_branches = 0.0;
> > +  float weighted_num_branches = compute_weighted_branches (loop);
> > +
> > +  /* If there was no profile feedback data, weighted_num_branches will
> be 0.0
> > +     and we won't limit unrolling. If the weighted_num_branches is at
> most 1.0,
> > +     also don't limit unrolling as the back-edge branch will not be
> duplicated.  */
> > +  if (weighted_num_branches <= 1.0)
> > +    return nunroll;
> > +
> > +  /* Walk up the loop tree until we find a hot outer loop in which the
> current
> > +     loop is nested. At that point we will compute the number of times
> the
> > +     current loop can be unrolled based on the number of branches in
> the hot
> > +     outer loop.  */
> > +  outer = loop_outer(loop);
> > +  /* The loop structure contains a fake outermost loop, so this should
> always
> > +     be non-NULL for our current loop.  */
> > +  gcc_assert (outer);
> > +  /* Detect if this is the fake outermost loop (at which point we are
> done)
> > +     by checking its outer loop.  */
> > +  while (loop_outer(outer))
> > +    {
> > +      outer_desc = get_simple_loop_desc (outer);
> > +
> > +      if (outer_desc->const_iter)
> > +        outer_niters *= outer_desc->niter;
> > +      else if (outer->header->count)
> > +        outer_niters *= expected_loop_iterations (outer);
> > +
> > +      weighted_outer_branches = compute_weighted_branches (outer);
> > +
> > +      /* Should have been checked by caller.  */
> > +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) !=
> -1);
> > +
> > +      /* If the outer loop has enough iterations to be considered hot,
> then
> > +         we can stop our upwards loop tree traversal and examine the
> current
> > +         outer loop.  */
> > +      if (outer_niters >= PARAM_VALUE
> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> > +        {
> > +          /* Assume that any call will cause the branch budget to be
> exceeded,
> > +             and that we can't unroll the current loop without
> increasing
> > +             mispredicts.  */
> > +          if (loop_has_call(outer))
> > +            return 0;
> > +
> > +          /* Otherwise, compute the maximum number of times current
> loop can be
> > +             unrolled without exceeding our branch budget. First we
> subtract
> > +             off the outer loop's weighted branch count from the
> budget. Note
> > +             that this includes the branches in the current loop. This
> yields
> > +             the number of branches left in the budget for the unrolled
> copies.
> > +             We divide this by the number of branches in the current
> loop that
> > +             must be duplicated when we unroll, which is the total
> weighted
> > +             number of branches minus the back-edge branch. This yields
> the
> > +             number of new loop body copies that can be created by
> unrolling
> > +             without exceeding the budget, to which we add 1 to get the
> unroll
> > +             factor.  */
> > +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
> > +                  weighted_outer_branches)/(weighted_num_branches - 1)
> + 1;
> > +        }
> > +      outer = loop_outer(outer);
> > +    }
> > +
> > +  /* The current loop is not enclosed by a hot enough outer loop in this
> > +     procedure, since the hot outer loop is inter-procedural, assume
> that
> > +     it already contains a significant number of branches, so don't
> unroll.  */
> > +  return 0;
> > +}
> > +
> >  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
> >  void
> >  unroll_and_peel_loops (int flags)
> > @@ -522,6 +696,7 @@ static void
> >  decide_unroll_constant_iterations (struct loop *loop, int flags)
> >  {
> >   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0,
> n_copies, i;
> > +  unsigned nunroll_branches;
> >   struct niter_desc *desc;
> >
> >   if (!(flags & UAP_UNROLL))
> > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
> >       return;
> >     }
> >
> > +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> > +     the number of mispredicts. Ignore loops with FP computation as
> these
> > +     tend to benefit much more consistently from unrolling.  */
> > +  if (num_loop_branches (loop) > 1
> > +      && loop_has_FP_comp(loop)
> > +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> > +      && desc->niter < (unsigned) PARAM_VALUE
> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> > +    {
> > +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> > +      if (nunroll > nunroll_branches)
> > +        nunroll = nunroll_branches;
> > +      if (nunroll <= 1)
> > +        {
> > +          if (dump_file)
> > +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
> > +          return;
> > +        }
> > +    }
> > +
> >   /* Check whether the loop rolls enough to consider.  */
> >   if (desc->niter < 2 * nunroll)
> >     {
> > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
> >  static void
> >  decide_unroll_runtime_iterations (struct loop *loop, int flags)
> >  {
> > -  unsigned nunroll, nunroll_by_av, i;
> > +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
> >   struct niter_desc *desc;
> >
> >   if (!(flags & UAP_UNROLL))
> > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
> >       return;
> >     }
> >
> > +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> > +     the number of mispredicts. Ignore loops with FP computation as
> these
> > +     tend to benefit much more consistently from unrolling.  */
> > +  if (num_loop_branches (loop) > 1
> > +      && loop_has_FP_comp(loop)
> > +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> > +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> > +    {
> > +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> > +      if (nunroll > nunroll_branches)
> > +        nunroll = nunroll_branches;
> > +      if (nunroll <= 1)
> > +        {
> > +          if (dump_file)
> > +            fprintf (dump_file, ";; Not unrolling, contains
> branches\n");
> > +          return;
> > +        }
> > +    }
> > +
> >   /* If we have profile feedback, check whether the loop rolls.  */
> >   if ((loop->header->count
> >        && expected_loop_iterations (loop) < 2 * nunroll)
> > Index: params.def
> > ===================================================================
> > --- params.def  (revision 186783)
> > +++ params.def  (working copy)
> > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
> >         "The maximum depth of a loop nest we completely peel",
> >         8, 0, 0)
> >
> > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> > +       "min-iter-unroll-with-branches",
> > +       "Minimum iteration count to ignore branch effects when
> unrolling",
> > +       50, 0, 0)
> > +
> > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> > +       "unroll-outer-loop-branch-budget",
> > +       "Maximum number of branches allowed in hot outer loop region
> after unroll",
> > +       25, 0, 0)
> > +
> >  /* The maximum number of insns of an unswitched loop.  */
> >  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
> >        "max-unswitch-insns",
> >
> > --
> > This patch is available for review at
> http://codereview.appspot.com/6099055
>



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

Resending my response in plain text so it will go through to gcc-patches... On Tue, ...

13 years, 6 months ago (2012-04-24 21:41:04 UTC) #4

Resending my response in plain text so it will go through to gcc-patches...

On Tue, Apr 24, 2012 at 2:36 PM, Teresa Johnson <tejohnson@google.com> wrote:
>
>
>
> On Tue, Apr 24, 2012 at 2:30 PM, Andrew Pinski <pinskia@gmail.com> wrote:
>>
>> On Tue, Apr 24, 2012 at 2:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
>> > This patch adds heuristics to limit unrolling in loops with branches that
may increase
>> > branch mispredictions. It affects loops that are not frequently iterated,
and that are
>> > nested within a hot region of code that already contains many branch
instructions.
>> >
>> > Performance tested with both internal benchmarks and with SPEC 2000/2006 on
a variety
>> > of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
>> > This improves performance of an internal search indexing benchmark by close
to 2% on
>> > all the tested Intel platforms.  It also consistently improves 445.gobmk
(with FDO feedback
>> > where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
>> > neutral.
>> >
>> > Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>> >
>> > Thanks,
>> > Teresa
>> >
>> > 2012-04-24   Teresa Johnson  <tejohnson@google.com>
>> >
>> >        * loop-unroll.c (loop_has_call): New function.
>> >        (loop_has_FP_comp): Ditto.
>> >        (compute_weighted_branches): Ditto.
>> >        (max_unroll_with_branches): Ditto.
>> >        (decide_unroll_constant_iterations): Add heuristic to avoid
>> >        increasing branch mispredicts when unrolling.
>> >        (decide_unroll_runtime_iterations): Ditto.
>> >        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>> >        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>> >
>> > Index: loop-unroll.c
>> > ===================================================================
>> > --- loop-unroll.c       (revision 186783)
>> > +++ loop-unroll.c       (working copy)
>> > @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>> >                                             basic_block);
>> >  static rtx get_expansion (struct var_to_expand *);
>> >
>> > +/* Determine whether LOOP contains call.  */
>> > +static bool
>> > +loop_has_call(struct loop *loop)
>> > +{
>> > +  basic_block *body, bb;
>> > +  unsigned i;
>> > +  rtx insn;
>> > +
>> > +  body = get_loop_body (loop);
>> > +  for (i = 0; i < loop->num_nodes; i++)
>> > +    {
>> > +      bb = body[i];
>> > +
>> > +      FOR_BB_INSNS (bb, insn)
>> > +        {
>> > +          if (CALL_P (insn))
>> > +            {
>> > +              free (body);
>> > +              return true;
>> > +            }
>> > +        }
>> > +    }
>> > +  free (body);
>> > +  return false;
>> > +}
>> > +
>> > +/* Determine whether LOOP contains floating-point computation.  */
>> > +static bool
>> > +loop_has_FP_comp(struct loop *loop)
>> > +{
>> > +  rtx set, dest;
>> > +  basic_block *body, bb;
>> > +  unsigned i;
>> > +  rtx insn;
>> > +
>> > +  body = get_loop_body (loop);
>> > +  for (i = 0; i < loop->num_nodes; i++)
>> > +    {
>> > +      bb = body[i];
>> > +
>> > +      FOR_BB_INSNS (bb, insn)
>> > +        {
>> > +          set = single_set (insn);
>> > +          if (!set)
>> > +            continue;
>> > +
>> > +          dest = SET_DEST (set);
>> > +          if (FLOAT_MODE_P (GET_MODE (dest)))
>> > +            {
>> > +              free (body);
>> > +              return true;
>> > +            }
>> > +        }
>> > +    }
>> > +  free (body);
>> > +  return false;
>> > +}
>> > +
>> > +/* Compute the number of branches in LOOP, weighted by execution counts.
 */
>> > +static float
>> > +compute_weighted_branches(struct loop *loop)
>> > +{
>> > +  int header_count = loop->header->count;
>> > +  unsigned i;
>> > +  float n;
>> > +  basic_block * body;
>> > +
>> > +  /* If no profile feedback data exists, don't limit unrolling  */
>> > +  if (header_count == 0)
>> > +    return 0.0;
>> > +
>> > +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> > +
>> > +  body = get_loop_body (loop);
>> > +  n = 0.0;
>> > +  for (i = 0; i < loop->num_nodes; i++)
>> > +    {
>> > +      if (EDGE_COUNT (body[i]->succs) >= 2)
>> > +        {
>> > +          /* If this block is executed less frequently than the header
(loop
>> > +             entry), then it is weighted based on the ratio of times it is
>> > +             executed compared to the header.  */
>> > +          if (body[i]->count < header_count)
>> > +            n += ((float)body[i]->count)/header_count;
>>
>> Please don't introduce more floating point usage into the compiler
>> since it could change between different hosts (sse vs x87 for an
>> example).
>> Maybe use a fixed point multiply of 1000 (note use a macro for this
>> special value though)  like what is used in the rest of the predict
>> code.
>
>
> Ok, got it. I will address this in the next version of the patch.
>
> Thanks,
> Teresa
>
>>
>>
>>
>> Thanks,
>> Andrew Pinski
>>
>>
>> > +
>> > +          /* When it is executed more frequently than the header (i.e. it
is
>> > +             in a nested inner loop), simply weight the branch at 1.0.  */
>> > +          else
>> > +            n += 1.0;
>> > +        }
>> > +    }
>> > +  free (body);
>> > +
>> > +  return n;
>> > +}
>> > +
>> > +/* Compute the maximum number of times LOOP can be unrolled without
exceeding
>> > +   a branch budget, which can increase branch mispredictions. The number
of
>> > +   branches is computed by weighting each branch with its expected
execution
>> > +   probability through the loop based on profile data. If no profile
feedback
>> > +   data exists, simply return the current NUNROLL factor.  */
>> > +static unsigned
>> > +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> > +{
>> > +  struct loop *outer;
>> > +  struct niter_desc *outer_desc;
>> > +  int outer_niters = 1;
>> > +  float weighted_outer_branches = 0.0;
>> > +  float weighted_num_branches = compute_weighted_branches (loop);
>> > +
>> > +  /* If there was no profile feedback data, weighted_num_branches will be
0.0
>> > +     and we won't limit unrolling. If the weighted_num_branches is at most
1.0,
>> > +     also don't limit unrolling as the back-edge branch will not be
duplicated.  */
>> > +  if (weighted_num_branches <= 1.0)
>> > +    return nunroll;
>> > +
>> > +  /* Walk up the loop tree until we find a hot outer loop in which the
current
>> > +     loop is nested. At that point we will compute the number of times the
>> > +     current loop can be unrolled based on the number of branches in the
hot
>> > +     outer loop.  */
>> > +  outer = loop_outer(loop);
>> > +  /* The loop structure contains a fake outermost loop, so this should
always
>> > +     be non-NULL for our current loop.  */
>> > +  gcc_assert (outer);
>> > +  /* Detect if this is the fake outermost loop (at which point we are
done)
>> > +     by checking its outer loop.  */
>> > +  while (loop_outer(outer))
>> > +    {
>> > +      outer_desc = get_simple_loop_desc (outer);
>> > +
>> > +      if (outer_desc->const_iter)
>> > +        outer_niters *= outer_desc->niter;
>> > +      else if (outer->header->count)
>> > +        outer_niters *= expected_loop_iterations (outer);
>> > +
>> > +      weighted_outer_branches = compute_weighted_branches (outer);
>> > +
>> > +      /* Should have been checked by caller.  */
>> > +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>> > +
>> > +      /* If the outer loop has enough iterations to be considered hot,
then
>> > +         we can stop our upwards loop tree traversal and examine the
current
>> > +         outer loop.  */
>> > +      if (outer_niters >= PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > +        {
>> > +          /* Assume that any call will cause the branch budget to be
exceeded,
>> > +             and that we can't unroll the current loop without increasing
>> > +             mispredicts.  */
>> > +          if (loop_has_call(outer))
>> > +            return 0;
>> > +
>> > +          /* Otherwise, compute the maximum number of times current loop
can be
>> > +             unrolled without exceeding our branch budget. First we
subtract
>> > +             off the outer loop's weighted branch count from the budget.
Note
>> > +             that this includes the branches in the current loop. This
yields
>> > +             the number of branches left in the budget for the unrolled
copies.
>> > +             We divide this by the number of branches in the current loop
that
>> > +             must be duplicated when we unroll, which is the total
weighted
>> > +             number of branches minus the back-edge branch. This yields
the
>> > +             number of new loop body copies that can be created by
unrolling
>> > +             without exceeding the budget, to which we add 1 to get the
unroll
>> > +             factor.  */
>> > +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> > +                  weighted_outer_branches)/(weighted_num_branches - 1) +
1;
>> > +        }
>> > +      outer = loop_outer(outer);
>> > +    }
>> > +
>> > +  /* The current loop is not enclosed by a hot enough outer loop in this
>> > +     procedure, since the hot outer loop is inter-procedural, assume that
>> > +     it already contains a significant number of branches, so don't
unroll.  */
>> > +  return 0;
>> > +}
>> > +
>> >  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>> >  void
>> >  unroll_and_peel_loops (int flags)
>> > @@ -522,6 +696,7 @@ static void
>> >  decide_unroll_constant_iterations (struct loop *loop, int flags)
>> >  {
>> >   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
i;
>> > +  unsigned nunroll_branches;
>> >   struct niter_desc *desc;
>> >
>> >   if (!(flags & UAP_UNROLL))
>> > @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>> >       return;
>> >     }
>> >
>> > +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> > +     the number of mispredicts. Ignore loops with FP computation as these
>> > +     tend to benefit much more consistently from unrolling.  */
>> > +  if (num_loop_branches (loop) > 1
>> > +      && loop_has_FP_comp(loop)
>> > +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> > +      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > +    {
>> > +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> > +      if (nunroll > nunroll_branches)
>> > +        nunroll = nunroll_branches;
>> > +      if (nunroll <= 1)
>> > +        {
>> > +          if (dump_file)
>> > +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> > +          return;
>> > +        }
>> > +    }
>> > +
>> >   /* Check whether the loop rolls enough to consider.  */
>> >   if (desc->niter < 2 * nunroll)
>> >     {
>> > @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>> >  static void
>> >  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>> >  {
>> > -  unsigned nunroll, nunroll_by_av, i;
>> > +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>> >   struct niter_desc *desc;
>> >
>> >   if (!(flags & UAP_UNROLL))
>> > @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>> >       return;
>> >     }
>> >
>> > +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> > +     the number of mispredicts. Ignore loops with FP computation as these
>> > +     tend to benefit much more consistently from unrolling.  */
>> > +  if (num_loop_branches (loop) > 1
>> > +      && loop_has_FP_comp(loop)
>> > +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> > +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> > +    {
>> > +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> > +      if (nunroll > nunroll_branches)
>> > +        nunroll = nunroll_branches;
>> > +      if (nunroll <= 1)
>> > +        {
>> > +          if (dump_file)
>> > +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> > +          return;
>> > +        }
>> > +    }
>> > +
>> >   /* If we have profile feedback, check whether the loop rolls.  */
>> >   if ((loop->header->count
>> >        && expected_loop_iterations (loop) < 2 * nunroll)
>> > Index: params.def
>> > ===================================================================
>> > --- params.def  (revision 186783)
>> > +++ params.def  (working copy)
>> > @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>> >         "The maximum depth of a loop nest we completely peel",
>> >         8, 0, 0)
>> >
>> > +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> > +       "min-iter-unroll-with-branches",
>> > +       "Minimum iteration count to ignore branch effects when unrolling",
>> > +       50, 0, 0)
>> > +
>> > +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> > +       "unroll-outer-loop-branch-budget",
>> > +       "Maximum number of branches allowed in hot outer loop region after
unroll",
>> > +       25, 0, 0)
>> > +
>> >  /* The maximum number of insns of an unswitched loop.  */
>> >  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>> >        "max-unswitch-insns",
>> >
>> > --
>> > This patch is available for review at http://codereview.appspot.com/6099055
>
>
>
>
> --
> Teresa Johnson | Software Engineer |  tejohnson@google.com |  408-460-2413
>



--
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

stevenb.gcc

On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > * params.def ...

13 years, 6 months ago (2012-04-24 23:38:17 UTC) #5

On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote:

>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.

You should add documentation for these new PARAMs to doc/invoke.texi.

I don't really like these new PARAMs: All other loop PARAMs are based
on the number of insns in a loop, or the maximum number of times a
transformation is applied. Your new
PARAM_MIN_ITER_UNROLL_WITH_BRANCHES is completely different, because
it is a number of iterations. This makes the PARAM value feel even
more arbitrary than all the other PARAMs to some extend already do...

(The only other PARAM like that is PARAM_ALIGN_LOOP_ITERATIONS, and
its default value also looks quite arbitrary...)

> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c       (revision 186783)
> +++ loop-unroll.c       (working copy)
> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>                                             basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +/* Determine whether LOOP contains call.  */
> +static bool
> +loop_has_call(struct loop *loop)
> +{
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          if (CALL_P (insn))
> +            {
> +              free (body);
> +              return true;
> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}
> +
> +/* Determine whether LOOP contains floating-point computation.  */
> +static bool
> +loop_has_FP_comp(struct loop *loop)
> +{
> +  rtx set, dest;
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          set = single_set (insn);
> +          if (!set)
> +            continue;
> +
> +          dest = SET_DEST (set);
> +          if (FLOAT_MODE_P (GET_MODE (dest)))
> +            {
> +              free (body);
> +              return true;

So you only detect single-set FP operations where some insns stores in
a float mode. It wouldn't be very difficult to just walk over all sets
and look for float modes. This is also necessary e.g. for x87 sincos,
as well as various insns on other machines. Your comments say you
don't want to apply the new heuristic to loops containing FP
operations because these loops usually benefit more from unrolling.
Therefore, you should IMHO look at non-single_set() insns also here,
to avoid applying the heuristics to loops containing non-single_set()
FP insns.

> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}

Nit: You are calling loop_has_call and loop_has_FP_comp() twice on
each loop (first for constant iterations and next for runtime
iterations), maybe you can fuse the functions and cache the results
(e.g. with two bitmaps, or put it in the loop description and retrieve
it with get_simple_loop_desc). Actually num_loop_branches()
could/should also be cached. I realize that the loop body walks are
probably not very expensive (and compile time probably isn't a concern
if you're using profile driven optimizations) but they do all add
up...

> +/* Compute the number of branches in LOOP, weighted by execution counts.  */
> +static float
> +compute_weighted_branches(struct loop *loop)

The floating point thing was already mentioned by Andrew. You can use
integer math instead (for examples, look for BB_FREQ_MAX e.g. in
average_num_loop_insns()).

> +  while (loop_outer(outer))
> +    {
> +      outer_desc = get_simple_loop_desc (outer);
> +
> +      if (outer_desc->const_iter)
> +        outer_niters *= outer_desc->niter;
> +      else if (outer->header->count)
> +        outer_niters *= expected_loop_iterations (outer);
> +
> +      weighted_outer_branches = compute_weighted_branches (outer);

Can you delay this computation of "weighted_outer_branches" call to ...

> +      /* Should have been checked by caller.  */
> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);

Should never even happen. You have set the minimum acceptable value to
0. If you managed to test this code with
PARAM_MIN_ITER_UNROLL_WITH_BRANCHES==-1, I'd like to know how (if you
can do it from the command line, there is a bug in the handling of
acceptable PARAM values :-)

> +      /* If the outer loop has enough iterations to be considered hot, then
> +         we can stop our upwards loop tree traversal and examine the current
> +         outer loop.  */
> +      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +        {
> +          /* Assume that any call will cause the branch budget to be
exceeded,
> +             and that we can't unroll the current loop without increasing
> +             mispredicts.  */
> +          if (loop_has_call(outer))
> +            return 0;
> +
> +          /* Otherwise, compute the maximum number of times current loop can
be
> +             unrolled without exceeding our branch budget. First we subtract
> +             off the outer loop's weighted branch count from the budget. Note
> +             that this includes the branches in the current loop. This yields
> +             the number of branches left in the budget for the unrolled
copies.
> +             We divide this by the number of branches in the current loop
that
> +             must be duplicated when we unroll, which is the total weighted
> +             number of branches minus the back-edge branch. This yields the
> +             number of new loop body copies that can be created by unrolling
> +             without exceeding the budget, to which we add 1 to get the
unroll
> +             factor.  */

... somewhere here, where weighted_outer_branches is used?

> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;

You should guard against weighted branches==1.0.

Ciao!
Steven

andi_firstfloor.org

tejohnson@google.com (Teresa Johnson) writes: > This patch adds heuristics to limit unrolling in loops with ...

13 years, 6 months ago (2012-04-25 01:13:58 UTC) #6

davidxl

On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: > tejohnson@google.com (Teresa ...

13 years, 6 months ago (2012-04-25 04:57:10 UTC) #7

davidxl

http://codereview.appspot.com/6099055/diff/1/loop-unroll.c File loop-unroll.c (right): http://codereview.appspot.com/6099055/diff/1/loop-unroll.c#newcode156 loop-unroll.c:156: static bool An empty line here. http://codereview.appspot.com/6099055/diff/1/loop-unroll.c#newcode182 loop-unroll.c:182: static ...

13 years, 6 months ago (2012-04-25 05:22:14 UTC) #8

andi_firstfloor.org

> Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2, > both of ...

13 years, 6 months ago (2012-04-25 08:27:10 UTC) #9

richard.guenther_gmail.com

On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote: > This patch ...

13 years, 6 months ago (2012-04-25 09:03:43 UTC) #10

On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com> wrote:
> This patch adds heuristics to limit unrolling in loops with branches that may
increase
> branch mispredictions. It affects loops that are not frequently iterated, and
that are
> nested within a hot region of code that already contains many branch
instructions.
>
> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
> This improves performance of an internal search indexing benchmark by close to
2% on
> all the tested Intel platforms.  It also consistently improves 445.gobmk (with
FDO feedback
> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
> neutral.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>
> Thanks,
> Teresa
>
> 2012-04-24   Teresa Johnson  <tejohnson@google.com>
>
>        * loop-unroll.c (loop_has_call): New function.
>        (loop_has_FP_comp): Ditto.
>        (compute_weighted_branches): Ditto.
>        (max_unroll_with_branches): Ditto.
>        (decide_unroll_constant_iterations): Add heuristic to avoid
>        increasing branch mispredicts when unrolling.
>        (decide_unroll_runtime_iterations): Ditto.
>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c       (revision 186783)
> +++ loop-unroll.c       (working copy)
> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>                                             basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +/* Determine whether LOOP contains call.  */
> +static bool
> +loop_has_call(struct loop *loop)
> +{
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);

You repeatedly do this and walk over all blocks.  Please think about
compile-time
issues when writing code.

This all looks sort-of target specific to me and I don't see why this
very specialized
patch is a good idea when unrolling does a very poor job deciding what and how
much to unroll generally.

Richard.

> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          if (CALL_P (insn))
> +            {
> +              free (body);
> +              return true;
> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}
> +
> +/* Determine whether LOOP contains floating-point computation.  */
> +static bool
> +loop_has_FP_comp(struct loop *loop)
> +{
> +  rtx set, dest;
> +  basic_block *body, bb;
> +  unsigned i;
> +  rtx insn;
> +
> +  body = get_loop_body (loop);
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      bb = body[i];
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          set = single_set (insn);
> +          if (!set)
> +            continue;
> +
> +          dest = SET_DEST (set);
> +          if (FLOAT_MODE_P (GET_MODE (dest)))
> +            {
> +              free (body);
> +              return true;
> +            }
> +        }
> +    }
> +  free (body);
> +  return false;
> +}
> +
> +/* Compute the number of branches in LOOP, weighted by execution counts.  */
> +static float
> +compute_weighted_branches(struct loop *loop)
> +{
> +  int header_count = loop->header->count;
> +  unsigned i;
> +  float n;
> +  basic_block * body;
> +
> +  /* If no profile feedback data exists, don't limit unrolling  */
> +  if (header_count == 0)
> +    return 0.0;
> +
> +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
> +
> +  body = get_loop_body (loop);
> +  n = 0.0;
> +  for (i = 0; i < loop->num_nodes; i++)
> +    {
> +      if (EDGE_COUNT (body[i]->succs) >= 2)
> +        {
> +          /* If this block is executed less frequently than the header (loop
> +             entry), then it is weighted based on the ratio of times it is
> +             executed compared to the header.  */
> +          if (body[i]->count < header_count)
> +            n += ((float)body[i]->count)/header_count;
> +
> +          /* When it is executed more frequently than the header (i.e. it is
> +             in a nested inner loop), simply weight the branch at 1.0.  */
> +          else
> +            n += 1.0;
> +        }
> +    }
> +  free (body);
> +
> +  return n;
> +}
> +
> +/* Compute the maximum number of times LOOP can be unrolled without
exceeding
> +   a branch budget, which can increase branch mispredictions. The number of
> +   branches is computed by weighting each branch with its expected execution
> +   probability through the loop based on profile data. If no profile
feedback
> +   data exists, simply return the current NUNROLL factor.  */
> +static unsigned
> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> +{
> +  struct loop *outer;
> +  struct niter_desc *outer_desc;
> +  int outer_niters = 1;
> +  float weighted_outer_branches = 0.0;
> +  float weighted_num_branches = compute_weighted_branches (loop);
> +
> +  /* If there was no profile feedback data, weighted_num_branches will be
0.0
> +     and we won't limit unrolling. If the weighted_num_branches is at most
1.0,
> +     also don't limit unrolling as the back-edge branch will not be
duplicated.  */
> +  if (weighted_num_branches <= 1.0)
> +    return nunroll;
> +
> +  /* Walk up the loop tree until we find a hot outer loop in which the
current
> +     loop is nested. At that point we will compute the number of times the
> +     current loop can be unrolled based on the number of branches in the hot
> +     outer loop.  */
> +  outer = loop_outer(loop);
> +  /* The loop structure contains a fake outermost loop, so this should
always
> +     be non-NULL for our current loop.  */
> +  gcc_assert (outer);
> +  /* Detect if this is the fake outermost loop (at which point we are done)
> +     by checking its outer loop.  */
> +  while (loop_outer(outer))
> +    {
> +      outer_desc = get_simple_loop_desc (outer);
> +
> +      if (outer_desc->const_iter)
> +        outer_niters *= outer_desc->niter;
> +      else if (outer->header->count)
> +        outer_niters *= expected_loop_iterations (outer);
> +
> +      weighted_outer_branches = compute_weighted_branches (outer);
> +
> +      /* Should have been checked by caller.  */
> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
> +
> +      /* If the outer loop has enough iterations to be considered hot, then
> +         we can stop our upwards loop tree traversal and examine the current
> +         outer loop.  */
> +      if (outer_niters >= PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +        {
> +          /* Assume that any call will cause the branch budget to be
exceeded,
> +             and that we can't unroll the current loop without increasing
> +             mispredicts.  */
> +          if (loop_has_call(outer))
> +            return 0;
> +
> +          /* Otherwise, compute the maximum number of times current loop can
be
> +             unrolled without exceeding our branch budget. First we subtract
> +             off the outer loop's weighted branch count from the budget.
Note
> +             that this includes the branches in the current loop. This
yields
> +             the number of branches left in the budget for the unrolled
copies.
> +             We divide this by the number of branches in the current loop
that
> +             must be duplicated when we unroll, which is the total weighted
> +             number of branches minus the back-edge branch. This yields the
> +             number of new loop body copies that can be created by unrolling
> +             without exceeding the budget, to which we add 1 to get the
unroll
> +             factor.  */
> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
> +        }
> +      outer = loop_outer(outer);
> +    }
> +
> +  /* The current loop is not enclosed by a hot enough outer loop in this
> +     procedure, since the hot outer loop is inter-procedural, assume that
> +     it already contains a significant number of branches, so don't unroll.
 */
> +  return 0;
> +}
> +
>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>  void
>  unroll_and_peel_loops (int flags)
> @@ -522,6 +696,7 @@ static void
>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>  {
>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
> +  unsigned nunroll_branches;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can increase
> +     the number of mispredicts. Ignore loops with FP computation as these
> +     tend to benefit much more consistently from unrolling.  */
> +  if (num_loop_branches (loop) > 1
> +      && loop_has_FP_comp(loop)
> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> +      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +    {
> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* Check whether the loop rolls enough to consider.  */
>   if (desc->niter < 2 * nunroll)
>     {
> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>  static void
>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>  {
> -  unsigned nunroll, nunroll_by_av, i;
> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can increase
> +     the number of mispredicts. Ignore loops with FP computation as these
> +     tend to benefit much more consistently from unrolling.  */
> +  if (num_loop_branches (loop) > 1
> +      && loop_has_FP_comp(loop)
> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
> +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
> +    {
> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* If we have profile feedback, check whether the loop rolls.  */
>   if ((loop->header->count
>        && expected_loop_iterations (loop) < 2 * nunroll)
> Index: params.def
> ===================================================================
> --- params.def  (revision 186783)
> +++ params.def  (working copy)
> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>         "The maximum depth of a loop nest we completely peel",
>         8, 0, 0)
>
> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> +       "min-iter-unroll-with-branches",
> +       "Minimum iteration count to ignore branch effects when unrolling",
> +       50, 0, 0)
> +
> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> +       "unroll-outer-loop-branch-budget",
> +       "Maximum number of branches allowed in hot outer loop region after
unroll",
> +       25, 0, 0)
> +
>  /* The maximum number of insns of an unswitched loop.  */
>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>        "max-unswitch-insns",
>
> --
> This patch is available for review at http://codereview.appspot.com/6099055

tejohnson

On Tue, Apr 24, 2012 at 4:38 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote: > On Tue, ...

13 years, 6 months ago (2012-04-25 15:27:59 UTC) #11

On Tue, Apr 24, 2012 at 4:38 PM, Steven Bosscher <stevenb.gcc@gmail.com> wrote:
> On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com>
wrote:
>
>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> You should add documentation for these new PARAMs to doc/invoke.texi.

Ok, will do.

>
> I don't really like these new PARAMs: All other loop PARAMs are based
> on the number of insns in a loop, or the maximum number of times a
> transformation is applied. Your new
> PARAM_MIN_ITER_UNROLL_WITH_BRANCHES is completely different, because
> it is a number of iterations. This makes the PARAM value feel even
> more arbitrary than all the other PARAMs to some extend already do...

That's true, they are different in what they are checking than some of
the other loop unrolling params. But I need some threshold for
determining when a loop is hot enough that its unrolled branches will
be executed frequently enough to train the branch predictor and also
where the impact on the branch prediction in the outer region of code
is less likely to matter overall. The defaults were chosen so that the
new unrolling limit should only kick in for loops that are not
iterating much anyway, and where the outer hot region has quite a few
branches.

>
> (The only other PARAM like that is PARAM_ALIGN_LOOP_ITERATIONS, and
> its default value also looks quite arbitrary...)
>
>
>> Index: loop-unroll.c
>> ===================================================================
>> --- loop-unroll.c       (revision 186783)
>> +++ loop-unroll.c       (working copy)
>> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>>                                             basic_block);
>>  static rtx get_expansion (struct var_to_expand *);
>>
>> +/* Determine whether LOOP contains call.  */
>> +static bool
>> +loop_has_call(struct loop *loop)
>> +{
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          if (CALL_P (insn))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Determine whether LOOP contains floating-point computation.  */
>> +static bool
>> +loop_has_FP_comp(struct loop *loop)
>> +{
>> +  rtx set, dest;
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          set = single_set (insn);
>> +          if (!set)
>> +            continue;
>> +
>> +          dest = SET_DEST (set);
>> +          if (FLOAT_MODE_P (GET_MODE (dest)))
>> +            {
>> +              free (body);
>> +              return true;
>
> So you only detect single-set FP operations where some insns stores in
> a float mode. It wouldn't be very difficult to just walk over all sets
> and look for float modes. This is also necessary e.g. for x87 sincos,
> as well as various insns on other machines. Your comments say you
> don't want to apply the new heuristic to loops containing FP
> operations because these loops usually benefit more from unrolling.
> Therefore, you should IMHO look at non-single_set() insns also here,
> to avoid applying the heuristics to loops containing non-single_set()
> FP insns.

Ok, thanks for the suggestion, I will expand this for the next version
of the patch.

>
>
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>
> Nit: You are calling loop_has_call and loop_has_FP_comp() twice on
> each loop (first for constant iterations and next for runtime
> iterations),

I don't think that is true for loop_has_FP_comp, since it is called in
decide_unroll_constant_iterations and decide_unroll_runtime_iterations
just after we have checked if the loop has a constant number of
iterations, and returned early depending on the result of this check
and which routine we are in. So each inner loop will only reach the
call to loop_has_FP_comp in one of these routines.

In the case of loop_has_call, which is only called for a hot outer
loop, it is true we could invoke that more than once. That would
happen if a hot outer loop contains more than one nested inner loop
with a small iteration count and branches that we attempt to unroll
(it is called at most once per inner loop that we attempt to unroll).
I thought about attempting to cache this info for the outer loop in
the structure returned by get_simple_loop_desc() as you also suggest
below. I was concerned that currently this returns an niter_desc
structure which holds info about the # iterations, and this
information doesn't fit into that category. However, I could go ahead
and add it to that structure and perhaps rename the structure to
something more generic like "loop_desc". What do you think?

The other issue is that we don't need this new information on all
loops where we currently may compute and return an niter_desc
instance, and I didn't want to unnecessarily add a walk over the loop
bodies to compute the new information if we didn't need it. I'm not
sure of the tradeoff between always computing this information when we
do a get_simple_loop_desc vs calling it a couple times each for the
loops where we do need it (only when we have innermost loops with
shorter iteration counts and internal branches that we have decided to
unroll). One way around this would be to set the initial value of this
information in the loop_desc to a default value that means "unknown"
and then compute and cache the information lazily as needed.

> maybe you can fuse the functions

I don't know that it makes sense to fuse them, as they are called on
different loops - the call to loop_has_FP_comp will only be done for
innermost loops that we are attempting to unroll, whereas
loop_has_call is called only for outer loops that contain nested inner
loops. But if I am going to cache the results in the loop desc then it
may not be too expensive to do all the checks in one walk over each
loop, regardless of whether it is an outer or inner loop.

> and cache the results
> (e.g. with two bitmaps, or put it in the loop description and retrieve
> it with get_simple_loop_desc). Actually num_loop_branches()
> could/should also be cached. I realize that the loop body walks are
> probably not very expensive (and compile time probably isn't a concern
> if you're using profile driven optimizations) but they do all add
> up...

True, num_loop_branches could be called a couple times for each
innermost loop we attempt to unroll, because it is also called by
decide_peel_simple and decide_unroll_stupid. If I cache the other info
in a loop_desc as described above, we could cache this as well as the
result of compute_weighted_branches.

>
>
>> +/* Compute the number of branches in LOOP, weighted by execution counts.
 */
>> +static float
>> +compute_weighted_branches(struct loop *loop)
>
> The floating point thing was already mentioned by Andrew. You can use
> integer math instead (for examples, look for BB_FREQ_MAX e.g. in
> average_num_loop_insns()).
>
>
>> +  while (loop_outer(outer))
>> +    {
>> +      outer_desc = get_simple_loop_desc (outer);
>> +
>> +      if (outer_desc->const_iter)
>> +        outer_niters *= outer_desc->niter;
>> +      else if (outer->header->count)
>> +        outer_niters *= expected_loop_iterations (outer);
>> +
>> +      weighted_outer_branches = compute_weighted_branches (outer);
>
> Can you delay this computation of "weighted_outer_branches" call to ...

Yes - thanks for the suggestion.

>
>> +      /* Should have been checked by caller.  */
>> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>
> Should never even happen. You have set the minimum acceptable value to
> 0. If you managed to test this code with
> PARAM_MIN_ITER_UNROLL_WITH_BRANCHES==-1, I'd like to know how (if you
> can do it from the command line, there is a bug in the handling of
> acceptable PARAM values :-)

You are right. This should have been removed - at one point I was
thinking of having a value of -1 indicate that the heuristic should
not kick in, but you are right that the minimum value set in
params.def does not support this, and in fact it is not needed at all
as setting this param to 0 will do the same thing. Will fix!

>
>
>> +      /* If the outer loop has enough iterations to be considered hot, then
>> +         we can stop our upwards loop tree traversal and examine the
current
>> +         outer loop.  */
>> +      if (outer_niters >= PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +        {
>> +          /* Assume that any call will cause the branch budget to be
exceeded,
>> +             and that we can't unroll the current loop without increasing
>> +             mispredicts.  */
>> +          if (loop_has_call(outer))
>> +            return 0;
>> +
>> +          /* Otherwise, compute the maximum number of times current loop can
be
>> +             unrolled without exceeding our branch budget. First we
subtract
>> +             off the outer loop's weighted branch count from the budget.
Note
>> +             that this includes the branches in the current loop. This
yields
>> +             the number of branches left in the budget for the unrolled
copies.
>> +             We divide this by the number of branches in the current loop
that
>> +             must be duplicated when we unroll, which is the total weighted
>> +             number of branches minus the back-edge branch. This yields the
>> +             number of new loop body copies that can be created by
unrolling
>> +             without exceeding the budget, to which we add 1 to get the
unroll
>> +             factor.  */
>
> ... somewhere here, where weighted_outer_branches is used?

Yep, thanks.

>
>> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
>
> You should guard against weighted branches==1.0.

Right - the guard already exists at the top of the routine. I can add
a comment here to that effect.

Thanks!
Teresa

>
> Ciao!
> Steven

-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote: > tejohnson@google.com (Teresa ...

13 years, 6 months ago (2012-04-25 15:36:05 UTC) #12

On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote:
> tejohnson@google.com (Teresa Johnson) writes:
>
>> This patch adds heuristics to limit unrolling in loops with branches that may
increase
>> branch mispredictions. It affects loops that are not frequently iterated, and
that are
>> nested within a hot region of code that already contains many branch
instructions.
>>
>> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
>> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
>> This improves performance of an internal search indexing benchmark by close
to 2% on
>> all the tested Intel platforms.  It also consistently improves 445.gobmk
(with FDO feedback
>> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
>> neutral.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>
> One problem with any unrolling heuristics is currently that gcc has both
> the tree level and the rtl level unroller. The tree one is even on at
> -O3.  So if you tweak anything for one you have to affect both, otherwise the
> other may still do the wrong thing(tm).

It's true that the tree level unroller could benefit from taking
branch mispredict effects into account as well. But since that is only
performing full unrolling of constant trip count loops I suspect that
there will be additional things that need to be considered, such as
whether the full unrolling enables better optimization in the
surrounding code/loop. Hence I wanted to tackle that later.

>
> For some other tweaks I looked into a shared cost model some time ago.
> May be still needed.

Yes, I think it would be good to unify some of the profitability
checks between the two unrolling passes, or at least between the tree
and rtl level full unrollers/peelers.

Teresa

>
> -Andi
>
> --
> ak@linux.intel.com -- Speaking for myself only



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther <richard.guenther@gmail.com> wrote: > On Tue, ...

13 years, 6 months ago (2012-04-25 15:48:23 UTC) #13

On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com>
wrote:
>> This patch adds heuristics to limit unrolling in loops with branches that may
increase
>> branch mispredictions. It affects loops that are not frequently iterated, and
that are
>> nested within a hot region of code that already contains many branch
instructions.
>>
>> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
>> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
>> This improves performance of an internal search indexing benchmark by close
to 2% on
>> all the tested Intel platforms.  It also consistently improves 445.gobmk
(with FDO feedback
>> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
>> neutral.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>>
>> Thanks,
>> Teresa
>>
>> 2012-04-24   Teresa Johnson  <tejohnson@google.com>
>>
>>        * loop-unroll.c (loop_has_call): New function.
>>        (loop_has_FP_comp): Ditto.
>>        (compute_weighted_branches): Ditto.
>>        (max_unroll_with_branches): Ditto.
>>        (decide_unroll_constant_iterations): Add heuristic to avoid
>>        increasing branch mispredicts when unrolling.
>>        (decide_unroll_runtime_iterations): Ditto.
>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>>
>> Index: loop-unroll.c
>> ===================================================================
>> --- loop-unroll.c       (revision 186783)
>> +++ loop-unroll.c       (working copy)
>> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>>                                             basic_block);
>>  static rtx get_expansion (struct var_to_expand *);
>>
>> +/* Determine whether LOOP contains call.  */
>> +static bool
>> +loop_has_call(struct loop *loop)
>> +{
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>
> You repeatedly do this and walk over all blocks.  Please think about
> compile-time
> issues when writing code.

See my response to Steven where I address this issue and mention some
approaches to reducing the loop body walks. Please let me know if you
have any feedback on that.

>
> This all looks sort-of target specific to me and I don't see why this
> very specialized
> patch is a good idea when unrolling does a very poor job deciding what and
how
> much to unroll generally.

I am hoping this will improve upon the job the unroller does in
deciding when/how to unroll. I didn't think that it was too target
specific as branch mispredictions could affect many targets. Note that
there are already some much more basic checks for the branch
misprediction effects in both decide_peel_simple and
decide_unroll_stupid, for example:

  /* Do not simply peel loops with branches inside -- it increases number
     of mispredicts.  */
  if (num_loop_branches (loop) > 1)
    {
      if (dump_file)
        fprintf (dump_file, ";; Not peeling, contains branches\n");
      return;
    }

It is possible that both of these checks could be made less aggressive
using the approach in this patch, which affects many more loops and
hence I am trying to add some more intelligent checking of whether
branch mispredicts might be triggered.

 Thanks,
Teresa

>
> Richard.
>
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          if (CALL_P (insn))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Determine whether LOOP contains floating-point computation.  */
>> +static bool
>> +loop_has_FP_comp(struct loop *loop)
>> +{
>> +  rtx set, dest;
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          set = single_set (insn);
>> +          if (!set)
>> +            continue;
>> +
>> +          dest = SET_DEST (set);
>> +          if (FLOAT_MODE_P (GET_MODE (dest)))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Compute the number of branches in LOOP, weighted by execution counts.
 */
>> +static float
>> +compute_weighted_branches(struct loop *loop)
>> +{
>> +  int header_count = loop->header->count;
>> +  unsigned i;
>> +  float n;
>> +  basic_block * body;
>> +
>> +  /* If no profile feedback data exists, don't limit unrolling  */
>> +  if (header_count == 0)
>> +    return 0.0;
>> +
>> +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> +
>> +  body = get_loop_body (loop);
>> +  n = 0.0;
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      if (EDGE_COUNT (body[i]->succs) >= 2)
>> +        {
>> +          /* If this block is executed less frequently than the header
(loop
>> +             entry), then it is weighted based on the ratio of times it is
>> +             executed compared to the header.  */
>> +          if (body[i]->count < header_count)
>> +            n += ((float)body[i]->count)/header_count;
>> +
>> +          /* When it is executed more frequently than the header (i.e. it
is
>> +             in a nested inner loop), simply weight the branch at 1.0.  */
>> +          else
>> +            n += 1.0;
>> +        }
>> +    }
>> +  free (body);
>> +
>> +  return n;
>> +}
>> +
>> +/* Compute the maximum number of times LOOP can be unrolled without
exceeding
>> +   a branch budget, which can increase branch mispredictions. The number of
>> +   branches is computed by weighting each branch with its expected
execution
>> +   probability through the loop based on profile data. If no profile
feedback
>> +   data exists, simply return the current NUNROLL factor.  */
>> +static unsigned
>> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> +{
>> +  struct loop *outer;
>> +  struct niter_desc *outer_desc;
>> +  int outer_niters = 1;
>> +  float weighted_outer_branches = 0.0;
>> +  float weighted_num_branches = compute_weighted_branches (loop);
>> +
>> +  /* If there was no profile feedback data, weighted_num_branches will be
0.0
>> +     and we won't limit unrolling. If the weighted_num_branches is at most
1.0,
>> +     also don't limit unrolling as the back-edge branch will not be
duplicated.  */
>> +  if (weighted_num_branches <= 1.0)
>> +    return nunroll;
>> +
>> +  /* Walk up the loop tree until we find a hot outer loop in which the
current
>> +     loop is nested. At that point we will compute the number of times the
>> +     current loop can be unrolled based on the number of branches in the
hot
>> +     outer loop.  */
>> +  outer = loop_outer(loop);
>> +  /* The loop structure contains a fake outermost loop, so this should
always
>> +     be non-NULL for our current loop.  */
>> +  gcc_assert (outer);
>> +  /* Detect if this is the fake outermost loop (at which point we are done)
>> +     by checking its outer loop.  */
>> +  while (loop_outer(outer))
>> +    {
>> +      outer_desc = get_simple_loop_desc (outer);
>> +
>> +      if (outer_desc->const_iter)
>> +        outer_niters *= outer_desc->niter;
>> +      else if (outer->header->count)
>> +        outer_niters *= expected_loop_iterations (outer);
>> +
>> +      weighted_outer_branches = compute_weighted_branches (outer);
>> +
>> +      /* Should have been checked by caller.  */
>> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>> +
>> +      /* If the outer loop has enough iterations to be considered hot, then
>> +         we can stop our upwards loop tree traversal and examine the
current
>> +         outer loop.  */
>> +      if (outer_niters >= PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +        {
>> +          /* Assume that any call will cause the branch budget to be
exceeded,
>> +             and that we can't unroll the current loop without increasing
>> +             mispredicts.  */
>> +          if (loop_has_call(outer))
>> +            return 0;
>> +
>> +          /* Otherwise, compute the maximum number of times current loop can
be
>> +             unrolled without exceeding our branch budget. First we
subtract
>> +             off the outer loop's weighted branch count from the budget.
Note
>> +             that this includes the branches in the current loop. This
yields
>> +             the number of branches left in the budget for the unrolled
copies.
>> +             We divide this by the number of branches in the current loop
that
>> +             must be duplicated when we unroll, which is the total weighted
>> +             number of branches minus the back-edge branch. This yields the
>> +             number of new loop body copies that can be created by
unrolling
>> +             without exceeding the budget, to which we add 1 to get the
unroll
>> +             factor.  */
>> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
>> +        }
>> +      outer = loop_outer(outer);
>> +    }
>> +
>> +  /* The current loop is not enclosed by a hot enough outer loop in this
>> +     procedure, since the hot outer loop is inter-procedural, assume that
>> +     it already contains a significant number of branches, so don't unroll.
 */
>> +  return 0;
>> +}
>> +
>>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>>  void
>>  unroll_and_peel_loops (int flags)
>> @@ -522,6 +696,7 @@ static void
>>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>>  {
>>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
i;
>> +  unsigned nunroll_branches;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* Check whether the loop rolls enough to consider.  */
>>   if (desc->niter < 2 * nunroll)
>>     {
>> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>>  static void
>>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>>  {
>> -  unsigned nunroll, nunroll_by_av, i;
>> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* If we have profile feedback, check whether the loop rolls.  */
>>   if ((loop->header->count
>>        && expected_loop_iterations (loop) < 2 * nunroll)
>> Index: params.def
>> ===================================================================
>> --- params.def  (revision 186783)
>> +++ params.def  (working copy)
>> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>>         "The maximum depth of a loop nest we completely peel",
>>         8, 0, 0)
>>
>> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> +       "min-iter-unroll-with-branches",
>> +       "Minimum iteration count to ignore branch effects when unrolling",
>> +       50, 0, 0)
>> +
>> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> +       "unroll-outer-loop-branch-budget",
>> +       "Maximum number of branches allowed in hot outer loop region after
unroll",
>> +       25, 0, 0)
>> +
>>  /* The maximum number of insns of an unswitched loop.  */
>>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>>        "max-unswitch-insns",
>>
>> --
>> This patch is available for review at http://codereview.appspot.com/6099055



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

davidxl

I think the general mechanism applies to most of the targets. What is needed is ...

13 years, 6 months ago (2012-04-25 16:11:59 UTC) #14

I think the general mechanism applies to most of the targets. What is
needed is target specific parameter (branch budget) tuning which can
be done separately -- there exist a way to do that already.

David

On Wed, Apr 25, 2012 at 2:03 AM, Richard Guenther
<richard.guenther@gmail.com> wrote:
> On Tue, Apr 24, 2012 at 11:26 PM, Teresa Johnson <tejohnson@google.com>
wrote:
>> This patch adds heuristics to limit unrolling in loops with branches that may
increase
>> branch mispredictions. It affects loops that are not frequently iterated, and
that are
>> nested within a hot region of code that already contains many branch
instructions.
>>
>> Performance tested with both internal benchmarks and with SPEC 2000/2006 on a
variety
>> of Intel systems (Core2, Corei7, SandyBridge) and a couple of different AMD
Opteron systems.
>> This improves performance of an internal search indexing benchmark by close
to 2% on
>> all the tested Intel platforms.  It also consistently improves 445.gobmk
(with FDO feedback
>> where unrolling kicks in) by close to 1% on AMD Opteron. Other performance
effects are
>> neutral.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>>
>> Thanks,
>> Teresa
>>
>> 2012-04-24   Teresa Johnson  <tejohnson@google.com>
>>
>>        * loop-unroll.c (loop_has_call): New function.
>>        (loop_has_FP_comp): Ditto.
>>        (compute_weighted_branches): Ditto.
>>        (max_unroll_with_branches): Ditto.
>>        (decide_unroll_constant_iterations): Add heuristic to avoid
>>        increasing branch mispredicts when unrolling.
>>        (decide_unroll_runtime_iterations): Ditto.
>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>>
>> Index: loop-unroll.c
>> ===================================================================
>> --- loop-unroll.c       (revision 186783)
>> +++ loop-unroll.c       (working copy)
>> @@ -152,6 +152,180 @@ static void combine_var_copies_in_loop_exit (struc
>>                                             basic_block);
>>  static rtx get_expansion (struct var_to_expand *);
>>
>> +/* Determine whether LOOP contains call.  */
>> +static bool
>> +loop_has_call(struct loop *loop)
>> +{
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>
> You repeatedly do this and walk over all blocks.  Please think about
> compile-time
> issues when writing code.
>
> This all looks sort-of target specific to me and I don't see why this
> very specialized
> patch is a good idea when unrolling does a very poor job deciding what and
how
> much to unroll generally.
>
> Richard.
>
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          if (CALL_P (insn))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Determine whether LOOP contains floating-point computation.  */
>> +static bool
>> +loop_has_FP_comp(struct loop *loop)
>> +{
>> +  rtx set, dest;
>> +  basic_block *body, bb;
>> +  unsigned i;
>> +  rtx insn;
>> +
>> +  body = get_loop_body (loop);
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      bb = body[i];
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          set = single_set (insn);
>> +          if (!set)
>> +            continue;
>> +
>> +          dest = SET_DEST (set);
>> +          if (FLOAT_MODE_P (GET_MODE (dest)))
>> +            {
>> +              free (body);
>> +              return true;
>> +            }
>> +        }
>> +    }
>> +  free (body);
>> +  return false;
>> +}
>> +
>> +/* Compute the number of branches in LOOP, weighted by execution counts.
 */
>> +static float
>> +compute_weighted_branches(struct loop *loop)
>> +{
>> +  int header_count = loop->header->count;
>> +  unsigned i;
>> +  float n;
>> +  basic_block * body;
>> +
>> +  /* If no profile feedback data exists, don't limit unrolling  */
>> +  if (header_count == 0)
>> +    return 0.0;
>> +
>> +  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> +
>> +  body = get_loop_body (loop);
>> +  n = 0.0;
>> +  for (i = 0; i < loop->num_nodes; i++)
>> +    {
>> +      if (EDGE_COUNT (body[i]->succs) >= 2)
>> +        {
>> +          /* If this block is executed less frequently than the header
(loop
>> +             entry), then it is weighted based on the ratio of times it is
>> +             executed compared to the header.  */
>> +          if (body[i]->count < header_count)
>> +            n += ((float)body[i]->count)/header_count;
>> +
>> +          /* When it is executed more frequently than the header (i.e. it
is
>> +             in a nested inner loop), simply weight the branch at 1.0.  */
>> +          else
>> +            n += 1.0;
>> +        }
>> +    }
>> +  free (body);
>> +
>> +  return n;
>> +}
>> +
>> +/* Compute the maximum number of times LOOP can be unrolled without
exceeding
>> +   a branch budget, which can increase branch mispredictions. The number of
>> +   branches is computed by weighting each branch with its expected
execution
>> +   probability through the loop based on profile data. If no profile
feedback
>> +   data exists, simply return the current NUNROLL factor.  */
>> +static unsigned
>> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> +{
>> +  struct loop *outer;
>> +  struct niter_desc *outer_desc;
>> +  int outer_niters = 1;
>> +  float weighted_outer_branches = 0.0;
>> +  float weighted_num_branches = compute_weighted_branches (loop);
>> +
>> +  /* If there was no profile feedback data, weighted_num_branches will be
0.0
>> +     and we won't limit unrolling. If the weighted_num_branches is at most
1.0,
>> +     also don't limit unrolling as the back-edge branch will not be
duplicated.  */
>> +  if (weighted_num_branches <= 1.0)
>> +    return nunroll;
>> +
>> +  /* Walk up the loop tree until we find a hot outer loop in which the
current
>> +     loop is nested. At that point we will compute the number of times the
>> +     current loop can be unrolled based on the number of branches in the
hot
>> +     outer loop.  */
>> +  outer = loop_outer(loop);
>> +  /* The loop structure contains a fake outermost loop, so this should
always
>> +     be non-NULL for our current loop.  */
>> +  gcc_assert (outer);
>> +  /* Detect if this is the fake outermost loop (at which point we are done)
>> +     by checking its outer loop.  */
>> +  while (loop_outer(outer))
>> +    {
>> +      outer_desc = get_simple_loop_desc (outer);
>> +
>> +      if (outer_desc->const_iter)
>> +        outer_niters *= outer_desc->niter;
>> +      else if (outer->header->count)
>> +        outer_niters *= expected_loop_iterations (outer);
>> +
>> +      weighted_outer_branches = compute_weighted_branches (outer);
>> +
>> +      /* Should have been checked by caller.  */
>> +      gcc_assert(PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1);
>> +
>> +      /* If the outer loop has enough iterations to be considered hot, then
>> +         we can stop our upwards loop tree traversal and examine the
current
>> +         outer loop.  */
>> +      if (outer_niters >= PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +        {
>> +          /* Assume that any call will cause the branch budget to be
exceeded,
>> +             and that we can't unroll the current loop without increasing
>> +             mispredicts.  */
>> +          if (loop_has_call(outer))
>> +            return 0;
>> +
>> +          /* Otherwise, compute the maximum number of times current loop can
be
>> +             unrolled without exceeding our branch budget. First we
subtract
>> +             off the outer loop's weighted branch count from the budget.
Note
>> +             that this includes the branches in the current loop. This
yields
>> +             the number of branches left in the budget for the unrolled
copies.
>> +             We divide this by the number of branches in the current loop
that
>> +             must be duplicated when we unroll, which is the total weighted
>> +             number of branches minus the back-edge branch. This yields the
>> +             number of new loop body copies that can be created by
unrolling
>> +             without exceeding the budget, to which we add 1 to get the
unroll
>> +             factor.  */
>> +          return (PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET) -
>> +                  weighted_outer_branches)/(weighted_num_branches - 1) + 1;
>> +        }
>> +      outer = loop_outer(outer);
>> +    }
>> +
>> +  /* The current loop is not enclosed by a hot enough outer loop in this
>> +     procedure, since the hot outer loop is inter-procedural, assume that
>> +     it already contains a significant number of branches, so don't unroll.
 */
>> +  return 0;
>> +}
>> +
>>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>>  void
>>  unroll_and_peel_loops (int flags)
>> @@ -522,6 +696,7 @@ static void
>>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>>  {
>>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
i;
>> +  unsigned nunroll_branches;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -565,6 +740,25 @@ decide_unroll_constant_iterations (struct loop *lo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && desc->niter < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* Check whether the loop rolls enough to consider.  */
>>   if (desc->niter < 2 * nunroll)
>>     {
>> @@ -802,7 +996,7 @@ unroll_loop_constant_iterations (struct loop *loop
>>  static void
>>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>>  {
>> -  unsigned nunroll, nunroll_by_av, i;
>> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -856,6 +1050,25 @@ decide_unroll_runtime_iterations (struct loop *loo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
increase
>> +     the number of mispredicts. Ignore loops with FP computation as these
>> +     tend to benefit much more consistently from unrolling.  */
>> +  if (num_loop_branches (loop) > 1
>> +      && loop_has_FP_comp(loop)
>> +      && PARAM_VALUE (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES) != -1
>> +      && expected_loop_iterations (loop) < (unsigned) PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES))
>> +    {
>> +      nunroll_branches = max_unroll_with_branches(loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* If we have profile feedback, check whether the loop rolls.  */
>>   if ((loop->header->count
>>        && expected_loop_iterations (loop) < 2 * nunroll)
>> Index: params.def
>> ===================================================================
>> --- params.def  (revision 186783)
>> +++ params.def  (working copy)
>> @@ -312,6 +312,16 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>>         "The maximum depth of a loop nest we completely peel",
>>         8, 0, 0)
>>
>> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> +       "min-iter-unroll-with-branches",
>> +       "Minimum iteration count to ignore branch effects when unrolling",
>> +       50, 0, 0)
>> +
>> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> +       "unroll-outer-loop-branch-budget",
>> +       "Maximum number of branches allowed in hot outer loop region after
unroll",
>> +       25, 0, 0)
>> +
>>  /* The maximum number of insns of an unswitched loop.  */
>>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>>        "max-unswitch-insns",
>>
>> --
>> This patch is available for review at http://codereview.appspot.com/6099055

izamyatin_gmail.com

Are you sure that tree-level unrollers are turned on at O2? My impression was that ...

13 years, 6 months ago (2012-04-27 07:07:42 UTC) #15

Are you sure that tree-level unrollers are turned on at O2? My
impression was that they work only at O3 or with f[unroll,peel]-loops
flags.

On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote:
> tejohnson@google.com (Teresa Johnson) writes:
>
>> This patch adds heuristics to limit unrolling in loops with branches
>> that may increase branch mispredictions. It affects loops that are
>> not frequently iterated, and that are nested within a hot region of code that
already contains many branch instructions.
>>
>> Performance tested with both internal benchmarks and with SPEC
>> 2000/2006 on a variety of Intel systems (Core2, Corei7, SandyBridge) and a
couple of different AMD Opteron systems.
>> This improves performance of an internal search indexing benchmark by
>> close to 2% on all the tested Intel platforms.  It also consistently
>> improves 445.gobmk (with FDO feedback where unrolling kicks in) by
>> close to 1% on AMD Opteron. Other performance effects are neutral.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>
> One problem with any unrolling heuristics is currently that gcc has
> both the tree level and the rtl level unroller. The tree one is even
> on at -O3.  So if you tweak anything for one you have to affect both,
> otherwise the other may still do the wrong thing(tm).

Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2,
both of them are turned on, but gcc does not allow any code growth --
which makes them pretty useless at O2 (very few loops qualify). The
default max complete peel iteration is also too low compared with both
icc and llvm.  This needs to be tuned.

David

>
> For some other tweaks I looked into a shared cost model some time ago.
> May be still needed.
>
> -Andi
>
> --
> ak@linux.intel.com -- Speaking for myself only

davidxl

On Fri, Apr 27, 2012 at 12:07 AM, Igor Zamyatin <izamyatin@gmail.com> wrote: > Are you ...

13 years, 6 months ago (2012-04-27 07:18:37 UTC) #16

On Fri, Apr 27, 2012 at 12:07 AM, Igor Zamyatin <izamyatin@gmail.com> wrote:
> Are you sure that tree-level unrollers are turned on at O2? My
> impression was that they work only at O3 or with f[unroll,peel]-loops
> flags.

yes they are on but only have effect on tiny loops with very small
trip count. With O3 or with -funroll,peel-loops, the size is allowed
to grow.

David

>
> On Tue, Apr 24, 2012 at 6:13 PM, Andi Kleen <andi@firstfloor.org> wrote:
>> tejohnson@google.com (Teresa Johnson) writes:
>>
>>> This patch adds heuristics to limit unrolling in loops with branches
>>> that may increase branch mispredictions. It affects loops that are
>>> not frequently iterated, and that are nested within a hot region of code
that already contains many branch instructions.
>>>
>>> Performance tested with both internal benchmarks and with SPEC
>>> 2000/2006 on a variety of Intel systems (Core2, Corei7, SandyBridge) and a
couple of different AMD Opteron systems.
>>> This improves performance of an internal search indexing benchmark by
>>> close to 2% on all the tested Intel platforms.  It also consistently
>>> improves 445.gobmk (with FDO feedback where unrolling kicks in) by
>>> close to 1% on AMD Opteron. Other performance effects are neutral.
>>>
>>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Is this ok for trunk?
>>
>> One problem with any unrolling heuristics is currently that gcc has
>> both the tree level and the rtl level unroller. The tree one is even
>> on at -O3.  So if you tweak anything for one you have to affect both,
>> otherwise the other may still do the wrong thing(tm).
>
> Tree level unrollers (cunrolli and cunroll) do complete unroll. At O2,
> both of them are turned on, but gcc does not allow any code growth --
> which makes them pretty useless at O2 (very few loops qualify). The
> default max complete peel iteration is also too low compared with both
> icc and llvm.  This needs to be tuned.
>
> David
>
>>
>> For some other tweaks I looked into a shared cost model some time ago.
>> May be still needed.
>>
>> -Andi
>>
>> --
>> ak@linux.intel.com -- Speaking for myself only

tejohnson

Fixed the stylist suggestions. Other responses below. On Tue, Apr 24, 2012 at 10:22 PM, ...

13 years, 6 months ago (2012-05-01 21:17:23 UTC) #17

tejohnson

Improved patch based on feedback. Main changes are: 1) Improve efficiency by caching loop analysis ...

13 years, 6 months ago (2012-05-01 22:54:30 UTC) #18

Improved patch based on feedback. Main changes are:

1) Improve efficiency by caching loop analysis results in the loop auxiliary
info structure hanging off the loop structure. Renamed this structure from
niter_desc to loop_desc to reflect the broader type of information cached
in the structure. Added a new routine, analyze_loop_insns, to fill in
information about the average and total number of branches, as well
as whether there are any floating point set and call instructions in the loop.
The new routine is invoked when we first create a loop's loop_desc struct,
and the caller (get_simple_loop_desc) has been modified to handle creating
a loop_desc for the fake outermost loop.

2) Improvements to max_unroll_with_branches:
- Treat the fake outermost loop (the procedure body) as we would a hot outer
loop, i.e. compute the max unroll looking at its nested branches, instead of
shutting off unrolling when we reach the fake outermost loop.
- Pull the checks previously done in the caller into the routine (e.g.
whether the loop iterates frequently or contains fp instructions).
- Fix a bug in the previous version that sometimes caused overflow in the 
new unroll factor.

3) Remove float variables, and use integer computation to compute the
average number of branches in the loop.

4) Detect more types of floating point computations in the loop by walking
all set instructions, not just single sets.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?

Thanks,
Teresa

2012-05-01   Teresa Johnson  <tejohnson@google.com>

	* doc/invoke.texi: Update the documentation with new params.
	* loop-unroll.c (max_unroll_with_branches): New function.
	(loop_exit_at_end_p, decide_peel_once_rolling): Rename niter_desc
        to loop_desc.
	(decide_peel_completely, peel_loop_completely): Ditto.
	(unroll_loop_runtime_iterations, decide_peel_simple): Ditto.
	(peel_loop_simple, decide_unroll_stupid, unroll_loop_stupid): Ditto.
	(decide_unroll_constant_iterations, decide_unroll_runtime_iterations):
        Ditto, and add heuristic to avoid increasing branch mispredicts when
        unrolling.
	* loop-doloop.c (doloop_valid_p): Rename niter_desc to loop_desc.
	(doloop_modify, doloop_optimize): Ditto.
	* loop-iv.c (simplify_using_initial_values): Ditto.
	(canonicalize_iv_subregs, determine_max_iter): Ditto.
	(iv_number_of_iterations, check_simple_exit): Ditto.
	(find_simple_exit, free_simple_loop_desc): Ditto.
	(get_simple_loop_desc): Ditto, invoke new analyze_loop_insns
        function, and add guards to enable this function to work for the
        outermost loop.
	* cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
	* cfgloop.h (struct loop_desc): Renamed from struct niter_desc
        and added new fields to cache additional loop analysis information.
	(find_simple_exit, get_simple_loop_desc, simple_loop_desc): Rename
        niter_desc to loop_desc.
	(analyze_loop_insns): Declare.
	* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.


Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 187013)
+++ doc/invoke.texi	(working copy)
@@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
 @item max-unswitch-level
 The maximum number of branches unswitched in a single loop.
 
+@item min-iter-unroll-with-branches
+Minimum iteration count to ignore branch effects when unrolling.
+
+@item unroll-outer-loop-branch-budget
+Maximum number of branches allowed in hot outer loop region after unroll.
+
 @item lim-expensive
 The minimum cost of an expensive expression in the loop invariant motion.
 
Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 187013)
+++ loop-unroll.c	(working copy)
@@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
 					     basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct loop_desc *outer_desc = 0;
+  int outer_niters = 1;
+  int frequent_iteration_threshold;
+  unsigned branch_budget;
+  struct loop_desc *desc = get_simple_loop_desc (loop);
+
+  /* Ignore loops with FP computation as these tend to benefit much more
+     consistently from unrolling.  */
+  if (desc->has_fp)
+    return nunroll;
+
+  frequent_iteration_threshold = PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
+  if (expected_loop_iterations (loop) >= (unsigned)
frequent_iteration_threshold)
+    return nunroll;
+
+  /* If there was no profile feedback data, av_num_branches will be 0
+     and we won't limit unrolling. If the av_num_branches is at most 1,
+     also don't limit unrolling as the back-edge branch will not be duplicated.
 */
+  if (desc->av_num_branches <= 1)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer (loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+
+  /* Walk up the loop tree until we either find a hot outer loop or hit the
+     fake outermost loop at the root.  */
+  while (true)
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      /* Stop if we hit the fake outermost loop at the root of the tree,
+         which includes the whole procedure.  */
+      if (!loop_outer (outer))
+        break;
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= frequent_iteration_threshold)
+        break;
+
+      outer = loop_outer (outer);
+    }
+
+  gcc_assert(outer);
+
+  /* Assume that any call will cause the branch budget to be exceeded,
+     and that we can't unroll the current loop without increasing
+     mispredicts.  */
+  if (outer_desc->has_call)
+    return 0;
+
+  /* Otherwise, compute the maximum number of times current loop can be
+     unrolled without exceeding our branch budget. First we subtract
+     off the outer loop's average branch count from the budget. Note
+     that this includes the branches in the current loop. This yields
+     the number of branches left in the budget for the unrolled copies.
+     We divide this by the number of branches in the current loop that
+     must be duplicated when we unroll, which is the total average
+     number of branches minus the back-edge branch. This yields the
+     number of new loop body copies that can be created by unrolling
+     without exceeding the budget, to which we add 1 to get the unroll
+     factor. Note that the "outermost loop" may be the whole procedure
+     if we did not find a hot enough enclosing loop.  */
+  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
+  if (outer_desc->av_num_branches > branch_budget)
+    return 0;
+  /* We already returned early if desc->av_num_branches <= 1.  */
+  return (branch_budget - outer_desc->av_num_branches)
+      / (desc->av_num_branches - 1) + 1;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -211,7 +304,7 @@ unroll_and_peel_loops (int flags)
 static bool
 loop_exit_at_end_p (struct loop *loop)
 {
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   rtx insn;
 
   if (desc->in_edge->dest != loop->latch)
@@ -321,7 +414,7 @@ decide_unrolling_and_peeling (int flags)
 static void
 decide_peel_once_rolling (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 {
-  struct niter_desc *desc;
+  struct loop_desc *desc;
 
   if (dump_file)
     fprintf (dump_file, "\n;; Considering peeling once rolling loop\n");
@@ -361,7 +454,7 @@ static void
 decide_peel_completely (struct loop *loop, int flags ATTRIBUTE_UNUSED)
 {
   unsigned npeel;
-  struct niter_desc *desc;
+  struct loop_desc *desc;
 
   if (dump_file)
     fprintf (dump_file, "\n;; Considering peeling completely\n");
@@ -459,7 +552,7 @@ peel_loop_completely (struct loop *loop)
   unsigned i;
   VEC (edge, heap) *remove_edges;
   edge ein;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   struct opt_info *opt_info = NULL;
 
   npeel = desc->niter;
@@ -522,7 +615,8 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
-  struct niter_desc *desc;
+  unsigned nunroll_branches;
+  struct loop_desc *desc;
 
   if (!(flags & UAP_UNROLL))
     {
@@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+	    fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -644,7 +753,7 @@ unroll_loop_constant_iterations (struct loop *loop
   VEC (edge, heap) *remove_edges;
   edge e;
   unsigned max_unroll = loop->lpt_decision.times;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   bool exit_at_end = loop_exit_at_end_p (loop);
   struct opt_info *opt_info = NULL;
   bool ok;
@@ -802,8 +911,8 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
-  struct niter_desc *desc;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
+  struct loop_desc *desc;
 
   if (!(flags & UAP_UNROLL))
     {
@@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
@@ -973,7 +1097,7 @@ unroll_loop_runtime_iterations (struct loop *loop)
   edge e;
   bool extra_zero_check, last_may_exit;
   unsigned max_unroll = loop->lpt_decision.times;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   bool exit_at_end = loop_exit_at_end_p (loop);
   struct opt_info *opt_info = NULL;
   bool ok;
@@ -1196,7 +1320,7 @@ static void
 decide_peel_simple (struct loop *loop, int flags)
 {
   unsigned npeel;
-  struct niter_desc *desc;
+  struct loop_desc *desc;
 
   if (!(flags & UAP_PEEL))
     {
@@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
 
   /* Do not simply peel loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not peeling, contains branches\n");
@@ -1295,7 +1419,7 @@ peel_loop_simple (struct loop *loop)
 {
   sbitmap wont_exit;
   unsigned npeel = loop->lpt_decision.times;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   struct opt_info *opt_info = NULL;
   bool ok;
 
@@ -1349,7 +1473,7 @@ static void
 decide_unroll_stupid (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, i;
-  struct niter_desc *desc;
+  struct loop_desc *desc;
 
   if (!(flags & UAP_UNROLL_ALL))
     {
@@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
 
   /* Do not unroll loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
@@ -1448,7 +1572,7 @@ unroll_loop_stupid (struct loop *loop)
 {
   sbitmap wont_exit;
   unsigned nunroll = loop->lpt_decision.times;
-  struct niter_desc *desc = get_simple_loop_desc (loop);
+  struct loop_desc *desc = get_simple_loop_desc (loop);
   struct opt_info *opt_info = NULL;
   bool ok;
 
Index: loop-doloop.c
===================================================================
--- loop-doloop.c	(revision 187013)
+++ loop-doloop.c	(working copy)
@@ -259,7 +259,7 @@ doloop_condition_get (rtx doloop_pat)
    describes the number of iterations of the loop.  */
 
 static bool
-doloop_valid_p (struct loop *loop, struct niter_desc *desc)
+doloop_valid_p (struct loop *loop, struct loop_desc *desc)
 {
   basic_block *body = get_loop_body (loop), bb;
   rtx insn;
@@ -397,7 +397,7 @@ add_test (rtx cond, edge *e, basic_block dest)
    DOLOOP_SEQ.  COUNT is the number of iterations of the LOOP.  */
 
 static void
-doloop_modify (struct loop *loop, struct niter_desc *desc,
+doloop_modify (struct loop *loop, struct loop_desc *desc,
 	       rtx doloop_seq, rtx condition, rtx count)
 {
   rtx counter_reg;
@@ -605,7 +605,7 @@ doloop_optimize (struct loop *loop)
   rtx condition;
   unsigned level, est_niter;
   int max_cost;
-  struct niter_desc *desc;
+  struct loop_desc *desc;
   unsigned word_mode_size;
   unsigned HOST_WIDE_INT word_mode_max;
 
@@ -751,4 +751,3 @@ doloop_optimize_loops (void)
 #endif
 }
 #endif /* HAVE_doloop_end */
-
Index: loop-iv.c
===================================================================
--- loop-iv.c	(revision 187013)
+++ loop-iv.c	(working copy)
@@ -2022,7 +2022,7 @@ simplify_using_initial_values (struct loop *loop,
 
 static void
 shorten_into_mode (struct rtx_iv *iv, enum machine_mode mode,
-		   enum rtx_code cond, bool signed_p, struct niter_desc *desc)
+		   enum rtx_code cond, bool signed_p, struct loop_desc *desc)
 {
   rtx mmin, mmax, cond_over, cond_under;
 
@@ -2081,7 +2081,7 @@ shorten_into_mode (struct rtx_iv *iv, enum machine
 
 static bool
 canonicalize_iv_subregs (struct rtx_iv *iv0, struct rtx_iv *iv1,
-			 enum rtx_code cond, struct niter_desc *desc)
+			 enum rtx_code cond, struct loop_desc *desc)
 {
   enum machine_mode comp_mode;
   bool signed_p;
@@ -2196,7 +2196,7 @@ canonicalize_iv_subregs (struct rtx_iv *iv0, struc
    expression for the number of iterations, before we tried to simplify it. 
*/
 
 static unsigned HOST_WIDEST_INT
-determine_max_iter (struct loop *loop, struct niter_desc *desc, rtx old_niter)
+determine_max_iter (struct loop *loop, struct loop_desc *desc, rtx old_niter)
 {
   rtx niter = desc->niter_expr;
   rtx mmin, mmax, cmp;
@@ -2244,7 +2244,7 @@ static unsigned HOST_WIDEST_INT
 
 static void
 iv_number_of_iterations (struct loop *loop, rtx insn, rtx condition,
-			 struct niter_desc *desc)
+			 struct loop_desc *desc)
 {
   rtx op0, op1, delta, step, bound, may_xform, tmp, tmp0, tmp1;
   struct rtx_iv iv0, iv1, tmp_iv;
@@ -2803,7 +2803,7 @@ fail:
    into DESC.  */
 
 static void
-check_simple_exit (struct loop *loop, edge e, struct niter_desc *desc)
+check_simple_exit (struct loop *loop, edge e, struct loop_desc *desc)
 {
   basic_block exit_bb;
   rtx condition, at;
@@ -2850,12 +2850,12 @@ static void
 /* Finds a simple exit of LOOP and stores its description into DESC.  */
 
 void
-find_simple_exit (struct loop *loop, struct niter_desc *desc)
+find_simple_exit (struct loop *loop, struct loop_desc *desc)
 {
   unsigned i;
   basic_block *body;
   edge e;
-  struct niter_desc act;
+  struct loop_desc act;
   bool any = false;
   edge_iterator ei;
 
@@ -2937,19 +2937,23 @@ void
 /* Creates a simple loop description of LOOP if it was not computed
    already.  */
 
-struct niter_desc *
+struct loop_desc *
 get_simple_loop_desc (struct loop *loop)
 {
-  struct niter_desc *desc = simple_loop_desc (loop);
+  struct loop_desc *desc = simple_loop_desc (loop);
 
   if (desc)
     return desc;
 
   /* At least desc->infinite is not always initialized by
      find_simple_loop_exit.  */
-  desc = XCNEW (struct niter_desc);
-  iv_analysis_loop_init (loop);
-  find_simple_exit (loop, desc);
+  desc = XCNEW (struct loop_desc);
+  if (loop->latch != EXIT_BLOCK_PTR)
+    {
+      iv_analysis_loop_init (loop);
+      find_simple_exit (loop, desc);
+    }
+  analyze_loop_insns (loop, desc);
   loop->aux = desc;
 
   if (desc->simple_p && (desc->assumptions || desc->infinite))
@@ -2995,7 +2999,7 @@ get_simple_loop_desc (struct loop *loop)
 void
 free_simple_loop_desc (struct loop *loop)
 {
-  struct niter_desc *desc = simple_loop_desc (loop);
+  struct loop_desc *desc = simple_loop_desc (loop);
 
   if (!desc)
     return;
Index: cfgloop.c
===================================================================
--- cfgloop.c	(revision 187013)
+++ cfgloop.c	(working copy)
@@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
   return edges;
 }
 
-/* Counts the number of conditional branches inside LOOP.  */
+/* Determine if INSN is a floating point set.  */
 
-unsigned
-num_loop_branches (const struct loop *loop)
+static bool
+insn_has_fp_set(rtx insn)
 {
-  unsigned i, n;
-  basic_block * body;
+  int i;
+  rtx pat = PATTERN(insn);
+  if (GET_CODE (pat) == SET)
+    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
+  else if (GET_CODE (pat) == PARALLEL)
+    {
+      for (i = 0; i < XVECLEN (pat, 0); i++)
+        {
+          rtx sub = XVECEXP (pat, 0, i);
+          if (GET_CODE (sub) == SET)
+            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
+        }
+    }
+  return false;
+}
 
-  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+/* Analyzes the instructions inside LOOP, updating the DESC. Currently counts
+   the number of conditional branch instructions, calls and fp instructions,
+   as well as the average number of branches executed per iteration.  */
 
+void
+analyze_loop_insns (const struct loop *loop, struct loop_desc *desc)
+{
+  unsigned i, nbranch;
+  gcov_type weighted_nbranch;
+  bool has_call, has_fp;
+  basic_block * body, bb;
+  rtx insn;
+  gcov_type header_count = loop->header->count;
+
+  nbranch = weighted_nbranch = 0;
+  has_call = has_fp = false;
+
   body = get_loop_body (loop);
-  n = 0;
   for (i = 0; i < loop->num_nodes; i++)
-    if (EDGE_COUNT (body[i]->succs) >= 2)
-      n++;
+    {
+      bb = body[i];
+
+      if (EDGE_COUNT (bb->succs) >= 2)
+        {
+          nbranch++;
+
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on its execution count, which
+             will be turned into a ratio compared to the loop header below. */
+          if (bb->count < header_count)
+            weighted_nbranch += bb->count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch the same as the
+             header execution count, so that it will contribute 1 branch to
+             the ratio computed below. */
+          else
+            weighted_nbranch += header_count;
+        }
+
+      /* No need to iterate through the instructions below if
+         both flags have already been set.  */
+      if (has_call && has_fp)
+        continue;
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (!INSN_P (insn))
+            continue;
+
+          if (!has_call)
+            has_call = CALL_P (insn);
+
+          if (!has_fp)
+            has_fp = insn_has_fp_set (insn);
+        }
+    }
   free (body);
 
-  return n;
+  desc->num_branches = nbranch;
+  /* Now divide the weights computed above by the loop header execution count,
+     to compute the average number of branches through the loop. By adding
+     header_count/2 to the numerator we round to nearest with integer
+     division.  */
+  if (header_count  != 0)
+    desc->av_num_branches
+        = (weighted_nbranch + header_count/2) / header_count;
+  else
+    desc->av_num_branches = 0;
+  desc->has_call = has_call;
+  desc->has_fp = has_fp;
 }
 
 /* Adds basic block BB to LOOP.  */
Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 187013)
+++ cfgloop.h	(working copy)
@@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
 
 extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
 edge single_exit (const struct loop *);
-extern unsigned num_loop_branches (const struct loop *);
 
 extern edge loop_preheader_edge (const struct loop *);
 extern edge loop_latch_edge (const struct loop *);
@@ -359,9 +358,10 @@ struct rtx_iv
 };
 
 /* The description of an exit from the loop and of the number of iterations
-   till we take the exit.  */
+   till we take the exit. Also includes other information used primarily
+   by the loop unroller.  */
 
-struct niter_desc
+struct loop_desc
 {
   /* The edge out of the loop.  */
   edge out_edge;
@@ -400,6 +400,18 @@ struct rtx_iv
 
   /* The number of iterations of the loop.  */
   rtx niter_expr;
+
+  /* The number of branches in the loop.  */
+  unsigned num_branches;
+
+  /* The number of executed branches per iteration.  */
+  unsigned av_num_branches;
+
+  /* Whether the loop contains a call instruction.  */
+  bool has_call;
+
+  /* Whether the loop contains fp instructions.  */
+  bool has_fp;
 };
 
 extern void iv_analysis_loop_init (struct loop *);
@@ -408,16 +420,17 @@ extern bool iv_analyze_result (rtx, rtx, struct rt
 extern bool iv_analyze_expr (rtx, rtx, enum machine_mode, struct rtx_iv *);
 extern rtx get_iv_value (struct rtx_iv *, rtx);
 extern bool biv_p (rtx, rtx);
-extern void find_simple_exit (struct loop *, struct niter_desc *);
+extern void find_simple_exit (struct loop *, struct loop_desc *);
 extern void iv_analysis_done (void);
 
-extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
+extern struct loop_desc *get_simple_loop_desc (struct loop *loop);
 extern void free_simple_loop_desc (struct loop *loop);
+void analyze_loop_insns (const struct loop *, struct loop_desc *desc);
 
-static inline struct niter_desc *
+static inline struct loop_desc *
 simple_loop_desc (struct loop *loop)
 {
-  return (struct niter_desc *) loop->aux;
+  return (struct loop_desc *) loop->aux;
 }
 
 /* Accessors for the loop structures.  */
Index: params.def
===================================================================
--- params.def	(revision 187013)
+++ params.def	(working copy)
@@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+	"min-iter-unroll-with-branches",
+	"Minimum iteration count to ignore branch effects when unrolling",
+	50, 0, 0)
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+	"unroll-outer-loop-branch-budget",
+	"Maximum number of branches allowed in hot outer loop region after unroll",
+	25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
 	"max-unswitch-insns",

--
This patch is available for review at http://codereview.appspot.com/6099055

davidxl

It might be better to separate the data structure name change (niter_desc to loop_desc) into ...

13 years, 6 months ago (2012-05-04 16:39:17 UTC) #19

tejohnson

On David's suggestion, I have removed the changes that rename niter_desc to loop_desc from this ...

13 years, 6 months ago (2012-05-04 22:41:41 UTC) #20

On David's suggestion, I have removed the changes that rename niter_desc to
loop_desc from this patch to focus the patch on the unrolling changes. I can
submit a cleanup patch to do the renaming as soon as this one goes in.

Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?

Thanks,
Teresa

Here is the new description of improvements from the original patch:

Improved patch based on feedback. Main changes are:

1) Improve efficiency by caching loop analysis results in the loop auxiliary
info structure hanging off the loop structure. Added a new routine,
analyze_loop_insns, to fill in information about the average and total number
of branches, as well as whether there are any floating point set and call
instructions in the loop. The new routine is invoked when we first create a
loop's niter_desc struct, and the caller (get_simple_loop_desc) has been
modified to handle creating a niter_desc for the fake outermost loop.

2) Improvements to max_unroll_with_branches:
- Treat the fake outermost loop (the procedure body) as we would a hot outer
loop, i.e. compute the max unroll looking at its nested branches, instead of
shutting off unrolling when we reach the fake outermost loop.
- Pull the checks previously done in the caller into the routine (e.g.
whether the loop iterates frequently or contains fp instructions).
- Fix a bug in the previous version that sometimes caused overflow in the
new unroll factor.

3) Remove float variables, and use integer computation to compute the
average number of branches in the loop.

4) Detect more types of floating point computations in the loop by walking
all set instructions, not just single sets.

2012-05-04   Teresa Johnson  <tejohnson@google.com>

	* doc/invoke.texi: Update the documentation with new params.
	* loop-unroll.c (max_unroll_with_branches): New function.
	(decide_unroll_constant_iterations, decide_unroll_runtime_iterations):
        Add heuristic to avoid increasing branch mispredicts when unrolling.
	(decide_peel_simple, decide_unroll_stupid): Retrieve number of
        branches from niter_desc instead of via function that walks loop.
	* loop-iv.c (get_simple_loop_desc): Invoke new analyze_loop_insns
        function, and add guards to enable this function to work for the
        outermost loop.
	* cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
        (num_loop_branches): Remove.
	* cfgloop.h (struct loop_desc): Added new fields to cache additional
        loop analysis information.
        (num_loop_branches): Remove.
	(analyze_loop_insns): Declare.
	* params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.

Index: doc/invoke.texi
===================================================================
--- doc/invoke.texi	(revision 187013)
+++ doc/invoke.texi	(working copy)
@@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
 @item max-unswitch-level
 The maximum number of branches unswitched in a single loop.
 
+@item min-iter-unroll-with-branches
+Minimum iteration count to ignore branch effects when unrolling.
+
+@item unroll-outer-loop-branch-budget
+Maximum number of branches allowed in hot outer loop region after unroll.
+
 @item lim-expensive
 The minimum cost of an expensive expression in the loop invariant motion.
 
Index: loop-unroll.c
===================================================================
--- loop-unroll.c	(revision 187013)
+++ loop-unroll.c	(working copy)
@@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
 					     basic_block);
 static rtx get_expansion (struct var_to_expand *);
 
+/* Compute the maximum number of times LOOP can be unrolled without exceeding
+   a branch budget, which can increase branch mispredictions. The number of
+   branches is computed by weighting each branch with its expected execution
+   probability through the loop based on profile data. If no profile feedback
+   data exists, simply return the current NUNROLL factor.  */
+
+static unsigned
+max_unroll_with_branches(struct loop *loop, unsigned nunroll)
+{
+  struct loop *outer;
+  struct niter_desc *outer_desc = 0;
+  int outer_niters = 1;
+  int frequent_iteration_threshold;
+  unsigned branch_budget;
+  struct niter_desc *desc = get_simple_loop_desc (loop);
+
+  /* Ignore loops with FP computation as these tend to benefit much more
+     consistently from unrolling.  */
+  if (desc->has_fp)
+    return nunroll;
+
+  frequent_iteration_threshold = PARAM_VALUE
(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
+  if (expected_loop_iterations (loop) >= (unsigned)
frequent_iteration_threshold)
+    return nunroll;
+
+  /* If there was no profile feedback data, av_num_branches will be 0
+     and we won't limit unrolling. If the av_num_branches is at most 1,
+     also don't limit unrolling as the back-edge branch will not be duplicated.
 */
+  if (desc->av_num_branches <= 1)
+    return nunroll;
+
+  /* Walk up the loop tree until we find a hot outer loop in which the current
+     loop is nested. At that point we will compute the number of times the
+     current loop can be unrolled based on the number of branches in the hot
+     outer loop.  */
+  outer = loop_outer (loop);
+  /* The loop structure contains a fake outermost loop, so this should always
+     be non-NULL for our current loop.  */
+  gcc_assert (outer);
+
+  /* Walk up the loop tree until we either find a hot outer loop or hit the
+     fake outermost loop at the root.  */
+  while (true)
+    {
+      outer_desc = get_simple_loop_desc (outer);
+
+      /* Stop if we hit the fake outermost loop at the root of the tree,
+         which includes the whole procedure.  */
+      if (!loop_outer (outer))
+        break;
+
+      if (outer_desc->const_iter)
+        outer_niters *= outer_desc->niter;
+      else if (outer->header->count)
+        outer_niters *= expected_loop_iterations (outer);
+
+      /* If the outer loop has enough iterations to be considered hot, then
+         we can stop our upwards loop tree traversal and examine the current
+         outer loop.  */
+      if (outer_niters >= frequent_iteration_threshold)
+        break;
+
+      outer = loop_outer (outer);
+    }
+
+  gcc_assert(outer);
+
+  /* Assume that any call will cause the branch budget to be exceeded,
+     and that we can't unroll the current loop without increasing
+     mispredicts.  */
+  if (outer_desc->has_call)
+    return 0;
+
+  /* Otherwise, compute the maximum number of times current loop can be
+     unrolled without exceeding our branch budget. First we subtract
+     off the outer loop's average branch count from the budget. Note
+     that this includes the branches in the current loop. This yields
+     the number of branches left in the budget for the unrolled copies.
+     We divide this by the number of branches in the current loop that
+     must be duplicated when we unroll, which is the total average
+     number of branches minus the back-edge branch. This yields the
+     number of new loop body copies that can be created by unrolling
+     without exceeding the budget, to which we add 1 to get the unroll
+     factor. Note that the "outermost loop" may be the whole procedure
+     if we did not find a hot enough enclosing loop.  */
+  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
+  if (outer_desc->av_num_branches > branch_budget)
+    return 0;
+  /* We already returned early if desc->av_num_branches <= 1.  */
+  return (branch_budget - outer_desc->av_num_branches)
+      / (desc->av_num_branches - 1) + 1;
+}
+
 /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
 void
 unroll_and_peel_loops (int flags)
@@ -522,6 +615,7 @@ static void
 decide_unroll_constant_iterations (struct loop *loop, int flags)
 {
   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
+  unsigned nunroll_branches;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+	    fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* Check whether the loop rolls enough to consider.  */
   if (desc->niter < 2 * nunroll)
     {
@@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
 static void
 decide_unroll_runtime_iterations (struct loop *loop, int flags)
 {
-  unsigned nunroll, nunroll_by_av, i;
+  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
   struct niter_desc *desc;
 
   if (!(flags & UAP_UNROLL))
@@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
       return;
     }
 
+  /* Be careful when unrolling loops with branches inside -- it can increase
+     the number of mispredicts.  */
+  if (desc->num_branches > 1)
+    {
+      nunroll_branches = max_unroll_with_branches (loop, nunroll);
+      if (nunroll > nunroll_branches)
+        nunroll = nunroll_branches;
+      if (nunroll <= 1)
+        {
+          if (dump_file)
+            fprintf (dump_file, ";; Not unrolling, contains branches\n");
+          return;
+        }
+    }
+
   /* If we have profile feedback, check whether the loop rolls.  */
   if ((loop->header->count
        && expected_loop_iterations (loop) < 2 * nunroll)
@@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
 
   /* Do not simply peel loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not peeling, contains branches\n");
@@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
 
   /* Do not unroll loops with branches inside -- it increases number
      of mispredicts.  */
-  if (num_loop_branches (loop) > 1)
+  if (desc->num_branches > 1)
     {
       if (dump_file)
 	fprintf (dump_file, ";; Not unrolling, contains branches\n");
Index: loop-iv.c
===================================================================
--- loop-iv.c	(revision 187013)
+++ loop-iv.c	(working copy)
@@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
   /* At least desc->infinite is not always initialized by
      find_simple_loop_exit.  */
   desc = XCNEW (struct niter_desc);
-  iv_analysis_loop_init (loop);
-  find_simple_exit (loop, desc);
+  if (loop->latch != EXIT_BLOCK_PTR)
+    {
+      iv_analysis_loop_init (loop);
+      find_simple_exit (loop, desc);
+    }
+  analyze_loop_insns (loop, desc);
   loop->aux = desc;
 
   if (desc->simple_p && (desc->assumptions || desc->infinite))
Index: cfgloop.c
===================================================================
--- cfgloop.c	(revision 187013)
+++ cfgloop.c	(working copy)
@@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
   return edges;
 }
 
-/* Counts the number of conditional branches inside LOOP.  */
+/* Determine if INSN is a floating point set.  */
 
-unsigned
-num_loop_branches (const struct loop *loop)
+static bool
+insn_has_fp_set(rtx insn)
 {
-  unsigned i, n;
-  basic_block * body;
+  int i;
+  rtx pat = PATTERN(insn);
+  if (GET_CODE (pat) == SET)
+    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
+  else if (GET_CODE (pat) == PARALLEL)
+    {
+      for (i = 0; i < XVECLEN (pat, 0); i++)
+        {
+          rtx sub = XVECEXP (pat, 0, i);
+          if (GET_CODE (sub) == SET)
+            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
+        }
+    }
+  return false;
+}
 
-  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
+/* Analyzes the instructions inside LOOP, updating the DESC. Currently counts
+   the number of conditional branch instructions, calls and fp instructions,
+   as well as the average number of branches executed per iteration.  */
 
+void
+analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
+{
+  unsigned i, nbranch;
+  gcov_type weighted_nbranch;
+  bool has_call, has_fp;
+  basic_block * body, bb;
+  rtx insn;
+  gcov_type header_count = loop->header->count;
+
+  nbranch = weighted_nbranch = 0;
+  has_call = has_fp = false;
+
   body = get_loop_body (loop);
-  n = 0;
   for (i = 0; i < loop->num_nodes; i++)
-    if (EDGE_COUNT (body[i]->succs) >= 2)
-      n++;
+    {
+      bb = body[i];
+
+      if (EDGE_COUNT (bb->succs) >= 2)
+        {
+          nbranch++;
+
+          /* If this block is executed less frequently than the header (loop
+             entry), then it is weighted based on its execution count, which
+             will be turned into a ratio compared to the loop header below. */
+          if (bb->count < header_count)
+            weighted_nbranch += bb->count;
+
+          /* When it is executed more frequently than the header (i.e. it is
+             in a nested inner loop), simply weight the branch the same as the
+             header execution count, so that it will contribute 1 branch to
+             the ratio computed below. */
+          else
+            weighted_nbranch += header_count;
+        }
+
+      /* No need to iterate through the instructions below if
+         both flags have already been set.  */
+      if (has_call && has_fp)
+        continue;
+
+      FOR_BB_INSNS (bb, insn)
+        {
+          if (!INSN_P (insn))
+            continue;
+
+          if (!has_call)
+            has_call = CALL_P (insn);
+
+          if (!has_fp)
+            has_fp = insn_has_fp_set (insn);
+        }
+    }
   free (body);
 
-  return n;
+  desc->num_branches = nbranch;
+  /* Now divide the weights computed above by the loop header execution count,
+     to compute the average number of branches through the loop. By adding
+     header_count/2 to the numerator we round to nearest with integer
+     division.  */
+  if (header_count  != 0)
+    desc->av_num_branches
+        = (weighted_nbranch + header_count/2) / header_count;
+  else
+    desc->av_num_branches = 0;
+  desc->has_call = has_call;
+  desc->has_fp = has_fp;
 }
 
 /* Adds basic block BB to LOOP.  */
Index: cfgloop.h
===================================================================
--- cfgloop.h	(revision 187013)
+++ cfgloop.h	(working copy)
@@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
 
 extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
 edge single_exit (const struct loop *);
-extern unsigned num_loop_branches (const struct loop *);
 
 extern edge loop_preheader_edge (const struct loop *);
 extern edge loop_latch_edge (const struct loop *);
@@ -359,7 +358,8 @@ struct rtx_iv
 };
 
 /* The description of an exit from the loop and of the number of iterations
-   till we take the exit.  */
+   till we take the exit. Also includes other information used primarily
+   by the loop unroller.  */
 
 struct niter_desc
 {
@@ -400,6 +400,18 @@ struct niter_desc
 
   /* The number of iterations of the loop.  */
   rtx niter_expr;
+
+  /* The number of branches in the loop.  */
+  unsigned num_branches;
+
+  /* The number of executed branches per iteration.  */
+  unsigned av_num_branches;
+
+  /* Whether the loop contains a call instruction.  */
+  bool has_call;
+
+  /* Whether the loop contains fp instructions.  */
+  bool has_fp;
 };
 
 extern void iv_analysis_loop_init (struct loop *);
@@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
 
 extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
 extern void free_simple_loop_desc (struct loop *loop);
+void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
 
 static inline struct niter_desc *
 simple_loop_desc (struct loop *loop)
Index: params.def
===================================================================
--- params.def	(revision 187013)
+++ params.def	(working copy)
@@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
 	 "The maximum depth of a loop nest we completely peel",
 	 8, 0, 0)
 
+DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
+	"min-iter-unroll-with-branches",
+	"Minimum iteration count to ignore branch effects when unrolling",
+	50, 0, 0)
+DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
+	"unroll-outer-loop-branch-budget",
+	"Maximum number of branches allowed in hot outer loop region after unroll",
+	25, 0, 0)
+
 /* The maximum number of insns of an unswitched loop.  */
 DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
 	"max-unswitch-insns",

--
This patch is available for review at http://codereview.appspot.com/6099055

tejohnson

Ping? Teresa On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote: > ...

13 years, 5 months ago (2012-05-11 13:09:44 UTC) #21

Ping?
Teresa

On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote:

> On David's suggestion, I have removed the changes that rename niter_desc to
> loop_desc from this patch to focus the patch on the unrolling changes. I
> can
> submit a cleanup patch to do the renaming as soon as this one goes in.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?
>
> Thanks,
> Teresa
>
> Here is the new description of improvements from the original patch:
>
> Improved patch based on feedback. Main changes are:
>
> 1) Improve efficiency by caching loop analysis results in the loop
> auxiliary
> info structure hanging off the loop structure. Added a new routine,
> analyze_loop_insns, to fill in information about the average and total
> number
> of branches, as well as whether there are any floating point set and call
> instructions in the loop. The new routine is invoked when we first create a
> loop's niter_desc struct, and the caller (get_simple_loop_desc) has been
> modified to handle creating a niter_desc for the fake outermost loop.
>
> 2) Improvements to max_unroll_with_branches:
> - Treat the fake outermost loop (the procedure body) as we would a hot
> outer
> loop, i.e. compute the max unroll looking at its nested branches, instead
> of
> shutting off unrolling when we reach the fake outermost loop.
> - Pull the checks previously done in the caller into the routine (e.g.
> whether the loop iterates frequently or contains fp instructions).
> - Fix a bug in the previous version that sometimes caused overflow in the
> new unroll factor.
>
> 3) Remove float variables, and use integer computation to compute the
> average number of branches in the loop.
>
> 4) Detect more types of floating point computations in the loop by walking
> all set instructions, not just single sets.
>
> 2012-05-04   Teresa Johnson  <tejohnson@google.com>
>
>        * doc/invoke.texi: Update the documentation with new params.
>        * loop-unroll.c (max_unroll_with_branches): New function.
>        (decide_unroll_constant_iterations,
> decide_unroll_runtime_iterations):
>        Add heuristic to avoid increasing branch mispredicts when unrolling.
>        (decide_peel_simple, decide_unroll_stupid): Retrieve number of
>        branches from niter_desc instead of via function that walks loop.
>        * loop-iv.c (get_simple_loop_desc): Invoke new analyze_loop_insns
>        function, and add guards to enable this function to work for the
>        outermost loop.
>        * cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
>        (num_loop_branches): Remove.
>        * cfgloop.h (struct loop_desc): Added new fields to cache additional
>        loop analysis information.
>        (num_loop_branches): Remove.
>        (analyze_loop_insns): Declare.
>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> Index: doc/invoke.texi
> ===================================================================
> --- doc/invoke.texi     (revision 187013)
> +++ doc/invoke.texi     (working copy)
> @@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
>  @item max-unswitch-level
>  The maximum number of branches unswitched in a single loop.
>
> +@item min-iter-unroll-with-branches
> +Minimum iteration count to ignore branch effects when unrolling.
> +
> +@item unroll-outer-loop-branch-budget
> +Maximum number of branches allowed in hot outer loop region after unroll.
> +
>  @item lim-expensive
>  The minimum cost of an expensive expression in the loop invariant motion.
>
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c       (revision 187013)
> +++ loop-unroll.c       (working copy)
> @@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
>                                             basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +/* Compute the maximum number of times LOOP can be unrolled without
> exceeding
> +   a branch budget, which can increase branch mispredictions. The number
> of
> +   branches is computed by weighting each branch with its expected
> execution
> +   probability through the loop based on profile data. If no profile
> feedback
> +   data exists, simply return the current NUNROLL factor.  */
> +
> +static unsigned
> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> +{
> +  struct loop *outer;
> +  struct niter_desc *outer_desc = 0;
> +  int outer_niters = 1;
> +  int frequent_iteration_threshold;
> +  unsigned branch_budget;
> +  struct niter_desc *desc = get_simple_loop_desc (loop);
> +
> +  /* Ignore loops with FP computation as these tend to benefit much more
> +     consistently from unrolling.  */
> +  if (desc->has_fp)
> +    return nunroll;
> +
> +  frequent_iteration_threshold = PARAM_VALUE
> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
> +  if (expected_loop_iterations (loop) >= (unsigned)
> frequent_iteration_threshold)
> +    return nunroll;
> +
> +  /* If there was no profile feedback data, av_num_branches will be 0
> +     and we won't limit unrolling. If the av_num_branches is at most 1,
> +     also don't limit unrolling as the back-edge branch will not be
> duplicated.  */
> +  if (desc->av_num_branches <= 1)
> +    return nunroll;
> +
> +  /* Walk up the loop tree until we find a hot outer loop in which the
> current
> +     loop is nested. At that point we will compute the number of times the
> +     current loop can be unrolled based on the number of branches in the
> hot
> +     outer loop.  */
> +  outer = loop_outer (loop);
> +  /* The loop structure contains a fake outermost loop, so this should
> always
> +     be non-NULL for our current loop.  */
> +  gcc_assert (outer);
> +
> +  /* Walk up the loop tree until we either find a hot outer loop or hit
> the
> +     fake outermost loop at the root.  */
> +  while (true)
> +    {
> +      outer_desc = get_simple_loop_desc (outer);
> +
> +      /* Stop if we hit the fake outermost loop at the root of the tree,
> +         which includes the whole procedure.  */
> +      if (!loop_outer (outer))
> +        break;
> +
> +      if (outer_desc->const_iter)
> +        outer_niters *= outer_desc->niter;
> +      else if (outer->header->count)
> +        outer_niters *= expected_loop_iterations (outer);
> +
> +      /* If the outer loop has enough iterations to be considered hot,
> then
> +         we can stop our upwards loop tree traversal and examine the
> current
> +         outer loop.  */
> +      if (outer_niters >= frequent_iteration_threshold)
> +        break;
> +
> +      outer = loop_outer (outer);
> +    }
> +
> +  gcc_assert(outer);
> +
> +  /* Assume that any call will cause the branch budget to be exceeded,
> +     and that we can't unroll the current loop without increasing
> +     mispredicts.  */
> +  if (outer_desc->has_call)
> +    return 0;
> +
> +  /* Otherwise, compute the maximum number of times current loop can be
> +     unrolled without exceeding our branch budget. First we subtract
> +     off the outer loop's average branch count from the budget. Note
> +     that this includes the branches in the current loop. This yields
> +     the number of branches left in the budget for the unrolled copies.
> +     We divide this by the number of branches in the current loop that
> +     must be duplicated when we unroll, which is the total average
> +     number of branches minus the back-edge branch. This yields the
> +     number of new loop body copies that can be created by unrolling
> +     without exceeding the budget, to which we add 1 to get the unroll
> +     factor. Note that the "outermost loop" may be the whole procedure
> +     if we did not find a hot enough enclosing loop.  */
> +  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
> +  if (outer_desc->av_num_branches > branch_budget)
> +    return 0;
> +  /* We already returned early if desc->av_num_branches <= 1.  */
> +  return (branch_budget - outer_desc->av_num_branches)
> +      / (desc->av_num_branches - 1) + 1;
> +}
> +
>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>  void
>  unroll_and_peel_loops (int flags)
> @@ -522,6 +615,7 @@ static void
>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>  {
>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
> i;
> +  unsigned nunroll_branches;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> +     the number of mispredicts.  */
> +  if (desc->num_branches > 1)
> +    {
> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* Check whether the loop rolls enough to consider.  */
>   if (desc->niter < 2 * nunroll)
>     {
> @@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
>  static void
>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>  {
> -  unsigned nunroll, nunroll_by_av, i;
> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> +     the number of mispredicts.  */
> +  if (desc->num_branches > 1)
> +    {
> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* If we have profile feedback, check whether the loop rolls.  */
>   if ((loop->header->count
>        && expected_loop_iterations (loop) < 2 * nunroll)
> @@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
>
>   /* Do not simply peel loops with branches inside -- it increases number
>      of mispredicts.  */
> -  if (num_loop_branches (loop) > 1)
> +  if (desc->num_branches > 1)
>     {
>       if (dump_file)
>        fprintf (dump_file, ";; Not peeling, contains branches\n");
> @@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
>
>   /* Do not unroll loops with branches inside -- it increases number
>      of mispredicts.  */
> -  if (num_loop_branches (loop) > 1)
> +  if (desc->num_branches > 1)
>     {
>       if (dump_file)
>        fprintf (dump_file, ";; Not unrolling, contains branches\n");
> Index: loop-iv.c
> ===================================================================
> --- loop-iv.c   (revision 187013)
> +++ loop-iv.c   (working copy)
> @@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
>   /* At least desc->infinite is not always initialized by
>      find_simple_loop_exit.  */
>   desc = XCNEW (struct niter_desc);
> -  iv_analysis_loop_init (loop);
> -  find_simple_exit (loop, desc);
> +  if (loop->latch != EXIT_BLOCK_PTR)
> +    {
> +      iv_analysis_loop_init (loop);
> +      find_simple_exit (loop, desc);
> +    }
> +  analyze_loop_insns (loop, desc);
>   loop->aux = desc;
>
>   if (desc->simple_p && (desc->assumptions || desc->infinite))
> Index: cfgloop.c
> ===================================================================
> --- cfgloop.c   (revision 187013)
> +++ cfgloop.c   (working copy)
> @@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
>   return edges;
>  }
>
> -/* Counts the number of conditional branches inside LOOP.  */
> +/* Determine if INSN is a floating point set.  */
>
> -unsigned
> -num_loop_branches (const struct loop *loop)
> +static bool
> +insn_has_fp_set(rtx insn)
>  {
> -  unsigned i, n;
> -  basic_block * body;
> +  int i;
> +  rtx pat = PATTERN(insn);
> +  if (GET_CODE (pat) == SET)
> +    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
> +  else if (GET_CODE (pat) == PARALLEL)
> +    {
> +      for (i = 0; i < XVECLEN (pat, 0); i++)
> +        {
> +          rtx sub = XVECEXP (pat, 0, i);
> +          if (GET_CODE (sub) == SET)
> +            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
> +        }
> +    }
> +  return false;
> +}
>
> -  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
> +/* Analyzes the instructions inside LOOP, updating the DESC. Currently
> counts
> +   the number of conditional branch instructions, calls and fp
> instructions,
> +   as well as the average number of branches executed per iteration.  */
>
> +void
> +analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
> +{
> +  unsigned i, nbranch;
> +  gcov_type weighted_nbranch;
> +  bool has_call, has_fp;
> +  basic_block * body, bb;
> +  rtx insn;
> +  gcov_type header_count = loop->header->count;
> +
> +  nbranch = weighted_nbranch = 0;
> +  has_call = has_fp = false;
> +
>   body = get_loop_body (loop);
> -  n = 0;
>   for (i = 0; i < loop->num_nodes; i++)
> -    if (EDGE_COUNT (body[i]->succs) >= 2)
> -      n++;
> +    {
> +      bb = body[i];
> +
> +      if (EDGE_COUNT (bb->succs) >= 2)
> +        {
> +          nbranch++;
> +
> +          /* If this block is executed less frequently than the header
> (loop
> +             entry), then it is weighted based on its execution count,
> which
> +             will be turned into a ratio compared to the loop header
> below. */
> +          if (bb->count < header_count)
> +            weighted_nbranch += bb->count;
> +
> +          /* When it is executed more frequently than the header (i.e. it
> is
> +             in a nested inner loop), simply weight the branch the same
> as the
> +             header execution count, so that it will contribute 1 branch
> to
> +             the ratio computed below. */
> +          else
> +            weighted_nbranch += header_count;
> +        }
> +
> +      /* No need to iterate through the instructions below if
> +         both flags have already been set.  */
> +      if (has_call && has_fp)
> +        continue;
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          if (!INSN_P (insn))
> +            continue;
> +
> +          if (!has_call)
> +            has_call = CALL_P (insn);
> +
> +          if (!has_fp)
> +            has_fp = insn_has_fp_set (insn);
> +        }
> +    }
>   free (body);
>
> -  return n;
> +  desc->num_branches = nbranch;
> +  /* Now divide the weights computed above by the loop header execution
> count,
> +     to compute the average number of branches through the loop. By adding
> +     header_count/2 to the numerator we round to nearest with integer
> +     division.  */
> +  if (header_count  != 0)
> +    desc->av_num_branches
> +        = (weighted_nbranch + header_count/2) / header_count;
> +  else
> +    desc->av_num_branches = 0;
> +  desc->has_call = has_call;
> +  desc->has_fp = has_fp;
>  }
>
>  /* Adds basic block BB to LOOP.  */
> Index: cfgloop.h
> ===================================================================
> --- cfgloop.h   (revision 187013)
> +++ cfgloop.h   (working copy)
> @@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
>
>  extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
>  edge single_exit (const struct loop *);
> -extern unsigned num_loop_branches (const struct loop *);
>
>  extern edge loop_preheader_edge (const struct loop *);
>  extern edge loop_latch_edge (const struct loop *);
> @@ -359,7 +358,8 @@ struct rtx_iv
>  };
>
>  /* The description of an exit from the loop and of the number of
> iterations
> -   till we take the exit.  */
> +   till we take the exit. Also includes other information used primarily
> +   by the loop unroller.  */
>
>  struct niter_desc
>  {
> @@ -400,6 +400,18 @@ struct niter_desc
>
>   /* The number of iterations of the loop.  */
>   rtx niter_expr;
> +
> +  /* The number of branches in the loop.  */
> +  unsigned num_branches;
> +
> +  /* The number of executed branches per iteration.  */
> +  unsigned av_num_branches;
> +
> +  /* Whether the loop contains a call instruction.  */
> +  bool has_call;
> +
> +  /* Whether the loop contains fp instructions.  */
> +  bool has_fp;
>  };
>
>  extern void iv_analysis_loop_init (struct loop *);
> @@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
>
>  extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
>  extern void free_simple_loop_desc (struct loop *loop);
> +void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
>
>  static inline struct niter_desc *
>  simple_loop_desc (struct loop *loop)
> Index: params.def
> ===================================================================
> --- params.def  (revision 187013)
> +++ params.def  (working copy)
> @@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>         "The maximum depth of a loop nest we completely peel",
>         8, 0, 0)
>
> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> +       "min-iter-unroll-with-branches",
> +       "Minimum iteration count to ignore branch effects when unrolling",
> +       50, 0, 0)
> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> +       "unroll-outer-loop-branch-budget",
> +       "Maximum number of branches allowed in hot outer loop region after
> unroll",
> +       25, 0, 0)
> +
>  /* The maximum number of insns of an unswitched loop.  */
>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>        "max-unswitch-insns",
>
> --
> This patch is available for review at
> http://codereview.appspot.com/6099055
>



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

Ping? Teresa On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote: > ...

13 years, 5 months ago (2012-05-11 13:12:00 UTC) #22

Ping?
Teresa

On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote:
>
> On David's suggestion, I have removed the changes that rename niter_desc
> to
> loop_desc from this patch to focus the patch on the unrolling changes. I
> can
> submit a cleanup patch to do the renaming as soon as this one goes in.
>
> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?
>
> Thanks,
> Teresa
>
> Here is the new description of improvements from the original patch:
>
> Improved patch based on feedback. Main changes are:
>
> 1) Improve efficiency by caching loop analysis results in the loop
> auxiliary
> info structure hanging off the loop structure. Added a new routine,
> analyze_loop_insns, to fill in information about the average and total
> number
> of branches, as well as whether there are any floating point set and call
> instructions in the loop. The new routine is invoked when we first create
> a
> loop's niter_desc struct, and the caller (get_simple_loop_desc) has been
> modified to handle creating a niter_desc for the fake outermost loop.
>
> 2) Improvements to max_unroll_with_branches:
> - Treat the fake outermost loop (the procedure body) as we would a hot
> outer
> loop, i.e. compute the max unroll looking at its nested branches, instead
> of
> shutting off unrolling when we reach the fake outermost loop.
> - Pull the checks previously done in the caller into the routine (e.g.
> whether the loop iterates frequently or contains fp instructions).
> - Fix a bug in the previous version that sometimes caused overflow in the
> new unroll factor.
>
> 3) Remove float variables, and use integer computation to compute the
> average number of branches in the loop.
>
> 4) Detect more types of floating point computations in the loop by walking
> all set instructions, not just single sets.
>
> 2012-05-04   Teresa Johnson  <tejohnson@google.com>
>
>        * doc/invoke.texi: Update the documentation with new params.
>        * loop-unroll.c (max_unroll_with_branches): New function.
>        (decide_unroll_constant_iterations,
> decide_unroll_runtime_iterations):
>        Add heuristic to avoid increasing branch mispredicts when
> unrolling.
>        (decide_peel_simple, decide_unroll_stupid): Retrieve number of
>        branches from niter_desc instead of via function that walks loop.
>        * loop-iv.c (get_simple_loop_desc): Invoke new analyze_loop_insns
>        function, and add guards to enable this function to work for the
>        outermost loop.
>        * cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
>        (num_loop_branches): Remove.
>        * cfgloop.h (struct loop_desc): Added new fields to cache
> additional
>        loop analysis information.
>        (num_loop_branches): Remove.
>        (analyze_loop_insns): Declare.
>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>
> Index: doc/invoke.texi
> ===================================================================
> --- doc/invoke.texi     (revision 187013)
> +++ doc/invoke.texi     (working copy)
> @@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
>  @item max-unswitch-level
>  The maximum number of branches unswitched in a single loop.
>
> +@item min-iter-unroll-with-branches
> +Minimum iteration count to ignore branch effects when unrolling.
> +
> +@item unroll-outer-loop-branch-budget
> +Maximum number of branches allowed in hot outer loop region after unroll.
> +
>  @item lim-expensive
>  The minimum cost of an expensive expression in the loop invariant motion.
>
> Index: loop-unroll.c
> ===================================================================
> --- loop-unroll.c       (revision 187013)
> +++ loop-unroll.c       (working copy)
> @@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
>                                             basic_block);
>  static rtx get_expansion (struct var_to_expand *);
>
> +/* Compute the maximum number of times LOOP can be unrolled without
> exceeding
> +   a branch budget, which can increase branch mispredictions. The number
> of
> +   branches is computed by weighting each branch with its expected
> execution
> +   probability through the loop based on profile data. If no profile
> feedback
> +   data exists, simply return the current NUNROLL factor.  */
> +
> +static unsigned
> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
> +{
> +  struct loop *outer;
> +  struct niter_desc *outer_desc = 0;
> +  int outer_niters = 1;
> +  int frequent_iteration_threshold;
> +  unsigned branch_budget;
> +  struct niter_desc *desc = get_simple_loop_desc (loop);
> +
> +  /* Ignore loops with FP computation as these tend to benefit much more
> +     consistently from unrolling.  */
> +  if (desc->has_fp)
> +    return nunroll;
> +
> +  frequent_iteration_threshold = PARAM_VALUE
> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
> +  if (expected_loop_iterations (loop) >= (unsigned)
> frequent_iteration_threshold)
> +    return nunroll;
> +
> +  /* If there was no profile feedback data, av_num_branches will be 0
> +     and we won't limit unrolling. If the av_num_branches is at most 1,
> +     also don't limit unrolling as the back-edge branch will not be
> duplicated.  */
> +  if (desc->av_num_branches <= 1)
> +    return nunroll;
> +
> +  /* Walk up the loop tree until we find a hot outer loop in which the
> current
> +     loop is nested. At that point we will compute the number of times
> the
> +     current loop can be unrolled based on the number of branches in the
> hot
> +     outer loop.  */
> +  outer = loop_outer (loop);
> +  /* The loop structure contains a fake outermost loop, so this should
> always
> +     be non-NULL for our current loop.  */
> +  gcc_assert (outer);
> +
> +  /* Walk up the loop tree until we either find a hot outer loop or hit
> the
> +     fake outermost loop at the root.  */
> +  while (true)
> +    {
> +      outer_desc = get_simple_loop_desc (outer);
> +
> +      /* Stop if we hit the fake outermost loop at the root of the tree,
> +         which includes the whole procedure.  */
> +      if (!loop_outer (outer))
> +        break;
> +
> +      if (outer_desc->const_iter)
> +        outer_niters *= outer_desc->niter;
> +      else if (outer->header->count)
> +        outer_niters *= expected_loop_iterations (outer);
> +
> +      /* If the outer loop has enough iterations to be considered hot,
> then
> +         we can stop our upwards loop tree traversal and examine the
> current
> +         outer loop.  */
> +      if (outer_niters >= frequent_iteration_threshold)
> +        break;
> +
> +      outer = loop_outer (outer);
> +    }
> +
> +  gcc_assert(outer);
> +
> +  /* Assume that any call will cause the branch budget to be exceeded,
> +     and that we can't unroll the current loop without increasing
> +     mispredicts.  */
> +  if (outer_desc->has_call)
> +    return 0;
> +
> +  /* Otherwise, compute the maximum number of times current loop can be
> +     unrolled without exceeding our branch budget. First we subtract
> +     off the outer loop's average branch count from the budget. Note
> +     that this includes the branches in the current loop. This yields
> +     the number of branches left in the budget for the unrolled copies.
> +     We divide this by the number of branches in the current loop that
> +     must be duplicated when we unroll, which is the total average
> +     number of branches minus the back-edge branch. This yields the
> +     number of new loop body copies that can be created by unrolling
> +     without exceeding the budget, to which we add 1 to get the unroll
> +     factor. Note that the "outermost loop" may be the whole procedure
> +     if we did not find a hot enough enclosing loop.  */
> +  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
> +  if (outer_desc->av_num_branches > branch_budget)
> +    return 0;
> +  /* We already returned early if desc->av_num_branches <= 1.  */
> +  return (branch_budget - outer_desc->av_num_branches)
> +      / (desc->av_num_branches - 1) + 1;
> +}
> +
>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>  void
>  unroll_and_peel_loops (int flags)
> @@ -522,6 +615,7 @@ static void
>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>  {
>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
> i;
> +  unsigned nunroll_branches;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> +     the number of mispredicts.  */
> +  if (desc->num_branches > 1)
> +    {
> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* Check whether the loop rolls enough to consider.  */
>   if (desc->niter < 2 * nunroll)
>     {
> @@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
>  static void
>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>  {
> -  unsigned nunroll, nunroll_by_av, i;
> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>   struct niter_desc *desc;
>
>   if (!(flags & UAP_UNROLL))
> @@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
>       return;
>     }
>
> +  /* Be careful when unrolling loops with branches inside -- it can
> increase
> +     the number of mispredicts.  */
> +  if (desc->num_branches > 1)
> +    {
> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
> +      if (nunroll > nunroll_branches)
> +        nunroll = nunroll_branches;
> +      if (nunroll <= 1)
> +        {
> +          if (dump_file)
> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
> +          return;
> +        }
> +    }
> +
>   /* If we have profile feedback, check whether the loop rolls.  */
>   if ((loop->header->count
>        && expected_loop_iterations (loop) < 2 * nunroll)
> @@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
>
>   /* Do not simply peel loops with branches inside -- it increases number
>      of mispredicts.  */
> -  if (num_loop_branches (loop) > 1)
> +  if (desc->num_branches > 1)
>     {
>       if (dump_file)
>        fprintf (dump_file, ";; Not peeling, contains branches\n");
> @@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
>
>   /* Do not unroll loops with branches inside -- it increases number
>      of mispredicts.  */
> -  if (num_loop_branches (loop) > 1)
> +  if (desc->num_branches > 1)
>     {
>       if (dump_file)
>        fprintf (dump_file, ";; Not unrolling, contains branches\n");
> Index: loop-iv.c
> ===================================================================
> --- loop-iv.c   (revision 187013)
> +++ loop-iv.c   (working copy)
> @@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
>   /* At least desc->infinite is not always initialized by
>      find_simple_loop_exit.  */
>   desc = XCNEW (struct niter_desc);
> -  iv_analysis_loop_init (loop);
> -  find_simple_exit (loop, desc);
> +  if (loop->latch != EXIT_BLOCK_PTR)
> +    {
> +      iv_analysis_loop_init (loop);
> +      find_simple_exit (loop, desc);
> +    }
> +  analyze_loop_insns (loop, desc);
>   loop->aux = desc;
>
>   if (desc->simple_p && (desc->assumptions || desc->infinite))
> Index: cfgloop.c
> ===================================================================
> --- cfgloop.c   (revision 187013)
> +++ cfgloop.c   (working copy)
> @@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
>   return edges;
>  }
>
> -/* Counts the number of conditional branches inside LOOP.  */
> +/* Determine if INSN is a floating point set.  */
>
> -unsigned
> -num_loop_branches (const struct loop *loop)
> +static bool
> +insn_has_fp_set(rtx insn)
>  {
> -  unsigned i, n;
> -  basic_block * body;
> +  int i;
> +  rtx pat = PATTERN(insn);
> +  if (GET_CODE (pat) == SET)
> +    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
> +  else if (GET_CODE (pat) == PARALLEL)
> +    {
> +      for (i = 0; i < XVECLEN (pat, 0); i++)
> +        {
> +          rtx sub = XVECEXP (pat, 0, i);
> +          if (GET_CODE (sub) == SET)
> +            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
> +        }
> +    }
> +  return false;
> +}
>
> -  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
> +/* Analyzes the instructions inside LOOP, updating the DESC. Currently
> counts
> +   the number of conditional branch instructions, calls and fp
> instructions,
> +   as well as the average number of branches executed per iteration.  */
>
> +void
> +analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
> +{
> +  unsigned i, nbranch;
> +  gcov_type weighted_nbranch;
> +  bool has_call, has_fp;
> +  basic_block * body, bb;
> +  rtx insn;
> +  gcov_type header_count = loop->header->count;
> +
> +  nbranch = weighted_nbranch = 0;
> +  has_call = has_fp = false;
> +
>   body = get_loop_body (loop);
> -  n = 0;
>   for (i = 0; i < loop->num_nodes; i++)
> -    if (EDGE_COUNT (body[i]->succs) >= 2)
> -      n++;
> +    {
> +      bb = body[i];
> +
> +      if (EDGE_COUNT (bb->succs) >= 2)
> +        {
> +          nbranch++;
> +
> +          /* If this block is executed less frequently than the header
> (loop
> +             entry), then it is weighted based on its execution count,
> which
> +             will be turned into a ratio compared to the loop header
> below. */
> +          if (bb->count < header_count)
> +            weighted_nbranch += bb->count;
> +
> +          /* When it is executed more frequently than the header (i.e. it
> is
> +             in a nested inner loop), simply weight the branch the same
> as the
> +             header execution count, so that it will contribute 1 branch
> to
> +             the ratio computed below. */
> +          else
> +            weighted_nbranch += header_count;
> +        }
> +
> +      /* No need to iterate through the instructions below if
> +         both flags have already been set.  */
> +      if (has_call && has_fp)
> +        continue;
> +
> +      FOR_BB_INSNS (bb, insn)
> +        {
> +          if (!INSN_P (insn))
> +            continue;
> +
> +          if (!has_call)
> +            has_call = CALL_P (insn);
> +
> +          if (!has_fp)
> +            has_fp = insn_has_fp_set (insn);
> +        }
> +    }
>   free (body);
>
> -  return n;
> +  desc->num_branches = nbranch;
> +  /* Now divide the weights computed above by the loop header execution
> count,
> +     to compute the average number of branches through the loop. By
> adding
> +     header_count/2 to the numerator we round to nearest with integer
> +     division.  */
> +  if (header_count  != 0)
> +    desc->av_num_branches
> +        = (weighted_nbranch + header_count/2) / header_count;
> +  else
> +    desc->av_num_branches = 0;
> +  desc->has_call = has_call;
> +  desc->has_fp = has_fp;
>  }
>
>  /* Adds basic block BB to LOOP.  */
> Index: cfgloop.h
> ===================================================================
> --- cfgloop.h   (revision 187013)
> +++ cfgloop.h   (working copy)
> @@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
>
>  extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
>  edge single_exit (const struct loop *);
> -extern unsigned num_loop_branches (const struct loop *);
>
>  extern edge loop_preheader_edge (const struct loop *);
>  extern edge loop_latch_edge (const struct loop *);
> @@ -359,7 +358,8 @@ struct rtx_iv
>  };
>
>  /* The description of an exit from the loop and of the number of
> iterations
> -   till we take the exit.  */
> +   till we take the exit. Also includes other information used primarily
> +   by the loop unroller.  */
>
>  struct niter_desc
>  {
> @@ -400,6 +400,18 @@ struct niter_desc
>
>   /* The number of iterations of the loop.  */
>   rtx niter_expr;
> +
> +  /* The number of branches in the loop.  */
> +  unsigned num_branches;
> +
> +  /* The number of executed branches per iteration.  */
> +  unsigned av_num_branches;
> +
> +  /* Whether the loop contains a call instruction.  */
> +  bool has_call;
> +
> +  /* Whether the loop contains fp instructions.  */
> +  bool has_fp;
>  };
>
>  extern void iv_analysis_loop_init (struct loop *);
> @@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
>
>  extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
>  extern void free_simple_loop_desc (struct loop *loop);
> +void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
>
>  static inline struct niter_desc *
>  simple_loop_desc (struct loop *loop)
> Index: params.def
> ===================================================================
> --- params.def  (revision 187013)
> +++ params.def  (working copy)
> @@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>         "The maximum depth of a loop nest we completely peel",
>         8, 0, 0)
>
> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
> +       "min-iter-unroll-with-branches",
> +       "Minimum iteration count to ignore branch effects when unrolling",
> +       50, 0, 0)
> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
> +       "unroll-outer-loop-branch-budget",
> +       "Maximum number of branches allowed in hot outer loop region after
> unroll",
> +       25, 0, 0)
> +
>  /* The maximum number of insns of an unswitched loop.  */
>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>        "max-unswitch-insns",
>
> --
> This patch is available for review at
> http://codereview.appspot.com/6099055




--
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

Ping? Teresa On Fri, May 11, 2012 at 6:11 AM, Teresa Johnson <tejohnson@google.com> wrote: > ...

13 years, 5 months ago (2012-05-18 14:21:03 UTC) #23

Ping?
Teresa

On Fri, May 11, 2012 at 6:11 AM, Teresa Johnson <tejohnson@google.com> wrote:
> Ping?
> Teresa
>
> On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote:
>>
>> On David's suggestion, I have removed the changes that rename niter_desc
>> to
>> loop_desc from this patch to focus the patch on the unrolling changes. I
>> can
>> submit a cleanup patch to do the renaming as soon as this one goes in.
>>
>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?
>>
>> Thanks,
>> Teresa
>>
>> Here is the new description of improvements from the original patch:
>>
>> Improved patch based on feedback. Main changes are:
>>
>> 1) Improve efficiency by caching loop analysis results in the loop
>> auxiliary
>> info structure hanging off the loop structure. Added a new routine,
>> analyze_loop_insns, to fill in information about the average and total
>> number
>> of branches, as well as whether there are any floating point set and call
>> instructions in the loop. The new routine is invoked when we first create
>> a
>> loop's niter_desc struct, and the caller (get_simple_loop_desc) has been
>> modified to handle creating a niter_desc for the fake outermost loop.
>>
>> 2) Improvements to max_unroll_with_branches:
>> - Treat the fake outermost loop (the procedure body) as we would a hot
>> outer
>> loop, i.e. compute the max unroll looking at its nested branches, instead
>> of
>> shutting off unrolling when we reach the fake outermost loop.
>> - Pull the checks previously done in the caller into the routine (e.g.
>> whether the loop iterates frequently or contains fp instructions).
>> - Fix a bug in the previous version that sometimes caused overflow in the
>> new unroll factor.
>>
>> 3) Remove float variables, and use integer computation to compute the
>> average number of branches in the loop.
>>
>> 4) Detect more types of floating point computations in the loop by walking
>> all set instructions, not just single sets.
>>
>> 2012-05-04   Teresa Johnson  <tejohnson@google.com>
>>
>>        * doc/invoke.texi: Update the documentation with new params.
>>        * loop-unroll.c (max_unroll_with_branches): New function.
>>        (decide_unroll_constant_iterations,
>> decide_unroll_runtime_iterations):
>>        Add heuristic to avoid increasing branch mispredicts when
>> unrolling.
>>        (decide_peel_simple, decide_unroll_stupid): Retrieve number of
>>        branches from niter_desc instead of via function that walks loop.
>>        * loop-iv.c (get_simple_loop_desc): Invoke new analyze_loop_insns
>>        function, and add guards to enable this function to work for the
>>        outermost loop.
>>        * cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
>>        (num_loop_branches): Remove.
>>        * cfgloop.h (struct loop_desc): Added new fields to cache
>> additional
>>        loop analysis information.
>>        (num_loop_branches): Remove.
>>        (analyze_loop_insns): Declare.
>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>>
>> Index: doc/invoke.texi
>> ===================================================================
>> --- doc/invoke.texi     (revision 187013)
>> +++ doc/invoke.texi     (working copy)
>> @@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
>>  @item max-unswitch-level
>>  The maximum number of branches unswitched in a single loop.
>>
>> +@item min-iter-unroll-with-branches
>> +Minimum iteration count to ignore branch effects when unrolling.
>> +
>> +@item unroll-outer-loop-branch-budget
>> +Maximum number of branches allowed in hot outer loop region after unroll.
>> +
>>  @item lim-expensive
>>  The minimum cost of an expensive expression in the loop invariant motion.
>>
>> Index: loop-unroll.c
>> ===================================================================
>> --- loop-unroll.c       (revision 187013)
>> +++ loop-unroll.c       (working copy)
>> @@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
>>                                             basic_block);
>>  static rtx get_expansion (struct var_to_expand *);
>>
>> +/* Compute the maximum number of times LOOP can be unrolled without
>> exceeding
>> +   a branch budget, which can increase branch mispredictions. The number
>> of
>> +   branches is computed by weighting each branch with its expected
>> execution
>> +   probability through the loop based on profile data. If no profile
>> feedback
>> +   data exists, simply return the current NUNROLL factor.  */
>> +
>> +static unsigned
>> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>> +{
>> +  struct loop *outer;
>> +  struct niter_desc *outer_desc = 0;
>> +  int outer_niters = 1;
>> +  int frequent_iteration_threshold;
>> +  unsigned branch_budget;
>> +  struct niter_desc *desc = get_simple_loop_desc (loop);
>> +
>> +  /* Ignore loops with FP computation as these tend to benefit much more
>> +     consistently from unrolling.  */
>> +  if (desc->has_fp)
>> +    return nunroll;
>> +
>> +  frequent_iteration_threshold = PARAM_VALUE
>> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
>> +  if (expected_loop_iterations (loop) >= (unsigned)
>> frequent_iteration_threshold)
>> +    return nunroll;
>> +
>> +  /* If there was no profile feedback data, av_num_branches will be 0
>> +     and we won't limit unrolling. If the av_num_branches is at most 1,
>> +     also don't limit unrolling as the back-edge branch will not be
>> duplicated.  */
>> +  if (desc->av_num_branches <= 1)
>> +    return nunroll;
>> +
>> +  /* Walk up the loop tree until we find a hot outer loop in which the
>> current
>> +     loop is nested. At that point we will compute the number of times
>> the
>> +     current loop can be unrolled based on the number of branches in the
>> hot
>> +     outer loop.  */
>> +  outer = loop_outer (loop);
>> +  /* The loop structure contains a fake outermost loop, so this should
>> always
>> +     be non-NULL for our current loop.  */
>> +  gcc_assert (outer);
>> +
>> +  /* Walk up the loop tree until we either find a hot outer loop or hit
>> the
>> +     fake outermost loop at the root.  */
>> +  while (true)
>> +    {
>> +      outer_desc = get_simple_loop_desc (outer);
>> +
>> +      /* Stop if we hit the fake outermost loop at the root of the tree,
>> +         which includes the whole procedure.  */
>> +      if (!loop_outer (outer))
>> +        break;
>> +
>> +      if (outer_desc->const_iter)
>> +        outer_niters *= outer_desc->niter;
>> +      else if (outer->header->count)
>> +        outer_niters *= expected_loop_iterations (outer);
>> +
>> +      /* If the outer loop has enough iterations to be considered hot,
>> then
>> +         we can stop our upwards loop tree traversal and examine the
>> current
>> +         outer loop.  */
>> +      if (outer_niters >= frequent_iteration_threshold)
>> +        break;
>> +
>> +      outer = loop_outer (outer);
>> +    }
>> +
>> +  gcc_assert(outer);
>> +
>> +  /* Assume that any call will cause the branch budget to be exceeded,
>> +     and that we can't unroll the current loop without increasing
>> +     mispredicts.  */
>> +  if (outer_desc->has_call)
>> +    return 0;
>> +
>> +  /* Otherwise, compute the maximum number of times current loop can be
>> +     unrolled without exceeding our branch budget. First we subtract
>> +     off the outer loop's average branch count from the budget. Note
>> +     that this includes the branches in the current loop. This yields
>> +     the number of branches left in the budget for the unrolled copies.
>> +     We divide this by the number of branches in the current loop that
>> +     must be duplicated when we unroll, which is the total average
>> +     number of branches minus the back-edge branch. This yields the
>> +     number of new loop body copies that can be created by unrolling
>> +     without exceeding the budget, to which we add 1 to get the unroll
>> +     factor. Note that the "outermost loop" may be the whole procedure
>> +     if we did not find a hot enough enclosing loop.  */
>> +  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
>> +  if (outer_desc->av_num_branches > branch_budget)
>> +    return 0;
>> +  /* We already returned early if desc->av_num_branches <= 1.  */
>> +  return (branch_budget - outer_desc->av_num_branches)
>> +      / (desc->av_num_branches - 1) + 1;
>> +}
>> +
>>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>>  void
>>  unroll_and_peel_loops (int flags)
>> @@ -522,6 +615,7 @@ static void
>>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>>  {
>>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
>> i;
>> +  unsigned nunroll_branches;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
>> increase
>> +     the number of mispredicts.  */
>> +  if (desc->num_branches > 1)
>> +    {
>> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* Check whether the loop rolls enough to consider.  */
>>   if (desc->niter < 2 * nunroll)
>>     {
>> @@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
>>  static void
>>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>>  {
>> -  unsigned nunroll, nunroll_by_av, i;
>> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>>   struct niter_desc *desc;
>>
>>   if (!(flags & UAP_UNROLL))
>> @@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
>>       return;
>>     }
>>
>> +  /* Be careful when unrolling loops with branches inside -- it can
>> increase
>> +     the number of mispredicts.  */
>> +  if (desc->num_branches > 1)
>> +    {
>> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
>> +      if (nunroll > nunroll_branches)
>> +        nunroll = nunroll_branches;
>> +      if (nunroll <= 1)
>> +        {
>> +          if (dump_file)
>> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> +          return;
>> +        }
>> +    }
>> +
>>   /* If we have profile feedback, check whether the loop rolls.  */
>>   if ((loop->header->count
>>        && expected_loop_iterations (loop) < 2 * nunroll)
>> @@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
>>
>>   /* Do not simply peel loops with branches inside -- it increases number
>>      of mispredicts.  */
>> -  if (num_loop_branches (loop) > 1)
>> +  if (desc->num_branches > 1)
>>     {
>>       if (dump_file)
>>        fprintf (dump_file, ";; Not peeling, contains branches\n");
>> @@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
>>
>>   /* Do not unroll loops with branches inside -- it increases number
>>      of mispredicts.  */
>> -  if (num_loop_branches (loop) > 1)
>> +  if (desc->num_branches > 1)
>>     {
>>       if (dump_file)
>>        fprintf (dump_file, ";; Not unrolling, contains branches\n");
>> Index: loop-iv.c
>> ===================================================================
>> --- loop-iv.c   (revision 187013)
>> +++ loop-iv.c   (working copy)
>> @@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
>>   /* At least desc->infinite is not always initialized by
>>      find_simple_loop_exit.  */
>>   desc = XCNEW (struct niter_desc);
>> -  iv_analysis_loop_init (loop);
>> -  find_simple_exit (loop, desc);
>> +  if (loop->latch != EXIT_BLOCK_PTR)
>> +    {
>> +      iv_analysis_loop_init (loop);
>> +      find_simple_exit (loop, desc);
>> +    }
>> +  analyze_loop_insns (loop, desc);
>>   loop->aux = desc;
>>
>>   if (desc->simple_p && (desc->assumptions || desc->infinite))
>> Index: cfgloop.c
>> ===================================================================
>> --- cfgloop.c   (revision 187013)
>> +++ cfgloop.c   (working copy)
>> @@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
>>   return edges;
>>  }
>>
>> -/* Counts the number of conditional branches inside LOOP.  */
>> +/* Determine if INSN is a floating point set.  */
>>
>> -unsigned
>> -num_loop_branches (const struct loop *loop)
>> +static bool
>> +insn_has_fp_set(rtx insn)
>>  {
>> -  unsigned i, n;
>> -  basic_block * body;
>> +  int i;
>> +  rtx pat = PATTERN(insn);
>> +  if (GET_CODE (pat) == SET)
>> +    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
>> +  else if (GET_CODE (pat) == PARALLEL)
>> +    {
>> +      for (i = 0; i < XVECLEN (pat, 0); i++)
>> +        {
>> +          rtx sub = XVECEXP (pat, 0, i);
>> +          if (GET_CODE (sub) == SET)
>> +            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
>> +        }
>> +    }
>> +  return false;
>> +}
>>
>> -  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>> +/* Analyzes the instructions inside LOOP, updating the DESC. Currently
>> counts
>> +   the number of conditional branch instructions, calls and fp
>> instructions,
>> +   as well as the average number of branches executed per iteration.  */
>>
>> +void
>> +analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
>> +{
>> +  unsigned i, nbranch;
>> +  gcov_type weighted_nbranch;
>> +  bool has_call, has_fp;
>> +  basic_block * body, bb;
>> +  rtx insn;
>> +  gcov_type header_count = loop->header->count;
>> +
>> +  nbranch = weighted_nbranch = 0;
>> +  has_call = has_fp = false;
>> +
>>   body = get_loop_body (loop);
>> -  n = 0;
>>   for (i = 0; i < loop->num_nodes; i++)
>> -    if (EDGE_COUNT (body[i]->succs) >= 2)
>> -      n++;
>> +    {
>> +      bb = body[i];
>> +
>> +      if (EDGE_COUNT (bb->succs) >= 2)
>> +        {
>> +          nbranch++;
>> +
>> +          /* If this block is executed less frequently than the header
>> (loop
>> +             entry), then it is weighted based on its execution count,
>> which
>> +             will be turned into a ratio compared to the loop header
>> below. */
>> +          if (bb->count < header_count)
>> +            weighted_nbranch += bb->count;
>> +
>> +          /* When it is executed more frequently than the header (i.e. it
>> is
>> +             in a nested inner loop), simply weight the branch the same
>> as the
>> +             header execution count, so that it will contribute 1 branch
>> to
>> +             the ratio computed below. */
>> +          else
>> +            weighted_nbranch += header_count;
>> +        }
>> +
>> +      /* No need to iterate through the instructions below if
>> +         both flags have already been set.  */
>> +      if (has_call && has_fp)
>> +        continue;
>> +
>> +      FOR_BB_INSNS (bb, insn)
>> +        {
>> +          if (!INSN_P (insn))
>> +            continue;
>> +
>> +          if (!has_call)
>> +            has_call = CALL_P (insn);
>> +
>> +          if (!has_fp)
>> +            has_fp = insn_has_fp_set (insn);
>> +        }
>> +    }
>>   free (body);
>>
>> -  return n;
>> +  desc->num_branches = nbranch;
>> +  /* Now divide the weights computed above by the loop header execution
>> count,
>> +     to compute the average number of branches through the loop. By
>> adding
>> +     header_count/2 to the numerator we round to nearest with integer
>> +     division.  */
>> +  if (header_count  != 0)
>> +    desc->av_num_branches
>> +        = (weighted_nbranch + header_count/2) / header_count;
>> +  else
>> +    desc->av_num_branches = 0;
>> +  desc->has_call = has_call;
>> +  desc->has_fp = has_fp;
>>  }
>>
>>  /* Adds basic block BB to LOOP.  */
>> Index: cfgloop.h
>> ===================================================================
>> --- cfgloop.h   (revision 187013)
>> +++ cfgloop.h   (working copy)
>> @@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
>>
>>  extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
>>  edge single_exit (const struct loop *);
>> -extern unsigned num_loop_branches (const struct loop *);
>>
>>  extern edge loop_preheader_edge (const struct loop *);
>>  extern edge loop_latch_edge (const struct loop *);
>> @@ -359,7 +358,8 @@ struct rtx_iv
>>  };
>>
>>  /* The description of an exit from the loop and of the number of
>> iterations
>> -   till we take the exit.  */
>> +   till we take the exit. Also includes other information used primarily
>> +   by the loop unroller.  */
>>
>>  struct niter_desc
>>  {
>> @@ -400,6 +400,18 @@ struct niter_desc
>>
>>   /* The number of iterations of the loop.  */
>>   rtx niter_expr;
>> +
>> +  /* The number of branches in the loop.  */
>> +  unsigned num_branches;
>> +
>> +  /* The number of executed branches per iteration.  */
>> +  unsigned av_num_branches;
>> +
>> +  /* Whether the loop contains a call instruction.  */
>> +  bool has_call;
>> +
>> +  /* Whether the loop contains fp instructions.  */
>> +  bool has_fp;
>>  };
>>
>>  extern void iv_analysis_loop_init (struct loop *);
>> @@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
>>
>>  extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
>>  extern void free_simple_loop_desc (struct loop *loop);
>> +void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
>>
>>  static inline struct niter_desc *
>>  simple_loop_desc (struct loop *loop)
>> Index: params.def
>> ===================================================================
>> --- params.def  (revision 187013)
>> +++ params.def  (working copy)
>> @@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>>         "The maximum depth of a loop nest we completely peel",
>>         8, 0, 0)
>>
>> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>> +       "min-iter-unroll-with-branches",
>> +       "Minimum iteration count to ignore branch effects when unrolling",
>> +       50, 0, 0)
>> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>> +       "unroll-outer-loop-branch-budget",
>> +       "Maximum number of branches allowed in hot outer loop region after
>> unroll",
>> +       25, 0, 0)
>> +
>>  /* The maximum number of insns of an unswitched loop.  */
>>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>>        "max-unswitch-insns",
>>
>> --
>> This patch is available for review at
>> http://codereview.appspot.com/6099055
>
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

tejohnson

13 years, 4 months ago (2012-06-20 01:49:04 UTC) #24

Ping.
Teresa

On Fri, May 18, 2012 at 7:21 AM, Teresa Johnson <tejohnson@google.com> wrote:
> Ping?
> Teresa
>
> On Fri, May 11, 2012 at 6:11 AM, Teresa Johnson <tejohnson@google.com> wrote:
>> Ping?
>> Teresa
>>
>> On Fri, May 4, 2012 at 3:41 PM, Teresa Johnson <tejohnson@google.com> wrote:
>>>
>>> On David's suggestion, I have removed the changes that rename niter_desc
>>> to
>>> loop_desc from this patch to focus the patch on the unrolling changes. I
>>> can
>>> submit a cleanup patch to do the renaming as soon as this one goes in.
>>>
>>> Bootstrapped and tested on x86_64-unknown-linux-gnu.  Ok for trunk?
>>>
>>> Thanks,
>>> Teresa
>>>
>>> Here is the new description of improvements from the original patch:
>>>
>>> Improved patch based on feedback. Main changes are:
>>>
>>> 1) Improve efficiency by caching loop analysis results in the loop
>>> auxiliary
>>> info structure hanging off the loop structure. Added a new routine,
>>> analyze_loop_insns, to fill in information about the average and total
>>> number
>>> of branches, as well as whether there are any floating point set and call
>>> instructions in the loop. The new routine is invoked when we first create
>>> a
>>> loop's niter_desc struct, and the caller (get_simple_loop_desc) has been
>>> modified to handle creating a niter_desc for the fake outermost loop.
>>>
>>> 2) Improvements to max_unroll_with_branches:
>>> - Treat the fake outermost loop (the procedure body) as we would a hot
>>> outer
>>> loop, i.e. compute the max unroll looking at its nested branches, instead
>>> of
>>> shutting off unrolling when we reach the fake outermost loop.
>>> - Pull the checks previously done in the caller into the routine (e.g.
>>> whether the loop iterates frequently or contains fp instructions).
>>> - Fix a bug in the previous version that sometimes caused overflow in the
>>> new unroll factor.
>>>
>>> 3) Remove float variables, and use integer computation to compute the
>>> average number of branches in the loop.
>>>
>>> 4) Detect more types of floating point computations in the loop by walking
>>> all set instructions, not just single sets.
>>>
>>> 2012-05-04   Teresa Johnson  <tejohnson@google.com>
>>>
>>>        * doc/invoke.texi: Update the documentation with new params.
>>>        * loop-unroll.c (max_unroll_with_branches): New function.
>>>        (decide_unroll_constant_iterations,
>>> decide_unroll_runtime_iterations):
>>>        Add heuristic to avoid increasing branch mispredicts when
>>> unrolling.
>>>        (decide_peel_simple, decide_unroll_stupid): Retrieve number of
>>>        branches from niter_desc instead of via function that walks loop.
>>>        * loop-iv.c (get_simple_loop_desc): Invoke new analyze_loop_insns
>>>        function, and add guards to enable this function to work for the
>>>        outermost loop.
>>>        * cfgloop.c (insn_has_fp_set, analyze_loop_insns): New functions.
>>>        (num_loop_branches): Remove.
>>>        * cfgloop.h (struct loop_desc): Added new fields to cache
>>> additional
>>>        loop analysis information.
>>>        (num_loop_branches): Remove.
>>>        (analyze_loop_insns): Declare.
>>>        * params.def (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES): New param.
>>>        (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET): Ditto.
>>>
>>> Index: doc/invoke.texi
>>> ===================================================================
>>> --- doc/invoke.texi     (revision 187013)
>>> +++ doc/invoke.texi     (working copy)
>>> @@ -8842,6 +8842,12 @@ The maximum number of insns of an unswitched loop.
>>>  @item max-unswitch-level
>>>  The maximum number of branches unswitched in a single loop.
>>>
>>> +@item min-iter-unroll-with-branches
>>> +Minimum iteration count to ignore branch effects when unrolling.
>>> +
>>> +@item unroll-outer-loop-branch-budget
>>> +Maximum number of branches allowed in hot outer loop region after unroll.
>>> +
>>>  @item lim-expensive
>>>  The minimum cost of an expensive expression in the loop invariant motion.
>>>
>>> Index: loop-unroll.c
>>> ===================================================================
>>> --- loop-unroll.c       (revision 187013)
>>> +++ loop-unroll.c       (working copy)
>>> @@ -152,6 +152,99 @@ static void combine_var_copies_in_loop_exit (struc
>>>                                             basic_block);
>>>  static rtx get_expansion (struct var_to_expand *);
>>>
>>> +/* Compute the maximum number of times LOOP can be unrolled without
>>> exceeding
>>> +   a branch budget, which can increase branch mispredictions. The number
>>> of
>>> +   branches is computed by weighting each branch with its expected
>>> execution
>>> +   probability through the loop based on profile data. If no profile
>>> feedback
>>> +   data exists, simply return the current NUNROLL factor.  */
>>> +
>>> +static unsigned
>>> +max_unroll_with_branches(struct loop *loop, unsigned nunroll)
>>> +{
>>> +  struct loop *outer;
>>> +  struct niter_desc *outer_desc = 0;
>>> +  int outer_niters = 1;
>>> +  int frequent_iteration_threshold;
>>> +  unsigned branch_budget;
>>> +  struct niter_desc *desc = get_simple_loop_desc (loop);
>>> +
>>> +  /* Ignore loops with FP computation as these tend to benefit much more
>>> +     consistently from unrolling.  */
>>> +  if (desc->has_fp)
>>> +    return nunroll;
>>> +
>>> +  frequent_iteration_threshold = PARAM_VALUE
>>> (PARAM_MIN_ITER_UNROLL_WITH_BRANCHES);
>>> +  if (expected_loop_iterations (loop) >= (unsigned)
>>> frequent_iteration_threshold)
>>> +    return nunroll;
>>> +
>>> +  /* If there was no profile feedback data, av_num_branches will be 0
>>> +     and we won't limit unrolling. If the av_num_branches is at most 1,
>>> +     also don't limit unrolling as the back-edge branch will not be
>>> duplicated.  */
>>> +  if (desc->av_num_branches <= 1)
>>> +    return nunroll;
>>> +
>>> +  /* Walk up the loop tree until we find a hot outer loop in which the
>>> current
>>> +     loop is nested. At that point we will compute the number of times
>>> the
>>> +     current loop can be unrolled based on the number of branches in the
>>> hot
>>> +     outer loop.  */
>>> +  outer = loop_outer (loop);
>>> +  /* The loop structure contains a fake outermost loop, so this should
>>> always
>>> +     be non-NULL for our current loop.  */
>>> +  gcc_assert (outer);
>>> +
>>> +  /* Walk up the loop tree until we either find a hot outer loop or hit
>>> the
>>> +     fake outermost loop at the root.  */
>>> +  while (true)
>>> +    {
>>> +      outer_desc = get_simple_loop_desc (outer);
>>> +
>>> +      /* Stop if we hit the fake outermost loop at the root of the tree,
>>> +         which includes the whole procedure.  */
>>> +      if (!loop_outer (outer))
>>> +        break;
>>> +
>>> +      if (outer_desc->const_iter)
>>> +        outer_niters *= outer_desc->niter;
>>> +      else if (outer->header->count)
>>> +        outer_niters *= expected_loop_iterations (outer);
>>> +
>>> +      /* If the outer loop has enough iterations to be considered hot,
>>> then
>>> +         we can stop our upwards loop tree traversal and examine the
>>> current
>>> +         outer loop.  */
>>> +      if (outer_niters >= frequent_iteration_threshold)
>>> +        break;
>>> +
>>> +      outer = loop_outer (outer);
>>> +    }
>>> +
>>> +  gcc_assert(outer);
>>> +
>>> +  /* Assume that any call will cause the branch budget to be exceeded,
>>> +     and that we can't unroll the current loop without increasing
>>> +     mispredicts.  */
>>> +  if (outer_desc->has_call)
>>> +    return 0;
>>> +
>>> +  /* Otherwise, compute the maximum number of times current loop can be
>>> +     unrolled without exceeding our branch budget. First we subtract
>>> +     off the outer loop's average branch count from the budget. Note
>>> +     that this includes the branches in the current loop. This yields
>>> +     the number of branches left in the budget for the unrolled copies.
>>> +     We divide this by the number of branches in the current loop that
>>> +     must be duplicated when we unroll, which is the total average
>>> +     number of branches minus the back-edge branch. This yields the
>>> +     number of new loop body copies that can be created by unrolling
>>> +     without exceeding the budget, to which we add 1 to get the unroll
>>> +     factor. Note that the "outermost loop" may be the whole procedure
>>> +     if we did not find a hot enough enclosing loop.  */
>>> +  branch_budget = PARAM_VALUE (PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET);
>>> +  if (outer_desc->av_num_branches > branch_budget)
>>> +    return 0;
>>> +  /* We already returned early if desc->av_num_branches <= 1.  */
>>> +  return (branch_budget - outer_desc->av_num_branches)
>>> +      / (desc->av_num_branches - 1) + 1;
>>> +}
>>> +
>>>  /* Unroll and/or peel (depending on FLAGS) LOOPS.  */
>>>  void
>>>  unroll_and_peel_loops (int flags)
>>> @@ -522,6 +615,7 @@ static void
>>>  decide_unroll_constant_iterations (struct loop *loop, int flags)
>>>  {
>>>   unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies,
>>> i;
>>> +  unsigned nunroll_branches;
>>>   struct niter_desc *desc;
>>>
>>>   if (!(flags & UAP_UNROLL))
>>> @@ -565,6 +659,21 @@ decide_unroll_constant_iterations (struct loop *lo
>>>       return;
>>>     }
>>>
>>> +  /* Be careful when unrolling loops with branches inside -- it can
>>> increase
>>> +     the number of mispredicts.  */
>>> +  if (desc->num_branches > 1)
>>> +    {
>>> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
>>> +      if (nunroll > nunroll_branches)
>>> +        nunroll = nunroll_branches;
>>> +      if (nunroll <= 1)
>>> +        {
>>> +          if (dump_file)
>>> +           fprintf (dump_file, ";; Not unrolling, contains branches\n");
>>> +          return;
>>> +        }
>>> +    }
>>> +
>>>   /* Check whether the loop rolls enough to consider.  */
>>>   if (desc->niter < 2 * nunroll)
>>>     {
>>> @@ -802,7 +911,7 @@ unroll_loop_constant_iterations (struct loop *loop
>>>  static void
>>>  decide_unroll_runtime_iterations (struct loop *loop, int flags)
>>>  {
>>> -  unsigned nunroll, nunroll_by_av, i;
>>> +  unsigned nunroll, nunroll_by_av, nunroll_branches, i;
>>>   struct niter_desc *desc;
>>>
>>>   if (!(flags & UAP_UNROLL))
>>> @@ -856,6 +965,21 @@ decide_unroll_runtime_iterations (struct loop *loo
>>>       return;
>>>     }
>>>
>>> +  /* Be careful when unrolling loops with branches inside -- it can
>>> increase
>>> +     the number of mispredicts.  */
>>> +  if (desc->num_branches > 1)
>>> +    {
>>> +      nunroll_branches = max_unroll_with_branches (loop, nunroll);
>>> +      if (nunroll > nunroll_branches)
>>> +        nunroll = nunroll_branches;
>>> +      if (nunroll <= 1)
>>> +        {
>>> +          if (dump_file)
>>> +            fprintf (dump_file, ";; Not unrolling, contains branches\n");
>>> +          return;
>>> +        }
>>> +    }
>>> +
>>>   /* If we have profile feedback, check whether the loop rolls.  */
>>>   if ((loop->header->count
>>>        && expected_loop_iterations (loop) < 2 * nunroll)
>>> @@ -1233,7 +1357,7 @@ decide_peel_simple (struct loop *loop, int flags)
>>>
>>>   /* Do not simply peel loops with branches inside -- it increases number
>>>      of mispredicts.  */
>>> -  if (num_loop_branches (loop) > 1)
>>> +  if (desc->num_branches > 1)
>>>     {
>>>       if (dump_file)
>>>        fprintf (dump_file, ";; Not peeling, contains branches\n");
>>> @@ -1394,7 +1518,7 @@ decide_unroll_stupid (struct loop *loop, int flags
>>>
>>>   /* Do not unroll loops with branches inside -- it increases number
>>>      of mispredicts.  */
>>> -  if (num_loop_branches (loop) > 1)
>>> +  if (desc->num_branches > 1)
>>>     {
>>>       if (dump_file)
>>>        fprintf (dump_file, ";; Not unrolling, contains branches\n");
>>> Index: loop-iv.c
>>> ===================================================================
>>> --- loop-iv.c   (revision 187013)
>>> +++ loop-iv.c   (working copy)
>>> @@ -2948,8 +2948,12 @@ get_simple_loop_desc (struct loop *loop)
>>>   /* At least desc->infinite is not always initialized by
>>>      find_simple_loop_exit.  */
>>>   desc = XCNEW (struct niter_desc);
>>> -  iv_analysis_loop_init (loop);
>>> -  find_simple_exit (loop, desc);
>>> +  if (loop->latch != EXIT_BLOCK_PTR)
>>> +    {
>>> +      iv_analysis_loop_init (loop);
>>> +      find_simple_exit (loop, desc);
>>> +    }
>>> +  analyze_loop_insns (loop, desc);
>>>   loop->aux = desc;
>>>
>>>   if (desc->simple_p && (desc->assumptions || desc->infinite))
>>> Index: cfgloop.c
>>> ===================================================================
>>> --- cfgloop.c   (revision 187013)
>>> +++ cfgloop.c   (working copy)
>>> @@ -1156,24 +1156,98 @@ get_loop_exit_edges (const struct loop *loop)
>>>   return edges;
>>>  }
>>>
>>> -/* Counts the number of conditional branches inside LOOP.  */
>>> +/* Determine if INSN is a floating point set.  */
>>>
>>> -unsigned
>>> -num_loop_branches (const struct loop *loop)
>>> +static bool
>>> +insn_has_fp_set(rtx insn)
>>>  {
>>> -  unsigned i, n;
>>> -  basic_block * body;
>>> +  int i;
>>> +  rtx pat = PATTERN(insn);
>>> +  if (GET_CODE (pat) == SET)
>>> +    return (FLOAT_MODE_P (GET_MODE (SET_DEST (pat))));
>>> +  else if (GET_CODE (pat) == PARALLEL)
>>> +    {
>>> +      for (i = 0; i < XVECLEN (pat, 0); i++)
>>> +        {
>>> +          rtx sub = XVECEXP (pat, 0, i);
>>> +          if (GET_CODE (sub) == SET)
>>> +            return (FLOAT_MODE_P (GET_MODE (SET_DEST (sub))));
>>> +        }
>>> +    }
>>> +  return false;
>>> +}
>>>
>>> -  gcc_assert (loop->latch != EXIT_BLOCK_PTR);
>>> +/* Analyzes the instructions inside LOOP, updating the DESC. Currently
>>> counts
>>> +   the number of conditional branch instructions, calls and fp
>>> instructions,
>>> +   as well as the average number of branches executed per iteration.  */
>>>
>>> +void
>>> +analyze_loop_insns (const struct loop *loop, struct niter_desc *desc)
>>> +{
>>> +  unsigned i, nbranch;
>>> +  gcov_type weighted_nbranch;
>>> +  bool has_call, has_fp;
>>> +  basic_block * body, bb;
>>> +  rtx insn;
>>> +  gcov_type header_count = loop->header->count;
>>> +
>>> +  nbranch = weighted_nbranch = 0;
>>> +  has_call = has_fp = false;
>>> +
>>>   body = get_loop_body (loop);
>>> -  n = 0;
>>>   for (i = 0; i < loop->num_nodes; i++)
>>> -    if (EDGE_COUNT (body[i]->succs) >= 2)
>>> -      n++;
>>> +    {
>>> +      bb = body[i];
>>> +
>>> +      if (EDGE_COUNT (bb->succs) >= 2)
>>> +        {
>>> +          nbranch++;
>>> +
>>> +          /* If this block is executed less frequently than the header
>>> (loop
>>> +             entry), then it is weighted based on its execution count,
>>> which
>>> +             will be turned into a ratio compared to the loop header
>>> below. */
>>> +          if (bb->count < header_count)
>>> +            weighted_nbranch += bb->count;
>>> +
>>> +          /* When it is executed more frequently than the header (i.e. it
>>> is
>>> +             in a nested inner loop), simply weight the branch the same
>>> as the
>>> +             header execution count, so that it will contribute 1 branch
>>> to
>>> +             the ratio computed below. */
>>> +          else
>>> +            weighted_nbranch += header_count;
>>> +        }
>>> +
>>> +      /* No need to iterate through the instructions below if
>>> +         both flags have already been set.  */
>>> +      if (has_call && has_fp)
>>> +        continue;
>>> +
>>> +      FOR_BB_INSNS (bb, insn)
>>> +        {
>>> +          if (!INSN_P (insn))
>>> +            continue;
>>> +
>>> +          if (!has_call)
>>> +            has_call = CALL_P (insn);
>>> +
>>> +          if (!has_fp)
>>> +            has_fp = insn_has_fp_set (insn);
>>> +        }
>>> +    }
>>>   free (body);
>>>
>>> -  return n;
>>> +  desc->num_branches = nbranch;
>>> +  /* Now divide the weights computed above by the loop header execution
>>> count,
>>> +     to compute the average number of branches through the loop. By
>>> adding
>>> +     header_count/2 to the numerator we round to nearest with integer
>>> +     division.  */
>>> +  if (header_count  != 0)
>>> +    desc->av_num_branches
>>> +        = (weighted_nbranch + header_count/2) / header_count;
>>> +  else
>>> +    desc->av_num_branches = 0;
>>> +  desc->has_call = has_call;
>>> +  desc->has_fp = has_fp;
>>>  }
>>>
>>>  /* Adds basic block BB to LOOP.  */
>>> Index: cfgloop.h
>>> ===================================================================
>>> --- cfgloop.h   (revision 187013)
>>> +++ cfgloop.h   (working copy)
>>> @@ -249,7 +249,6 @@ extern basic_block *get_loop_body_in_custom_order
>>>
>>>  extern VEC (edge, heap) *get_loop_exit_edges (const struct loop *);
>>>  edge single_exit (const struct loop *);
>>> -extern unsigned num_loop_branches (const struct loop *);
>>>
>>>  extern edge loop_preheader_edge (const struct loop *);
>>>  extern edge loop_latch_edge (const struct loop *);
>>> @@ -359,7 +358,8 @@ struct rtx_iv
>>>  };
>>>
>>>  /* The description of an exit from the loop and of the number of
>>> iterations
>>> -   till we take the exit.  */
>>> +   till we take the exit. Also includes other information used primarily
>>> +   by the loop unroller.  */
>>>
>>>  struct niter_desc
>>>  {
>>> @@ -400,6 +400,18 @@ struct niter_desc
>>>
>>>   /* The number of iterations of the loop.  */
>>>   rtx niter_expr;
>>> +
>>> +  /* The number of branches in the loop.  */
>>> +  unsigned num_branches;
>>> +
>>> +  /* The number of executed branches per iteration.  */
>>> +  unsigned av_num_branches;
>>> +
>>> +  /* Whether the loop contains a call instruction.  */
>>> +  bool has_call;
>>> +
>>> +  /* Whether the loop contains fp instructions.  */
>>> +  bool has_fp;
>>>  };
>>>
>>>  extern void iv_analysis_loop_init (struct loop *);
>>> @@ -413,6 +425,7 @@ extern void iv_analysis_done (void);
>>>
>>>  extern struct niter_desc *get_simple_loop_desc (struct loop *loop);
>>>  extern void free_simple_loop_desc (struct loop *loop);
>>> +void analyze_loop_insns (const struct loop *, struct niter_desc *desc);
>>>
>>>  static inline struct niter_desc *
>>>  simple_loop_desc (struct loop *loop)
>>> Index: params.def
>>> ===================================================================
>>> --- params.def  (revision 187013)
>>> +++ params.def  (working copy)
>>> @@ -312,6 +312,15 @@ DEFPARAM(PARAM_MAX_UNROLL_ITERATIONS,
>>>         "The maximum depth of a loop nest we completely peel",
>>>         8, 0, 0)
>>>
>>> +DEFPARAM(PARAM_MIN_ITER_UNROLL_WITH_BRANCHES,
>>> +       "min-iter-unroll-with-branches",
>>> +       "Minimum iteration count to ignore branch effects when unrolling",
>>> +       50, 0, 0)
>>> +DEFPARAM(PARAM_UNROLL_OUTER_LOOP_BRANCH_BUDGET,
>>> +       "unroll-outer-loop-branch-budget",
>>> +       "Maximum number of branches allowed in hot outer loop region after
>>> unroll",
>>> +       25, 0, 0)
>>> +
>>>  /* The maximum number of insns of an unswitched loop.  */
>>>  DEFPARAM(PARAM_MAX_UNSWITCH_INSNS,
>>>        "max-unswitch-insns",
>>>
>>> --
>>> This patch is available for review at
>>> http://codereview.appspot.com/6099055
>>
>>
>>
>>
>> --
>> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
>
>
>
> --
> Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413

Expand All Messages | Collapse All Messages