Rietveld Code Review Tool
Help | Bug tracker | Discussion group | Source code | Sign in
(2991)

Issue 5865043: [google][4.6][i386]Support autocloning for corei7 with -mvarch= option to remove LCP stalls in loops

Can't Edit
Can't Publish+Mail
Start Review
Created:
12 years, 1 month ago by Sriraman
Modified:
12 years, 1 month ago
Reviewers:
tejohnson, teresa
CC:
davidxl, gcc-patches_gcc.gnu.org
Base URL:
svn+ssh://gcc.gnu.org/svn/gcc/branches/google/gcc-4_6/gcc/
Visibility:
Public.

Patch Set 1 #

Unified diffs Side-by-side diffs Delta from patch set Stats (+199 lines, -9 lines) Patch
M config/i386/i386.c View 4 chunks +195 lines, -8 lines 0 comments Download
M mversn-dispatch.c View 2 chunks +3 lines, -0 lines 0 comments Download
M params.def View 1 chunk +1 line, -1 line 0 comments Download

Messages

Total messages: 2
Sriraman
This patch adds support to version for corei7 with -mvarch option. The versioning supported is ...
12 years, 1 month ago (2012-03-20 21:04:58 UTC) #1
tejohnson
12 years, 1 month ago (2012-03-20 22:59:39 UTC) #2
On Tue, Mar 20, 2012 at 2:04 PM, Sriraman Tallam <tmsriram@google.com> wrote:
> This patch adds support to version for corei7 with -mvarch option. The
versioning supported is in the case where a loop generates a LCP stalling
instruction in corei7. In such cases, on corei7, limiting the unroll factor to
try to keep the unrolled loop body small enough to fit in the Corei7's loop
stream detector can hide LCP stalls in loops. With mvarch, the function
containing the loop is multi-versioned and one version is tagged with
"tune=corei7" so that the unroll factor can be limited on this version.
>
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg01230.html for
discussion on mvarch option.
> Please see: http://gcc.gnu.org/ml/gcc-patches/2011-12/msg00123.html for
discussion  on LCP stalls in corei7.
>
>
> The autocloning framework is only avaiable in google/gcc-4_6 branch. I am
working on porting this to trunk.
>
>        * config/i386/i386.c (find_himode_assigns): New function.
>        (mversionable_for_core2_p): Add new param version_number.
>        (mversionable_for_corei7_p): New function.
>        (ix86_mversion_function): Check for corei7 versioning.
>        * params.def (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING): Bump
>        allowed limit to 5000.
>        *  mversn-dispatch.c (do_auto_clone): Reverse fn_ver_addr_chain.
>
> Index: config/i386/i386.c
> ===================================================================
> --- config/i386/i386.c  (revision 185514)
> +++ config/i386/i386.c  (working copy)
> @@ -26507,6 +26507,132 @@ any_loops_vectorizable_with_load_store (void)
>   return vectorizable_loop_found;
>  }
>
> +/* Returns true if this function finds a loop that contains a possible LCP
> +   stalling instruction on corei7.   This is used to multiversion functions
> +   for corei7.
> +
> +   This function looks for instructions that store a constant into
> +   HImode (16-bit) memory. These require a length-changing prefix and on
> +   corei7 are prone to LCP stalls. These stalls can be avoided if the loop
> +   is streamed from the loop stream detector.  */
> +
> +static bool
> +find_himode_assigns (void)
> +{
> +  gimple_stmt_iterator gsi;
> +  gimple stmt;
> +  enum gimple_code code;
> +  tree lhs/*, rhs*/;

Can rhs be removed?

> +  enum machine_mode mode;
> +  basic_block *body;
> +  unsigned i;
> +  loop_iterator li;
> +  struct loop *loop;
> +  bool found = false;
> +  location_t locus = 0;

locus is dead (assigned but not read).

> +  int stmt_count;
> +  unsigned HOST_WIDE_INT n_unroll, max_unroll;
> +
> +  if (!flag_unroll_loops)
> +    return false;
> +
> +  loop_optimizer_init (LOOPS_NORMAL
> +                       | LOOPS_HAVE_RECORDED_EXITS);
> +  if (number_of_loops () < 1)
> +    return false;
> +
> +  scev_initialize();
> +
> +  if (profile_status == PROFILE_READ)
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES_FEEDBACK);
> +  else
> +    max_unroll = PARAM_VALUE (PARAM_MAX_COMPLETELY_PEEL_TIMES);

It might be clearer to rename max_unroll to max_peel_times or
something like that to be clearer.

> +
> +  FOR_EACH_LOOP (li, loop, LI_ONLY_INNERMOST)
> +    {
> +      tree niter;
> +
> +      /* Will not peel/unroll cold areas.  */
> +      if (optimize_loop_for_size_p (loop))
> +        continue;
> +
> +      /* Can the loop be manipulated?  */
> +      if (!can_duplicate_loop_p (loop))
> +        continue;
> +
> +      niter = number_of_latch_executions (loop);
> +      if (host_integerp (niter, 1))
> +       {
> +         n_unroll = tree_low_cst (niter, 1);
> +         if (n_unroll <= max_unroll)
> +           continue;
> +       }
> +
> +      body = get_loop_body (loop);
> +      found = false;
> +      stmt_count = 0;
> +
> +      for (i = 0; i < loop->num_nodes; i++)
> +       {
> +         for (gsi = gsi_start_bb (body[i]); !gsi_end_p (gsi); gsi_next
(&gsi))
> +           {
> +             stmt = gsi_stmt (gsi);
> +             stmt_count++;
> +             if (found)
> +               continue;
> +             code = gimple_code (stmt);
> +             if (code != GIMPLE_ASSIGN)
> +               continue;
> +             lhs = gimple_assign_lhs (stmt);
> +             if (TREE_CODE (lhs) != MEM_REF &&
> +                 TREE_CODE (lhs) != COMPONENT_REF &&
> +                 TREE_CODE (lhs) != ARRAY_REF)
> +               continue;
> +             if (gimple_assign_rhs_code(stmt) != INTEGER_CST)
> +               continue;
> +             mode = TYPE_MODE (TREE_TYPE (lhs));
> +             if (mode == HImode)
> +               {
> +                 locus = gimple_location (stmt);
> +                 found = true;
> +               }
> +          }
> +       }
> +      /* Don't worry about large loops that won't be unrolled anyway. In
fact,
> +       * don't worry about unrolling loops that are already over the size of
the
> +       * LSD (28 insts). Since instruction counts may be a little off at this
> +       * point, due to downstream transformations, include loops a little
bigger
> +       * than the LSD size.
> +       */
> +      if (found && stmt_count < 40)
> +       {
> +         n_unroll = PARAM_VALUE (PARAM_MAX_UNROLLED_INSNS)/stmt_count;
> +         /* Check for a simple peel candidate */
> +         if (!(loop->header->count
> +               && expected_loop_iterations (loop) < 2 * n_unroll))
> +           {
> +             location_t locus2;

locus2 is dead (assigned but not read) and can be removed.

> +             edge exit;
> +             if ((exit = single_exit(loop)) != NULL)
> +               locus2 = gimple_location(last_stmt(exit->src));
> +             else
> +               locus2 = gimple_location(first_stmt(loop->latch));
> +           }
> +         else
> +           found = false;
> +       }
> +      else
> +       found = false;
> +      free (body);
> +    }
> +
> +  scev_finalize();
> +  loop_optimizer_finalize();
> +  return found;
> +}
> +
>  /* This makes the function that chooses the version to execute based
>    on the condition.  This condition function will decide which version
>    of the function to execute.  It should look like this:
> @@ -26601,7 +26727,7 @@ create_mtune_target_opt_node (const char *arch_tun
>    the newly created version, that is tag "tune=core2" on the new version.  */
>
>  static bool
> -mversionable_for_core2_p (tree *optimization_node,
> +mversionable_for_core2_p (int version_number, tree *optimization_node,
>                          tree *cond_func_decl, basic_block *new_bb)
>  {
>   tree predicate_decl;
> @@ -26635,11 +26761,60 @@ static bool
>   *optimization_node = create_mtune_target_opt_node ("core2");
>
>   predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_CORE2];
> -  *new_bb = add_condition_to_bb (*cond_func_decl, 0, *new_bb,
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
>                                 predicate_decl, false);
>   return true;
>  }
>
> +/* Should a version of this function be specially optimized for core2?
> +
> +   This function should have checks to see if there are any opportunities for
> +   core2 specific optimizations, otherwise do not create a clone.  The
> +   following opportunities are checked.

The above 2 references to core2 should be changed to corei7?

> +
> +   * Check if there are any loops that contain LCP stalling instruction on
> +     corei7.
> +
> +   This versioning is triggered only when multi-versioning for corei7 is
> +   requested using -mvarch=corei7.
> +
> +   Return false if no versioning is required.  Return true if a version must
> +   be created.  Generate the *OPTIMIZATION_NODE that must be used to optimize
> +   the newly created version, that is tag "tune=corei7" on the new version.
 */
> +
> +static bool
> +mversionable_for_corei7_p (int version_number, tree *optimization_node,
> +                          tree *cond_func_decl, basic_block *new_bb)
> +{
> +  tree predicate_decl;
> +  bool is_mversion_target_corei7 = false;
> +  bool create_version = false;
> +
> +  if (ix86_varch_specified
> +      && (ix86_varch[PROCESSOR_COREI7_64]
> +         || ix86_varch[PROCESSOR_COREI7_32]))
> +    is_mversion_target_corei7 = true;
> +
> +  if (is_mversion_target_corei7 && find_himode_assigns ())
> +    create_version = true;
> +  /* else if XXX: Add more criteria to version for corei7.  */
> +
> +  if (!create_version)
> +    return false;
> +
> +  /* If the condition function's body has not been created, create it now.
 */
> +  if (*cond_func_decl == NULL)
> +    *cond_func_decl = make_condition_function (new_bb);
> +
> +  *optimization_node = create_mtune_target_opt_node ("corei7");
> +
> +  predicate_decl = ix86_builtins [(int) IX86_BUILTIN_CPU_IS_INTEL_COREI7];
> +  *new_bb = add_condition_to_bb (*cond_func_decl, version_number, *new_bb,
> +                                predicate_decl, false);
> +  return true;
> +}
> +
> +
>  /* Should this function CURRENT_FUNCTION_DECL be multi-versioned, if so
>    the number of versions to be created (other than the original) is
>    returned.  The outcome of COND_FUNC_DECL will decide the version to be
> @@ -26654,19 +26829,33 @@ ix86_mversion_function (tree fndecl ATTRIBUTE_UNUS
>   basic_block new_bb;
>   tree optimization_node;
>   int num_versions_created = 0;
> -
> +
>   if (ix86_mv_arch_string == NULL)
>     return 0;
>
> -  if (mversionable_for_core2_p (&optimization_node, cond_func_decl, &new_bb))
> -    num_versions_created++;
> +  if (mversionable_for_core2_p (num_versions_created, &optimization_node,
> +                               cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
>
> +  if (mversionable_for_corei7_p (num_versions_created, &optimization_node,
> +                                cond_func_decl, &new_bb))
> +    {
> +      num_versions_created++;
> +      *optimization_node_chain = tree_cons (optimization_node,
> +                                           NULL_TREE,
> +                                           *optimization_node_chain);
> +    }
> +
> +  *optimization_node_chain = nreverse (*optimization_node_chain);

What is the nreverse here and below needed for?

Thanks,
Teresa

> +
>   if (!num_versions_created)
>     return 0;
>
> -  *optimization_node_chain = tree_cons (optimization_node,
> -                                       NULL_TREE, *optimization_node_chain);
> -
>   /* Return the default version as the last stmt in cond_func_decl.  */
>   if (*cond_func_decl != NULL)
>     new_bb = add_condition_to_bb (*cond_func_decl, num_versions_created,
> Index: params.def
> ===================================================================
> --- params.def  (revision 185514)
> +++ params.def  (working copy)
> @@ -1040,7 +1040,7 @@ DEFPARAM (PARAM_PMU_PROFILE_N_ADDRESS,
>  DEFPARAM (PARAM_MAX_FUNCTION_SIZE_FOR_AUTO_CLONING,
>          "autoclone-function-size-limit",
>          "Do not auto clone functions beyond this size.",
> -         450, 0, 100000)
> +         5000, 0, 100000)
>
>  /*
>  Local variables:
> Index: mversn-dispatch.c
> ===================================================================
> --- mversn-dispatch.c   (revision 185514)
> +++ mversn-dispatch.c   (working copy)
> @@ -2423,6 +2423,8 @@ do_auto_clone (void)
>       opt_node = TREE_CHAIN (opt_node);
>     }
>
> +  fn_ver_addr_chain = nreverse (fn_ver_addr_chain);
> +
>   /* The current function is replaced by an ifunc call to the right version.
>      Make another clone for the default.  */
>   default_decl = clone_function (current_function_decl, "autoclone.original");
> @@ -2432,6 +2434,7 @@ do_auto_clone (void)
>   default_ver = build_fold_addr_expr (default_decl);
>   cond_func_addr = build_fold_addr_expr (cond_func_decl);
>
> +
>   /* Get the gimple sequence to replace the current function's body with a
>      ifunc dispatch call to the right version.  */
>   gseq = dispatch_using_ifunc (num_versions, current_function_decl,
>
> --
> This patch is available for review at http://codereview.appspot.com/5865043



-- 
Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Sign in to reply to this message.

Powered by Google App Engine
RSS Feeds Recent Issues | This issue
This is Rietveld f62528b