|
|
Patch Set 1 #Patch Set 2 : [Patch, i386] Avoid LCP stalls #Patch Set 3 : [Patch, i386] Avoid LCP stalls #
Total comments: 1
MessagesTotal messages: 15
This patch addresses instructions that incur expensive length-changing prefix (LCP) stalls on some x86-64 implementations, notably Core2 and Corei7. Specifically, a move of a 16-bit constant into memory requires a length-changing prefix and can incur significant penalties. The attached patch avoids this by forcing such instructions to be split into two: a move of the corresponding 32-bit constant into a register, and a move of the register's lower 16 bits into memory. Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? Thanks, Teresa 2012-03-29 Teresa Johnson <tejohnson@google.com> * config/i386/i386.h (ix86_tune_indices): Add X86_TUNE_LCP_STALL. * config/i386/i386.md (movhi_internal): Split to movhi_internal and movhi_imm_internal. * config/i386/i386.c (initial_ix86_tune_features): Initialize X86_TUNE_LCP_STALL entry. Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 185920) +++ config/i386/i386.h (working copy) @@ -262,6 +262,7 @@ enum ix86_tune_indices { X86_TUNE_MOVX, X86_TUNE_PARTIAL_REG_STALL, X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_LCP_STALL, X86_TUNE_USE_HIMODE_FIOP, X86_TUNE_USE_SIMODE_FIOP, X86_TUNE_USE_MOV0, @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] #define TARGET_PARTIAL_FLAG_REG_STALL \ ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_LCP_STALL \ + ix86_tune_features[X86_TUNE_LCP_STALL] #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 185920) +++ config/i386/i386.md (working copy) @@ -2262,9 +2262,19 @@ ] (const_string "SI")))]) +(define_insn "*movhi_imm_internal" + [(set (match_operand:HI 0 "memory_operand" "=m") + (match_operand:HI 1 "immediate_operand" "n"))] + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" +{ + return "mov{w}\t{%1, %0|%0, %1}"; +} + [(set (attr "type") (const_string "imov")) + (set (attr "mode") (const_string "HI"))]) + (define_insn "*movhi_internal" [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] "!(MEM_P (operands[0]) && MEM_P (operands[1]))" { switch (get_attr_type (insn)) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 185920) +++ config/i386/i386.c (working copy) @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ m_CORE2I7 | m_GENERIC, + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall + * on 16-bit immediate moves into memory on Core2 and Corei7, + * which may also affect AMD implementations. */ + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, + /* X86_TUNE_USE_HIMODE_FIOP */ m_386 | m_486 | m_K6_GEODE, -- This patch is available for review at http://codereview.appspot.com/5975045
Sign in to reply to this message.
I should add that I have tested performance of this on Core2, Corei7 (Nehalem) and AMD Opteron-based systems. It appears to be performance-neutral on AMD (only minor perturbations, overall a wash). For the test case that provoked the optimization, there were nice improvements on Core2 and Corei7. Thanks, Teresa On Fri, Mar 30, 2012 at 7:18 AM, Teresa Johnson <tejohnson@google.com> wrote: > This patch addresses instructions that incur expensive length-changing prefix (LCP) stalls > on some x86-64 implementations, notably Core2 and Corei7. Specifically, a move of > a 16-bit constant into memory requires a length-changing prefix and can incur significant > penalties. The attached patch avoids this by forcing such instructions to be split into > two: a move of the corresponding 32-bit constant into a register, and a move of the > register's lower 16 bits into memory. > > Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? > > Thanks, > Teresa > > 2012-03-29 Teresa Johnson <tejohnson@google.com> > > * config/i386/i386.h (ix86_tune_indices): Add > X86_TUNE_LCP_STALL. > * config/i386/i386.md (movhi_internal): Split to > movhi_internal and movhi_imm_internal. > * config/i386/i386.c (initial_ix86_tune_features): Initialize > X86_TUNE_LCP_STALL entry. > > Index: config/i386/i386.h > =================================================================== > --- config/i386/i386.h (revision 185920) > +++ config/i386/i386.h (working copy) > @@ -262,6 +262,7 @@ enum ix86_tune_indices { > X86_TUNE_MOVX, > X86_TUNE_PARTIAL_REG_STALL, > X86_TUNE_PARTIAL_FLAG_REG_STALL, > + X86_TUNE_LCP_STALL, > X86_TUNE_USE_HIMODE_FIOP, > X86_TUNE_USE_SIMODE_FIOP, > X86_TUNE_USE_MOV0, > @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L > #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] > #define TARGET_PARTIAL_FLAG_REG_STALL \ > ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] > +#define TARGET_LCP_STALL \ > + ix86_tune_features[X86_TUNE_LCP_STALL] > #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] > #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] > #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] > Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 185920) > +++ config/i386/i386.md (working copy) > @@ -2262,9 +2262,19 @@ > ] > (const_string "SI")))]) > > +(define_insn "*movhi_imm_internal" > + [(set (match_operand:HI 0 "memory_operand" "=m") > + (match_operand:HI 1 "immediate_operand" "n"))] > + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" > +{ > + return "mov{w}\t{%1, %0|%0, %1}"; > +} > + [(set (attr "type") (const_string "imov")) > + (set (attr "mode") (const_string "HI"))]) > + > (define_insn "*movhi_internal" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") > - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] > + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > { > switch (get_attr_type (insn)) > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 185920) > +++ config/i386/i386.c (working copy) > @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > m_CORE2I7 | m_GENERIC, > > + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > + * on 16-bit immediate moves into memory on Core2 and Corei7, > + * which may also affect AMD implementations. */ > + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, > + > /* X86_TUNE_USE_HIMODE_FIOP */ > m_386 | m_486 | m_K6_GEODE, > > > -- > This patch is available for review at http://codereview.appspot.com/5975045 -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Sign in to reply to this message.
Minor update to patch to remove unnecessary check in new movhi_imm_internal define_insn. Retested successfully. Teresa 2012-03-29 Teresa Johnson <tejohnson@google.com> * config/i386/i386.h (ix86_tune_indices): Add X86_TUNE_LCP_STALL. * config/i386/i386.md (movhi_internal): Split to movhi_internal and movhi_imm_internal. * config/i386/i386.c (initial_ix86_tune_features): Initialize X86_TUNE_LCP_STALL entry. Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 185920) +++ config/i386/i386.h (working copy) @@ -262,6 +262,7 @@ enum ix86_tune_indices { X86_TUNE_MOVX, X86_TUNE_PARTIAL_REG_STALL, X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_LCP_STALL, X86_TUNE_USE_HIMODE_FIOP, X86_TUNE_USE_SIMODE_FIOP, X86_TUNE_USE_MOV0, @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] #define TARGET_PARTIAL_FLAG_REG_STALL \ ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_LCP_STALL \ + ix86_tune_features[X86_TUNE_LCP_STALL] #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 185920) +++ config/i386/i386.md (working copy) @@ -2262,9 +2262,19 @@ ] (const_string "SI")))]) +(define_insn "*movhi_imm_internal" + [(set (match_operand:HI 0 "memory_operand" "=m") + (match_operand:HI 1 "immediate_operand" "n"))] + "!TARGET_LCP_STALL" +{ + return "mov{w}\t{%1, %0|%0, %1}"; +} + [(set (attr "type") (const_string "imov")) + (set (attr "mode") (const_string "HI"))]) + (define_insn "*movhi_internal" [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] "!(MEM_P (operands[0]) && MEM_P (operands[1]))" { switch (get_attr_type (insn)) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 185920) +++ config/i386/i386.c (working copy) @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ m_CORE2I7 | m_GENERIC, + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall + * on 16-bit immediate moves into memory on Core2 and Corei7, + * which may also affect AMD implementations. */ + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, + /* X86_TUNE_USE_HIMODE_FIOP */ m_386 | m_486 | m_K6_GEODE, -- This patch is available for review at http://codereview.appspot.com/5975045
Sign in to reply to this message.
On 03/30/2012 11:03 AM, Teresa Johnson wrote: > +(define_insn "*movhi_imm_internal" > + [(set (match_operand:HI 0 "memory_operand" "=m") > + (match_operand:HI 1 "immediate_operand" "n"))] > + "!TARGET_LCP_STALL" > +{ > + return "mov{w}\t{%1, %0|%0, %1}"; > +} > + [(set (attr "type") (const_string "imov")) > + (set (attr "mode") (const_string "HI"))]) > + > (define_insn "*movhi_internal" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") > - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] > + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" For reload to work correctly, all alternatives must remain part of the same pattern. This issue should be handled with the ISA and ENABLED attributes. r~
Sign in to reply to this message.
> Index: config/i386/i386.md > =================================================================== > --- config/i386/i386.md (revision 185920) > +++ config/i386/i386.md (working copy) > @@ -2262,9 +2262,19 @@ > ] > (const_string "SI")))]) > > +(define_insn "*movhi_imm_internal" > + [(set (match_operand:HI 0 "memory_operand" "=m") > + (match_operand:HI 1 "immediate_operand" "n"))] > + "!TARGET_LCP_STALL && !(MEM_P (operands[0]) && MEM_P (operands[1]))" > +{ > + return "mov{w}\t{%1, %0|%0, %1}"; > +} > + [(set (attr "type") (const_string "imov")) > + (set (attr "mode") (const_string "HI"))]) > + > (define_insn "*movhi_internal" > [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") > - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] > + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] If you do this, you will prevent reload from considering using immediate as rematerializatoin when the register holding constant is on a stack on !TARGET_LCP_STALL machines. The matching pattern for moves should really handle all available alternatives, so reload is happy. You can duplicate the pattern, but I think this is much better to be done as post-reload peephole2. I.e. ask for scratch register and if it is available do the splitting. This way optimization won't happen when there is no register available and we will also rely on post-reload cleanups to unify moves of constant, but I think this should work well. You also want to conditionalize the split by optimize_insn_for_speed, too. > "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > { > switch (get_attr_type (insn)) > Index: config/i386/i386.c > =================================================================== > --- config/i386/i386.c (revision 185920) > +++ config/i386/i386.c (working copy) > @@ -1964,6 +1964,11 @@ static unsigned int initial_ix86_tune_features[X86 > /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ > m_CORE2I7 | m_GENERIC, > > + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall > + * on 16-bit immediate moves into memory on Core2 and Corei7, > + * which may also affect AMD implementations. */ > + m_CORE2I7 | m_GENERIC | m_AMD_MULTIPLE, Is this supposed to help AMD? (at least the pre-buldozer design should not care about length changing prefixes that much because it tags sizes in the cache). If not, I would suggest enabling it only for cores and generic. Honza
Sign in to reply to this message.
On 03/30/2012 11:11 AM, Richard Henderson wrote: > On 03/30/2012 11:03 AM, Teresa Johnson wrote: >> +(define_insn "*movhi_imm_internal" >> + [(set (match_operand:HI 0 "memory_operand" "=m") >> + (match_operand:HI 1 "immediate_operand" "n"))] >> + "!TARGET_LCP_STALL" >> +{ >> + return "mov{w}\t{%1, %0|%0, %1}"; >> +} >> + [(set (attr "type") (const_string "imov")) >> + (set (attr "mode") (const_string "HI"))]) >> + >> (define_insn "*movhi_internal" >> [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") >> - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] >> + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] >> "!(MEM_P (operands[0]) && MEM_P (operands[1]))" > > For reload to work correctly, all alternatives must remain part of the same pattern. > This issue should be handled with the ISA and ENABLED attributes. I'll also ask if this should better be handled with a peephole2. While movw $1234,(%eax) might be expensive, is it so expensive that we *must* force the use of a free register? Might it be better only to split the insn in two if and only if a free register exists? That can easily be done with a peephole2 pattern... r~
Sign in to reply to this message.
On Fri, Mar 30, 2012 at 8:19 AM, Richard Henderson <rth@redhat.com> wrote: > On 03/30/2012 11:11 AM, Richard Henderson wrote: >> On 03/30/2012 11:03 AM, Teresa Johnson wrote: >>> +(define_insn "*movhi_imm_internal" >>> + [(set (match_operand:HI 0 "memory_operand" "=m") >>> + (match_operand:HI 1 "immediate_operand" "n"))] >>> + "!TARGET_LCP_STALL" >>> +{ >>> + return "mov{w}\t{%1, %0|%0, %1}"; >>> +} >>> + [(set (attr "type") (const_string "imov")) >>> + (set (attr "mode") (const_string "HI"))]) >>> + >>> (define_insn "*movhi_internal" >>> [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") >>> - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] >>> + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] >>> "!(MEM_P (operands[0]) && MEM_P (operands[1]))" >> >> For reload to work correctly, all alternatives must remain part of the same pattern. >> This issue should be handled with the ISA and ENABLED attributes. > > I'll also ask if this should better be handled with a peephole2. > > While movw $1234,(%eax) might be expensive, is it so expensive that we > *must* force the use of a free register? Might it be better only to > split the insn in two if and only if a free register exists? > > That can easily be done with a peephole2 pattern... > Here is a very old LCP patch with peephole2. It may need some updates. -- H.J.
Sign in to reply to this message.
Hi Richard, Jan and H.J., Thanks for all the quick responses and suggestions. I had tested my patch when tuning for an arch without the LCP stalls, but it didn't hit an issue in reload because it didn't require rematerialization. Thanks for pointing out this issue. Regarding the penalty, it can be >=6 cycles for core2/corei7 so I thought it would be best to force the splitting even when that would force the use of a new register, but it is possible that the peephole2 approach will work just fine in the majority of the cases. Thanks for the peephole2 patch, H.J., I will test that solution out for the case I was trying to solve. Regarding the penalty on AMD, reading Agner's guide suggested that this could be a problem on Bulldozer, but only if there are >3 prefixes, and I'm not sure how often that will occur for this type of instruction in practice. I will look into removing AMD from the handled cases. Will respond later after trying the peephole2 approach. Teresa On Fri, Mar 30, 2012 at 9:23 AM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Fri, Mar 30, 2012 at 8:19 AM, Richard Henderson <rth@redhat.com> wrote: >> On 03/30/2012 11:11 AM, Richard Henderson wrote: >>> On 03/30/2012 11:03 AM, Teresa Johnson wrote: >>>> +(define_insn "*movhi_imm_internal" >>>> + [(set (match_operand:HI 0 "memory_operand" "=m") >>>> + (match_operand:HI 1 "immediate_operand" "n"))] >>>> + "!TARGET_LCP_STALL" >>>> +{ >>>> + return "mov{w}\t{%1, %0|%0, %1}"; >>>> +} >>>> + [(set (attr "type") (const_string "imov")) >>>> + (set (attr "mode") (const_string "HI"))]) >>>> + >>>> (define_insn "*movhi_internal" >>>> [(set (match_operand:HI 0 "nonimmediate_operand" "=r,r,r,m") >>>> - (match_operand:HI 1 "general_operand" "r,rn,rm,rn"))] >>>> + (match_operand:HI 1 "general_operand" "r,rn,rm,r"))] >>>> "!(MEM_P (operands[0]) && MEM_P (operands[1]))" >>> >>> For reload to work correctly, all alternatives must remain part of the same pattern. >>> This issue should be handled with the ISA and ENABLED attributes. >> >> I'll also ask if this should better be handled with a peephole2. >> >> While movw $1234,(%eax) might be expensive, is it so expensive that we >> *must* force the use of a free register? Might it be better only to >> split the insn in two if and only if a free register exists? >> >> That can easily be done with a peephole2 pattern... >> > > Here is a very old LCP patch with peephole2. It may need some > updates. > > > -- > H.J. -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Sign in to reply to this message.
> Hi Richard, Jan and H.J., > > Thanks for all the quick responses and suggestions. > > I had tested my patch when tuning for an arch without the LCP stalls, > but it didn't hit an issue in reload because it didn't require > rematerialization. Thanks for pointing out this issue. > > Regarding the penalty, it can be >=6 cycles for core2/corei7 so I 6 cycles is indeed quite serve and may pay for extra spill. I guess easiest way is to benchmark peephole variant and see what comes first. You may be able to see the differences better in 32bit mode due to register pressure issues. > thought it would be best to force the splitting even when that would > force the use of a new register, but it is possible that the peephole2 > approach will work just fine in the majority of the cases. Thanks for > the peephole2 patch, H.J., I will test that solution out for the case > I was trying to solve. > > Regarding the penalty on AMD, reading Agner's guide suggested that > this could be a problem on Bulldozer, but only if there are >3 > prefixes, and I'm not sure how often that will occur for this type of I can not think of case where MOV instruction in question would have 3 prefixes. It can have size overload and REX prefix, but REX usually do not count. You may try to benchmark Buldozer, but I would be surprised if there was any benefits. We need to run some benchmarks for generic/generic32 models on AMD machine anyway. I would guess that this transformation should be safe. Cost of extra register move is not high compared to the 16bit store overhead. Harsha? Honza
Sign in to reply to this message.
New patch to avoid LCP stalls based on feedback from earlier patch. I modified H.J.'s old patch to perform the peephole2 to split immediate moves to HImode memory. This is now enabled for Core2, Corei7 and Generic. I verified that this enables the splitting to occur in the case that originally motivated the optimization. If we subsequently find situations where LCP stalls are hurting performance but an extra register is required to perform the splitting, then we can revisit whether this should be performed earlier. I also measured SPEC 2000/2006 performance using Generic64 on an AMD Opteron and the results were neutral. Bootstrapped and tested on x86_64-unknown-linux-gnu. Is this ok for trunk? Thanks, Teresa 2012-04-04 Teresa Johnson <tejohnson@google.com> * config/i386/i386.h (ix86_tune_indices): Add X86_TUNE_LCP_STALL. * config/i386/i386.md (move immediate to memory peephole2): Add cases for HImode move when LCP stall avoidance is needed. * config/i386/i386.c (initial_ix86_tune_features): Initialize X86_TUNE_LCP_STALL entry. Index: config/i386/i386.h =================================================================== --- config/i386/i386.h (revision 185920) +++ config/i386/i386.h (working copy) @@ -262,6 +262,7 @@ enum ix86_tune_indices { X86_TUNE_MOVX, X86_TUNE_PARTIAL_REG_STALL, X86_TUNE_PARTIAL_FLAG_REG_STALL, + X86_TUNE_LCP_STALL, X86_TUNE_USE_HIMODE_FIOP, X86_TUNE_USE_SIMODE_FIOP, X86_TUNE_USE_MOV0, @@ -340,6 +341,8 @@ extern unsigned char ix86_tune_features[X86_TUNE_L #define TARGET_PARTIAL_REG_STALL ix86_tune_features[X86_TUNE_PARTIAL_REG_STALL] #define TARGET_PARTIAL_FLAG_REG_STALL \ ix86_tune_features[X86_TUNE_PARTIAL_FLAG_REG_STALL] +#define TARGET_LCP_STALL \ + ix86_tune_features[X86_TUNE_LCP_STALL] #define TARGET_USE_HIMODE_FIOP ix86_tune_features[X86_TUNE_USE_HIMODE_FIOP] #define TARGET_USE_SIMODE_FIOP ix86_tune_features[X86_TUNE_USE_SIMODE_FIOP] #define TARGET_USE_MOV0 ix86_tune_features[X86_TUNE_USE_MOV0] Index: config/i386/i386.md =================================================================== --- config/i386/i386.md (revision 185920) +++ config/i386/i386.md (working copy) @@ -16977,9 +16977,11 @@ (set (match_operand:SWI124 0 "memory_operand") (const_int 0))] "optimize_insn_for_speed_p () - && !TARGET_USE_MOV0 - && TARGET_SPLIT_LONG_MOVES - && get_attr_length (insn) >= ix86_cur_cost ()->large_insn + && ((TARGET_LCP_STALL + && GET_MODE (operands[0]) == HImode) + || (!TARGET_USE_MOV0 + && TARGET_SPLIT_LONG_MOVES + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn)) && peep2_regno_dead_p (0, FLAGS_REG)" [(parallel [(set (match_dup 2) (const_int 0)) (clobber (reg:CC FLAGS_REG))]) @@ -16991,8 +16993,10 @@ (set (match_operand:SWI124 0 "memory_operand") (match_operand:SWI124 1 "immediate_operand"))] "optimize_insn_for_speed_p () - && TARGET_SPLIT_LONG_MOVES - && get_attr_length (insn) >= ix86_cur_cost ()->large_insn" + && ((TARGET_LCP_STALL + && GET_MODE (operands[0]) == HImode) + || (TARGET_SPLIT_LONG_MOVES + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn))" [(set (match_dup 2) (match_dup 1)) (set (match_dup 0) (match_dup 2))]) Index: config/i386/i386.c =================================================================== --- config/i386/i386.c (revision 185920) +++ config/i386/i386.c (working copy) @@ -1964,6 +1964,10 @@ static unsigned int initial_ix86_tune_features[X86 /* X86_TUNE_PARTIAL_FLAG_REG_STALL */ m_CORE2I7 | m_GENERIC, + /* X86_TUNE_LCP_STALL: Avoid an expensive length-changing prefix stall + * on 16-bit immediate moves into memory on Core2 and Corei7. */ + m_CORE2I7 | m_GENERIC, + /* X86_TUNE_USE_HIMODE_FIOP */ m_386 | m_486 | m_K6_GEODE, -- This patch is available for review at http://codereview.appspot.com/5975045
Sign in to reply to this message.
On Wed, Apr 4, 2012 at 5:07 PM, Teresa Johnson <tejohnson@google.com> wrote: > New patch to avoid LCP stalls based on feedback from earlier patch. I modified > H.J.'s old patch to perform the peephole2 to split immediate moves to HImode > memory. This is now enabled for Core2, Corei7 and Generic. > > I verified that this enables the splitting to occur in the case that originally > motivated the optimization. If we subsequently find situations where LCP stalls > are hurting performance but an extra register is required to perform the > splitting, then we can revisit whether this should be performed earlier. > > I also measured SPEC 2000/2006 performance using Generic64 on an AMD Opteron > and the results were neutral. > What are the performance impacts on Core i7? I didn't notice any significant changes when I worked on it for Core 2. Thanks. -- H.J.
Sign in to reply to this message.
http://codereview.appspot.com/5975045/diff/6001/config/i386/i386.md File config/i386/i386.md (right): http://codereview.appspot.com/5975045/diff/6001/config/i386/i386.md#newcode16974 config/i386/i386.md:16974: ;; gets too big. The comments may need to be updated.
Sign in to reply to this message.
On Wed, Apr 4, 2012 at 5:39 PM, H.J. Lu <hjl.tools@gmail.com> wrote: > On Wed, Apr 4, 2012 at 5:07 PM, Teresa Johnson <tejohnson@google.com> wrote: >> New patch to avoid LCP stalls based on feedback from earlier patch. I modified >> H.J.'s old patch to perform the peephole2 to split immediate moves to HImode >> memory. This is now enabled for Core2, Corei7 and Generic. >> >> I verified that this enables the splitting to occur in the case that originally >> motivated the optimization. If we subsequently find situations where LCP stalls >> are hurting performance but an extra register is required to perform the >> splitting, then we can revisit whether this should be performed earlier. >> >> I also measured SPEC 2000/2006 performance using Generic64 on an AMD Opteron >> and the results were neutral. >> > > What are the performance impacts on Core i7? I didn't notice any significant > changes when I worked on it for Core 2. One of our street map applications speeds up by almost 5% on Corei7 and almost 2.5% on Core2 from this optimization. It contains a hot inner loop with some conditional writes of zero into a short array. The loop is unrolled so that it does not fit into the LSD which would have avoided many of the LCP stalls. Thanks, Teresa > > Thanks. > > -- > H.J. -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Sign in to reply to this message.
Hello! > New patch to avoid LCP stalls based on feedback from earlier patch. I modified > H.J.'s old patch to perform the peephole2 to split immediate moves to HImode > memory. This is now enabled for Core2, Corei7 and Generic. > 2012-04-04 Teresa Johnson <tejohnson@google.com> > > * config/i386/i386.h (ix86_tune_indices): Add > X86_TUNE_LCP_STALL. > * config/i386/i386.md (move immediate to memory peephole2): > Add cases for HImode move when LCP stall avoidance is needed. > * config/i386/i386.c (initial_ix86_tune_features): Initialize > X86_TUNE_LCP_STALL entry. "optimize_insn_for_speed_p () - && !TARGET_USE_MOV0 - && TARGET_SPLIT_LONG_MOVES - && get_attr_length (insn) >= ix86_cur_cost ()->large_insn + && ((TARGET_LCP_STALL + && GET_MODE (operands[0]) == HImode) + || (!TARGET_USE_MOV0 + && TARGET_SPLIT_LONG_MOVES + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn)) Please change added condition to: && ((<MODE>mode == HImode && TARGET_LCP_STALL) || (...)) Please also add H.J. as co-author of this change to ChangeLog. OK with these changes. Thanks, Uros.
Sign in to reply to this message.
Thanks, I will do both and update the comment as suggested by David, retest and then commit. Teresa On Thu, Apr 5, 2012 at 12:41 AM, Uros Bizjak <ubizjak@gmail.com> wrote: > Hello! > >> New patch to avoid LCP stalls based on feedback from earlier patch. I modified >> H.J.'s old patch to perform the peephole2 to split immediate moves to HImode >> memory. This is now enabled for Core2, Corei7 and Generic. > >> 2012-04-04 Teresa Johnson <tejohnson@google.com> >> >> * config/i386/i386.h (ix86_tune_indices): Add >> X86_TUNE_LCP_STALL. >> * config/i386/i386.md (move immediate to memory peephole2): >> Add cases for HImode move when LCP stall avoidance is needed. >> * config/i386/i386.c (initial_ix86_tune_features): Initialize >> X86_TUNE_LCP_STALL entry. > > "optimize_insn_for_speed_p () > - && !TARGET_USE_MOV0 > - && TARGET_SPLIT_LONG_MOVES > - && get_attr_length (insn) >= ix86_cur_cost ()->large_insn > + && ((TARGET_LCP_STALL > + && GET_MODE (operands[0]) == HImode) > + || (!TARGET_USE_MOV0 > + && TARGET_SPLIT_LONG_MOVES > + && get_attr_length (insn) >= ix86_cur_cost ()->large_insn)) > > Please change added condition to: > > && ((<MODE>mode == HImode > && TARGET_LCP_STALL) > || (...)) > > Please also add H.J. as co-author of this change to ChangeLog. > > OK with these changes. > > Thanks, > Uros. -- Teresa Johnson | Software Engineer | tejohnson@google.com | 408-460-2413
Sign in to reply to this message.
|