Index: src/core/SkBitmapProcState_matrixProcs.cpp |
=================================================================== |
--- src/core/SkBitmapProcState_matrixProcs.cpp (revision 4857) |
+++ src/core/SkBitmapProcState_matrixProcs.cpp (working copy) |
@@ -8,6 +8,7 @@ |
#include "SkPerspIter.h" |
#include "SkShader.h" |
#include "SkUtils.h" |
+#include "SkUtilsArm.h" |
// Helper to ensure that when we shift down, we do it w/o sign-extension |
// so the caller doesn't have to manually mask off the top 16 bits |
@@ -67,27 +68,31 @@ |
void decal_nofilter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); |
void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count); |
+// Compile neon code paths if needed |
+#if !SK_ARM_NEON_IS_NONE |
+ |
+// These are defined in src/opts/SkBitmapProcState_matrixProcs_neon.cpp |
+extern const SkBitmapProcState::MatrixProc ClampX_ClampY_Procs_neon[]; |
+extern const SkBitmapProcState::MatrixProc RepeatX_RepeatY_Procs_neon[]; |
+ |
+#endif // !SK_ARM_NEON_IS_NONE |
+ |
+// Compile non-neon code path if needed |
+#if !SK_ARM_NEON_IS_ALWAYS |
#define MAKENAME(suffix) ClampX_ClampY ## suffix |
#define TILEX_PROCF(fx, max) SkClampMax((fx) >> 16, max) |
#define TILEY_PROCF(fy, max) SkClampMax((fy) >> 16, max) |
#define TILEX_LOW_BITS(fx, max) (((fx) >> 12) & 0xF) |
#define TILEY_LOW_BITS(fy, max) (((fy) >> 12) & 0xF) |
#define CHECK_FOR_DECAL |
-#if defined(__ARM_HAVE_NEON) |
- #include "SkBitmapProcState_matrix_clamp.h" |
-#else |
- #include "SkBitmapProcState_matrix.h" |
-#endif |
+#include "SkBitmapProcState_matrix.h" |
#define MAKENAME(suffix) RepeatX_RepeatY ## suffix |
#define TILEX_PROCF(fx, max) SK_USHIFT16(((fx) & 0xFFFF) * ((max) + 1)) |
#define TILEY_PROCF(fy, max) SK_USHIFT16(((fy) & 0xFFFF) * ((max) + 1)) |
#define TILEX_LOW_BITS(fx, max) ((((fx) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) |
#define TILEY_LOW_BITS(fy, max) ((((fy) & 0xFFFF) * ((max) + 1) >> 12) & 0xF) |
-#if defined(__ARM_HAVE_NEON) |
- #include "SkBitmapProcState_matrix_repeat.h" |
-#else |
- #include "SkBitmapProcState_matrix.h" |
+#include "SkBitmapProcState_matrix.h" |
#endif |
#define MAKENAME(suffix) GeneralXY ## suffix |
@@ -228,52 +233,6 @@ |
{ |
int i; |
-#if defined(__ARM_HAVE_NEON) |
- if (count >= 8) { |
- /* SkFixed is 16.16 fixed point */ |
- SkFixed dx2 = dx+dx; |
- SkFixed dx4 = dx2+dx2; |
- SkFixed dx8 = dx4+dx4; |
- |
- /* now build fx/fx+dx/fx+2dx/fx+3dx */ |
- SkFixed fx1, fx2, fx3; |
- int32x2_t lower, upper; |
- int32x4_t lbase, hbase; |
- uint16_t *dst16 = (uint16_t *)dst; |
- |
- fx1 = fx+dx; |
- fx2 = fx1+dx; |
- fx3 = fx2+dx; |
- |
- /* avoid an 'lbase unitialized' warning */ |
- lbase = vdupq_n_s32(fx); |
- lbase = vsetq_lane_s32(fx1, lbase, 1); |
- lbase = vsetq_lane_s32(fx2, lbase, 2); |
- lbase = vsetq_lane_s32(fx3, lbase, 3); |
- hbase = vaddq_s32(lbase, vdupq_n_s32(dx4)); |
- |
- /* take upper 16 of each, store, and bump everything */ |
- do { |
- int32x4_t lout, hout; |
- uint16x8_t hi16; |
- |
- lout = lbase; |
- hout = hbase; |
- /* gets hi's of all louts then hi's of all houts */ |
- asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout)); |
- hi16 = vreinterpretq_u16_s32(hout); |
- vst1q_u16(dst16, hi16); |
- |
- /* on to the next */ |
- lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8)); |
- hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8)); |
- dst16 += 8; |
- count -= 8; |
- fx += dx8; |
- } while (count >= 8); |
- dst = (uint32_t *) dst16; |
- } |
-#else |
for (i = (count >> 2); i > 0; --i) |
{ |
*dst++ = pack_two_shorts(fx >> 16, (fx + dx) >> 16); |
@@ -282,7 +241,6 @@ |
fx += dx+dx; |
} |
count &= 3; |
-#endif |
uint16_t* xx = (uint16_t*)dst; |
for (i = count; i > 0; --i) { |
@@ -293,43 +251,7 @@ |
void decal_filter_scale(uint32_t dst[], SkFixed fx, SkFixed dx, int count) |
{ |
-#if defined(__ARM_HAVE_NEON) |
- if (count >= 8) { |
- int32x4_t wide_fx; |
- int32x4_t wide_fx2; |
- int32x4_t wide_dx8 = vdupq_n_s32(dx*8); |
- wide_fx = vdupq_n_s32(fx); |
- wide_fx = vsetq_lane_s32(fx+dx, wide_fx, 1); |
- wide_fx = vsetq_lane_s32(fx+dx+dx, wide_fx, 2); |
- wide_fx = vsetq_lane_s32(fx+dx+dx+dx, wide_fx, 3); |
- |
- wide_fx2 = vaddq_s32(wide_fx, vdupq_n_s32(dx+dx+dx+dx)); |
- |
- while (count >= 8) { |
- int32x4_t wide_out; |
- int32x4_t wide_out2; |
- |
- wide_out = vshlq_n_s32(vshrq_n_s32(wide_fx, 12), 14); |
- wide_out = vorrq_s32(wide_out, |
- vaddq_s32(vshrq_n_s32(wide_fx,16), vdupq_n_s32(1))); |
- |
- wide_out2 = vshlq_n_s32(vshrq_n_s32(wide_fx2, 12), 14); |
- wide_out2 = vorrq_s32(wide_out2, |
- vaddq_s32(vshrq_n_s32(wide_fx2,16), vdupq_n_s32(1))); |
- |
- vst1q_u32(dst, vreinterpretq_u32_s32(wide_out)); |
- vst1q_u32(dst+4, vreinterpretq_u32_s32(wide_out2)); |
- |
- dst += 8; |
- fx += dx*8; |
- wide_fx = vaddq_s32(wide_fx, wide_dx8); |
- wide_fx2 = vaddq_s32(wide_fx2, wide_dx8); |
- count -= 8; |
- } |
- } |
-#endif |
- |
if (count & 1) |
{ |
SkASSERT((fx >> (16 + 14)) == 0); |
@@ -574,7 +496,17 @@ |
// clamp gets special version of filterOne |
fFilterOneX = SK_Fixed1; |
fFilterOneY = SK_Fixed1; |
+#if SK_ARM_NEON_IS_NONE |
return ClampX_ClampY_Procs[index]; |
+#elif SK_ARM_NEON_IS_ALWAYS |
+ return ClampX_ClampY_Procs_neon[index]; |
+#else // SK_ARM_NEON_IS_DYNAMIC |
+ if (sk_cpu_arm_has_neon()) { |
+ return ClampX_ClampY_Procs_neon[index]; |
+ } else { |
+ return ClampX_ClampY_Procs[index]; |
+ } |
+#endif |
} |
// all remaining procs use this form for filterOne |
@@ -584,9 +516,19 @@ |
if (SkShader::kRepeat_TileMode == fTileModeX && |
SkShader::kRepeat_TileMode == fTileModeY) |
{ |
+#if SK_ARM_NEON_IS_NONE |
return RepeatX_RepeatY_Procs[index]; |
+#elif SK_ARM_NEON_IS_ALWAYS |
+ return RepeatX_RepeatY_Procs_neon[index]; |
+#else // SK_ARM_NEON_IS_DYNAMIC |
+ if (sk_cpu_arm_has_neon()) { |
+ return RepeatX_RepeatY_Procs_neon[index]; |
+ } else { |
+ return RepeatX_RepeatY_Procs[index]; |
+ } |
+#endif |
} |
- |
+ |
fTileProcX = choose_tile_proc(fTileModeX); |
fTileProcY = choose_tile_proc(fTileModeY); |
fTileLowBitsProcX = choose_tile_lowbits_proc(fTileModeX); |