src/gui/painting/qdrawhelper_sse2.cpp
changeset 37 758a864f9613
parent 33 3e2da88830cd
equal deleted inserted replaced
36:ef0373b55136 37:758a864f9613
    41 
    41 
    42 #include <private/qdrawhelper_x86_p.h>
    42 #include <private/qdrawhelper_x86_p.h>
    43 
    43 
    44 #ifdef QT_HAVE_SSE2
    44 #ifdef QT_HAVE_SSE2
    45 
    45 
    46 #include <private/qsimd_p.h>
       
    47 #include <private/qdrawingprimitive_sse2_p.h>
    46 #include <private/qdrawingprimitive_sse2_p.h>
    48 #include <private/qpaintengine_raster_p.h>
    47 #include <private/qpaintengine_raster_p.h>
    49 
    48 
    50 QT_BEGIN_NAMESPACE
    49 QT_BEGIN_NAMESPACE
    51 
    50 
   110             const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
   109             const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
   111             for (int y = 0; y < h; ++y) {
   110             for (int y = 0; y < h; ++y) {
   112                 int x = 0;
   111                 int x = 0;
   113 
   112 
   114                 // First, align dest to 16 bytes:
   113                 // First, align dest to 16 bytes:
   115                 const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
   114                 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
   116                 const int prologLength = qMin(w, offsetToAlignOn16Bytes);
       
   117                 for (; x < prologLength; ++x) {
       
   118                     quint32 s = src[x];
   115                     quint32 s = src[x];
   119                     s = BYTE_MUL(s, const_alpha);
   116                     s = BYTE_MUL(s, const_alpha);
   120                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
   117                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
   121                 }
   118                 }
   122 
   119 
   143     }
   140     }
   144 }
   141 }
   145 
   142 
   146 void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
   143 void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
   147 {
   144 {
   148     Q_ASSERT(const_alpha >= 0);
       
   149     Q_ASSERT(const_alpha < 256);
   145     Q_ASSERT(const_alpha < 256);
   150 
   146 
   151     const quint32 *src = (const quint32 *) srcPixels;
   147     const quint32 *src = (const quint32 *) srcPixels;
   152     quint32 *dst = (quint32 *) destPixels;
   148     quint32 *dst = (quint32 *) destPixels;
   153 
   149 
   159         const __m128i alphaMask = _mm_set1_epi32(0xff000000);
   155         const __m128i alphaMask = _mm_set1_epi32(0xff000000);
   160         BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask);
   156         BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask);
   161     } else {
   157     } else {
   162         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
   158         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
   163         BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector);
   159         BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector);
       
   160     }
       
   161 }
       
   162 
       
   163 void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha)
       
   164 {
       
   165     int x = 0;
       
   166 
       
   167     if (const_alpha == 255) {
       
   168         // 1) Prologue: align destination on 16 bytes
       
   169         ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
       
   170             dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
       
   171 
       
   172         // 2) composition with SSE2
       
   173         for (; x < length - 3; x += 4) {
       
   174             const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
       
   175             const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
       
   176 
       
   177             const __m128i result = _mm_adds_epu8(srcVector, dstVector);
       
   178             _mm_store_si128((__m128i *)&dst[x], result);
       
   179         }
       
   180 
       
   181         // 3) Epilogue:
       
   182         for (; x < length; ++x)
       
   183             dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
       
   184     } else {
       
   185         const int one_minus_const_alpha = 255 - const_alpha;
       
   186         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
       
   187         const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
       
   188 
       
   189         // 1) Prologue: align destination on 16 bytes
       
   190         ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
       
   191             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
       
   192 
       
   193         const __m128i half = _mm_set1_epi16(0x80);
       
   194         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
       
   195         // 2) composition with SSE2
       
   196         for (; x < length - 3; x += 4) {
       
   197             const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
       
   198             const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
       
   199 
       
   200             __m128i result = _mm_adds_epu8(srcVector, dstVector);
       
   201             INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
       
   202             _mm_store_si128((__m128i *)&dst[x], result);
       
   203         }
       
   204 
       
   205         // 3) Epilogue:
       
   206         for (; x < length; ++x)
       
   207             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
       
   208     }
       
   209 }
       
   210 
       
   211 void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha)
       
   212 {
       
   213     if (const_alpha == 255) {
       
   214         ::memcpy(dst, src, length * sizeof(uint));
       
   215     } else {
       
   216         const int ialpha = 255 - const_alpha;
       
   217 
       
   218         int x = 0;
       
   219 
       
   220         // 1) prologue, align on 16 bytes
       
   221         ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
       
   222             dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
       
   223 
       
   224         // 2) interpolate pixels with SSE2
       
   225         const __m128i half = _mm_set1_epi16(0x80);
       
   226         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
       
   227         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
       
   228         const __m128i oneMinusConstAlpha =  _mm_set1_epi16(ialpha);
       
   229         for (; x < length - 3; x += 4) {
       
   230             const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
       
   231             __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
       
   232             INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
       
   233             _mm_store_si128((__m128i *)&dst[x], dstVector);
       
   234         }
       
   235 
       
   236         // 3) Epilogue
       
   237         for (; x < length; ++x)
       
   238             dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
   164     }
   239     }
   165 }
   240 }
   166 
   241 
   167 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
   242 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
   168 {
   243 {
   233         }
   308         }
   234         for (;x < length; ++x)
   309         for (;x < length; ++x)
   235             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
   310             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
   236     }
   311     }
   237 }
   312 }
       
   313 
       
   314 CompositionFunctionSolid qt_functionForModeSolid_onlySSE2[numCompositionFunctions] = {
       
   315     comp_func_solid_SourceOver_sse2,
       
   316     comp_func_solid_DestinationOver,
       
   317     comp_func_solid_Clear,
       
   318     comp_func_solid_Source,
       
   319     comp_func_solid_Destination,
       
   320     comp_func_solid_SourceIn,
       
   321     comp_func_solid_DestinationIn,
       
   322     comp_func_solid_SourceOut,
       
   323     comp_func_solid_DestinationOut,
       
   324     comp_func_solid_SourceAtop,
       
   325     comp_func_solid_DestinationAtop,
       
   326     comp_func_solid_XOR,
       
   327     comp_func_solid_Plus,
       
   328     comp_func_solid_Multiply,
       
   329     comp_func_solid_Screen,
       
   330     comp_func_solid_Overlay,
       
   331     comp_func_solid_Darken,
       
   332     comp_func_solid_Lighten,
       
   333     comp_func_solid_ColorDodge,
       
   334     comp_func_solid_ColorBurn,
       
   335     comp_func_solid_HardLight,
       
   336     comp_func_solid_SoftLight,
       
   337     comp_func_solid_Difference,
       
   338     comp_func_solid_Exclusion,
       
   339     rasterop_solid_SourceOrDestination,
       
   340     rasterop_solid_SourceAndDestination,
       
   341     rasterop_solid_SourceXorDestination,
       
   342     rasterop_solid_NotSourceAndNotDestination,
       
   343     rasterop_solid_NotSourceOrNotDestination,
       
   344     rasterop_solid_NotSourceXorDestination,
       
   345     rasterop_solid_NotSource,
       
   346     rasterop_solid_NotSourceAndDestination,
       
   347     rasterop_solid_SourceAndNotDestination
       
   348 };
       
   349 
       
   350 CompositionFunction qt_functionForMode_onlySSE2[numCompositionFunctions] = {
       
   351     comp_func_SourceOver_sse2,
       
   352     comp_func_DestinationOver,
       
   353     comp_func_Clear,
       
   354     comp_func_Source_sse2,
       
   355     comp_func_Destination,
       
   356     comp_func_SourceIn,
       
   357     comp_func_DestinationIn,
       
   358     comp_func_SourceOut,
       
   359     comp_func_DestinationOut,
       
   360     comp_func_SourceAtop,
       
   361     comp_func_DestinationAtop,
       
   362     comp_func_XOR,
       
   363     comp_func_Plus_sse2,
       
   364     comp_func_Multiply,
       
   365     comp_func_Screen,
       
   366     comp_func_Overlay,
       
   367     comp_func_Darken,
       
   368     comp_func_Lighten,
       
   369     comp_func_ColorDodge,
       
   370     comp_func_ColorBurn,
       
   371     comp_func_HardLight,
       
   372     comp_func_SoftLight,
       
   373     comp_func_Difference,
       
   374     comp_func_Exclusion,
       
   375     rasterop_SourceOrDestination,
       
   376     rasterop_SourceAndDestination,
       
   377     rasterop_SourceXorDestination,
       
   378     rasterop_NotSourceAndNotDestination,
       
   379     rasterop_NotSourceOrNotDestination,
       
   380     rasterop_NotSourceXorDestination,
       
   381     rasterop_NotSource,
       
   382     rasterop_NotSourceAndDestination,
       
   383     rasterop_SourceAndNotDestination
       
   384 };
   238 
   385 
   239 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
   386 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
   240 {
   387 {
   241     if (count < 3) {
   388     if (count < 3) {
   242         switch (count) {
   389         switch (count) {