src/gui/painting/qdrawhelper_sse2.cpp
changeset 37 758a864f9613
parent 33 3e2da88830cd
--- a/src/gui/painting/qdrawhelper_sse2.cpp	Fri Sep 17 08:34:18 2010 +0300
+++ b/src/gui/painting/qdrawhelper_sse2.cpp	Mon Oct 04 01:19:32 2010 +0300
@@ -43,7 +43,6 @@
 
 #ifdef QT_HAVE_SSE2
 
-#include <private/qsimd_p.h>
 #include <private/qdrawingprimitive_sse2_p.h>
 #include <private/qpaintengine_raster_p.h>
 
@@ -112,9 +111,7 @@
                 int x = 0;
 
                 // First, align dest to 16 bytes:
-                const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;
-                const int prologLength = qMin(w, offsetToAlignOn16Bytes);
-                for (; x < prologLength; ++x) {
+                ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) {
                     quint32 s = src[x];
                     s = BYTE_MUL(s, const_alpha);
                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
@@ -145,7 +142,6 @@
 
 void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
 {
-    Q_ASSERT(const_alpha >= 0);
     Q_ASSERT(const_alpha < 256);
 
     const quint32 *src = (const quint32 *) srcPixels;
@@ -164,6 +160,85 @@
     }
 }
 
+void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha)
+{
+    int x = 0;
+
+    if (const_alpha == 255) {
+        // 1) Prologue: align destination on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+
+        // 2) composition with SSE2
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
+
+            const __m128i result = _mm_adds_epu8(srcVector, dstVector);
+            _mm_store_si128((__m128i *)&dst[x], result);
+        }
+
+        // 3) Epilogue:
+        for (; x < length; ++x)
+            dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]);
+    } else {
+        const int one_minus_const_alpha = 255 - const_alpha;
+        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
+        const __m128i oneMinusConstAlpha =  _mm_set1_epi16(one_minus_const_alpha);
+
+        // 1) Prologue: align destination on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
+
+        const __m128i half = _mm_set1_epi16(0x80);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        // 2) composition with SSE2
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
+            const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
+
+            __m128i result = _mm_adds_epu8(srcVector, dstVector);
+            INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
+            _mm_store_si128((__m128i *)&dst[x], result);
+        }
+
+        // 3) Epilogue:
+        for (; x < length; ++x)
+            dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
+    }
+}
+
+void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha)
+{
+    if (const_alpha == 255) {
+        ::memcpy(dst, src, length * sizeof(uint));
+    } else {
+        const int ialpha = 255 - const_alpha;
+
+        int x = 0;
+
+        // 1) prologue, align on 16 bytes
+        ALIGNMENT_PROLOGUE_16BYTES(dst, x, length)
+            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+
+        // 2) interpolate pixels with SSE2
+        const __m128i half = _mm_set1_epi16(0x80);
+        const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+        const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
+        const __m128i oneMinusConstAlpha =  _mm_set1_epi16(ialpha);
+        for (; x < length - 3; x += 4) {
+            const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]);
+            __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]);
+            INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half)
+            _mm_store_si128((__m128i *)&dst[x], dstVector);
+        }
+
+        // 3) Epilogue
+        for (; x < length; ++x)
+            dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha);
+    }
+}
+
 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count)
 {
     if (count < 7) {
@@ -236,6 +311,78 @@
     }
 }
 
+CompositionFunctionSolid qt_functionForModeSolid_onlySSE2[numCompositionFunctions] = {
+    comp_func_solid_SourceOver_sse2,
+    comp_func_solid_DestinationOver,
+    comp_func_solid_Clear,
+    comp_func_solid_Source,
+    comp_func_solid_Destination,
+    comp_func_solid_SourceIn,
+    comp_func_solid_DestinationIn,
+    comp_func_solid_SourceOut,
+    comp_func_solid_DestinationOut,
+    comp_func_solid_SourceAtop,
+    comp_func_solid_DestinationAtop,
+    comp_func_solid_XOR,
+    comp_func_solid_Plus,
+    comp_func_solid_Multiply,
+    comp_func_solid_Screen,
+    comp_func_solid_Overlay,
+    comp_func_solid_Darken,
+    comp_func_solid_Lighten,
+    comp_func_solid_ColorDodge,
+    comp_func_solid_ColorBurn,
+    comp_func_solid_HardLight,
+    comp_func_solid_SoftLight,
+    comp_func_solid_Difference,
+    comp_func_solid_Exclusion,
+    rasterop_solid_SourceOrDestination,
+    rasterop_solid_SourceAndDestination,
+    rasterop_solid_SourceXorDestination,
+    rasterop_solid_NotSourceAndNotDestination,
+    rasterop_solid_NotSourceOrNotDestination,
+    rasterop_solid_NotSourceXorDestination,
+    rasterop_solid_NotSource,
+    rasterop_solid_NotSourceAndDestination,
+    rasterop_solid_SourceAndNotDestination
+};
+
+CompositionFunction qt_functionForMode_onlySSE2[numCompositionFunctions] = {
+    comp_func_SourceOver_sse2,
+    comp_func_DestinationOver,
+    comp_func_Clear,
+    comp_func_Source_sse2,
+    comp_func_Destination,
+    comp_func_SourceIn,
+    comp_func_DestinationIn,
+    comp_func_SourceOut,
+    comp_func_DestinationOut,
+    comp_func_SourceAtop,
+    comp_func_DestinationAtop,
+    comp_func_XOR,
+    comp_func_Plus_sse2,
+    comp_func_Multiply,
+    comp_func_Screen,
+    comp_func_Overlay,
+    comp_func_Darken,
+    comp_func_Lighten,
+    comp_func_ColorDodge,
+    comp_func_ColorBurn,
+    comp_func_HardLight,
+    comp_func_SoftLight,
+    comp_func_Difference,
+    comp_func_Exclusion,
+    rasterop_SourceOrDestination,
+    rasterop_SourceAndDestination,
+    rasterop_SourceXorDestination,
+    rasterop_NotSourceAndNotDestination,
+    rasterop_NotSourceOrNotDestination,
+    rasterop_NotSourceXorDestination,
+    rasterop_NotSource,
+    rasterop_NotSourceAndDestination,
+    rasterop_SourceAndNotDestination
+};
+
 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count)
 {
     if (count < 3) {