src/gui/painting/qdrawhelper.cpp
changeset 37 758a864f9613
parent 33 3e2da88830cd
--- a/src/gui/painting/qdrawhelper.cpp	Fri Sep 17 08:34:18 2010 +0300
+++ b/src/gui/painting/qdrawhelper.cpp	Mon Oct 04 01:19:32 2010 +0300
@@ -46,7 +46,6 @@
 #include <private/qdrawhelper_armv6_p.h>
 #include <private/qdrawhelper_neon_p.h>
 #include <private/qmath_p.h>
-#include <private/qsimd_p.h>
 #include <qmath.h>
 
 QT_BEGIN_NAMESPACE
@@ -656,6 +655,46 @@
     return buffer;
 }
 
+/** \internal
+  interpolate 4 argb pixels with the distx and disty factor.
+  distx and disty bust be between 0 and 16
+ */
+static inline uint interpolate_4_pixels_16(uint tl, uint tr, uint bl, uint br, int distx, int disty, int idistx, int idisty)
+{
+    uint tlrb = ((tl & 0x00ff00ff) * idistx * idisty);
+    uint tlag = (((tl & 0xff00ff00) >> 8) * idistx * idisty);
+    uint trrb = ((tr & 0x00ff00ff) * distx * idisty);
+    uint trag = (((tr & 0xff00ff00) >> 8) * distx * idisty);
+    uint blrb = ((bl & 0x00ff00ff) * idistx * disty);
+    uint blag = (((bl & 0xff00ff00) >> 8) * idistx * disty);
+    uint brrb = ((br & 0x00ff00ff) * distx * disty);
+    uint brag = (((br & 0xff00ff00) >> 8) * distx * disty);
+    return (((tlrb + trrb + blrb + brrb) >> 8) & 0x00ff00ff) | ((tlag + trag + blag + brag) & 0xff00ff00);
+}
+
+
+template<TextureBlendType blendType>
+Q_STATIC_TEMPLATE_FUNCTION inline void fetchTransformedBilinear_pixelBounds(int max, int l1, int l2, int &v1, int &v2)
+{
+    if (blendType == BlendTransformedBilinearTiled) {
+        v1 %= max;
+        if (v1 < 0) v1 += max;
+        v2 = v1 + 1;
+        v2 %= max;
+    } else {
+        if (v1 < l1) {
+            v2 = v1 = l1;
+        } else if (v1 >= l2) {
+            v2 = v1 = l2;
+        } else {
+            v2 = v1 + 1;
+        }
+    }
+
+    Q_ASSERT(v1 >= 0 && v1 < max);
+    Q_ASSERT(v2 >= 0 && v2 < max);
+}
+
 template<TextureBlendType blendType, QImage::Format format> /* blendType = BlendTransformedBilinear or BlendTransformedBilinearTiled */
 Q_STATIC_TEMPLATE_FUNCTION
 const uint * QT_FASTCALL fetchTransformedBilinear(uint *buffer, const Operator *, const QSpanData *data,
@@ -676,8 +715,8 @@
 
     int image_x1 = data->texture.x1;
     int image_y1 = data->texture.y1;
-    int image_x2 = data->texture.x2;
-    int image_y2 = data->texture.y2;
+    int image_x2 = data->texture.x2 - 1;
+    int image_y2 = data->texture.y2 - 1;
 
     const qreal cx = x + 0.5;
     const qreal cy = y + 0.5;
@@ -696,64 +735,230 @@
 
         fx -= half_point;
         fy -= half_point;
-        while (b < end) {
-            int x1 = (fx >> 16);
-            int x2;
+
+        if (fdy == 0) { //simple scale, no rotation
             int y1 = (fy >> 16);
             int y2;
-
-            if (blendType == BlendTransformedBilinearTiled) {
-                x1 %= image_width;
-                if (x1 < 0) x1 += image_width;
-                x2 = x1 + 1;
-                x2 %= image_width;
-
-                y1 %= image_height;
-                if (y1 < 0) y1 += image_height;
-                y2 = y1 + 1;
-                y2 %= image_height;
-            } else {
-                if (x1 < image_x1) {
-                    x2 = x1 = image_x1;
-                } else if (x1 >= image_x2 - 1) {
-                    x2 = x1 = image_x2 - 1;
+            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+            const uchar *s1 = data->texture.scanLine(y1);
+            const uchar *s2 = data->texture.scanLine(y2);
+
+            if (fdx <= fixed_scale && fdx > 0) { // scale up on X
+                int disty = (fy & 0x0000ffff) >> 8;
+                int idisty = 256 - disty;
+                int x = fx >> 16;
+
+                // The idea is first to do the interpolation between the row s1 and the row s2
+                // into an intermediate buffer, then we interpolate between two pixel of this buffer.
+
+                // intermediate_buffer[0] is a buffer of red-blue component of the pixel, in the form 0x00RR00BB
+                // intermediate_buffer[1] is the alpha-green component of the pixel, in the form 0x00AA00GG
+                quint32 intermediate_buffer[2][buffer_size + 2];
+                // count is the size used in the intermediate_buffer.
+                int count = qCeil(length * data->m11) + 2; //+1 for the last pixel to interpolate with, and +1 for rounding errors.
+                Q_ASSERT(count <= buffer_size + 2); //length is supposed to be <= buffer_size and data->m11 < 1 in this case
+                int f = 0;
+                int lim = count;
+                if (blendType == BlendTransformedBilinearTiled) {
+                    x %= image_width;
+                    if (x < 0) x += image_width;
                 } else {
-                    x2 = x1 + 1;
+                    lim = qMin(count, image_x2-x+1);
+                    if (x < image_x1) {
+                        Q_ASSERT(x <= image_x2);
+                        uint t = fetch(s1, image_x1, data->texture.colorTable);
+                        uint b = fetch(s2, image_x1, data->texture.colorTable);
+                        quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                        quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                        do {
+                            intermediate_buffer[0][f] = rb;
+                            intermediate_buffer[1][f] = ag;
+                            f++;
+                            x++;
+                        } while (x < image_x1 && f < lim);
+                    }
+                }
+
+#if defined(QT_ALWAYS_HAVE_SSE2)
+                if (blendType != BlendTransformedBilinearTiled &&
+                        (format == QImage::Format_ARGB32_Premultiplied || format == QImage::Format_RGB32)) {
+
+                    const __m128i disty_ = _mm_set1_epi16(disty);
+                    const __m128i idisty_ = _mm_set1_epi16(idisty);
+                    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
+
+                    lim -= 3;
+                    for (; f < lim; x += 4, f += 4) {
+                        // Load 4 pixels from s1, and split the alpha-green and red-blue component
+                        __m128i top = _mm_loadu_si128((__m128i*)((const uint *)(s1)+x));
+                        __m128i topAG = _mm_srli_epi16(top, 8);
+                        __m128i topRB = _mm_and_si128(top, colorMask);
+                        // Multiplies each colour component by idisty
+                        topAG = _mm_mullo_epi16 (topAG, idisty_);
+                        topRB = _mm_mullo_epi16 (topRB, idisty_);
+
+                        // Same for the s2 vector
+                        __m128i bottom = _mm_loadu_si128((__m128i*)((const uint *)(s2)+x));
+                        __m128i bottomAG = _mm_srli_epi16(bottom, 8);
+                        __m128i bottomRB = _mm_and_si128(bottom, colorMask);
+                        bottomAG = _mm_mullo_epi16 (bottomAG, disty_);
+                        bottomRB = _mm_mullo_epi16 (bottomRB, disty_);
+
+                        // Add the values, and shift to only keep 8 significant bits per colors
+                        __m128i rAG =_mm_add_epi16(topAG, bottomAG);
+                        rAG = _mm_srli_epi16(rAG, 8);
+                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[1][f]), rAG);
+                        __m128i rRB =_mm_add_epi16(topRB, bottomRB);
+                        rRB = _mm_srli_epi16(rRB, 8);
+                        _mm_storeu_si128((__m128i*)(&intermediate_buffer[0][f]), rRB);
+                    }
+                }
+#endif
+                for (; f < count; f++) { // Same as above but without sse2
+                    if (blendType == BlendTransformedBilinearTiled) {
+                        if (x >= image_width) x -= image_width;
+                    } else {
+                        x = qMin(x, image_x2);
+                    }
+
+                    uint t = fetch(s1, x, data->texture.colorTable);
+                    uint b = fetch(s2, x, data->texture.colorTable);
+
+                    intermediate_buffer[0][f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                    intermediate_buffer[1][f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
+                    x++;
                 }
-                if (y1 < image_y1) {
-                    y2 = y1 = image_y1;
-                } else if (y1 >= image_y2 - 1) {
-                    y2 = y1 = image_y2 - 1;
-                } else {
-                    y2 = y1 + 1;
+                // Now interpolate the values from the intermediate_buffer to get the final result.
+                fx &= fixed_scale - 1;
+                Q_ASSERT((fx >> 16) == 0);
+                while (b < end) {
+                    register int x1 = (fx >> 16);
+                    register int x2 = x1 + 1;
+                    Q_ASSERT(x1 >= 0);
+                    Q_ASSERT(x2 < count);
+
+                    register int distx = (fx & 0x0000ffff) >> 8;
+                    register int idistx = 256 - distx;
+                    int rb = ((intermediate_buffer[0][x1] * idistx + intermediate_buffer[0][x2] * distx) >> 8) & 0xff00ff;
+                    int ag = (intermediate_buffer[1][x1] * idistx + intermediate_buffer[1][x2] * distx) & 0xff00ff00;
+                    *b = rb | ag;
+                    b++;
+                    fx += fdx;
+                }
+            } else if ((fdx < 0 && fdx > -(fixed_scale / 8)) || fabs(data->m22) < (1./8.)) { // scale up more than 8x
+                int y1 = (fy >> 16);
+                int y2;
+                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+                const uchar *s1 = data->texture.scanLine(y1);
+                const uchar *s2 = data->texture.scanLine(y2);
+                int disty = (fy & 0x0000ffff) >> 8;
+                int idisty = 256 - disty;
+                while (b < end) {
+                    int x1 = (fx >> 16);
+                    int x2;
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+                    uint tl = fetch(s1, x1, data->texture.colorTable);
+                    uint tr = fetch(s1, x2, data->texture.colorTable);
+                    uint bl = fetch(s2, x1, data->texture.colorTable);
+                    uint br = fetch(s2, x2, data->texture.colorTable);
+
+                    int distx = (fx & 0x0000ffff) >> 8;
+                    int idistx = 256 - distx;
+
+                    uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
+                    uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
+                    *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+
+                    fx += fdx;
+                    ++b;
+                }
+            } else { //scale down
+                int y1 = (fy >> 16);
+                int y2;
+                fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+                const uchar *s1 = data->texture.scanLine(y1);
+                const uchar *s2 = data->texture.scanLine(y2);
+                int disty = (fy & 0x0000ffff) >> 12;
+                int idisty = 16 - disty;
+                while (b < end) {
+                    int x1 = (fx >> 16);
+                    int x2;
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+                    uint tl = fetch(s1, x1, data->texture.colorTable);
+                    uint tr = fetch(s1, x2, data->texture.colorTable);
+                    uint bl = fetch(s2, x1, data->texture.colorTable);
+                    uint br = fetch(s2, x2, data->texture.colorTable);
+                    int distx = (fx & 0x0000ffff) >> 12;
+                    int idistx = 16 - distx;
+                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty, idistx, idisty);
+                    fx += fdx;
+                    ++b;
                 }
             }
-
-            Q_ASSERT(x1 >= 0 && x1 < image_width);
-            Q_ASSERT(x2 >= 0 && x2 < image_width);
-            Q_ASSERT(y1 >= 0 && y1 < image_height);
-            Q_ASSERT(y2 >= 0 && y2 < image_height);
-
-            const uchar *s1 = data->texture.scanLine(y1);
-            const uchar *s2 = data->texture.scanLine(y2);
-
-            uint tl = fetch(s1, x1, data->texture.colorTable);
-            uint tr = fetch(s1, x2, data->texture.colorTable);
-            uint bl = fetch(s2, x1, data->texture.colorTable);
-            uint br = fetch(s2, x2, data->texture.colorTable);
-
-            int distx = (fx & 0x0000ffff) >> 8;
-            int disty = (fy & 0x0000ffff) >> 8;
-            int idistx = 256 - distx;
-            int idisty = 256 - disty;
-
-            uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
-            uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
-            *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
-
-            fx += fdx;
-            fy += fdy;
-            ++b;
+        } else { //rotation
+            if (fabs(data->m11) > 8 || fabs(data->m22) > 8) {
+                //if we are zooming more than 8 times, we use 8bit precision for the position.
+                while (b < end) {
+                    int x1 = (fx >> 16);
+                    int x2;
+                    int y1 = (fy >> 16);
+                    int y2;
+
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+
+                    const uchar *s1 = data->texture.scanLine(y1);
+                    const uchar *s2 = data->texture.scanLine(y2);
+
+                    uint tl = fetch(s1, x1, data->texture.colorTable);
+                    uint tr = fetch(s1, x2, data->texture.colorTable);
+                    uint bl = fetch(s2, x1, data->texture.colorTable);
+                    uint br = fetch(s2, x2, data->texture.colorTable);
+
+                    int distx = (fx & 0x0000ffff) >> 8;
+                    int disty = (fy & 0x0000ffff) >> 8;
+                    int idistx = 256 - distx;
+                    int idisty = 256 - disty;
+
+                    uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
+                    uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
+                    *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
+
+                    fx += fdx;
+                    fy += fdy;
+                    ++b;
+                }
+            } else {
+                //we are zooming less than 8x, use 4bit precision
+                while (b < end) {
+                    int x1 = (fx >> 16);
+                    int x2;
+                    int y1 = (fy >> 16);
+                    int y2;
+
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+                    fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
+
+                    const uchar *s1 = data->texture.scanLine(y1);
+                    const uchar *s2 = data->texture.scanLine(y2);
+
+                    uint tl = fetch(s1, x1, data->texture.colorTable);
+                    uint tr = fetch(s1, x2, data->texture.colorTable);
+                    uint bl = fetch(s2, x1, data->texture.colorTable);
+                    uint br = fetch(s2, x2, data->texture.colorTable);
+
+                    int distx = (fx & 0x0000ffff) >> 12;
+                    int disty = (fy & 0x0000ffff) >> 12;
+                    int idistx = 16 - distx;
+                    int idisty = 16 - disty;
+
+                    *b = interpolate_4_pixels_16(tl, tr, bl, br, distx, disty, idistx, idisty);
+
+                    fx += fdx;
+                    fy += fdy;
+                    ++b;
+                }
+            }
         }
     } else {
         const qreal fdx = data->m11;
@@ -779,37 +984,8 @@
             int idistx = 256 - distx;
             int idisty = 256 - disty;
 
-            if (blendType == BlendTransformedBilinearTiled) {
-                x1 %= image_width;
-                if (x1 < 0) x1 += image_width;
-                x2 = x1 + 1;
-                x2 %= image_width;
-
-                y1 %= image_height;
-                if (y1 < 0) y1 += image_height;
-                y2 = y1 + 1;
-                y2 %= image_height;
-            } else {
-                if (x1 < 0) {
-                    x2 = x1 = 0;
-                } else if (x1 >= image_width - 1) {
-                    x2 = x1 = image_width - 1;
-                } else {
-                    x2 = x1 + 1;
-                }
-                if (y1 < 0) {
-                    y2 = y1 = 0;
-                } else if (y1 >= image_height - 1) {
-                    y2 = y1 = image_height - 1;
-                } else {
-                    y2 = y1 + 1;
-                }
-            }
-
-            Q_ASSERT(x1 >= 0 && x1 < image_width);
-            Q_ASSERT(x2 >= 0 && x2 < image_width);
-            Q_ASSERT(y1 >= 0 && y1 < image_height);
-            Q_ASSERT(y2 >= 0 && y2 < image_height);
+            fetchTransformedBilinear_pixelBounds<blendType>(image_width, image_x1, image_x2, x1, x2);
+            fetchTransformedBilinear_pixelBounds<blendType>(image_height, image_y1, image_y2, y1, y2);
 
             const uchar *s1 = data->texture.scanLine(y1);
             const uchar *s2 = data->texture.scanLine(y2);
@@ -1309,12 +1485,12 @@
     }\
 }
 
-static void QT_FASTCALL comp_func_solid_Clear(uint *dest, int length, uint, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Clear(uint *dest, int length, uint, uint const_alpha)
 {
     comp_func_Clear_impl(dest, length, const_alpha);
 }
 
-static void QT_FASTCALL comp_func_Clear(uint *dest, const uint *, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Clear(uint *dest, const uint *, int length, uint const_alpha)
 {
     comp_func_Clear_impl(dest, length, const_alpha);
 }
@@ -1323,7 +1499,7 @@
   result = s
   dest = s * ca + d * cia
 */
-static void QT_FASTCALL comp_func_solid_Source(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Source(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255) {
         QT_MEMFILL_UINT(dest, length, color);
@@ -1338,7 +1514,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Source(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Source(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255) {
         ::memcpy(dest, src, length * sizeof(uint));
@@ -1352,11 +1528,11 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Destination(uint *, int, uint, uint)
-{
-}
-
-static void QT_FASTCALL comp_func_Destination(uint *, const uint *, int, uint)
+void QT_FASTCALL comp_func_solid_Destination(uint *, int, uint, uint)
+{
+}
+
+void QT_FASTCALL comp_func_Destination(uint *, const uint *, int, uint)
 {
 }
 
@@ -1366,7 +1542,7 @@
        = s * ca + d * (sia * ca + cia)
        = s * ca + d * (1 - sa*ca)
 */
-static void QT_FASTCALL comp_func_solid_SourceOver(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_SourceOver(uint *dest, int length, uint color, uint const_alpha)
 {
     if ((const_alpha & qAlpha(color)) == 255) {
         QT_MEMFILL_UINT(dest, length, color);
@@ -1381,7 +1557,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_SourceOver(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_SourceOver(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1407,7 +1583,7 @@
   dest = (d + s * dia) * ca + d * cia
        = d + s * dia * ca
 */
-static void QT_FASTCALL comp_func_solid_DestinationOver(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_DestinationOver(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha != 255)
         color = BYTE_MUL(color, const_alpha);
@@ -1419,7 +1595,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_DestinationOver(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_DestinationOver(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1442,7 +1618,7 @@
   result = s * da
   dest = s * da * ca + d * cia
 */
-static void QT_FASTCALL comp_func_solid_SourceIn(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_SourceIn(uint *dest, int length, uint color, uint const_alpha)
 {
     PRELOAD_INIT(dest)
     if (const_alpha == 255) {
@@ -1461,7 +1637,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_SourceIn(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_SourceIn(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1485,7 +1661,7 @@
   dest = d * sa * ca + d * cia
        = d * (sa * ca + cia)
 */
-static void QT_FASTCALL comp_func_solid_DestinationIn(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_DestinationIn(uint *dest, int length, uint color, uint const_alpha)
 {
     uint a = qAlpha(color);
     if (const_alpha != 255) {
@@ -1498,7 +1674,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_DestinationIn(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_DestinationIn(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1521,7 +1697,7 @@
   dest = s * dia * ca + d * cia
 */
 
-static void QT_FASTCALL comp_func_solid_SourceOut(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_SourceOut(uint *dest, int length, uint color, uint const_alpha)
 {
     PRELOAD_INIT(dest)
     if (const_alpha == 255) {
@@ -1540,7 +1716,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_SourceOut(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_SourceOut(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1564,7 +1740,7 @@
   dest = d * sia * ca + d * cia
        = d * (sia * ca + cia)
 */
-static void QT_FASTCALL comp_func_solid_DestinationOut(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_DestinationOut(uint *dest, int length, uint color, uint const_alpha)
 {
     uint a = qAlpha(~color);
     if (const_alpha != 255)
@@ -1576,7 +1752,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_DestinationOut(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_DestinationOut(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1600,7 +1776,7 @@
        = s*ca * da + d * (sia*ca + cia)
        = s*ca * da + d * (1 - sa*ca)
 */
-static void QT_FASTCALL comp_func_solid_SourceAtop(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_SourceAtop(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha != 255) {
         color = BYTE_MUL(color, const_alpha);
@@ -1613,7 +1789,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_SourceAtop(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_SourceAtop(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1638,7 +1814,7 @@
   dest = d*sa*ca + s*dia*ca + d *cia
        = s*ca * dia + d * (sa*ca + cia)
 */
-static void QT_FASTCALL comp_func_solid_DestinationAtop(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_DestinationAtop(uint *dest, int length, uint color, uint const_alpha)
 {
     uint a = qAlpha(color);
     if (const_alpha != 255) {
@@ -1653,7 +1829,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_DestinationAtop(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_DestinationAtop(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1681,7 +1857,7 @@
        = s*ca * dia + d * (sia*ca + cia)
        = s*ca * dia + d * (1 - sa*ca)
 */
-static void QT_FASTCALL comp_func_solid_XOR(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_XOR(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha != 255)
         color = BYTE_MUL(color, const_alpha);
@@ -1695,7 +1871,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_XOR(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_XOR(uint *dest, const uint *src, int length, uint const_alpha)
 {
     PRELOAD_INIT2(dest, src)
     if (const_alpha == 255) {
@@ -1715,11 +1891,6 @@
     }
 }
 
-static const uint AMASK = 0xff000000;
-static const uint RMASK = 0x00ff0000;
-static const uint GMASK = 0x0000ff00;
-static const uint BMASK = 0x000000ff;
-
 struct QFullCoverage {
     inline void store(uint *dest, const uint src) const
     {
@@ -1762,14 +1933,12 @@
     for (int i = 0; i < length; ++i) {
         PRELOAD_COND(dest)
         uint d = dest[i];
-#define MIX(mask) (qMin(((qint64(s)&mask) + (qint64(d)&mask)), qint64(mask)))
-        d = (MIX(AMASK) | MIX(RMASK) | MIX(GMASK) | MIX(BMASK));
-#undef MIX
+        d = comp_func_Plus_one_pixel(d, s);
         coverage.store(&dest[i], d);
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Plus(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Plus(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Plus_impl(dest, length, color, QFullCoverage());
@@ -1786,15 +1955,13 @@
         uint d = dest[i];
         uint s = src[i];
 
-#define MIX(mask) (qMin(((qint64(s)&mask) + (qint64(d)&mask)), qint64(mask)))
-        d = (MIX(AMASK) | MIX(RMASK) | MIX(GMASK) | MIX(BMASK));
-#undef MIX
+        d = comp_func_Plus_one_pixel(d, s);
 
         coverage.store(&dest[i], d);
     }
 }
 
-static void QT_FASTCALL comp_func_Plus(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Plus(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Plus_impl(dest, src, length, QFullCoverage());
@@ -1835,7 +2002,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Multiply(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Multiply(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Multiply_impl(dest, length, color, QFullCoverage());
@@ -1866,7 +2033,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Multiply(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Multiply(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Multiply_impl(dest, src, length, QFullCoverage());
@@ -1903,7 +2070,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Screen(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Screen(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Screen_impl(dest, length, color, QFullCoverage());
@@ -1934,7 +2101,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Screen(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Screen(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Screen_impl(dest, src, length, QFullCoverage());
@@ -1982,7 +2149,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Overlay(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Overlay(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Overlay_impl(dest, length, color, QFullCoverage());
@@ -2013,7 +2180,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Overlay(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Overlay(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Overlay_impl(dest, src, length, QFullCoverage());
@@ -2055,7 +2222,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Darken(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Darken(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Darken_impl(dest, length, color, QFullCoverage());
@@ -2086,7 +2253,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Darken(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Darken(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Darken_impl(dest, src, length, QFullCoverage());
@@ -2128,7 +2295,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Lighten(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Lighten(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Lighten_impl(dest, length, color, QFullCoverage());
@@ -2159,7 +2326,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Lighten(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Lighten(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Lighten_impl(dest, src, length, QFullCoverage());
@@ -2211,7 +2378,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_ColorDodge(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_ColorDodge(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_ColorDodge_impl(dest, length, color, QFullCoverage());
@@ -2242,7 +2409,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_ColorDodge(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_ColorDodge(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_ColorDodge_impl(dest, src, length, QFullCoverage());
@@ -2294,7 +2461,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_ColorBurn(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_ColorBurn(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_ColorBurn_impl(dest, length, color, QFullCoverage());
@@ -2325,7 +2492,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_ColorBurn(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_ColorBurn(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_ColorBurn_impl(dest, src, length, QFullCoverage());
@@ -2374,7 +2541,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_HardLight(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_HardLight(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_HardLight_impl(dest, length, color, QFullCoverage());
@@ -2405,7 +2572,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_HardLight(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_HardLight(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_HardLight_impl(dest, src, length, QFullCoverage());
@@ -2465,7 +2632,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_SoftLight(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_SoftLight(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_SoftLight_impl(dest, length, color, QFullCoverage());
@@ -2496,7 +2663,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_SoftLight(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_SoftLight(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_SoftLight_impl(dest, src, length, QFullCoverage());
@@ -2538,7 +2705,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Difference(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Difference(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Difference_impl(dest, length, color, QFullCoverage());
@@ -2569,7 +2736,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Difference(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Difference(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Difference_impl(dest, src, length, QFullCoverage());
@@ -2605,7 +2772,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_solid_Exclusion(uint *dest, int length, uint color, uint const_alpha)
+void QT_FASTCALL comp_func_solid_Exclusion(uint *dest, int length, uint color, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_solid_Exclusion_impl(dest, length, color, QFullCoverage());
@@ -2636,7 +2803,7 @@
     }
 }
 
-static void QT_FASTCALL comp_func_Exclusion(uint *dest, const uint *src, int length, uint const_alpha)
+void QT_FASTCALL comp_func_Exclusion(uint *dest, const uint *src, int length, uint const_alpha)
 {
     if (const_alpha == 255)
         comp_func_Exclusion_impl(dest, src, length, QFullCoverage());
@@ -2649,30 +2816,30 @@
 #  pragma pop
 #endif
 
-static void QT_FASTCALL rasterop_solid_SourceOrDestination(uint *dest,
-                                                           int length,
-                                                           uint color,
-                                                           uint const_alpha)
+void QT_FASTCALL rasterop_solid_SourceOrDestination(uint *dest,
+                                                    int length,
+                                                    uint color,
+                                                    uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--)
         *dest++ |= color;
 }
 
-static void QT_FASTCALL rasterop_SourceOrDestination(uint *dest,
-                                                     const uint *src,
-                                                     int length,
-                                                     uint const_alpha)
+void QT_FASTCALL rasterop_SourceOrDestination(uint *dest,
+                                              const uint *src,
+                                              int length,
+                                              uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--)
         *dest++ |= *src++;
 }
 
-static void QT_FASTCALL rasterop_solid_SourceAndDestination(uint *dest,
-                                                            int length,
-                                                            uint color,
-                                                            uint const_alpha)
+void QT_FASTCALL rasterop_solid_SourceAndDestination(uint *dest,
+                                                     int length,
+                                                     uint color,
+                                                     uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color |= 0xff000000;
@@ -2680,10 +2847,10 @@
         *dest++ &= color;
 }
 
-static void QT_FASTCALL rasterop_SourceAndDestination(uint *dest,
-                                                      const uint *src,
-                                                      int length,
-                                                      uint const_alpha)
+void QT_FASTCALL rasterop_SourceAndDestination(uint *dest,
+                                               const uint *src,
+                                               int length,
+                                               uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2692,10 +2859,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_SourceXorDestination(uint *dest,
-                                                            int length,
-                                                            uint color,
-                                                            uint const_alpha)
+void QT_FASTCALL rasterop_solid_SourceXorDestination(uint *dest,
+                                                     int length,
+                                                     uint color,
+                                                     uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color &= 0x00ffffff;
@@ -2703,10 +2870,10 @@
         *dest++ ^= color;
 }
 
-static void QT_FASTCALL rasterop_SourceXorDestination(uint *dest,
-                                                      const uint *src,
-                                                      int length,
-                                                      uint const_alpha)
+void QT_FASTCALL rasterop_SourceXorDestination(uint *dest,
+                                               const uint *src,
+                                               int length,
+                                               uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2715,10 +2882,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_NotSourceAndNotDestination(uint *dest,
-                                                                  int length,
-                                                                  uint color,
-                                                                  uint const_alpha)
+void QT_FASTCALL rasterop_solid_NotSourceAndNotDestination(uint *dest,
+                                                           int length,
+                                                           uint color,
+                                                           uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color = ~color;
@@ -2728,10 +2895,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_NotSourceAndNotDestination(uint *dest,
-                                                            const uint *src,
-                                                            int length,
-                                                            uint const_alpha)
+void QT_FASTCALL rasterop_NotSourceAndNotDestination(uint *dest,
+                                                     const uint *src,
+                                                     int length,
+                                                     uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2740,10 +2907,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_NotSourceOrNotDestination(uint *dest,
-                                                                 int length,
-                                                                 uint color,
-                                                                 uint const_alpha)
+void QT_FASTCALL rasterop_solid_NotSourceOrNotDestination(uint *dest,
+                                                          int length,
+                                                          uint color,
+                                                          uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color = ~color | 0xff000000;
@@ -2753,10 +2920,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_NotSourceOrNotDestination(uint *dest,
-                                                           const uint *src,
-                                                           int length,
-                                                           uint const_alpha)
+void QT_FASTCALL rasterop_NotSourceOrNotDestination(uint *dest,
+                                                    const uint *src,
+                                                    int length,
+                                                    uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2765,10 +2932,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_NotSourceXorDestination(uint *dest,
-                                                               int length,
-                                                               uint color,
-                                                               uint const_alpha)
+void QT_FASTCALL rasterop_solid_NotSourceXorDestination(uint *dest,
+                                                        int length,
+                                                        uint color,
+                                                        uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color = ~color & 0x00ffffff;
@@ -2778,10 +2945,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_NotSourceXorDestination(uint *dest,
-                                                         const uint *src,
-                                                         int length,
-                                                         uint const_alpha)
+void QT_FASTCALL rasterop_NotSourceXorDestination(uint *dest,
+                                                  const uint *src,
+                                                  int length,
+                                                  uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2790,25 +2957,25 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_NotSource(uint *dest, int length,
-                                                 uint color, uint const_alpha)
+void QT_FASTCALL rasterop_solid_NotSource(uint *dest, int length,
+                                          uint color, uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     qt_memfill(dest, ~color | 0xff000000, length);
 }
 
-static void QT_FASTCALL rasterop_NotSource(uint *dest, const uint *src,
-                                           int length, uint const_alpha)
+void QT_FASTCALL rasterop_NotSource(uint *dest, const uint *src,
+                                    int length, uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--)
         *dest++ = ~(*src++) | 0xff000000;
 }
 
-static void QT_FASTCALL rasterop_solid_NotSourceAndDestination(uint *dest,
-                                                               int length,
-                                                               uint color,
-                                                               uint const_alpha)
+void QT_FASTCALL rasterop_solid_NotSourceAndDestination(uint *dest,
+                                                        int length,
+                                                        uint color,
+                                                        uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     color = ~color | 0xff000000;
@@ -2818,10 +2985,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_NotSourceAndDestination(uint *dest,
-                                                         const uint *src,
-                                                         int length,
-                                                         uint const_alpha)
+void QT_FASTCALL rasterop_NotSourceAndDestination(uint *dest,
+                                                  const uint *src,
+                                                  int length,
+                                                  uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2830,10 +2997,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_solid_SourceAndNotDestination(uint *dest,
-                                                               int length,
-                                                               uint color,
-                                                               uint const_alpha)
+void QT_FASTCALL rasterop_solid_SourceAndNotDestination(uint *dest,
+                                                        int length,
+                                                        uint color,
+                                                        uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -2842,10 +3009,10 @@
     }
 }
 
-static void QT_FASTCALL rasterop_SourceAndNotDestination(uint *dest,
-                                                         const uint *src,
-                                                         int length,
-                                                         uint const_alpha)
+void QT_FASTCALL rasterop_SourceAndNotDestination(uint *dest,
+                                                  const uint *src,
+                                                  int length,
+                                                  uint const_alpha)
 {
     Q_UNUSED(const_alpha);
     while (length--) {
@@ -5168,239 +5335,6 @@
         blend_tiled_generic<RegularSpans>(count, spans, userData);
 }
 
-
-template <SpanMethod spanMethod, TextureBlendType blendType>  /* blendType must be either BlendTransformedBilinear or BlendTransformedBilinearTiled */
-Q_STATIC_TEMPLATE_FUNCTION void blend_transformed_bilinear_argb(int count, const QSpan *spans, void *userData)
-{
-    QSpanData *data = reinterpret_cast<QSpanData *>(userData);
-    if (data->texture.format != QImage::Format_ARGB32_Premultiplied
-        && data->texture.format != QImage::Format_RGB32) {
-        blend_src_generic<spanMethod>(count, spans, userData);
-        return;
-    }
-
-    CompositionFunction func = functionForMode[data->rasterBuffer->compositionMode];
-    uint buffer[buffer_size];
-
-    const int image_x1 = data->texture.x1;
-    const int image_y1 = data->texture.y1;
-    const int image_x2 = data->texture.x2;
-    const int image_y2 = data->texture.y2;
-    const int image_width = data->texture.width;
-    const int image_height = data->texture.height;
-    const int scanline_offset = data->texture.bytesPerLine / 4;
-
-    if (data->fast_matrix) {
-        // The increment pr x in the scanline
-        int fdx = (int)(data->m11 * fixed_scale);
-        int fdy = (int)(data->m12 * fixed_scale);
-
-        while (count--) {
-            void *t = data->rasterBuffer->scanLine(spans->y);
-
-            uint *target = ((uint *)t) + spans->x;
-            uint *image_bits = (uint *)data->texture.imageData;
-
-            const qreal cx = spans->x + 0.5;
-            const qreal cy = spans->y + 0.5;
-
-            int x = int((data->m21 * cy
-                         + data->m11 * cx + data->dx) * fixed_scale) - half_point;
-            int y = int((data->m22 * cy
-                         + data->m12 * cx + data->dy) * fixed_scale) - half_point;
-
-            int length = spans->len;
-            const int coverage = (data->texture.const_alpha * spans->coverage) >> 8;
-            while (length) {
-                int l = qMin(length, buffer_size);
-                const uint *end = buffer + l;
-                uint *b = buffer;
-                while (b < end) {
-                    int x1 = (x >> 16);
-                    int x2;
-                    int y1 = (y >> 16);
-                    int y2;
-
-                    if (blendType == BlendTransformedBilinearTiled) {
-                        x1 %= image_width;
-                        if (x1 < 0) x1 += image_width;
-                        x2 = x1 + 1;
-                        x2 %= image_width;
-
-                        y1 %= image_height;
-                        if (y1 < 0) y1 += image_height;
-                        y2 = y1 + 1;
-                        y2 %= image_height;
-
-                        Q_ASSERT(x1 >= 0 && x1 < image_width);
-                        Q_ASSERT(x2 >= 0 && x2 < image_width);
-                        Q_ASSERT(y1 >= 0 && y1 < image_height);
-                        Q_ASSERT(y2 >= 0 && y2 < image_height);
-                    } else {
-                        if (x1 < image_x1) {
-                            x2 = x1 = image_x1;
-                        } else if (x1 >= image_x2 - 1) {
-                            x2 = x1 = image_x2 - 1;
-                        } else {
-                            x2 = x1 + 1;
-                        }
-                        if (y1 < image_y1) {
-                            y2 = y1 = image_y1;
-                        } else if (y1 >= image_y2 - 1) {
-                            y2 = y1 = image_y2 - 1;
-                        } else {
-                            y2 = y1 + 1;
-                        }
-                    }
-
-                    int y1_offset = y1 * scanline_offset;
-                    int y2_offset = y2 * scanline_offset;
-
-#if defined(Q_IRIX_GCC3_3_WORKAROUND)
-                    uint tl = gccBug(image_bits[y1_offset + x1]);
-                    uint tr = gccBug(image_bits[y1_offset + x2]);
-                    uint bl = gccBug(image_bits[y2_offset + x1]);
-                    uint br = gccBug(image_bits[y2_offset + x2]);
-#else
-                    uint tl = image_bits[y1_offset + x1];
-                    uint tr = image_bits[y1_offset + x2];
-                    uint bl = image_bits[y2_offset + x1];
-                    uint br = image_bits[y2_offset + x2];
-#endif
-
-                    int distx = (x & 0x0000ffff) >> 8;
-                    int disty = (y & 0x0000ffff) >> 8;
-                    int idistx = 256 - distx;
-                    int idisty = 256 - disty;
-
-                    uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
-                    uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
-                    *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
-                    ++b;
-
-                    x += fdx;
-                    y += fdy;
-                }
-                if (spanMethod == RegularSpans)
-                    func(target, buffer, l, coverage);
-                else
-                    drawBufferSpan(data, buffer, buffer_size,
-                                   spans->x + spans->len - length,
-                                   spans->y, l, coverage);
-                target += l;
-                length -= l;
-            }
-            ++spans;
-        }
-    } else {
-        const qreal fdx = data->m11;
-        const qreal fdy = data->m12;
-        const qreal fdw = data->m13;
-
-        while (count--) {
-            void *t = data->rasterBuffer->scanLine(spans->y);
-
-            uint *target = ((uint *)t) + spans->x;
-            uint *image_bits = (uint *)data->texture.imageData;
-
-            const qreal cx = spans->x + 0.5;
-            const qreal cy = spans->y + 0.5;
-
-            qreal x = data->m21 * cy + data->m11 * cx + data->dx;
-            qreal y = data->m22 * cy + data->m12 * cx + data->dy;
-            qreal w = data->m23 * cy + data->m13 * cx + data->m33;
-
-            int length = spans->len;
-            const int coverage = (data->texture.const_alpha * spans->coverage) >> 8;
-            while (length) {
-                int l = qMin(length, buffer_size);
-                const uint *end = buffer + l;
-                uint *b = buffer;
-                while (b < end) {
-                    const qreal iw = w == 0 ? 1 : 1 / w;
-                    const qreal px = x * iw - 0.5;
-                    const qreal py = y * iw - 0.5;
-
-                    int x1 = int(px) - (px < 0);
-                    int x2;
-                    int y1 = int(py) - (py < 0);
-                    int y2;
-
-                    int distx = int((px - x1) * 256);
-                    int disty = int((py - y1) * 256);
-                    int idistx = 256 - distx;
-                    int idisty = 256 - disty;
-
-                    if (blendType == BlendTransformedBilinearTiled) {
-                        x1 %= image_width;
-                        if (x1 < 0) x1 += image_width;
-                        x2 = x1 + 1;
-                        x2 %= image_width;
-
-                        y1 %= image_height;
-                        if (y1 < 0) y1 += image_height;
-                        y2 = y1 + 1;
-                        y2 %= image_height;
-
-                        Q_ASSERT(x1 >= 0 && x1 < image_width);
-                        Q_ASSERT(x2 >= 0 && x2 < image_width);
-                        Q_ASSERT(y1 >= 0 && y1 < image_height);
-                        Q_ASSERT(y2 >= 0 && y2 < image_height);
-                    } else {
-                        if (x1 < image_x1) {
-                            x2 = x1 = image_x1;
-                        } else if (x1 >= image_x2 - 1) {
-                            x2 = x1 = image_x2 - 1;
-                        } else {
-                            x2 = x1 + 1;
-                        }
-                        if (y1 < image_y1) {
-                            y2 = y1 = image_y1;
-                        } else if (y1 >= image_y2 - 1) {
-                            y2 = y1 = image_y2 - 1;
-                        } else {
-                            y2 = y1 + 1;
-                        }
-                    }
-
-                    int y1_offset = y1 * scanline_offset;
-                    int y2_offset = y2 * scanline_offset;
-
-#if defined(Q_IRIX_GCC3_3_WORKAROUND)
-                    uint tl = gccBug(image_bits[y1_offset + x1]);
-                    uint tr = gccBug(image_bits[y1_offset + x2]);
-                    uint bl = gccBug(image_bits[y2_offset + x1]);
-                    uint br = gccBug(image_bits[y2_offset + x2]);
-#else
-                    uint tl = image_bits[y1_offset + x1];
-                    uint tr = image_bits[y1_offset + x2];
-                    uint bl = image_bits[y2_offset + x1];
-                    uint br = image_bits[y2_offset + x2];
-#endif
-
-                    uint xtop = INTERPOLATE_PIXEL_256(tl, idistx, tr, distx);
-                    uint xbot = INTERPOLATE_PIXEL_256(bl, idistx, br, distx);
-                    *b = INTERPOLATE_PIXEL_256(xtop, idisty, xbot, disty);
-                    ++b;
-
-                    x += fdx;
-                    y += fdy;
-                    w += fdw;
-                }
-                if (spanMethod == RegularSpans)
-                    func(target, buffer, l, coverage);
-                else
-                    drawBufferSpan(data, buffer, buffer_size,
-                                   spans->x + spans->len - length,
-                                   spans->y, l, coverage);
-                target += l;
-                length -= l;
-            }
-            ++spans;
-        }
-    }
-}
-
 template <class DST, class SRC>
 Q_STATIC_TEMPLATE_FUNCTION void blendTransformedBilinear(int count, const QSpan *spans,
                                      void *userData)
@@ -6651,7 +6585,7 @@
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // Indexed8
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // RGB32
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // ARGB32
-        blend_transformed_bilinear_argb<RegularSpans, BlendTransformedBilinear>, // ARGB32_Premultiplied
+        SPANFUNC_POINTER(blend_src_generic, RegularSpans), // ARGB32_Premultiplied
         blend_transformed_bilinear_rgb565,
         blend_transformed_bilinear_argb8565,
         blend_transformed_bilinear_rgb666,
@@ -6670,7 +6604,7 @@
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // Indexed8
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // RGB32
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // ARGB32
-        blend_transformed_bilinear_argb<RegularSpans, BlendTransformedBilinearTiled>, // ARGB32_Premultiplied
+        SPANFUNC_POINTER(blend_src_generic, RegularSpans), // ARGB32_Premultiplied
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // RGB16
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // ARGB8565_Premultiplied
         SPANFUNC_POINTER(blend_src_generic, RegularSpans), // RGB666
@@ -6769,7 +6703,7 @@
         blend_src_generic<CallbackSpans>,   // Indexed8
         blend_src_generic<CallbackSpans>,   // RGB32
         blend_src_generic<CallbackSpans>,   // ARGB32
-        blend_transformed_bilinear_argb<CallbackSpans, BlendTransformedBilinear>, // ARGB32_Premultiplied
+        blend_src_generic<CallbackSpans>, // ARGB32_Premultiplied
         blend_src_generic<CallbackSpans>,   // RGB16
         blend_src_generic<CallbackSpans>,   // ARGB8565_Premultiplied
         blend_src_generic<CallbackSpans>,   // RGB666
@@ -6788,7 +6722,7 @@
         blend_src_generic<CallbackSpans>,   // Indexed8
         blend_src_generic<CallbackSpans>,   // RGB32
         blend_src_generic<CallbackSpans>,   // ARGB32
-        blend_transformed_bilinear_argb<CallbackSpans, BlendTransformedBilinearTiled>, // ARGB32_Premultiplied
+        blend_src_generic<CallbackSpans>, // ARGB32_Premultiplied
         blend_src_generic<CallbackSpans>,   // RGB16
         blend_src_generic<CallbackSpans>,   // ARGB8565_Premultiplied
         blend_src_generic<CallbackSpans>,   // RGB666
@@ -7775,6 +7709,7 @@
 #ifdef QT_HAVE_MMX
     if (features & MMX) {
         functionForModeAsm = qt_functionForMode_MMX;
+
         functionForModeSolidAsm = qt_functionForModeSolid_MMX;
         qDrawHelper[QImage::Format_ARGB32_Premultiplied].blendColor = qt_blend_color_argb_mmx;
 #ifdef QT_HAVE_3DNOW
@@ -7804,6 +7739,55 @@
 
 #ifdef QT_HAVE_SSE
     if (features & SSE) {
+        extern void qt_blend_rgb32_on_rgb32_sse(uchar *destPixels, int dbpl,
+                                                const uchar *srcPixels, int sbpl,
+                                                int w, int h,
+                                                int const_alpha);
+        extern void qt_blend_argb32_on_argb32_sse(uchar *destPixels, int dbpl,
+                                                  const uchar *srcPixels, int sbpl,
+                                                  int w, int h,
+                                                  int const_alpha);
+
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse;
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse;
+    }
+#endif // SSE
+
+#ifdef QT_HAVE_SSE2
+    if (features & SSE2) {
+        extern void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl,
+                                                 const uchar *srcPixels, int sbpl,
+                                                 int w, int h,
+                                                 int const_alpha);
+        extern void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
+                                                   const uchar *srcPixels, int sbpl,
+                                                   int w, int h,
+                                                   int const_alpha);
+
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse2;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse2;
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2;
+    }
+
+#ifdef QT_HAVE_SSSE3
+    if (features & SSSE3) {
+        extern void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
+                                                    const uchar *srcPixels, int sbpl,
+                                                    int w, int h,
+                                                    int const_alpha);
+
+        qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_ssse3;
+        qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_ssse3;
+    }
+#endif // SSSE3
+
+#endif // SSE2
+
+#ifdef QT_HAVE_SSE
+    if (features & SSE) {
         functionForModeAsm = qt_functionForMode_SSE;
         functionForModeSolidAsm = qt_functionForModeSolid_SSE;
         qDrawHelper[QImage::Format_ARGB32_Premultiplied].blendColor = qt_blend_color_argb_sse;
@@ -7819,47 +7803,27 @@
 #ifdef QT_HAVE_SSE2
         if (features & SSE2) {
             extern void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels,
-                                                  const uint *srcPixels,
-                                                  int length,
-                                                  uint const_alpha);
+                                                              const uint *srcPixels,
+                                                              int length,
+                                                              uint const_alpha);
             extern void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha);
+            extern void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha);
+            extern void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha);
 
             functionForModeAsm[0] = comp_func_SourceOver_sse2;
+            functionForModeAsm[QPainter::CompositionMode_Source] = comp_func_Source_sse2;
+            functionForModeAsm[QPainter::CompositionMode_Plus] = comp_func_Plus_sse2;
             functionForModeSolidAsm[0] = comp_func_solid_SourceOver_sse2;
-
-            extern void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl,
-                                                     const uchar *srcPixels, int sbpl,
-                                                     int w, int h,
-                                                     int const_alpha);
-            extern void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl,
-                                                       const uchar *srcPixels, int sbpl,
-                                                       int w, int h,
-                                                       int const_alpha);
-
-            qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse2;
-            qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse2;
-            qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2;
-            qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse2;
-        } else
+        }
 #endif
-        {
-            extern void qt_blend_rgb32_on_rgb32_sse(uchar *destPixels, int dbpl,
-                                                    const uchar *srcPixels, int sbpl,
-                                                    int w, int h,
-                                                    int const_alpha);
-            extern void qt_blend_argb32_on_argb32_sse(uchar *destPixels, int dbpl,
-                                                      const uchar *srcPixels, int sbpl,
-                                                      int w, int h,
-                                                      int const_alpha);
-
-
-            qBlendFunctions[QImage::Format_RGB32][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse;
-            qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB32] = qt_blend_rgb32_on_rgb32_sse;
-            qBlendFunctions[QImage::Format_RGB32][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse;
-            qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_sse;
-        }
-}
-#endif // SSE
+    }
+#elif defined(QT_HAVE_SSE2)
+    // this is the special case when SSE2 is usable but MMX/SSE is not usable (e.g.: Windows x64 + visual studio)
+    if (features & SSE2) {
+        functionForModeAsm = qt_functionForMode_onlySSE2;
+        functionForModeSolidAsm = qt_functionForModeSolid_onlySSE2;
+    }
+#endif
 
 #ifdef QT_HAVE_IWMMXT
     if (features & IWMMXT) {
@@ -7889,6 +7853,7 @@
             qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_argb32_neon;
             qBlendFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_blend_argb32_on_rgb16_neon;
             qBlendFunctions[QImage::Format_ARGB32_Premultiplied][QImage::Format_RGB16] = qt_blend_rgb16_on_argb32_neon;
+            qBlendFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_blend_rgb16_on_rgb16_neon;
 
             qScaleFunctions[QImage::Format_RGB16][QImage::Format_ARGB32_Premultiplied] = qt_scale_image_argb32_on_rgb16_neon;
             qScaleFunctions[QImage::Format_RGB16][QImage::Format_RGB16] = qt_scale_image_rgb16_on_rgb16_neon;
@@ -7900,11 +7865,13 @@
 
             functionForMode_C[QPainter::CompositionMode_SourceOver] = qt_blend_argb32_on_argb32_scanline_neon;
             functionForModeSolid_C[QPainter::CompositionMode_SourceOver] = comp_func_solid_SourceOver_neon;
+            functionForMode_C[QPainter::CompositionMode_Plus] = comp_func_Plus_neon;
             destFetchProc[QImage::Format_RGB16] = qt_destFetchRGB16_neon;
             destStoreProc[QImage::Format_RGB16] = qt_destStoreRGB16_neon;
 
             qMemRotateFunctions[QImage::Format_RGB16][0] = qt_memrotate90_16_neon;
             qMemRotateFunctions[QImage::Format_RGB16][2] = qt_memrotate270_16_neon;
+            qt_memfill32 = qt_memfill32_neon;
         }
 #endif
 
@@ -7919,17 +7886,8 @@
 
         functionForModeSolid = functionForModeSolidAsm;
     }
-    if (functionForModeAsm) {
-        const int destinationMode = QPainter::CompositionMode_Destination;
-        functionForModeAsm[destinationMode] = functionForMode_C[destinationMode];
-
-        // use the default qdrawhelper implementation for the
-        // extended composition modes
-        for (int mode = 12; mode < numCompositionFunctions; ++mode)
-            functionForModeAsm[mode] = functionForMode_C[mode];
-
+    if (functionForModeAsm)
         functionForMode = functionForModeAsm;
-    }
 
     qt_build_pow_tables();
 }