src/gui/painting/qdrawhelper_ssse3.cpp
changeset 37 758a864f9613
equal deleted inserted replaced
36:ef0373b55136 37:758a864f9613
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2010 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtGui module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include <private/qdrawhelper_x86_p.h>
       
    43 
       
    44 #ifdef QT_HAVE_SSSE3
       
    45 
       
    46 #include <private/qdrawingprimitive_sse2_p.h>
       
    47 
       
    48 QT_BEGIN_NAMESPACE
       
    49 
       
    50 inline static void blend_pixel(quint32 &dst, const quint32 src)
       
    51 {
       
    52     if (src >= 0xff000000)
       
    53         dst = src;
       
    54     else if (src != 0)
       
    55         dst = src + BYTE_MUL(dst, qAlpha(~src));
       
    56 }
       
    57 
       
    58 
       
    59 /* The instruction palignr uses direct arguments, so we have to generate the code fo the different
       
    60    shift (4, 8, 12). Checking the alignment inside the loop is unfortunatelly way too slow.
       
    61  */
       
    62 #define BLENDING_LOOP(palignrOffset, length)\
       
    63     for (; x < length-3; x += 4) { \
       
    64         const __m128i srcVectorLastLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
       
    65         const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
       
    66         const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
       
    67         if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
       
    68             _mm_store_si128((__m128i *)&dst[x], srcVector); \
       
    69         } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
       
    70             __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
       
    71             alphaChannel = _mm_sub_epi16(one, alphaChannel); \
       
    72             const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
       
    73             __m128i destMultipliedByOneMinusAlpha; \
       
    74             BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
       
    75             const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
       
    76             _mm_store_si128((__m128i *)&dst[x], result); \
       
    77         } \
       
    78         srcVectorPrevLoaded = srcVectorLastLoaded;\
       
    79     }
       
    80 
       
    81 
       
    82 #define BLEND_SOURCE_OVER_ARGB32_FIRST_ROW_SSSE3(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
       
    83     int x = 0; \
       
    84 \
       
    85     /* First, get dst aligned. */ \
       
    86     const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3;\
       
    87     const int prologLength = qMin(length, offsetToAlignOn16Bytes);\
       
    88 \
       
    89     for (; x < prologLength; ++x) {\
       
    90         blend_pixel(dst[x], src[x]); \
       
    91     } \
       
    92 \
       
    93     const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;\
       
    94 \
       
    95     if (!minusOffsetToAlignSrcOn16Bytes) {\
       
    96         /* src is aligned, usual algorithm but with aligned operations.\
       
    97            See the SSE2 version for more documentation on the algorithm itself. */\
       
    98         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
       
    99         for (; x < length-3; x += 4) { \
       
   100             const __m128i srcVector = _mm_load_si128((__m128i *)&src[x]); \
       
   101             const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
       
   102             if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
       
   103                 _mm_store_si128((__m128i *)&dst[x], srcVector); \
       
   104             } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
       
   105                 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
       
   106                 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
       
   107                 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
       
   108                 __m128i destMultipliedByOneMinusAlpha; \
       
   109                 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
       
   110                 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
       
   111                 _mm_store_si128((__m128i *)&dst[x], result); \
       
   112             } \
       
   113         } /* end for() */\
       
   114     } else if ((length - x) >= 8) {\
       
   115         /* We are at the first line, so "x - minusOffsetToAlignSrcOn16Bytes" could go before src, and\
       
   116            generate an invalid access. */\
       
   117 \
       
   118         /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */\
       
   119         __m128i srcVectorPrevLoaded;\
       
   120         if (minusOffsetToAlignSrcOn16Bytes > prologLength) {\
       
   121             /* We go forward 4 pixels to avoid reading before src. */\
       
   122             for (; x < prologLength + 4; ++x)\
       
   123                 blend_pixel(dst[x], src[x]); \
       
   124         }\
       
   125         srcVectorPrevLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);\
       
   126         const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;\
       
   127 \
       
   128         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
       
   129         switch (palignrOffset) {\
       
   130         case 4:\
       
   131             BLENDING_LOOP(4, length)\
       
   132             break;\
       
   133         case 8:\
       
   134             BLENDING_LOOP(8, length)\
       
   135             break;\
       
   136         case 12:\
       
   137             BLENDING_LOOP(12, length)\
       
   138             break;\
       
   139         }\
       
   140     }\
       
   141     for (; x < length; ++x) \
       
   142         blend_pixel(dst[x], src[x]); \
       
   143 }
       
   144 
       
   145 // Basically blend src over dst with the const alpha defined as constAlphaVector.
       
   146 // nullVector, half, one, colorMask are constant accross the whole image/texture, and should be defined as:
       
   147 //const __m128i nullVector = _mm_set1_epi32(0);
       
   148 //const __m128i half = _mm_set1_epi16(0x80);
       
   149 //const __m128i one = _mm_set1_epi16(0xff);
       
   150 //const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
       
   151 //const __m128i alphaMask = _mm_set1_epi32(0xff000000);
       
   152 //
       
   153 // The computation being done is:
       
   154 // result = s + d * (1-alpha)
       
   155 // with shortcuts if fully opaque or fully transparent.
       
   156 #define BLEND_SOURCE_OVER_ARGB32_MAIN_SSSE3(dst, src, length, nullVector, half, one, colorMask, alphaMask) { \
       
   157     int x = 0; \
       
   158 \
       
   159     /* First, get dst aligned. */ \
       
   160     ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) { \
       
   161         blend_pixel(dst[x], src[x]); \
       
   162     } \
       
   163 \
       
   164     const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;\
       
   165 \
       
   166     if (!minusOffsetToAlignSrcOn16Bytes) {\
       
   167         /* src is aligned, usual algorithm but with aligned operations.\
       
   168            See the SSE2 version for more documentation on the algorithm itself. */\
       
   169         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
       
   170         for (; x < length-3; x += 4) { \
       
   171             const __m128i srcVector = _mm_load_si128((__m128i *)&src[x]); \
       
   172             const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
       
   173             if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
       
   174                 _mm_store_si128((__m128i *)&dst[x], srcVector); \
       
   175             } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
       
   176                 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
       
   177                 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
       
   178                 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
       
   179                 __m128i destMultipliedByOneMinusAlpha; \
       
   180                 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
       
   181                 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
       
   182                 _mm_store_si128((__m128i *)&dst[x], result); \
       
   183             } \
       
   184         } /* end for() */\
       
   185     } else if ((length - x) >= 8) {\
       
   186         /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */\
       
   187         __m128i srcVectorPrevLoaded = _mm_load_si128((__m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);\
       
   188         const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;\
       
   189 \
       
   190         const __m128i alphaShuffleMask = _mm_set_epi8(0xff,15,0xff,15,0xff,11,0xff,11,0xff,7,0xff,7,0xff,3,0xff,3);\
       
   191         switch (palignrOffset) {\
       
   192         case 4:\
       
   193             BLENDING_LOOP(4, length)\
       
   194             break;\
       
   195         case 8:\
       
   196             BLENDING_LOOP(8, length)\
       
   197             break;\
       
   198         case 12:\
       
   199             BLENDING_LOOP(12, length)\
       
   200             break;\
       
   201         }\
       
   202     }\
       
   203     for (; x < length; ++x) \
       
   204         blend_pixel(dst[x], src[x]); \
       
   205 }
       
   206 
       
   207 void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
       
   208                                      const uchar *srcPixels, int sbpl,
       
   209                                      int w, int h,
       
   210                                      int const_alpha)
       
   211 {
       
   212     const quint32 *src = (const quint32 *) srcPixels;
       
   213     quint32 *dst = (quint32 *) destPixels;
       
   214     if (const_alpha == 256) {
       
   215         const __m128i alphaMask = _mm_set1_epi32(0xff000000);
       
   216         const __m128i nullVector = _mm_setzero_si128();
       
   217         const __m128i half = _mm_set1_epi16(0x80);
       
   218         const __m128i one = _mm_set1_epi16(0xff);
       
   219         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
       
   220 
       
   221         // We have to unrol the first row in order to deal with the load on unaligned data
       
   222         // prior to the src pointer.
       
   223         BLEND_SOURCE_OVER_ARGB32_FIRST_ROW_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
       
   224         dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   225         src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   226 
       
   227         for (int y = 1; y < h; ++y) {
       
   228             BLEND_SOURCE_OVER_ARGB32_MAIN_SSSE3(dst, src, w, nullVector, half, one, colorMask, alphaMask);
       
   229             dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   230             src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   231         }
       
   232     } else if (const_alpha != 0) {
       
   233         // dest = (s + d * sia) * ca + d * cia
       
   234         //      = s * ca + d * (sia * ca + cia)
       
   235         //      = s * ca + d * (1 - sa*ca)
       
   236         const_alpha = (const_alpha * 255) >> 8;
       
   237         const __m128i nullVector = _mm_setzero_si128();
       
   238         const __m128i half = _mm_set1_epi16(0x80);
       
   239         const __m128i one = _mm_set1_epi16(0xff);
       
   240         const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
       
   241         const __m128i constAlphaVector = _mm_set1_epi16(const_alpha);
       
   242         for (int y = 0; y < h; ++y) {
       
   243             BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
       
   244             dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   245             src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   246         }
       
   247     }
       
   248 }
       
   249 
       
   250 QT_END_NAMESPACE
       
   251 
       
   252 #endif // QT_HAVE_SSSE3