src/gui/painting/qdrawhelper_neon.cpp
changeset 3 41300fa6a67c
child 4 3b1da2848fc7
child 7 f7bc934e204c
equal deleted inserted replaced
2:56cd8111b7f7 3:41300fa6a67c
       
     1 /****************************************************************************
       
     2 **
       
     3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies).
       
     4 ** All rights reserved.
       
     5 ** Contact: Nokia Corporation (qt-info@nokia.com)
       
     6 **
       
     7 ** This file is part of the QtGui module of the Qt Toolkit.
       
     8 **
       
     9 ** $QT_BEGIN_LICENSE:LGPL$
       
    10 ** No Commercial Usage
       
    11 ** This file contains pre-release code and may not be distributed.
       
    12 ** You may use this file in accordance with the terms and conditions
       
    13 ** contained in the Technology Preview License Agreement accompanying
       
    14 ** this package.
       
    15 **
       
    16 ** GNU Lesser General Public License Usage
       
    17 ** Alternatively, this file may be used under the terms of the GNU Lesser
       
    18 ** General Public License version 2.1 as published by the Free Software
       
    19 ** Foundation and appearing in the file LICENSE.LGPL included in the
       
    20 ** packaging of this file.  Please review the following information to
       
    21 ** ensure the GNU Lesser General Public License version 2.1 requirements
       
    22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
       
    23 **
       
    24 ** In addition, as a special exception, Nokia gives you certain additional
       
    25 ** rights.  These rights are described in the Nokia Qt LGPL Exception
       
    26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
       
    27 **
       
    28 ** If you have questions regarding the use of this file, please contact
       
    29 ** Nokia at qt-info@nokia.com.
       
    30 **
       
    31 **
       
    32 **
       
    33 **
       
    34 **
       
    35 **
       
    36 **
       
    37 **
       
    38 ** $QT_END_LICENSE$
       
    39 **
       
    40 ****************************************************************************/
       
    41 
       
    42 #include <private/qdrawhelper_p.h>
       
    43 
       
    44 #ifdef QT_HAVE_NEON
       
    45 
       
    46 #include <private/qdrawhelper_neon_p.h>
       
    47 #include <arm_neon.h>
       
    48 
       
    49 QT_BEGIN_NAMESPACE
       
    50 
       
    51 static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half)
       
    52 {
       
    53     // result = (x + (x >> 8) + 0x80) >> 8
       
    54 
       
    55     const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8
       
    56     const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80
       
    57     const int16x8_t sum = vaddq_s16(temp, sum_part);
       
    58 
       
    59     return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8));
       
    60 }
       
    61 
       
    62 static inline int16x8_t qvbyte_mul_s16(int16x8_t x, int16x8_t alpha, int16x8_t half)
       
    63 {
       
    64     // t = qRound(x * alpha / 255.0)
       
    65 
       
    66     const int16x8_t t = vmulq_s16(x, alpha); // t
       
    67     return qvdiv_255_s16(t, half);
       
    68 }
       
    69 
       
    70 static inline int16x8_t qvinterpolate_pixel_255(int16x8_t x, int16x8_t a, int16x8_t y, int16x8_t b, int16x8_t half)
       
    71 {
       
    72     // t = x * a + y * b
       
    73 
       
    74     const int16x8_t ta = vmulq_s16(x, a);
       
    75     const int16x8_t tb = vmulq_s16(y, b);
       
    76 
       
    77     return qvdiv_255_s16(vaddq_s16(ta, tb), half);
       
    78 }
       
    79 
       
    80 static inline int16x8_t qvsource_over_s16(int16x8_t src16, int16x8_t dst16, int16x8_t half, int16x8_t full)
       
    81 {
       
    82     const int16x4_t alpha16_high = vdup_lane_s16(vget_high_s16(src16), 3);
       
    83     const int16x4_t alpha16_low = vdup_lane_s16(vget_low_s16(src16), 3);
       
    84 
       
    85     const int16x8_t alpha16 = vsubq_s16(full, vcombine_s16(alpha16_low, alpha16_high));
       
    86 
       
    87     return vaddq_s16(src16, qvbyte_mul_s16(dst16, alpha16, half));
       
    88 }
       
    89 
       
    90 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
       
    91                                     const uchar *srcPixels, int sbpl,
       
    92                                     int w, int h,
       
    93                                     int const_alpha)
       
    94 {
       
    95     const uint *src = (const uint *) srcPixels;
       
    96     uint *dst = (uint *) destPixels;
       
    97     int16x8_t half = vdupq_n_s16(0x80);
       
    98     int16x8_t full = vdupq_n_s16(0xff);
       
    99     if (const_alpha == 256) {
       
   100         for (int y = 0; y < h; ++y) {
       
   101             int x = 0;
       
   102             for (; x < w-3; x += 4) {
       
   103                 int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
       
   104                 if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
       
   105                     // all opaque
       
   106                     vst1q_s32((int32_t *)&dst[x], src32);
       
   107                 } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
       
   108                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
       
   109 
       
   110                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
       
   111                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
       
   112 
       
   113                     const uint8x8_t src8_low = vget_low_u8(src8);
       
   114                     const uint8x8_t dst8_low = vget_low_u8(dst8);
       
   115 
       
   116                     const uint8x8_t src8_high = vget_high_u8(src8);
       
   117                     const uint8x8_t dst8_high = vget_high_u8(dst8);
       
   118 
       
   119                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
       
   120                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
       
   121 
       
   122                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
       
   123                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
       
   124 
       
   125                     const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full);
       
   126                     const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full);
       
   127 
       
   128                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
       
   129                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
       
   130 
       
   131                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
       
   132                 }
       
   133             }
       
   134             for (; x<w; ++x) {
       
   135                 uint s = src[x];
       
   136                 if (s >= 0xff000000)
       
   137                     dst[x] = s;
       
   138                 else if (s != 0)
       
   139                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
       
   140             }
       
   141             dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   142             src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   143         }
       
   144     } else if (const_alpha != 0) {
       
   145         const_alpha = (const_alpha * 255) >> 8;
       
   146         int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
       
   147         for (int y = 0; y < h; ++y) {
       
   148             int x = 0;
       
   149             for (; x < w-3; x += 4) {
       
   150                 if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
       
   151                     int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
       
   152                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
       
   153 
       
   154                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
       
   155                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
       
   156 
       
   157                     const uint8x8_t src8_low = vget_low_u8(src8);
       
   158                     const uint8x8_t dst8_low = vget_low_u8(dst8);
       
   159 
       
   160                     const uint8x8_t src8_high = vget_high_u8(src8);
       
   161                     const uint8x8_t dst8_high = vget_high_u8(dst8);
       
   162 
       
   163                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
       
   164                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
       
   165 
       
   166                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
       
   167                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
       
   168 
       
   169                     const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half);
       
   170                     const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half);
       
   171 
       
   172                     const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full);
       
   173                     const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full);
       
   174 
       
   175                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
       
   176                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
       
   177 
       
   178                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
       
   179                 }
       
   180             }
       
   181             for (; x<w; ++x) {
       
   182                 uint s = src[x];
       
   183                 if (s != 0) {
       
   184                     s = BYTE_MUL(s, const_alpha);
       
   185                     dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
       
   186                 }
       
   187             }
       
   188             dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   189             src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   190         }
       
   191     }
       
   192 }
       
   193 
       
   194 // qblendfunctions.cpp
       
   195 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl,
       
   196                              const uchar *srcPixels, int sbpl,
       
   197                              int w, int h,
       
   198                              int const_alpha);
       
   199 
       
   200 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl,
       
   201                                   const uchar *srcPixels, int sbpl,
       
   202                                   int w, int h,
       
   203                                   int const_alpha)
       
   204 {
       
   205     if (const_alpha != 256) {
       
   206         if (const_alpha != 0) {
       
   207             const uint *src = (const uint *) srcPixels;
       
   208             uint *dst = (uint *) destPixels;
       
   209             int16x8_t half = vdupq_n_s16(0x80);
       
   210             const_alpha = (const_alpha * 255) >> 8;
       
   211             int one_minus_const_alpha = 255 - const_alpha;
       
   212             int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
       
   213             int16x8_t one_minus_const_alpha16 = vdupq_n_s16(255 - const_alpha);
       
   214             for (int y = 0; y < h; ++y) {
       
   215                 int x = 0;
       
   216                 for (; x < w-3; x += 4) {
       
   217                     int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
       
   218                     int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);
       
   219 
       
   220                     const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
       
   221                     const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);
       
   222 
       
   223                     const uint8x8_t src8_low = vget_low_u8(src8);
       
   224                     const uint8x8_t dst8_low = vget_low_u8(dst8);
       
   225 
       
   226                     const uint8x8_t src8_high = vget_high_u8(src8);
       
   227                     const uint8x8_t dst8_high = vget_high_u8(dst8);
       
   228 
       
   229                     const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
       
   230                     const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));
       
   231 
       
   232                     const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
       
   233                     const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));
       
   234 
       
   235                     const int16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half);
       
   236                     const int16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half);
       
   237 
       
   238                     const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
       
   239                     const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));
       
   240 
       
   241                     vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
       
   242                 }
       
   243                 for (; x<w; ++x) {
       
   244                     uint s = src[x];
       
   245                     s = BYTE_MUL(s, const_alpha);
       
   246                     dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha);
       
   247                 }
       
   248                 dst = (quint32 *)(((uchar *) dst) + dbpl);
       
   249                 src = (const quint32 *)(((const uchar *) src) + sbpl);
       
   250             }
       
   251         }
       
   252     } else {
       
   253         qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
       
   254     }
       
   255 }
       
   256 
       
   257 QT_END_NAMESPACE
       
   258 
       
   259 #endif // QT_HAVE_NEON
       
   260