src/gui/painting/qdrawhelper_neon.cpp
changeset 37 758a864f9613
parent 33 3e2da88830cd
equal deleted inserted replaced
36:ef0373b55136 37:758a864f9613
    48 #include <private/qdrawhelper_neon_p.h>
    48 #include <private/qdrawhelper_neon_p.h>
    49 #include <private/qpaintengine_raster_p.h>
    49 #include <private/qpaintengine_raster_p.h>
    50 #include <arm_neon.h>
    50 #include <arm_neon.h>
    51 
    51 
    52 QT_BEGIN_NAMESPACE
    52 QT_BEGIN_NAMESPACE
       
    53 
       
    54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count)
       
    55 {
       
    56     const int epilogueSize = count % 16;
       
    57     if (count >= 16) {
       
    58         quint32 *const neonEnd = dest + count - epilogueSize;
       
    59         register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value);
       
    60         register uint32x4_t valueVector2 asm ("q1") = valueVector1;
       
    61         while (dest != neonEnd) {
       
    62             asm volatile (
       
    63                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
       
    64                 "vst2.32     { d0, d1, d2, d3 }, [%[DST]] !\n\t"
       
    65                 : [DST]"+r" (dest)
       
    66                 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2)
       
    67                 : "memory"
       
    68             );
       
    69         }
       
    70     }
       
    71 
       
    72     switch (epilogueSize)
       
    73     {
       
    74     case 15:     *dest++ = value;
       
    75     case 14:     *dest++ = value;
       
    76     case 13:     *dest++ = value;
       
    77     case 12:     *dest++ = value;
       
    78     case 11:     *dest++ = value;
       
    79     case 10:     *dest++ = value;
       
    80     case 9:      *dest++ = value;
       
    81     case 8:      *dest++ = value;
       
    82     case 7:      *dest++ = value;
       
    83     case 6:      *dest++ = value;
       
    84     case 5:      *dest++ = value;
       
    85     case 4:      *dest++ = value;
       
    86     case 3:      *dest++ = value;
       
    87     case 2:      *dest++ = value;
       
    88     case 1:      *dest++ = value;
       
    89     }
       
    90 }
    53 
    91 
    54 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
    92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half)
    55 {
    93 {
    56     // result = (x + (x >> 8) + 0x80) >> 8
    94     // result = (x + (x >> 8) + 0x80) >> 8
    57 
    95 
   127 extern "C" void
   165 extern "C" void
   128 pixman_composite_scanline_over_asm_neon (int32_t         w,
   166 pixman_composite_scanline_over_asm_neon (int32_t         w,
   129                                          const uint32_t *dst,
   167                                          const uint32_t *dst,
   130                                          const uint32_t *src);
   168                                          const uint32_t *src);
   131 
   169 
       
   170 extern "C" void
       
   171 pixman_composite_src_0565_0565_asm_neon (int32_t   w,
       
   172                                          int32_t   h,
       
   173                                          uint16_t *dst,
       
   174                                          int32_t   dst_stride,
       
   175                                          uint16_t *src,
       
   176                                          int32_t   src_stride);
       
   177 
   132 // qblendfunctions.cpp
   178 // qblendfunctions.cpp
   133 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
   179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl,
   134                                           const uchar *srcPixels, int sbpl,
   180                                           const uchar *srcPixels, int sbpl,
   135                                           int w, int h,
   181                                           int w, int h,
   136                                           int const_alpha);
   182                                           int const_alpha);
   158         }
   204         }
   159         return;
   205         return;
   160     }
   206     }
   161 
   207 
   162     pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
   208     pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl);
       
   209 }
       
   210 
       
   211 // qblendfunctions.cpp
       
   212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl,
       
   213                              const uchar *src, int sbpl,
       
   214                              int w, int h,
       
   215                              int const_alpha);
       
   216 
       
   217 template <int N>
       
   218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride)
       
   219 {
       
   220     if (N >= 2) {
       
   221         ((quint32 *)dst)[0] = ((quint32 *)src)[0];
       
   222         __builtin_prefetch(dst + dstride, 1, 0);
       
   223     }
       
   224     for (int i = 1; i < N/2; ++i)
       
   225         ((quint32 *)dst)[i] = ((quint32 *)src)[i];
       
   226     if (N & 1)
       
   227         dst[N-1] = src[N-1];
       
   228 }
       
   229 
       
   230 template <int Width>
       
   231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h)
       
   232 {
       
   233     union {
       
   234         quintptr address;
       
   235         quint16 *pointer;
       
   236     } u;
       
   237 
       
   238     u.pointer = dst;
       
   239 
       
   240     if (u.address & 2) {
       
   241         while (h--) {
       
   242             // align dst
       
   243             dst[0] = src[0];
       
   244             if (Width > 1)
       
   245                 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride);
       
   246             dst += dstride;
       
   247             src += sstride;
       
   248         }
       
   249     } else {
       
   250         while (h--) {
       
   251             scanLineBlit16<Width>(dst, src, dstride);
       
   252 
       
   253             dst += dstride;
       
   254             src += sstride;
       
   255         }
       
   256     }
       
   257 }
       
   258 
       
   259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl,
       
   260                                   const uchar *srcPixels, int sbpl,
       
   261                                   int w, int h,
       
   262                                   int const_alpha)
       
   263 {
       
   264     // testing show that the default memcpy is faster for widths 150 and up
       
   265     if (const_alpha != 256 || w >= 150) {
       
   266         qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha);
       
   267         return;
       
   268     }
       
   269 
       
   270     int dstride = dbpl / 2;
       
   271     int sstride = sbpl / 2;
       
   272 
       
   273     quint16 *dst = (quint16 *) destPixels;
       
   274     quint16 *src = (quint16 *) srcPixels;
       
   275 
       
   276     switch (w) {
       
   277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return;
       
   278     BLOCKBLIT(1);
       
   279     BLOCKBLIT(2);
       
   280     BLOCKBLIT(3);
       
   281     BLOCKBLIT(4);
       
   282     BLOCKBLIT(5);
       
   283     BLOCKBLIT(6);
       
   284     BLOCKBLIT(7);
       
   285     BLOCKBLIT(8);
       
   286     BLOCKBLIT(9);
       
   287     BLOCKBLIT(10);
       
   288     BLOCKBLIT(11);
       
   289     BLOCKBLIT(12);
       
   290     BLOCKBLIT(13);
       
   291     BLOCKBLIT(14);
       
   292     BLOCKBLIT(15);
       
   293 #undef BLOCKBLIT
       
   294     default:
       
   295         break;
       
   296     }
       
   297 
       
   298     pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride);
   163 }
   299 }
   164 
   300 
   165 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
   301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha);
   166 
   302 
   167 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
   303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl,
   620         for (;x < length; ++x)
   756         for (;x < length; ++x)
   621             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
   757             destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor);
   622     }
   758     }
   623 }
   759 }
   624 
   760 
       
   761 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha)
       
   762 {
       
   763     if (const_alpha == 255) {
       
   764         uint *const end = dst + length;
       
   765         uint *const neonEnd = end - 3;
       
   766 
       
   767         while (dst < neonEnd) {
       
   768             asm volatile (
       
   769                 "vld2.8     { d0, d1 }, [%[SRC]] !\n\t"
       
   770                 "vld2.8     { d2, d3 }, [%[DST]]\n\t"
       
   771                 "vqadd.u8 q0, q0, q1\n\t"
       
   772                 "vst2.8     { d0, d1 }, [%[DST]] !\n\t"
       
   773                 : [DST]"+r" (dst), [SRC]"+r" (src)
       
   774                 :
       
   775                 : "memory", "d0", "d1", "d2", "d3", "q0", "q1"
       
   776             );
       
   777         }
       
   778 
       
   779         while (dst != end) {
       
   780             *dst = comp_func_Plus_one_pixel(*dst, *src);
       
   781             ++dst;
       
   782             ++src;
       
   783         }
       
   784     } else {
       
   785         int x = 0;
       
   786         const int one_minus_const_alpha = 255 - const_alpha;
       
   787         const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha);
       
   788         const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha);
       
   789 
       
   790         const uint16x8_t half = vdupq_n_u16(0x80);
       
   791         for (; x < length - 3; x += 4) {
       
   792             const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]);
       
   793             const uint8x16_t src8 = vreinterpretq_u8_u32(src32);
       
   794             uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]);
       
   795             uint8x16_t result = vqaddq_u8(dst8, src8);
       
   796 
       
   797             uint16x8_t result_low = vmovl_u8(vget_low_u8(result));
       
   798             uint16x8_t result_high = vmovl_u8(vget_high_u8(result));
       
   799 
       
   800             uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8));
       
   801             uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8));
       
   802 
       
   803             result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half);
       
   804             result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half);
       
   805 
       
   806             const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low));
       
   807             const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high));
       
   808             vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high));
       
   809         }
       
   810 
       
   811         for (; x < length; ++x)
       
   812             dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha);
       
   813     }
       
   814 }
       
   815 
   625 static const int tileSize = 32;
   816 static const int tileSize = 32;
   626 
   817 
   627 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
   818 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count);
   628 
   819 
   629 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)
   820 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride)