48 #include <private/qdrawhelper_neon_p.h> |
48 #include <private/qdrawhelper_neon_p.h> |
49 #include <private/qpaintengine_raster_p.h> |
49 #include <private/qpaintengine_raster_p.h> |
50 #include <arm_neon.h> |
50 #include <arm_neon.h> |
51 |
51 |
52 QT_BEGIN_NAMESPACE |
52 QT_BEGIN_NAMESPACE |
|
53 |
|
54 void qt_memfill32_neon(quint32 *dest, quint32 value, int count) |
|
55 { |
|
56 const int epilogueSize = count % 16; |
|
57 if (count >= 16) { |
|
58 quint32 *const neonEnd = dest + count - epilogueSize; |
|
59 register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value); |
|
60 register uint32x4_t valueVector2 asm ("q1") = valueVector1; |
|
61 while (dest != neonEnd) { |
|
62 asm volatile ( |
|
63 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t" |
|
64 "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t" |
|
65 : [DST]"+r" (dest) |
|
66 : [VALUE1]"w"(valueVector1), [VALUE2]"w"(valueVector2) |
|
67 : "memory" |
|
68 ); |
|
69 } |
|
70 } |
|
71 |
|
72 switch (epilogueSize) |
|
73 { |
|
74 case 15: *dest++ = value; |
|
75 case 14: *dest++ = value; |
|
76 case 13: *dest++ = value; |
|
77 case 12: *dest++ = value; |
|
78 case 11: *dest++ = value; |
|
79 case 10: *dest++ = value; |
|
80 case 9: *dest++ = value; |
|
81 case 8: *dest++ = value; |
|
82 case 7: *dest++ = value; |
|
83 case 6: *dest++ = value; |
|
84 case 5: *dest++ = value; |
|
85 case 4: *dest++ = value; |
|
86 case 3: *dest++ = value; |
|
87 case 2: *dest++ = value; |
|
88 case 1: *dest++ = value; |
|
89 } |
|
90 } |
53 |
91 |
54 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half) |
92 static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half) |
55 { |
93 { |
56 // result = (x + (x >> 8) + 0x80) >> 8 |
94 // result = (x + (x >> 8) + 0x80) >> 8 |
57 |
95 |
127 extern "C" void |
165 extern "C" void |
128 pixman_composite_scanline_over_asm_neon (int32_t w, |
166 pixman_composite_scanline_over_asm_neon (int32_t w, |
129 const uint32_t *dst, |
167 const uint32_t *dst, |
130 const uint32_t *src); |
168 const uint32_t *src); |
131 |
169 |
|
170 extern "C" void |
|
171 pixman_composite_src_0565_0565_asm_neon (int32_t w, |
|
172 int32_t h, |
|
173 uint16_t *dst, |
|
174 int32_t dst_stride, |
|
175 uint16_t *src, |
|
176 int32_t src_stride); |
|
177 |
132 // qblendfunctions.cpp |
178 // qblendfunctions.cpp |
133 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, |
179 void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, |
134 const uchar *srcPixels, int sbpl, |
180 const uchar *srcPixels, int sbpl, |
135 int w, int h, |
181 int w, int h, |
136 int const_alpha); |
182 int const_alpha); |
158 } |
204 } |
159 return; |
205 return; |
160 } |
206 } |
161 |
207 |
162 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl); |
208 pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl); |
|
209 } |
|
210 |
|
211 // qblendfunctions.cpp |
|
212 void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl, |
|
213 const uchar *src, int sbpl, |
|
214 int w, int h, |
|
215 int const_alpha); |
|
216 |
|
217 template <int N> |
|
218 static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride) |
|
219 { |
|
220 if (N >= 2) { |
|
221 ((quint32 *)dst)[0] = ((quint32 *)src)[0]; |
|
222 __builtin_prefetch(dst + dstride, 1, 0); |
|
223 } |
|
224 for (int i = 1; i < N/2; ++i) |
|
225 ((quint32 *)dst)[i] = ((quint32 *)src)[i]; |
|
226 if (N & 1) |
|
227 dst[N-1] = src[N-1]; |
|
228 } |
|
229 |
|
230 template <int Width> |
|
231 static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h) |
|
232 { |
|
233 union { |
|
234 quintptr address; |
|
235 quint16 *pointer; |
|
236 } u; |
|
237 |
|
238 u.pointer = dst; |
|
239 |
|
240 if (u.address & 2) { |
|
241 while (h--) { |
|
242 // align dst |
|
243 dst[0] = src[0]; |
|
244 if (Width > 1) |
|
245 scanLineBlit16<Width-1>(dst + 1, src + 1, dstride); |
|
246 dst += dstride; |
|
247 src += sstride; |
|
248 } |
|
249 } else { |
|
250 while (h--) { |
|
251 scanLineBlit16<Width>(dst, src, dstride); |
|
252 |
|
253 dst += dstride; |
|
254 src += sstride; |
|
255 } |
|
256 } |
|
257 } |
|
258 |
|
259 void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, |
|
260 const uchar *srcPixels, int sbpl, |
|
261 int w, int h, |
|
262 int const_alpha) |
|
263 { |
|
264 // testing show that the default memcpy is faster for widths 150 and up |
|
265 if (const_alpha != 256 || w >= 150) { |
|
266 qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
|
267 return; |
|
268 } |
|
269 |
|
270 int dstride = dbpl / 2; |
|
271 int sstride = sbpl / 2; |
|
272 |
|
273 quint16 *dst = (quint16 *) destPixels; |
|
274 quint16 *src = (quint16 *) srcPixels; |
|
275 |
|
276 switch (w) { |
|
277 #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return; |
|
278 BLOCKBLIT(1); |
|
279 BLOCKBLIT(2); |
|
280 BLOCKBLIT(3); |
|
281 BLOCKBLIT(4); |
|
282 BLOCKBLIT(5); |
|
283 BLOCKBLIT(6); |
|
284 BLOCKBLIT(7); |
|
285 BLOCKBLIT(8); |
|
286 BLOCKBLIT(9); |
|
287 BLOCKBLIT(10); |
|
288 BLOCKBLIT(11); |
|
289 BLOCKBLIT(12); |
|
290 BLOCKBLIT(13); |
|
291 BLOCKBLIT(14); |
|
292 BLOCKBLIT(15); |
|
293 #undef BLOCKBLIT |
|
294 default: |
|
295 break; |
|
296 } |
|
297 |
|
298 pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride); |
163 } |
299 } |
164 |
300 |
165 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha); |
301 extern "C" void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha); |
166 |
302 |
167 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, |
303 void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, |
620 for (;x < length; ++x) |
756 for (;x < length; ++x) |
621 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
757 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
622 } |
758 } |
623 } |
759 } |
624 |
760 |
|
761 void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha) |
|
762 { |
|
763 if (const_alpha == 255) { |
|
764 uint *const end = dst + length; |
|
765 uint *const neonEnd = end - 3; |
|
766 |
|
767 while (dst < neonEnd) { |
|
768 asm volatile ( |
|
769 "vld2.8 { d0, d1 }, [%[SRC]] !\n\t" |
|
770 "vld2.8 { d2, d3 }, [%[DST]]\n\t" |
|
771 "vqadd.u8 q0, q0, q1\n\t" |
|
772 "vst2.8 { d0, d1 }, [%[DST]] !\n\t" |
|
773 : [DST]"+r" (dst), [SRC]"+r" (src) |
|
774 : |
|
775 : "memory", "d0", "d1", "d2", "d3", "q0", "q1" |
|
776 ); |
|
777 } |
|
778 |
|
779 while (dst != end) { |
|
780 *dst = comp_func_Plus_one_pixel(*dst, *src); |
|
781 ++dst; |
|
782 ++src; |
|
783 } |
|
784 } else { |
|
785 int x = 0; |
|
786 const int one_minus_const_alpha = 255 - const_alpha; |
|
787 const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha); |
|
788 const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha); |
|
789 |
|
790 const uint16x8_t half = vdupq_n_u16(0x80); |
|
791 for (; x < length - 3; x += 4) { |
|
792 const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); |
|
793 const uint8x16_t src8 = vreinterpretq_u8_u32(src32); |
|
794 uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]); |
|
795 uint8x16_t result = vqaddq_u8(dst8, src8); |
|
796 |
|
797 uint16x8_t result_low = vmovl_u8(vget_low_u8(result)); |
|
798 uint16x8_t result_high = vmovl_u8(vget_high_u8(result)); |
|
799 |
|
800 uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8)); |
|
801 uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8)); |
|
802 |
|
803 result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half); |
|
804 result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half); |
|
805 |
|
806 const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low)); |
|
807 const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high)); |
|
808 vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); |
|
809 } |
|
810 |
|
811 for (; x < length; ++x) |
|
812 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); |
|
813 } |
|
814 } |
|
815 |
625 static const int tileSize = 32; |
816 static const int tileSize = 32; |
626 |
817 |
627 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count); |
818 extern "C" void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count); |
628 |
819 |
629 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride) |
820 void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride) |