|
1 /**************************************************************************** |
|
2 ** |
|
3 ** Copyright (C) 2009 Nokia Corporation and/or its subsidiary(-ies). |
|
4 ** All rights reserved. |
|
5 ** Contact: Nokia Corporation (qt-info@nokia.com) |
|
6 ** |
|
7 ** This file is part of the QtGui module of the Qt Toolkit. |
|
8 ** |
|
9 ** $QT_BEGIN_LICENSE:LGPL$ |
|
10 ** No Commercial Usage |
|
11 ** This file contains pre-release code and may not be distributed. |
|
12 ** You may use this file in accordance with the terms and conditions |
|
13 ** contained in the Technology Preview License Agreement accompanying |
|
14 ** this package. |
|
15 ** |
|
16 ** GNU Lesser General Public License Usage |
|
17 ** Alternatively, this file may be used under the terms of the GNU Lesser |
|
18 ** General Public License version 2.1 as published by the Free Software |
|
19 ** Foundation and appearing in the file LICENSE.LGPL included in the |
|
20 ** packaging of this file. Please review the following information to |
|
21 ** ensure the GNU Lesser General Public License version 2.1 requirements |
|
22 ** will be met: http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html. |
|
23 ** |
|
24 ** In addition, as a special exception, Nokia gives you certain additional |
|
25 ** rights. These rights are described in the Nokia Qt LGPL Exception |
|
26 ** version 1.1, included in the file LGPL_EXCEPTION.txt in this package. |
|
27 ** |
|
28 ** If you have questions regarding the use of this file, please contact |
|
29 ** Nokia at qt-info@nokia.com. |
|
30 ** |
|
31 ** |
|
32 ** |
|
33 ** |
|
34 ** |
|
35 ** |
|
36 ** |
|
37 ** |
|
38 ** $QT_END_LICENSE$ |
|
39 ** |
|
40 ****************************************************************************/ |
|
41 |
|
42 #include <private/qdrawhelper_p.h> |
|
43 |
|
44 #ifdef QT_HAVE_NEON |
|
45 |
|
46 #include <private/qdrawhelper_neon_p.h> |
|
47 #include <arm_neon.h> |
|
48 |
|
49 QT_BEGIN_NAMESPACE |
|
50 |
|
51 static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half) |
|
52 { |
|
53 // result = (x + (x >> 8) + 0x80) >> 8 |
|
54 |
|
55 const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8 |
|
56 const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80 |
|
57 const int16x8_t sum = vaddq_s16(temp, sum_part); |
|
58 |
|
59 return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8)); |
|
60 } |
|
61 |
|
62 static inline int16x8_t qvbyte_mul_s16(int16x8_t x, int16x8_t alpha, int16x8_t half) |
|
63 { |
|
64 // t = qRound(x * alpha / 255.0) |
|
65 |
|
66 const int16x8_t t = vmulq_s16(x, alpha); // t |
|
67 return qvdiv_255_s16(t, half); |
|
68 } |
|
69 |
|
70 static inline int16x8_t qvinterpolate_pixel_255(int16x8_t x, int16x8_t a, int16x8_t y, int16x8_t b, int16x8_t half) |
|
71 { |
|
72 // t = x * a + y * b |
|
73 |
|
74 const int16x8_t ta = vmulq_s16(x, a); |
|
75 const int16x8_t tb = vmulq_s16(y, b); |
|
76 |
|
77 return qvdiv_255_s16(vaddq_s16(ta, tb), half); |
|
78 } |
|
79 |
|
80 static inline int16x8_t qvsource_over_s16(int16x8_t src16, int16x8_t dst16, int16x8_t half, int16x8_t full) |
|
81 { |
|
82 const int16x4_t alpha16_high = vdup_lane_s16(vget_high_s16(src16), 3); |
|
83 const int16x4_t alpha16_low = vdup_lane_s16(vget_low_s16(src16), 3); |
|
84 |
|
85 const int16x8_t alpha16 = vsubq_s16(full, vcombine_s16(alpha16_low, alpha16_high)); |
|
86 |
|
87 return vaddq_s16(src16, qvbyte_mul_s16(dst16, alpha16, half)); |
|
88 } |
|
89 |
|
90 void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, |
|
91 const uchar *srcPixels, int sbpl, |
|
92 int w, int h, |
|
93 int const_alpha) |
|
94 { |
|
95 const uint *src = (const uint *) srcPixels; |
|
96 uint *dst = (uint *) destPixels; |
|
97 int16x8_t half = vdupq_n_s16(0x80); |
|
98 int16x8_t full = vdupq_n_s16(0xff); |
|
99 if (const_alpha == 256) { |
|
100 for (int y = 0; y < h; ++y) { |
|
101 int x = 0; |
|
102 for (; x < w-3; x += 4) { |
|
103 int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); |
|
104 if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { |
|
105 // all opaque |
|
106 vst1q_s32((int32_t *)&dst[x], src32); |
|
107 } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { |
|
108 int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); |
|
109 |
|
110 const uint8x16_t src8 = vreinterpretq_u8_s32(src32); |
|
111 const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); |
|
112 |
|
113 const uint8x8_t src8_low = vget_low_u8(src8); |
|
114 const uint8x8_t dst8_low = vget_low_u8(dst8); |
|
115 |
|
116 const uint8x8_t src8_high = vget_high_u8(src8); |
|
117 const uint8x8_t dst8_high = vget_high_u8(dst8); |
|
118 |
|
119 const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); |
|
120 const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); |
|
121 |
|
122 const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); |
|
123 const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); |
|
124 |
|
125 const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full); |
|
126 const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full); |
|
127 |
|
128 const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); |
|
129 const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); |
|
130 |
|
131 vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); |
|
132 } |
|
133 } |
|
134 for (; x<w; ++x) { |
|
135 uint s = src[x]; |
|
136 if (s >= 0xff000000) |
|
137 dst[x] = s; |
|
138 else if (s != 0) |
|
139 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
140 } |
|
141 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
142 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
143 } |
|
144 } else if (const_alpha != 0) { |
|
145 const_alpha = (const_alpha * 255) >> 8; |
|
146 int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); |
|
147 for (int y = 0; y < h; ++y) { |
|
148 int x = 0; |
|
149 for (; x < w-3; x += 4) { |
|
150 if (src[x] | src[x+1] | src[x+2] | src[x+3]) { |
|
151 int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); |
|
152 int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); |
|
153 |
|
154 const uint8x16_t src8 = vreinterpretq_u8_s32(src32); |
|
155 const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); |
|
156 |
|
157 const uint8x8_t src8_low = vget_low_u8(src8); |
|
158 const uint8x8_t dst8_low = vget_low_u8(dst8); |
|
159 |
|
160 const uint8x8_t src8_high = vget_high_u8(src8); |
|
161 const uint8x8_t dst8_high = vget_high_u8(dst8); |
|
162 |
|
163 const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); |
|
164 const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); |
|
165 |
|
166 const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); |
|
167 const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); |
|
168 |
|
169 const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half); |
|
170 const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half); |
|
171 |
|
172 const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full); |
|
173 const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full); |
|
174 |
|
175 const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); |
|
176 const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); |
|
177 |
|
178 vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); |
|
179 } |
|
180 } |
|
181 for (; x<w; ++x) { |
|
182 uint s = src[x]; |
|
183 if (s != 0) { |
|
184 s = BYTE_MUL(s, const_alpha); |
|
185 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
186 } |
|
187 } |
|
188 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
189 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
190 } |
|
191 } |
|
192 } |
|
193 |
|
194 // qblendfunctions.cpp |
|
195 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, |
|
196 const uchar *srcPixels, int sbpl, |
|
197 int w, int h, |
|
198 int const_alpha); |
|
199 |
|
200 void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl, |
|
201 const uchar *srcPixels, int sbpl, |
|
202 int w, int h, |
|
203 int const_alpha) |
|
204 { |
|
205 if (const_alpha != 256) { |
|
206 if (const_alpha != 0) { |
|
207 const uint *src = (const uint *) srcPixels; |
|
208 uint *dst = (uint *) destPixels; |
|
209 int16x8_t half = vdupq_n_s16(0x80); |
|
210 const_alpha = (const_alpha * 255) >> 8; |
|
211 int one_minus_const_alpha = 255 - const_alpha; |
|
212 int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); |
|
213 int16x8_t one_minus_const_alpha16 = vdupq_n_s16(255 - const_alpha); |
|
214 for (int y = 0; y < h; ++y) { |
|
215 int x = 0; |
|
216 for (; x < w-3; x += 4) { |
|
217 int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); |
|
218 int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); |
|
219 |
|
220 const uint8x16_t src8 = vreinterpretq_u8_s32(src32); |
|
221 const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); |
|
222 |
|
223 const uint8x8_t src8_low = vget_low_u8(src8); |
|
224 const uint8x8_t dst8_low = vget_low_u8(dst8); |
|
225 |
|
226 const uint8x8_t src8_high = vget_high_u8(src8); |
|
227 const uint8x8_t dst8_high = vget_high_u8(dst8); |
|
228 |
|
229 const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); |
|
230 const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); |
|
231 |
|
232 const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); |
|
233 const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); |
|
234 |
|
235 const int16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half); |
|
236 const int16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half); |
|
237 |
|
238 const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); |
|
239 const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); |
|
240 |
|
241 vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); |
|
242 } |
|
243 for (; x<w; ++x) { |
|
244 uint s = src[x]; |
|
245 s = BYTE_MUL(s, const_alpha); |
|
246 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
|
247 } |
|
248 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
249 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
250 } |
|
251 } |
|
252 } else { |
|
253 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
|
254 } |
|
255 } |
|
256 |
|
257 QT_END_NAMESPACE |
|
258 |
|
259 #endif // QT_HAVE_NEON |
|
260 |