41 |
41 |
42 #include <private/qdrawhelper_x86_p.h> |
42 #include <private/qdrawhelper_x86_p.h> |
43 |
43 |
44 #ifdef QT_HAVE_SSE2 |
44 #ifdef QT_HAVE_SSE2 |
45 |
45 |
|
46 #include <private/qsimd_p.h> |
|
47 #include <private/qdrawingprimitive_sse2_p.h> |
46 #include <private/qpaintengine_raster_p.h> |
48 #include <private/qpaintengine_raster_p.h> |
47 |
49 |
48 #ifdef QT_LINUXBASE |
|
49 // this is an evil hack - the posix_memalign declaration in LSB |
|
50 // is wrong - see http://bugs.linuxbase.org/show_bug.cgi?id=2431 |
|
51 # define posix_memalign _lsb_hack_posix_memalign |
|
52 # include <emmintrin.h> |
|
53 # undef posix_memalign |
|
54 #else |
|
55 # include <emmintrin.h> |
|
56 #endif |
|
57 |
|
58 QT_BEGIN_NAMESPACE |
50 QT_BEGIN_NAMESPACE |
59 |
|
60 /* |
|
61 * Multiply the components of pixelVector by alphaChannel |
|
62 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
|
63 * colorMask must have 0x00ff00ff on each 32 bits component |
|
64 * half must have the value 128 (0x80) for each 32 bits compnent |
|
65 */ |
|
66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \ |
|
67 { \ |
|
68 /* 1. separate the colors in 2 vectors so each color is on 16 bits \ |
|
69 (in order to be multiplied by the alpha \ |
|
70 each 32 bit of dstVectorAG are in the form 0x00AA00GG \ |
|
71 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\ |
|
72 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \ |
|
73 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \ |
|
74 \ |
|
75 /* 2. multiply the vectors by the alpha channel */\ |
|
76 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \ |
|
77 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \ |
|
78 \ |
|
79 /* 3. devide by 255, that's the tricky part. \ |
|
80 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \ |
|
81 /** so first (X + X/256 + rounding) */\ |
|
82 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \ |
|
83 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \ |
|
84 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \ |
|
85 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \ |
|
86 \ |
|
87 /** second devide by 256 */\ |
|
88 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \ |
|
89 /** for AG, we could >> 8 to divide followed by << 8 to put the \ |
|
90 bytes in the correct position. By masking instead, we execute \ |
|
91 only one instruction */\ |
|
92 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \ |
|
93 \ |
|
94 /* 4. combine the 2 pairs of colors */ \ |
|
95 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \ |
|
96 } |
|
97 |
|
98 /* |
|
99 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
|
100 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component |
|
101 * colorMask must have 0x00ff00ff on each 32 bits component |
|
102 * half must have the value 128 (0x80) for each 32 bits compnent |
|
103 */ |
|
104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \ |
|
105 /* interpolate AG */\ |
|
106 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \ |
|
107 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \ |
|
108 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \ |
|
109 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \ |
|
110 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \ |
|
111 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \ |
|
112 finalAG = _mm_add_epi16(finalAG, half); \ |
|
113 finalAG = _mm_andnot_si128(colorMask, finalAG); \ |
|
114 \ |
|
115 /* interpolate RB */\ |
|
116 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \ |
|
117 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \ |
|
118 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \ |
|
119 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \ |
|
120 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \ |
|
121 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \ |
|
122 finalRB = _mm_add_epi16(finalRB, half); \ |
|
123 finalRB = _mm_srli_epi16(finalRB, 8); \ |
|
124 \ |
|
125 /* combine */\ |
|
126 result = _mm_or_si128(finalAG, finalRB); \ |
|
127 } |
|
128 |
51 |
129 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, |
52 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, |
130 const uchar *srcPixels, int sbpl, |
53 const uchar *srcPixels, int sbpl, |
131 int w, int h, |
54 int w, int h, |
132 int const_alpha) |
55 int const_alpha) |
133 { |
56 { |
134 const quint32 *src = (const quint32 *) srcPixels; |
57 const quint32 *src = (const quint32 *) srcPixels; |
135 quint32 *dst = (uint *) destPixels; |
58 quint32 *dst = (quint32 *) destPixels; |
136 if (const_alpha == 256) { |
59 if (const_alpha == 256) { |
137 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
60 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
138 const __m128i nullVector = _mm_set1_epi32(0); |
61 const __m128i nullVector = _mm_set1_epi32(0); |
139 const __m128i half = _mm_set1_epi16(0x80); |
62 const __m128i half = _mm_set1_epi16(0x80); |
140 const __m128i one = _mm_set1_epi16(0xff); |
63 const __m128i one = _mm_set1_epi16(0xff); |
141 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
64 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
142 for (int y = 0; y < h; ++y) { |
65 for (int y = 0; y < h; ++y) { |
143 int x = 0; |
66 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, w, nullVector, half, one, colorMask, alphaMask); |
144 for (; x < w-3; x += 4) { |
|
145 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
146 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); |
|
147 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { |
|
148 // all opaque |
|
149 _mm_storeu_si128((__m128i *)&dst[x], srcVector); |
|
150 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { |
|
151 // not fully transparent |
|
152 // result = s + d * (1-alpha) |
|
153 |
|
154 // extract the alpha channel on 2 x 16 bits |
|
155 // so we have room for the multiplication |
|
156 // each 32 bits will be in the form 0x00AA00AA |
|
157 // with A being the 1 - alpha |
|
158 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); |
|
159 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); |
|
160 alphaChannel = _mm_sub_epi16(one, alphaChannel); |
|
161 |
|
162 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
163 __m128i destMultipliedByOneMinusAlpha; |
|
164 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
|
165 |
|
166 // result = s + d * (1-alpha) |
|
167 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); |
|
168 _mm_storeu_si128((__m128i *)&dst[x], result); |
|
169 } |
|
170 } |
|
171 for (; x<w; ++x) { |
|
172 uint s = src[x]; |
|
173 if (s >= 0xff000000) |
|
174 dst[x] = s; |
|
175 else if (s != 0) |
|
176 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
177 } |
|
178 dst = (quint32 *)(((uchar *) dst) + dbpl); |
67 dst = (quint32 *)(((uchar *) dst) + dbpl); |
179 src = (const quint32 *)(((const uchar *) src) + sbpl); |
68 src = (const quint32 *)(((const uchar *) src) + sbpl); |
180 } |
69 } |
181 } else if (const_alpha != 0) { |
70 } else if (const_alpha != 0) { |
182 // dest = (s + d * sia) * ca + d * cia |
71 // dest = (s + d * sia) * ca + d * cia |
187 const __m128i half = _mm_set1_epi16(0x80); |
76 const __m128i half = _mm_set1_epi16(0x80); |
188 const __m128i one = _mm_set1_epi16(0xff); |
77 const __m128i one = _mm_set1_epi16(0xff); |
189 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
78 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
190 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
79 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
191 for (int y = 0; y < h; ++y) { |
80 for (int y = 0; y < h; ++y) { |
192 int x = 0; |
81 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector) |
193 for (; x < w-3; x += 4) { |
|
194 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
195 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { |
|
196 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); |
|
197 |
|
198 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); |
|
199 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); |
|
200 alphaChannel = _mm_sub_epi16(one, alphaChannel); |
|
201 |
|
202 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
203 __m128i destMultipliedByOneMinusAlpha; |
|
204 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
|
205 |
|
206 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); |
|
207 _mm_storeu_si128((__m128i *)&dst[x], result); |
|
208 } |
|
209 } |
|
210 for (; x<w; ++x) { |
|
211 quint32 s = src[x]; |
|
212 if (s != 0) { |
|
213 s = BYTE_MUL(s, const_alpha); |
|
214 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
215 } |
|
216 } |
|
217 dst = (quint32 *)(((uchar *) dst) + dbpl); |
82 dst = (quint32 *)(((uchar *) dst) + dbpl); |
218 src = (const quint32 *)(((const uchar *) src) + sbpl); |
83 src = (const quint32 *)(((const uchar *) src) + sbpl); |
219 } |
84 } |
220 } |
85 } |
221 } |
86 } |
243 int one_minus_const_alpha = 255 - const_alpha; |
108 int one_minus_const_alpha = 255 - const_alpha; |
244 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
109 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
245 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
110 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
246 for (int y = 0; y < h; ++y) { |
111 for (int y = 0; y < h; ++y) { |
247 int x = 0; |
112 int x = 0; |
|
113 |
|
114 // First, align dest to 16 bytes: |
|
115 const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3; |
|
116 const int prologLength = qMin(w, offsetToAlignOn16Bytes); |
|
117 for (; x < prologLength; ++x) { |
|
118 quint32 s = src[x]; |
|
119 s = BYTE_MUL(s, const_alpha); |
|
120 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
|
121 } |
|
122 |
248 for (; x < w-3; x += 4) { |
123 for (; x < w-3; x += 4) { |
249 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
124 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
250 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { |
125 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { |
251 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
126 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); |
252 __m128i result; |
127 __m128i result; |
253 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); |
128 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); |
254 _mm_storeu_si128((__m128i *)&dst[x], result); |
129 _mm_store_si128((__m128i *)&dst[x], result); |
255 } |
130 } |
256 } |
131 } |
257 for (; x<w; ++x) { |
132 for (; x<w; ++x) { |
258 quint32 s = src[x]; |
133 quint32 s = src[x]; |
259 s = BYTE_MUL(s, const_alpha); |
134 s = BYTE_MUL(s, const_alpha); |
266 } else { |
141 } else { |
267 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
142 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
268 } |
143 } |
269 } |
144 } |
270 |
145 |
|
146 void QT_FASTCALL comp_func_SourceOver_sse2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha) |
|
147 { |
|
148 Q_ASSERT(const_alpha >= 0); |
|
149 Q_ASSERT(const_alpha < 256); |
|
150 |
|
151 const quint32 *src = (const quint32 *) srcPixels; |
|
152 quint32 *dst = (quint32 *) destPixels; |
|
153 |
|
154 const __m128i nullVector = _mm_set1_epi32(0); |
|
155 const __m128i half = _mm_set1_epi16(0x80); |
|
156 const __m128i one = _mm_set1_epi16(0xff); |
|
157 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
158 if (const_alpha == 255) { |
|
159 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
|
160 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask); |
|
161 } else { |
|
162 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
|
163 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector); |
|
164 } |
|
165 } |
|
166 |
271 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
167 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
272 { |
168 { |
273 if (count < 7) { |
169 if (count < 7) { |
274 switch (count) { |
170 switch (count) { |
275 case 6: *dest++ = value; |
171 case 6: *dest++ = value; |
307 switch (rest) { |
203 switch (rest) { |
308 case 3: dest[count - 3] = value; |
204 case 3: dest[count - 3] = value; |
309 case 2: dest[count - 2] = value; |
205 case 2: dest[count - 2] = value; |
310 case 1: dest[count - 1] = value; |
206 case 1: dest[count - 1] = value; |
311 } |
207 } |
|
208 } |
|
209 } |
|
210 |
|
211 void QT_FASTCALL comp_func_solid_SourceOver_sse2(uint *destPixels, int length, uint color, uint const_alpha) |
|
212 { |
|
213 if ((const_alpha & qAlpha(color)) == 255) { |
|
214 qt_memfill32_sse2(destPixels, color, length); |
|
215 } else { |
|
216 if (const_alpha != 255) |
|
217 color = BYTE_MUL(color, const_alpha); |
|
218 |
|
219 const quint32 minusAlphaOfColor = qAlpha(~color); |
|
220 int x = 0; |
|
221 |
|
222 quint32 *dst = (quint32 *) destPixels; |
|
223 const __m128i colorVector = _mm_set1_epi32(color); |
|
224 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
225 const __m128i half = _mm_set1_epi16(0x80); |
|
226 const __m128i minusAlphaOfColorVector = _mm_set1_epi16(minusAlphaOfColor); |
|
227 |
|
228 for (; x < length-3; x += 4) { |
|
229 __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
230 BYTE_MUL_SSE2(dstVector, dstVector, minusAlphaOfColorVector, colorMask, half); |
|
231 dstVector = _mm_add_epi8(colorVector, dstVector); |
|
232 _mm_storeu_si128((__m128i *)&dst[x], dstVector); |
|
233 } |
|
234 for (;x < length; ++x) |
|
235 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
312 } |
236 } |
313 } |
237 } |
314 |
238 |
315 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count) |
239 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count) |
316 { |
240 { |