54 #else |
54 #else |
55 # include <emmintrin.h> |
55 # include <emmintrin.h> |
56 #endif |
56 #endif |
57 |
57 |
58 QT_BEGIN_NAMESPACE |
58 QT_BEGIN_NAMESPACE |
|
59 |
|
60 /* |
|
61 * Multiply the components of pixelVector by alphaChannel |
|
62 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
|
63 * colorMask must have 0x00ff00ff on each 32 bits component |
|
64 * half must have the value 128 (0x80) for each 32 bits compnent |
|
65 */ |
|
66 #define BYTE_MUL_SSE2(result, pixelVector, alphaChannel, colorMask, half) \ |
|
67 { \ |
|
68 /* 1. separate the colors in 2 vectors so each color is on 16 bits \ |
|
69 (in order to be multiplied by the alpha \ |
|
70 each 32 bit of dstVectorAG are in the form 0x00AA00GG \ |
|
71 each 32 bit of dstVectorRB are in the form 0x00RR00BB */\ |
|
72 __m128i pixelVectorAG = _mm_srli_epi16(pixelVector, 8); \ |
|
73 __m128i pixelVectorRB = _mm_and_si128(pixelVector, colorMask); \ |
|
74 \ |
|
75 /* 2. multiply the vectors by the alpha channel */\ |
|
76 pixelVectorAG = _mm_mullo_epi16(pixelVectorAG, alphaChannel); \ |
|
77 pixelVectorRB = _mm_mullo_epi16(pixelVectorRB, alphaChannel); \ |
|
78 \ |
|
79 /* 3. devide by 255, that's the tricky part. \ |
|
80 we do it like for BYTE_MUL(), with bit shift: X/255 ~= (X + X/256 + rounding)/256 */ \ |
|
81 /** so first (X + X/256 + rounding) */\ |
|
82 pixelVectorRB = _mm_add_epi16(pixelVectorRB, _mm_srli_epi16(pixelVectorRB, 8)); \ |
|
83 pixelVectorRB = _mm_add_epi16(pixelVectorRB, half); \ |
|
84 pixelVectorAG = _mm_add_epi16(pixelVectorAG, _mm_srli_epi16(pixelVectorAG, 8)); \ |
|
85 pixelVectorAG = _mm_add_epi16(pixelVectorAG, half); \ |
|
86 \ |
|
87 /** second devide by 256 */\ |
|
88 pixelVectorRB = _mm_srli_epi16(pixelVectorRB, 8); \ |
|
89 /** for AG, we could >> 8 to divide followed by << 8 to put the \ |
|
90 bytes in the correct position. By masking instead, we execute \ |
|
91 only one instruction */\ |
|
92 pixelVectorAG = _mm_andnot_si128(colorMask, pixelVectorAG); \ |
|
93 \ |
|
94 /* 4. combine the 2 pairs of colors */ \ |
|
95 result = _mm_or_si128(pixelVectorAG, pixelVectorRB); \ |
|
96 } |
|
97 |
|
98 /* |
|
99 * Each 32bits components of alphaChannel must be in the form 0x00AA00AA |
|
100 * oneMinusAlphaChannel must be 255 - alpha for each 32 bits component |
|
101 * colorMask must have 0x00ff00ff on each 32 bits component |
|
102 * half must have the value 128 (0x80) for each 32 bits compnent |
|
103 */ |
|
104 #define INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, alphaChannel, oneMinusAlphaChannel, colorMask, half) { \ |
|
105 /* interpolate AG */\ |
|
106 __m128i srcVectorAG = _mm_srli_epi16(srcVector, 8); \ |
|
107 __m128i dstVectorAG = _mm_srli_epi16(dstVector, 8); \ |
|
108 __m128i srcVectorAGalpha = _mm_mullo_epi16(srcVectorAG, alphaChannel); \ |
|
109 __m128i dstVectorAGoneMinusAlphalpha = _mm_mullo_epi16(dstVectorAG, oneMinusAlphaChannel); \ |
|
110 __m128i finalAG = _mm_add_epi16(srcVectorAGalpha, dstVectorAGoneMinusAlphalpha); \ |
|
111 finalAG = _mm_add_epi16(finalAG, _mm_srli_epi16(finalAG, 8)); \ |
|
112 finalAG = _mm_add_epi16(finalAG, half); \ |
|
113 finalAG = _mm_andnot_si128(colorMask, finalAG); \ |
|
114 \ |
|
115 /* interpolate RB */\ |
|
116 __m128i srcVectorRB = _mm_and_si128(srcVector, colorMask); \ |
|
117 __m128i dstVectorRB = _mm_and_si128(dstVector, colorMask); \ |
|
118 __m128i srcVectorRBalpha = _mm_mullo_epi16(srcVectorRB, alphaChannel); \ |
|
119 __m128i dstVectorRBoneMinusAlphalpha = _mm_mullo_epi16(dstVectorRB, oneMinusAlphaChannel); \ |
|
120 __m128i finalRB = _mm_add_epi16(srcVectorRBalpha, dstVectorRBoneMinusAlphalpha); \ |
|
121 finalRB = _mm_add_epi16(finalRB, _mm_srli_epi16(finalRB, 8)); \ |
|
122 finalRB = _mm_add_epi16(finalRB, half); \ |
|
123 finalRB = _mm_srli_epi16(finalRB, 8); \ |
|
124 \ |
|
125 /* combine */\ |
|
126 result = _mm_or_si128(finalAG, finalRB); \ |
|
127 } |
|
128 |
|
129 void qt_blend_argb32_on_argb32_sse2(uchar *destPixels, int dbpl, |
|
130 const uchar *srcPixels, int sbpl, |
|
131 int w, int h, |
|
132 int const_alpha) |
|
133 { |
|
134 const quint32 *src = (const quint32 *) srcPixels; |
|
135 quint32 *dst = (uint *) destPixels; |
|
136 if (const_alpha == 256) { |
|
137 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
|
138 const __m128i nullVector = _mm_set1_epi32(0); |
|
139 const __m128i half = _mm_set1_epi16(0x80); |
|
140 const __m128i one = _mm_set1_epi16(0xff); |
|
141 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
142 for (int y = 0; y < h; ++y) { |
|
143 int x = 0; |
|
144 for (; x < w-3; x += 4) { |
|
145 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
146 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); |
|
147 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { |
|
148 // all opaque |
|
149 _mm_storeu_si128((__m128i *)&dst[x], srcVector); |
|
150 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { |
|
151 // not fully transparent |
|
152 // result = s + d * (1-alpha) |
|
153 |
|
154 // extract the alpha channel on 2 x 16 bits |
|
155 // so we have room for the multiplication |
|
156 // each 32 bits will be in the form 0x00AA00AA |
|
157 // with A being the 1 - alpha |
|
158 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); |
|
159 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); |
|
160 alphaChannel = _mm_sub_epi16(one, alphaChannel); |
|
161 |
|
162 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
163 __m128i destMultipliedByOneMinusAlpha; |
|
164 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
|
165 |
|
166 // result = s + d * (1-alpha) |
|
167 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); |
|
168 _mm_storeu_si128((__m128i *)&dst[x], result); |
|
169 } |
|
170 } |
|
171 for (; x<w; ++x) { |
|
172 uint s = src[x]; |
|
173 if (s >= 0xff000000) |
|
174 dst[x] = s; |
|
175 else if (s != 0) |
|
176 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
177 } |
|
178 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
179 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
180 } |
|
181 } else if (const_alpha != 0) { |
|
182 // dest = (s + d * sia) * ca + d * cia |
|
183 // = s * ca + d * (sia * ca + cia) |
|
184 // = s * ca + d * (1 - sa*ca) |
|
185 const_alpha = (const_alpha * 255) >> 8; |
|
186 const __m128i nullVector = _mm_set1_epi32(0); |
|
187 const __m128i half = _mm_set1_epi16(0x80); |
|
188 const __m128i one = _mm_set1_epi16(0xff); |
|
189 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
190 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
|
191 for (int y = 0; y < h; ++y) { |
|
192 int x = 0; |
|
193 for (; x < w-3; x += 4) { |
|
194 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
195 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { |
|
196 BYTE_MUL_SSE2(srcVector, srcVector, constAlphaVector, colorMask, half); |
|
197 |
|
198 __m128i alphaChannel = _mm_srli_epi32(srcVector, 24); |
|
199 alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16)); |
|
200 alphaChannel = _mm_sub_epi16(one, alphaChannel); |
|
201 |
|
202 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
203 __m128i destMultipliedByOneMinusAlpha; |
|
204 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); |
|
205 |
|
206 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); |
|
207 _mm_storeu_si128((__m128i *)&dst[x], result); |
|
208 } |
|
209 } |
|
210 for (; x<w; ++x) { |
|
211 quint32 s = src[x]; |
|
212 if (s != 0) { |
|
213 s = BYTE_MUL(s, const_alpha); |
|
214 dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
|
215 } |
|
216 } |
|
217 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
218 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
219 } |
|
220 } |
|
221 } |
|
222 |
|
223 // qblendfunctions.cpp |
|
224 void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, |
|
225 const uchar *srcPixels, int sbpl, |
|
226 int w, int h, |
|
227 int const_alpha); |
|
228 |
|
229 void qt_blend_rgb32_on_rgb32_sse2(uchar *destPixels, int dbpl, |
|
230 const uchar *srcPixels, int sbpl, |
|
231 int w, int h, |
|
232 int const_alpha) |
|
233 { |
|
234 const quint32 *src = (const quint32 *) srcPixels; |
|
235 quint32 *dst = (uint *) destPixels; |
|
236 if (const_alpha != 256) { |
|
237 if (const_alpha != 0) { |
|
238 const __m128i nullVector = _mm_set1_epi32(0); |
|
239 const __m128i half = _mm_set1_epi16(0x80); |
|
240 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
241 |
|
242 const_alpha = (const_alpha * 255) >> 8; |
|
243 int one_minus_const_alpha = 255 - const_alpha; |
|
244 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
|
245 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
|
246 for (int y = 0; y < h; ++y) { |
|
247 int x = 0; |
|
248 for (; x < w-3; x += 4) { |
|
249 __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
250 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVector, nullVector)) != 0xffff) { |
|
251 const __m128i dstVector = _mm_loadu_si128((__m128i *)&dst[x]); |
|
252 __m128i result; |
|
253 INTERPOLATE_PIXEL_255_SSE2(result, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half); |
|
254 _mm_storeu_si128((__m128i *)&dst[x], result); |
|
255 } |
|
256 } |
|
257 for (; x<w; ++x) { |
|
258 quint32 s = src[x]; |
|
259 s = BYTE_MUL(s, const_alpha); |
|
260 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
|
261 } |
|
262 dst = (quint32 *)(((uchar *) dst) + dbpl); |
|
263 src = (const quint32 *)(((const uchar *) src) + sbpl); |
|
264 } |
|
265 } |
|
266 } else { |
|
267 qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
|
268 } |
|
269 } |
59 |
270 |
60 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
271 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
61 { |
272 { |
62 if (count < 7) { |
273 if (count < 7) { |
63 switch (count) { |
274 switch (count) { |