110 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
109 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
111 for (int y = 0; y < h; ++y) { |
110 for (int y = 0; y < h; ++y) { |
112 int x = 0; |
111 int x = 0; |
113 |
112 |
114 // First, align dest to 16 bytes: |
113 // First, align dest to 16 bytes: |
115 const int offsetToAlignOn16Bytes = (4 - ((reinterpret_cast<quintptr>(dst) >> 2) & 0x3)) & 0x3; |
114 ALIGNMENT_PROLOGUE_16BYTES(dst, x, w) { |
116 const int prologLength = qMin(w, offsetToAlignOn16Bytes); |
|
117 for (; x < prologLength; ++x) { |
|
118 quint32 s = src[x]; |
115 quint32 s = src[x]; |
119 s = BYTE_MUL(s, const_alpha); |
116 s = BYTE_MUL(s, const_alpha); |
120 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
117 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
121 } |
118 } |
122 |
119 |
159 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
155 const __m128i alphaMask = _mm_set1_epi32(0xff000000); |
160 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask); |
156 BLEND_SOURCE_OVER_ARGB32_SSE2(dst, src, length, nullVector, half, one, colorMask, alphaMask); |
161 } else { |
157 } else { |
162 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
158 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
163 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector); |
159 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, length, nullVector, half, one, colorMask, constAlphaVector); |
|
160 } |
|
161 } |
|
162 |
|
163 void QT_FASTCALL comp_func_Plus_sse2(uint *dst, const uint *src, int length, uint const_alpha) |
|
164 { |
|
165 int x = 0; |
|
166 |
|
167 if (const_alpha == 255) { |
|
168 // 1) Prologue: align destination on 16 bytes |
|
169 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
|
170 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); |
|
171 |
|
172 // 2) composition with SSE2 |
|
173 for (; x < length - 3; x += 4) { |
|
174 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
175 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); |
|
176 |
|
177 const __m128i result = _mm_adds_epu8(srcVector, dstVector); |
|
178 _mm_store_si128((__m128i *)&dst[x], result); |
|
179 } |
|
180 |
|
181 // 3) Epilogue: |
|
182 for (; x < length; ++x) |
|
183 dst[x] = comp_func_Plus_one_pixel(dst[x], src[x]); |
|
184 } else { |
|
185 const int one_minus_const_alpha = 255 - const_alpha; |
|
186 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
|
187 const __m128i oneMinusConstAlpha = _mm_set1_epi16(one_minus_const_alpha); |
|
188 |
|
189 // 1) Prologue: align destination on 16 bytes |
|
190 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
|
191 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); |
|
192 |
|
193 const __m128i half = _mm_set1_epi16(0x80); |
|
194 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
195 // 2) composition with SSE2 |
|
196 for (; x < length - 3; x += 4) { |
|
197 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
198 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); |
|
199 |
|
200 __m128i result = _mm_adds_epu8(srcVector, dstVector); |
|
201 INTERPOLATE_PIXEL_255_SSE2(result, result, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) |
|
202 _mm_store_si128((__m128i *)&dst[x], result); |
|
203 } |
|
204 |
|
205 // 3) Epilogue: |
|
206 for (; x < length; ++x) |
|
207 dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); |
|
208 } |
|
209 } |
|
210 |
|
211 void QT_FASTCALL comp_func_Source_sse2(uint *dst, const uint *src, int length, uint const_alpha) |
|
212 { |
|
213 if (const_alpha == 255) { |
|
214 ::memcpy(dst, src, length * sizeof(uint)); |
|
215 } else { |
|
216 const int ialpha = 255 - const_alpha; |
|
217 |
|
218 int x = 0; |
|
219 |
|
220 // 1) prologue, align on 16 bytes |
|
221 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) |
|
222 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); |
|
223 |
|
224 // 2) interpolate pixels with SSE2 |
|
225 const __m128i half = _mm_set1_epi16(0x80); |
|
226 const __m128i colorMask = _mm_set1_epi32(0x00ff00ff); |
|
227 const __m128i constAlphaVector = _mm_set1_epi16(const_alpha); |
|
228 const __m128i oneMinusConstAlpha = _mm_set1_epi16(ialpha); |
|
229 for (; x < length - 3; x += 4) { |
|
230 const __m128i srcVector = _mm_loadu_si128((__m128i *)&src[x]); |
|
231 __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); |
|
232 INTERPOLATE_PIXEL_255_SSE2(dstVector, srcVector, dstVector, constAlphaVector, oneMinusConstAlpha, colorMask, half) |
|
233 _mm_store_si128((__m128i *)&dst[x], dstVector); |
|
234 } |
|
235 |
|
236 // 3) Epilogue |
|
237 for (; x < length; ++x) |
|
238 dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], ialpha); |
164 } |
239 } |
165 } |
240 } |
166 |
241 |
167 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
242 void qt_memfill32_sse2(quint32 *dest, quint32 value, int count) |
168 { |
243 { |
233 } |
308 } |
234 for (;x < length; ++x) |
309 for (;x < length; ++x) |
235 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
310 destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
236 } |
311 } |
237 } |
312 } |
|
313 |
|
314 CompositionFunctionSolid qt_functionForModeSolid_onlySSE2[numCompositionFunctions] = { |
|
315 comp_func_solid_SourceOver_sse2, |
|
316 comp_func_solid_DestinationOver, |
|
317 comp_func_solid_Clear, |
|
318 comp_func_solid_Source, |
|
319 comp_func_solid_Destination, |
|
320 comp_func_solid_SourceIn, |
|
321 comp_func_solid_DestinationIn, |
|
322 comp_func_solid_SourceOut, |
|
323 comp_func_solid_DestinationOut, |
|
324 comp_func_solid_SourceAtop, |
|
325 comp_func_solid_DestinationAtop, |
|
326 comp_func_solid_XOR, |
|
327 comp_func_solid_Plus, |
|
328 comp_func_solid_Multiply, |
|
329 comp_func_solid_Screen, |
|
330 comp_func_solid_Overlay, |
|
331 comp_func_solid_Darken, |
|
332 comp_func_solid_Lighten, |
|
333 comp_func_solid_ColorDodge, |
|
334 comp_func_solid_ColorBurn, |
|
335 comp_func_solid_HardLight, |
|
336 comp_func_solid_SoftLight, |
|
337 comp_func_solid_Difference, |
|
338 comp_func_solid_Exclusion, |
|
339 rasterop_solid_SourceOrDestination, |
|
340 rasterop_solid_SourceAndDestination, |
|
341 rasterop_solid_SourceXorDestination, |
|
342 rasterop_solid_NotSourceAndNotDestination, |
|
343 rasterop_solid_NotSourceOrNotDestination, |
|
344 rasterop_solid_NotSourceXorDestination, |
|
345 rasterop_solid_NotSource, |
|
346 rasterop_solid_NotSourceAndDestination, |
|
347 rasterop_solid_SourceAndNotDestination |
|
348 }; |
|
349 |
|
350 CompositionFunction qt_functionForMode_onlySSE2[numCompositionFunctions] = { |
|
351 comp_func_SourceOver_sse2, |
|
352 comp_func_DestinationOver, |
|
353 comp_func_Clear, |
|
354 comp_func_Source_sse2, |
|
355 comp_func_Destination, |
|
356 comp_func_SourceIn, |
|
357 comp_func_DestinationIn, |
|
358 comp_func_SourceOut, |
|
359 comp_func_DestinationOut, |
|
360 comp_func_SourceAtop, |
|
361 comp_func_DestinationAtop, |
|
362 comp_func_XOR, |
|
363 comp_func_Plus_sse2, |
|
364 comp_func_Multiply, |
|
365 comp_func_Screen, |
|
366 comp_func_Overlay, |
|
367 comp_func_Darken, |
|
368 comp_func_Lighten, |
|
369 comp_func_ColorDodge, |
|
370 comp_func_ColorBurn, |
|
371 comp_func_HardLight, |
|
372 comp_func_SoftLight, |
|
373 comp_func_Difference, |
|
374 comp_func_Exclusion, |
|
375 rasterop_SourceOrDestination, |
|
376 rasterop_SourceAndDestination, |
|
377 rasterop_SourceXorDestination, |
|
378 rasterop_NotSourceAndNotDestination, |
|
379 rasterop_NotSourceOrNotDestination, |
|
380 rasterop_NotSourceXorDestination, |
|
381 rasterop_NotSource, |
|
382 rasterop_NotSourceAndDestination, |
|
383 rasterop_SourceAndNotDestination |
|
384 }; |
238 |
385 |
239 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count) |
386 void qt_memfill16_sse2(quint16 *dest, quint16 value, int count) |
240 { |
387 { |
241 if (count < 3) { |
388 if (count < 3) { |
242 switch (count) { |
389 switch (count) { |