|
1 /* |
|
2 * Copyright (c) 2005 |
|
3 * Eric Anholt. All rights reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND |
|
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE |
|
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
24 * SUCH DAMAGE. |
|
25 */ |
|
26 |
|
27 #ifdef HAVE_CONFIG_H |
|
28 #include "config.h" |
|
29 #endif |
|
30 #include <liboilclasses.h> |
|
31 #include <liboilfunction.h> |
|
32 #include <emmintrin.h> |
|
33 #include "liboil/liboilcolorspace.h" |
|
34 |
|
35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) |
|
36 |
|
37 /* non-SSE2 compositing support */ |
|
38 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m))) |
|
39 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s)) |
|
40 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m)) |
|
41 |
|
42 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with |
|
43 * the channel value in the low byte. This means 2 pixels per pass. |
|
44 */ |
|
45 |
|
46 #ifdef ENABLE_BROKEN_IMPLS |
|
47 |
|
48 union m128_int { |
|
49 __m128i m128; |
|
50 uint64_t ull[2]; |
|
51 }; |
|
52 |
|
53 static const struct _SSEData { |
|
54 union m128_int sse_8x00ff; |
|
55 union m128_int sse_8x0080; |
|
56 } c = { |
|
57 .sse_8x00ff.ull = {0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL}, |
|
58 .sse_8x0080.ull = {0x0080008000800080ULL, 0x0080008000800080ULL}, |
|
59 }; |
|
60 |
|
61 #define MC(x) (c.sse_##x.m128) |
|
62 |
|
63 /* Shuffles the given value such that the alpha for each pixel appears in each |
|
64 * channel of the pixel. |
|
65 */ |
|
66 SSE_FUNCTION static inline __m128i |
|
67 argb_A_sse2(__m128i a) |
|
68 { |
|
69 a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3)); |
|
70 a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3)); |
|
71 return a; |
|
72 } |
|
73 |
|
74 /* Multiplies the pixel data in a channel-by-channel by b, and divides the |
|
75 * result by 255, with rounding. |
|
76 */ |
|
77 SSE_FUNCTION static inline __m128i |
|
78 muldiv_255_sse2(__m128i a, __m128i b) |
|
79 { |
|
80 __m128i ret; |
|
81 __m128i roundconst = MC(8x0080); |
|
82 |
|
83 ret = _mm_mullo_epi16(a, b); |
|
84 ret = _mm_adds_epu16(ret, roundconst); |
|
85 ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8)); |
|
86 ret = _mm_srli_epi16(ret, 8); |
|
87 |
|
88 return ret; |
|
89 } |
|
90 |
|
91 SSE_FUNCTION static inline __m128i |
|
92 negate_argb_sse2(__m128i a) |
|
93 { |
|
94 return _mm_xor_si128(a, MC(8x00ff)); |
|
95 } |
|
96 |
|
97 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */ |
|
98 SSE_FUNCTION static inline __m128i |
|
99 load_argb_sse2(const uint32_t *src) |
|
100 { |
|
101 __m128i pix; |
|
102 |
|
103 pix = _mm_loadl_epi64((__m128i *)src); |
|
104 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128()); |
|
105 return pix; |
|
106 } |
|
107 |
|
108 SSE_FUNCTION static inline __m128i |
|
109 set1_argb_sse2(uint32_t src) |
|
110 { |
|
111 __m128i pix; |
|
112 |
|
113 pix = _mm_set1_epi32(src); |
|
114 pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128()); |
|
115 return pix; |
|
116 } |
|
117 |
|
118 SSE_FUNCTION static inline __m128i |
|
119 load_u8_mask(const uint8_t *m) |
|
120 { |
|
121 return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1])); |
|
122 } |
|
123 |
|
124 SSE_FUNCTION static inline __m128i |
|
125 set1_u8_mask(uint8_t m) |
|
126 { |
|
127 return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128()); |
|
128 } |
|
129 |
|
130 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */ |
|
131 SSE_FUNCTION static void |
|
132 store_argb_sse2(uint32_t *dest, __m128i pix) |
|
133 { |
|
134 pix = _mm_packus_epi16(pix, pix); |
|
135 _mm_storel_epi64((__m128i *)dest, pix); |
|
136 } |
|
137 |
|
138 SSE_FUNCTION static __m128i |
|
139 over_argb_sse2(__m128i dest, __m128i src, __m128i srca) |
|
140 { |
|
141 return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca))); |
|
142 } |
|
143 |
|
144 SSE_FUNCTION static void |
|
145 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
146 const uint8_t *mask, int n) |
|
147 { |
|
148 for (; n >= 2; n -= 2) { |
|
149 __m128i s, m; |
|
150 s = load_argb_sse2(src); |
|
151 m = load_u8_mask(mask); |
|
152 store_argb_sse2(dest, muldiv_255_sse2(s, m)); |
|
153 src += 2; |
|
154 mask += 2; |
|
155 dest += 2; |
|
156 } |
|
157 for (; n > 0; n--) { |
|
158 uint32_t s = *src++; |
|
159 uint8_t m = *mask++; |
|
160 |
|
161 *dest++ = oil_argb( |
|
162 COMPOSITE_IN(oil_argb_A(s), m), |
|
163 COMPOSITE_IN(oil_argb_R(s), m), |
|
164 COMPOSITE_IN(oil_argb_G(s), m), |
|
165 COMPOSITE_IN(oil_argb_B(s), m)); |
|
166 } |
|
167 } |
|
168 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb, |
|
169 OIL_IMPL_FLAG_SSE2); |
|
170 |
|
171 SSE_FUNCTION static void |
|
172 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
173 const uint8_t *mask, int n) |
|
174 { |
|
175 __m128i s; |
|
176 |
|
177 s = set1_argb_sse2(*src); |
|
178 |
|
179 for (; n >= 2; n -= 2) { |
|
180 __m128i m; |
|
181 m = load_u8_mask(mask); |
|
182 store_argb_sse2(dest, muldiv_255_sse2(s, m)); |
|
183 mask += 2; |
|
184 dest += 2; |
|
185 } |
|
186 for (; n > 0; n--) { |
|
187 uint8_t m = *mask++; |
|
188 |
|
189 *dest++ = oil_argb( |
|
190 COMPOSITE_IN(oil_argb_A(*src), m), |
|
191 COMPOSITE_IN(oil_argb_R(*src), m), |
|
192 COMPOSITE_IN(oil_argb_G(*src), m), |
|
193 COMPOSITE_IN(oil_argb_B(*src), m)); |
|
194 } |
|
195 } |
|
196 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix, |
|
197 composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2); |
|
198 |
|
199 #ifdef SSE_ALIGN |
|
200 SSE_FUNCTION static void |
|
201 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
202 const uint8_t *mask, int n) |
|
203 { |
|
204 __m128i m; |
|
205 |
|
206 m = set1_u8_mask(*mask); |
|
207 |
|
208 for (; n >= 2; n -= 2) { |
|
209 __m128i s; |
|
210 s = load_argb_sse2(src); |
|
211 store_argb_sse2(dest, muldiv_255_sse2(s, m)); |
|
212 src += 2; |
|
213 dest += 2; |
|
214 } |
|
215 for (; n > 0; n--) { |
|
216 uint32_t s = *src++; |
|
217 |
|
218 *dest++ = oil_argb( |
|
219 COMPOSITE_IN(oil_argb_A(s), mask[0]), |
|
220 COMPOSITE_IN(oil_argb_R(s), mask[0]), |
|
221 COMPOSITE_IN(oil_argb_G(s), mask[0]), |
|
222 COMPOSITE_IN(oil_argb_B(s), mask[0])); |
|
223 } |
|
224 } |
|
225 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix, |
|
226 composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2); |
|
227 #endif |
|
228 |
|
229 SSE_FUNCTION static void |
|
230 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n) |
|
231 { |
|
232 for (; n >= 2; n -= 2) { |
|
233 __m128i d, s; |
|
234 s = load_argb_sse2(src); |
|
235 d = load_argb_sse2(dest); |
|
236 d = over_argb_sse2(d, s, argb_A_sse2(s)); |
|
237 store_argb_sse2(dest, d); |
|
238 src += 2; |
|
239 dest += 2; |
|
240 } |
|
241 for (; n > 0; n--) { |
|
242 uint32_t d = *dest, s = *src++; |
|
243 uint8_t srca = oil_argb_A(s); |
|
244 d = oil_argb( |
|
245 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca), |
|
246 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca), |
|
247 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca), |
|
248 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca)); |
|
249 *dest++ = d; |
|
250 } |
|
251 } |
|
252 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb, |
|
253 OIL_IMPL_FLAG_SSE2); |
|
254 |
|
255 SSE_FUNCTION static void |
|
256 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
257 int n) |
|
258 { |
|
259 __m128i s, sa; |
|
260 uint32_t srca; |
|
261 |
|
262 srca = oil_argb_A(*src); |
|
263 s = set1_argb_sse2(*src); |
|
264 sa = negate_argb_sse2(argb_A_sse2(s)); |
|
265 for (; n >= 2; n -= 2) { |
|
266 __m128i d; |
|
267 d = load_argb_sse2(dest); |
|
268 d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa)); |
|
269 store_argb_sse2(dest, d); |
|
270 dest += 2; |
|
271 } |
|
272 for (; n > 0; n--) { |
|
273 uint32_t d = *dest; |
|
274 d = oil_argb( |
|
275 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca), |
|
276 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca), |
|
277 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca), |
|
278 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca)); |
|
279 *dest++ = d; |
|
280 } |
|
281 } |
|
282 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix, |
|
283 composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2); |
|
284 |
|
285 SSE_FUNCTION static void |
|
286 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
287 const uint8_t *mask, int n) |
|
288 { |
|
289 for (; n >= 2; n -= 2) { |
|
290 __m128i d, s, m; |
|
291 s = load_argb_sse2(src); |
|
292 m = load_u8_mask(mask); |
|
293 d = load_argb_sse2(dest); |
|
294 s = muldiv_255_sse2(s, m); |
|
295 d = over_argb_sse2(d, s, argb_A_sse2(s)); |
|
296 store_argb_sse2(dest, d); |
|
297 src += 2; |
|
298 mask += 2; |
|
299 dest += 2; |
|
300 } |
|
301 for (; n > 0; n--) { |
|
302 uint32_t d = *dest, s = *src++, m = *mask++, color; |
|
303 uint8_t srca; |
|
304 |
|
305 color = oil_argb( |
|
306 COMPOSITE_IN(oil_argb_A(s), m), |
|
307 COMPOSITE_IN(oil_argb_R(s), m), |
|
308 COMPOSITE_IN(oil_argb_G(s), m), |
|
309 COMPOSITE_IN(oil_argb_B(s), m)); |
|
310 srca = oil_argb_A(color); |
|
311 d = oil_argb( |
|
312 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), |
|
313 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), |
|
314 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), |
|
315 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); |
|
316 *dest++ = d; |
|
317 } |
|
318 } |
|
319 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb, |
|
320 OIL_IMPL_FLAG_SSE2); |
|
321 |
|
322 SSE_FUNCTION static void |
|
323 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
324 const uint8_t *mask, int n) |
|
325 { |
|
326 __m128i s; |
|
327 |
|
328 s = set1_argb_sse2(*src); |
|
329 |
|
330 for (; n >= 2; n -= 2) { |
|
331 __m128i d, color, m; |
|
332 m = load_u8_mask(mask); |
|
333 d = load_argb_sse2(dest); |
|
334 color = muldiv_255_sse2(s, m); |
|
335 d = over_argb_sse2(d, color, argb_A_sse2(color)); |
|
336 store_argb_sse2(dest, d); |
|
337 mask += 2; |
|
338 dest += 2; |
|
339 } |
|
340 for (; n > 0; n--) { |
|
341 uint32_t d = *dest, m = *mask++, color; |
|
342 uint8_t srca; |
|
343 |
|
344 color = oil_argb( |
|
345 COMPOSITE_IN(oil_argb_A(*src), m), |
|
346 COMPOSITE_IN(oil_argb_R(*src), m), |
|
347 COMPOSITE_IN(oil_argb_G(*src), m), |
|
348 COMPOSITE_IN(oil_argb_B(*src), m)); |
|
349 srca = oil_argb_A(color); |
|
350 d = oil_argb( |
|
351 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), |
|
352 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), |
|
353 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), |
|
354 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); |
|
355 *dest++ = d; |
|
356 } |
|
357 } |
|
358 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix, |
|
359 composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2); |
|
360 |
|
361 SSE_FUNCTION static void |
|
362 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src, |
|
363 const uint8_t *mask, int n) |
|
364 { |
|
365 __m128i m; |
|
366 |
|
367 m = set1_u8_mask(*mask); |
|
368 |
|
369 for (; n >= 2; n -= 2) { |
|
370 __m128i d, s; |
|
371 s = load_argb_sse2(src); |
|
372 d = load_argb_sse2(dest); |
|
373 s = muldiv_255_sse2(s, m); |
|
374 d = over_argb_sse2(d, s, argb_A_sse2(s)); |
|
375 store_argb_sse2(dest, d); |
|
376 src += 2; |
|
377 dest += 2; |
|
378 } |
|
379 for (; n > 0; n--) { |
|
380 uint32_t d = *dest, s = *src++, color; |
|
381 uint8_t srca; |
|
382 |
|
383 color = oil_argb( |
|
384 COMPOSITE_IN(oil_argb_A(s), *mask), |
|
385 COMPOSITE_IN(oil_argb_R(s), *mask), |
|
386 COMPOSITE_IN(oil_argb_G(s), *mask), |
|
387 COMPOSITE_IN(oil_argb_B(s), *mask)); |
|
388 srca = oil_argb_A(color); |
|
389 d = oil_argb( |
|
390 COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca), |
|
391 COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca), |
|
392 COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca), |
|
393 COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca)); |
|
394 *dest++ = d; |
|
395 } |
|
396 } |
|
397 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix, |
|
398 composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2); |
|
399 |
|
400 SSE_FUNCTION static void |
|
401 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n) |
|
402 { |
|
403 /* Initial operations to align the destination pointer */ |
|
404 for (; ((long)dest & 15) && (n > 0); n--) { |
|
405 *dest = COMPOSITE_OVER(*dest, *src, *src); |
|
406 src++; |
|
407 dest++; |
|
408 } |
|
409 /* over_u8 can be dealt with using our argb code, with srca = s */ |
|
410 for (; n >= 8; n -= 8) { |
|
411 __m128i d, s; |
|
412 d = load_argb_sse2((uint32_t *)dest); |
|
413 s = load_argb_sse2((uint32_t *)src); |
|
414 store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s)); |
|
415 src += 8; |
|
416 dest += 8; |
|
417 } |
|
418 for (; n > 0; n--) { |
|
419 *dest = COMPOSITE_OVER(*dest, *src, *src); |
|
420 src++; |
|
421 dest++; |
|
422 } |
|
423 } |
|
424 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8, |
|
425 OIL_IMPL_FLAG_SSE2); |
|
426 #endif |
|
427 |