|
1 /* |
|
2 * Copyright (c) 2005 |
|
3 * Eric Anholt. All rights reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND |
|
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE |
|
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
24 * SUCH DAMAGE. |
|
25 */ |
|
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
27 |
|
28 #ifdef HAVE_CONFIG_H |
|
29 #include "config.h" |
|
30 #endif |
|
31 #include "liboil/liboilclasses.h" |
|
32 #include "liboil/liboilfunction.h" |
|
33 #include <emmintrin.h> |
|
34 #include <xmmintrin.h> |
|
35 |
|
36 /* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */ |
|
37 #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) |
|
38 |
|
39 SSE_FUNCTION static void |
|
40 clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1, |
|
41 uint8_t *src3_1) |
|
42 { |
|
43 __m128i xmm1, xmm2; |
|
44 uint8_t min = *src2_1; |
|
45 uint8_t max = *src3_1; |
|
46 |
|
47 /* Initial operations to align the destination pointer */ |
|
48 for (; ((long)dest & 15) && (n > 0); n--) { |
|
49 uint8_t x = *src1++; |
|
50 if (x < min) |
|
51 x = min; |
|
52 if (x > max) |
|
53 x = max; |
|
54 *dest++ = x; |
|
55 } |
|
56 xmm1 = _mm_set1_epi8(min); |
|
57 xmm2 = _mm_set1_epi8(max); |
|
58 for (; n >= 16; n -= 16) { |
|
59 __m128i xmm0; |
|
60 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
61 xmm0 = _mm_max_epu8(xmm0, xmm1); |
|
62 xmm0 = _mm_min_epu8(xmm0, xmm2); |
|
63 _mm_store_si128((__m128i *)dest, xmm0); |
|
64 dest += 16; |
|
65 src1 += 16; |
|
66 } |
|
67 for (; n > 0; n--) { |
|
68 uint8_t x = *src1++; |
|
69 if (x < min) |
|
70 x = min; |
|
71 if (x > max) |
|
72 x = max; |
|
73 *dest++ = x; |
|
74 } |
|
75 } |
|
76 OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2); |
|
77 |
|
78 SSE_FUNCTION static void |
|
79 clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1, |
|
80 int16_t *src3_1) |
|
81 { |
|
82 __m128i xmm1, xmm2; |
|
83 int16_t min = *src2_1; |
|
84 int16_t max = *src3_1; |
|
85 |
|
86 /* Initial operations to align the destination pointer */ |
|
87 for (; ((long)dest & 15) && (n > 0); n--) { |
|
88 int16_t x = *src1++; |
|
89 if (x < min) |
|
90 x = min; |
|
91 if (x > max) |
|
92 x = max; |
|
93 *dest++ = x; |
|
94 } |
|
95 xmm1 = _mm_set1_epi16(min); |
|
96 xmm2 = _mm_set1_epi16(max); |
|
97 for (; n >= 8; n -= 8) { |
|
98 __m128i xmm0; |
|
99 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
100 xmm0 = _mm_max_epi16(xmm0, xmm1); |
|
101 xmm0 = _mm_min_epi16(xmm0, xmm2); |
|
102 _mm_store_si128((__m128i *)dest, xmm0); |
|
103 dest += 8; |
|
104 src1 += 8; |
|
105 } |
|
106 for (; n > 0; n--) { |
|
107 int16_t x = *src1++; |
|
108 if (x < min) |
|
109 x = min; |
|
110 if (x > max) |
|
111 x = max; |
|
112 *dest++ = x; |
|
113 } |
|
114 } |
|
115 OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2); |
|
116 |
|
117 SSE_FUNCTION static void |
|
118 clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1, |
|
119 const float *src3_1) |
|
120 { |
|
121 __m128 xmm1, xmm2; |
|
122 float min = *src2_1; |
|
123 float max = *src3_1; |
|
124 |
|
125 /* Initial operations to align the destination pointer */ |
|
126 for (; ((long)dest & 15) && (n > 0); n--) { |
|
127 float x = *src1++; |
|
128 if (x < min) |
|
129 x = min; |
|
130 if (x > max) |
|
131 x = max; |
|
132 *dest++ = x; |
|
133 } |
|
134 xmm1 = _mm_set_ps1(min); |
|
135 xmm2 = _mm_set_ps1(max); |
|
136 for (; n >= 4; n -= 4) { |
|
137 __m128 xmm0; |
|
138 xmm0 = _mm_loadu_ps(src1); |
|
139 xmm0 = _mm_max_ps(xmm0, xmm1); |
|
140 xmm0 = _mm_min_ps(xmm0, xmm2); |
|
141 _mm_store_ps(dest, xmm0); |
|
142 dest += 4; |
|
143 src1 += 4; |
|
144 } |
|
145 for (; n > 0; n--) { |
|
146 float x = *src1++; |
|
147 if (x < min) |
|
148 x = min; |
|
149 if (x > max) |
|
150 x = max; |
|
151 *dest++ = x; |
|
152 } |
|
153 } |
|
154 OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE); |
|
155 |
|
156 SSE_FUNCTION static void |
|
157 clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1, |
|
158 const double *src3_1) |
|
159 { |
|
160 __m128d xmm1, xmm2; |
|
161 double min = *src2_1; |
|
162 double max = *src3_1; |
|
163 |
|
164 /* Initial operations to align the destination pointer */ |
|
165 for (; ((long)dest & 15) && (n > 0); n--) { |
|
166 double x = *src1++; |
|
167 if (x < min) |
|
168 x = min; |
|
169 if (x > max) |
|
170 x = max; |
|
171 *dest++ = x; |
|
172 } |
|
173 xmm1 = _mm_set1_pd(min); |
|
174 xmm2 = _mm_set1_pd(max); |
|
175 for (; n >= 2; n -= 2) { |
|
176 __m128d xmm0; |
|
177 xmm0 = _mm_loadu_pd(src1); |
|
178 xmm0 = _mm_max_pd(xmm0, xmm1); |
|
179 xmm0 = _mm_min_pd(xmm0, xmm2); |
|
180 _mm_store_pd(dest, xmm0); |
|
181 dest += 2; |
|
182 src1 += 2; |
|
183 } |
|
184 for (; n > 0; n--) { |
|
185 double x = *src1++; |
|
186 if (x < min) |
|
187 x = min; |
|
188 if (x > max) |
|
189 x = max; |
|
190 *dest++ = x; |
|
191 } |
|
192 } |
|
193 OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64, |
|
194 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2); |
|
195 |
|
196 SSE_FUNCTION static void |
|
197 clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n, |
|
198 const uint8_t *src2_1) |
|
199 { |
|
200 __m128i xmm1; |
|
201 uint8_t min = *src2_1; |
|
202 |
|
203 /* Initial operations to align the destination pointer */ |
|
204 for (; ((long)dest & 15) && (n > 0); n--) { |
|
205 uint8_t x = *src1++; |
|
206 if (x < min) |
|
207 x = min; |
|
208 *dest++ = x; |
|
209 } |
|
210 xmm1 = _mm_set1_epi8(min); |
|
211 for (; n >= 16; n -= 16) { |
|
212 __m128i xmm0; |
|
213 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
214 xmm0 = _mm_max_epu8(xmm0, xmm1); |
|
215 _mm_store_si128((__m128i *)dest, xmm0); |
|
216 dest += 16; |
|
217 src1 += 16; |
|
218 } |
|
219 for (; n > 0; n--) { |
|
220 uint8_t x = *src1++; |
|
221 if (x < min) |
|
222 x = min; |
|
223 *dest++ = x; |
|
224 } |
|
225 } |
|
226 OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2); |
|
227 |
|
228 SSE_FUNCTION static void |
|
229 clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n, |
|
230 const int16_t *src2_1) |
|
231 { |
|
232 __m128i xmm1; |
|
233 int16_t min = *src2_1; |
|
234 |
|
235 /* Initial operations to align the destination pointer */ |
|
236 for (; ((long)dest & 15) && (n > 0); n--) { |
|
237 int16_t x = *src1++; |
|
238 if (x < min) |
|
239 x = min; |
|
240 *dest++ = x; |
|
241 } |
|
242 xmm1 = _mm_set1_epi16(min); |
|
243 for (; n >= 8; n -= 8) { |
|
244 __m128i xmm0; |
|
245 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
246 xmm0 = _mm_max_epi16(xmm0, xmm1); |
|
247 _mm_store_si128((__m128i *)dest, xmm0); |
|
248 dest += 8; |
|
249 src1 += 8; |
|
250 } |
|
251 for (; n > 0; n--) { |
|
252 int16_t x = *src1++; |
|
253 if (x < min) |
|
254 x = min; |
|
255 *dest++ = x; |
|
256 } |
|
257 } |
|
258 OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2); |
|
259 |
|
260 SSE_FUNCTION static void |
|
261 clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1) |
|
262 { |
|
263 __m128 xmm1; |
|
264 float min = *src2_1; |
|
265 |
|
266 /* Initial operations to align the destination pointer */ |
|
267 for (; ((long)dest & 15) && (n > 0); n--) { |
|
268 float x = *src1++; |
|
269 if (x < min) |
|
270 x = min; |
|
271 *dest++ = x; |
|
272 } |
|
273 xmm1 = _mm_set_ps1(min); |
|
274 for (; n >= 4; n -= 4) { |
|
275 __m128 xmm0; |
|
276 xmm0 = _mm_loadu_ps(src1); |
|
277 xmm0 = _mm_max_ps(xmm0, xmm1); |
|
278 _mm_store_ps(dest, xmm0); |
|
279 dest += 4; |
|
280 src1 += 4; |
|
281 } |
|
282 for (; n > 0; n--) { |
|
283 float x = *src1++; |
|
284 if (x < min) |
|
285 x = min; |
|
286 *dest++ = x; |
|
287 } |
|
288 } |
|
289 OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE); |
|
290 |
|
291 SSE_FUNCTION static void |
|
292 clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1) |
|
293 { |
|
294 __m128d xmm1; |
|
295 double min = *src2_1; |
|
296 |
|
297 /* Initial operations to align the destination pointer */ |
|
298 for (; ((long)dest & 15) && (n > 0); n--) { |
|
299 double x = *src1++; |
|
300 if (x < min) |
|
301 x = min; |
|
302 *dest++ = x; |
|
303 } |
|
304 xmm1 = _mm_set1_pd(min); |
|
305 for (; n >= 2; n -= 2) { |
|
306 __m128d xmm0; |
|
307 xmm0 = _mm_loadu_pd(src1); |
|
308 xmm0 = _mm_max_pd(xmm0, xmm1); |
|
309 _mm_store_pd(dest, xmm0); |
|
310 dest += 2; |
|
311 src1 += 2; |
|
312 } |
|
313 for (; n > 0; n--) { |
|
314 double x = *src1++; |
|
315 if (x < min) |
|
316 x = min; |
|
317 *dest++ = x; |
|
318 } |
|
319 } |
|
320 OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64, |
|
321 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2); |
|
322 |
|
323 SSE_FUNCTION static void |
|
324 clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n, |
|
325 const uint8_t *src2_1) |
|
326 { |
|
327 __m128i xmm1; |
|
328 uint8_t max = *src2_1; |
|
329 |
|
330 /* Initial operations to align the destination pointer */ |
|
331 for (; ((long)dest & 15) && (n > 0); n--) { |
|
332 uint8_t x = *src1++; |
|
333 if (x > max) |
|
334 x = max; |
|
335 *dest++ = x; |
|
336 } |
|
337 xmm1 = _mm_set1_epi8(max); |
|
338 for (; n >= 16; n -= 16) { |
|
339 __m128i xmm0; |
|
340 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
341 xmm0 = _mm_min_epu8(xmm0, xmm1); |
|
342 _mm_store_si128((__m128i *)dest, xmm0); |
|
343 dest += 16; |
|
344 src1 += 16; |
|
345 } |
|
346 for (; n > 0; n--) { |
|
347 uint8_t x = *src1++; |
|
348 if (x > max) |
|
349 x = max; |
|
350 *dest++ = x; |
|
351 } |
|
352 } |
|
353 OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2); |
|
354 |
|
355 SSE_FUNCTION static void |
|
356 clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n, |
|
357 const int16_t *src2_1) |
|
358 { |
|
359 __m128i xmm1; |
|
360 int16_t max = *src2_1; |
|
361 |
|
362 /* Initial operations to align the destination pointer */ |
|
363 for (; ((long)dest & 15) && (n > 0); n--) { |
|
364 int16_t x = *src1++; |
|
365 if (x > max) |
|
366 x = max; |
|
367 *dest++ = x; |
|
368 } |
|
369 xmm1 = _mm_set1_epi16(max); |
|
370 for (; n >= 8; n -= 8) { |
|
371 __m128i xmm0; |
|
372 xmm0 = _mm_loadu_si128((__m128i *)src1); |
|
373 xmm0 = _mm_min_epi16(xmm0, xmm1); |
|
374 _mm_store_si128((__m128i *)dest, xmm0); |
|
375 dest += 8; |
|
376 src1 += 8; |
|
377 } |
|
378 for (; n > 0; n--) { |
|
379 int16_t x = *src1++; |
|
380 if (x > max) |
|
381 x = max; |
|
382 *dest++ = x; |
|
383 } |
|
384 } |
|
385 OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2); |
|
386 |
|
387 SSE_FUNCTION static void |
|
388 clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) |
|
389 { |
|
390 __m128 xmm1; |
|
391 float max = *src2_1; |
|
392 |
|
393 /* Initial operations to align the destination pointer */ |
|
394 for (; ((long)dest & 15) && (n > 0); n--) { |
|
395 float x = *src1++; |
|
396 if (x > max) |
|
397 x = max; |
|
398 *dest++ = x; |
|
399 } |
|
400 xmm1 = _mm_set_ps1(max); |
|
401 for (; n >= 4; n -= 4) { |
|
402 __m128 xmm0; |
|
403 xmm0 = _mm_loadu_ps(src1); |
|
404 xmm0 = _mm_min_ps(xmm0, xmm1); |
|
405 _mm_store_ps(dest, xmm0); |
|
406 dest += 4; |
|
407 src1 += 4; |
|
408 } |
|
409 for (; n > 0; n--) { |
|
410 float x = *src1++; |
|
411 if (x > max) |
|
412 x = max; |
|
413 *dest++ = x; |
|
414 } |
|
415 } |
|
416 OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE); |
|
417 |
|
418 SSE_FUNCTION static void |
|
419 clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1) |
|
420 { |
|
421 __m128d xmm1; |
|
422 double max = *src2_1; |
|
423 |
|
424 /* Initial operations to align the destination pointer */ |
|
425 for (; ((long)dest & 15) && (n > 0); n--) { |
|
426 double x = *src1++; |
|
427 if (x > max) |
|
428 x = max; |
|
429 *dest++ = x; |
|
430 } |
|
431 xmm1 = _mm_set1_pd(max); |
|
432 for (; n >= 2; n -= 2) { |
|
433 __m128d xmm0; |
|
434 xmm0 = _mm_loadu_pd(src1); |
|
435 xmm0 = _mm_min_pd(xmm0, xmm1); |
|
436 _mm_store_pd(dest, xmm0); |
|
437 dest += 2; |
|
438 src1 += 2; |
|
439 } |
|
440 for (; n > 0; n--) { |
|
441 double x = *src1++; |
|
442 if (x > max) |
|
443 x = max; |
|
444 *dest++ = x; |
|
445 } |
|
446 } |
|
447 OIL_DEFINE_IMPL_FULL (clamphigh_f64_sse, clamphigh_f64, |
|
448 OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2); |
|
449 |
|
450 |
|
451 #ifdef __SYMBIAN32__ |
|
452 |
|
453 OilFunctionImpl* __oil_function_impl_clamp_u8_sse, clamp_u8() { |
|
454 return &_oil_function_impl_clamp_u8_sse, clamp_u8; |
|
455 } |
|
456 #endif |
|
457 |
|
458 #ifdef __SYMBIAN32__ |
|
459 |
|
460 OilFunctionImpl* __oil_function_impl_clamp_s16_sse, clamp_s16() { |
|
461 return &_oil_function_impl_clamp_s16_sse, clamp_s16; |
|
462 } |
|
463 #endif |
|
464 |
|
465 #ifdef __SYMBIAN32__ |
|
466 |
|
467 OilFunctionImpl* __oil_function_impl_clamp_f32_sse, clamp_f32() { |
|
468 return &_oil_function_impl_clamp_f32_sse, clamp_f32; |
|
469 } |
|
470 #endif |
|
471 |
|
472 #ifdef __SYMBIAN32__ |
|
473 |
|
474 OilFunctionImpl* __oil_function_impl_clamp_f64_sse, clamp_f64() { |
|
475 return &_oil_function_impl_clamp_f64_sse, clamp_f64; |
|
476 } |
|
477 #endif |
|
478 |
|
479 #ifdef __SYMBIAN32__ |
|
480 |
|
481 OilFunctionImpl* __oil_function_impl_clamplow_u8_sse, clamplow_u8() { |
|
482 return &_oil_function_impl_clamplow_u8_sse, clamplow_u8; |
|
483 } |
|
484 #endif |
|
485 |
|
486 #ifdef __SYMBIAN32__ |
|
487 |
|
488 OilFunctionImpl* __oil_function_impl_clamplow_s16_sse, clamplow_s16() { |
|
489 return &_oil_function_impl_clamplow_s16_sse, clamplow_s16; |
|
490 } |
|
491 #endif |
|
492 |
|
493 #ifdef __SYMBIAN32__ |
|
494 |
|
495 OilFunctionImpl* __oil_function_impl_clamplow_f32_sse, clamplow_f32() { |
|
496 return &_oil_function_impl_clamplow_f32_sse, clamplow_f32; |
|
497 } |
|
498 #endif |
|
499 |
|
500 #ifdef __SYMBIAN32__ |
|
501 |
|
502 OilFunctionImpl* __oil_function_impl_clamplow_f64_sse, clamplow_f64() { |
|
503 return &_oil_function_impl_clamplow_f64_sse, clamplow_f64; |
|
504 } |
|
505 #endif |
|
506 |
|
507 #ifdef __SYMBIAN32__ |
|
508 |
|
509 OilFunctionImpl* __oil_function_impl_clamphigh_u8_sse, clamphigh_u8() { |
|
510 return &_oil_function_impl_clamphigh_u8_sse, clamphigh_u8; |
|
511 } |
|
512 #endif |
|
513 |
|
514 #ifdef __SYMBIAN32__ |
|
515 |
|
516 OilFunctionImpl* __oil_function_impl_clamphigh_s16_sse, clamphigh_s16() { |
|
517 return &_oil_function_impl_clamphigh_s16_sse, clamphigh_s16; |
|
518 } |
|
519 #endif |
|
520 |
|
521 #ifdef __SYMBIAN32__ |
|
522 |
|
523 OilFunctionImpl* __oil_function_impl_clamphigh_f32_sse, clamphigh_f32() { |
|
524 return &_oil_function_impl_clamphigh_f32_sse, clamphigh_f32; |
|
525 } |
|
526 #endif |
|
527 |
|
528 #ifdef __SYMBIAN32__ |
|
529 |
|
530 OilFunctionImpl* __oil_function_impl_clamphigh_f64_sse, clamphigh_f64() { |
|
531 return &_oil_function_impl_clamphigh_f64_sse, clamphigh_f64; |
|
532 } |
|
533 #endif |
|
534 |