|
1 /* |
|
2 * Copyright (c) 2005 |
|
3 * Eric Anholt. All rights reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND |
|
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE |
|
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
24 * SUCH DAMAGE. |
|
25 */ |
|
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
27 |
|
28 #ifdef HAVE_CONFIG_H |
|
29 #include "config.h" |
|
30 #endif |
|
31 #include <liboilclasses.h> |
|
32 #include <liboilfunction.h> |
|
33 #include <emmintrin.h> |
|
34 #include <xmmintrin.h> |
|
35 |
|
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) |
|
37 |
|
38 SSE_FUNCTION static void |
|
39 add_f32_sse (float *dest, float *src1, float *src2, int n) |
|
40 { |
|
41 /* Initial operations to align the destination pointer */ |
|
42 for (; ((long)dest & 15) && (n > 0); n--) { |
|
43 *dest++ = *src1++ + *src2++; |
|
44 } |
|
45 for (; n >= 4; n -= 4) { |
|
46 __m128 xmm0, xmm1; |
|
47 xmm0 = _mm_loadu_ps(src1); |
|
48 xmm1 = _mm_loadu_ps(src2); |
|
49 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
50 _mm_store_ps(dest, xmm0); |
|
51 dest += 4; |
|
52 src1 += 4; |
|
53 src2 += 4; |
|
54 } |
|
55 for (; n > 0; n--) { |
|
56 *dest++ = *src1++ + *src2++; |
|
57 } |
|
58 } |
|
59 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE); |
|
60 |
|
61 SSE_FUNCTION static void |
|
62 add_f64_sse2 (double *dest, double *src1, double *src2, int n) |
|
63 { |
|
64 __m128d xmm0, xmm1; |
|
65 while (((long)dest & 15) && (0 < n)) { |
|
66 *dest++ = *src1++ + *src2++; |
|
67 n--; |
|
68 } |
|
69 while (1 < n) { |
|
70 xmm0 = _mm_loadu_pd(src1); |
|
71 xmm1 = _mm_loadu_pd(src2); |
|
72 xmm0 = _mm_add_pd(xmm0, xmm1); |
|
73 _mm_store_pd(dest, xmm0); |
|
74 dest += 2; |
|
75 src1 += 2; |
|
76 src2 += 2; |
|
77 n -= 2; |
|
78 } |
|
79 while (0 < n) { |
|
80 *dest++ = *src1++ + *src2++; |
|
81 n--; |
|
82 } |
|
83 } |
|
84 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2); |
|
85 |
|
86 SSE_FUNCTION static void |
|
87 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n) |
|
88 { |
|
89 __m128d xmm0, xmm1; |
|
90 while (((long)dest & 15) && (0 < n)) { |
|
91 *dest++ = *src1++ + *src2++; |
|
92 n--; |
|
93 } |
|
94 while (3 < n) { |
|
95 xmm0 = _mm_loadu_pd(src1); |
|
96 xmm1 = _mm_loadu_pd(src2); |
|
97 xmm0 = _mm_add_pd(xmm0, xmm1); |
|
98 _mm_store_pd(dest, xmm0); |
|
99 |
|
100 xmm0 = _mm_loadu_pd(src1+2); |
|
101 xmm1 = _mm_loadu_pd(src2+2); |
|
102 xmm0 = _mm_add_pd(xmm0, xmm1); |
|
103 _mm_store_pd(dest+2, xmm0); |
|
104 dest += 4; |
|
105 src1 += 4; |
|
106 src2 += 4; |
|
107 n -= 4; |
|
108 } |
|
109 while (1 < n) { |
|
110 xmm0 = _mm_loadu_pd(src1); |
|
111 xmm1 = _mm_loadu_pd(src2); |
|
112 xmm0 = _mm_add_pd(xmm0, xmm1); |
|
113 _mm_store_pd(dest, xmm0); |
|
114 dest += 2; |
|
115 src1 += 2; |
|
116 src2 += 2; |
|
117 n -= 2; |
|
118 } |
|
119 while (0 < n) { |
|
120 *dest++ = *src1++ + *src2++; |
|
121 n--; |
|
122 } |
|
123 } |
|
124 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2); |
|
125 |
|
126 SSE_FUNCTION static void |
|
127 subtract_f32_sse (float *dest, float *src1, float *src2, int n) |
|
128 { |
|
129 /* Initial operations to align the destination pointer */ |
|
130 for (; ((long)dest & 15) && (n > 0); n--) { |
|
131 *dest++ = *src1++ - *src2++; |
|
132 } |
|
133 for (; n >= 4; n -= 4) { |
|
134 __m128 xmm0, xmm1; |
|
135 xmm0 = _mm_loadu_ps(src1); |
|
136 xmm1 = _mm_loadu_ps(src2); |
|
137 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
138 _mm_store_ps(dest, xmm0); |
|
139 dest += 4; |
|
140 src1 += 4; |
|
141 src2 += 4; |
|
142 } |
|
143 for (; n > 0; n--) { |
|
144 *dest++ = *src1++ - *src2++; |
|
145 } |
|
146 } |
|
147 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE); |
|
148 |
|
149 SSE_FUNCTION static void |
|
150 multiply_f32_sse (float *dest, float *src1, float *src2, int n) |
|
151 { |
|
152 /* Initial operations to align the destination pointer */ |
|
153 for (; ((long)dest & 15) && (n > 0); n--) { |
|
154 *dest++ = *src1++ * *src2++; |
|
155 } |
|
156 for (; n >= 4; n -= 4) { |
|
157 __m128 xmm0, xmm1; |
|
158 xmm0 = _mm_loadu_ps(src1); |
|
159 xmm1 = _mm_loadu_ps(src2); |
|
160 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
161 _mm_store_ps(dest, xmm0); |
|
162 dest += 4; |
|
163 src1 += 4; |
|
164 src2 += 4; |
|
165 } |
|
166 for (; n > 0; n--) { |
|
167 *dest++ = *src1++ * *src2++; |
|
168 } |
|
169 } |
|
170 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE); |
|
171 |
|
172 SSE_FUNCTION static void |
|
173 divide_f32_sse (float *dest, float *src1, float *src2, int n) |
|
174 { |
|
175 /* Initial operations to align the destination pointer */ |
|
176 for (; ((long)dest & 15) && (n > 0); n--) { |
|
177 *dest++ = *src1++ / *src2++; |
|
178 } |
|
179 for (; n >= 4; n -= 4) { |
|
180 __m128 xmm0, xmm1; |
|
181 xmm0 = _mm_loadu_ps(src1); |
|
182 xmm1 = _mm_loadu_ps(src2); |
|
183 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
184 _mm_store_ps(dest, xmm0); |
|
185 dest += 4; |
|
186 src1 += 4; |
|
187 src2 += 4; |
|
188 } |
|
189 for (; n > 0; n--) { |
|
190 *dest++ = *src1++ / *src2++; |
|
191 } |
|
192 } |
|
193 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE); |
|
194 |
|
195 SSE_FUNCTION static void |
|
196 minimum_f32_sse (float *dest, float *src1, float *src2, int n) |
|
197 { |
|
198 /* Initial operations to align the destination pointer */ |
|
199 for (; ((long)dest & 15) && (n > 0); n--) { |
|
200 *dest++ = *src1 < *src2 ? *src1 : *src2; |
|
201 src1++; |
|
202 src2++; |
|
203 } |
|
204 for (; n >= 4; n -= 4) { |
|
205 __m128 xmm0, xmm1; |
|
206 xmm0 = _mm_loadu_ps(src1); |
|
207 xmm1 = _mm_loadu_ps(src2); |
|
208 xmm0 = _mm_min_ps(xmm0, xmm1); |
|
209 _mm_store_ps(dest, xmm0); |
|
210 dest += 4; |
|
211 src1 += 4; |
|
212 src2 += 4; |
|
213 } |
|
214 for (; n > 0; n--) { |
|
215 *dest++ = *src1 < *src2 ? *src1 : *src2; |
|
216 src1++; |
|
217 src2++; |
|
218 } |
|
219 } |
|
220 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE); |
|
221 |
|
222 SSE_FUNCTION static void |
|
223 maximum_f32_sse (float *dest, float *src1, float *src2, int n) |
|
224 { |
|
225 /* Initial operations to align the destination pointer */ |
|
226 for (; ((long)dest & 15) && (n > 0); n--) { |
|
227 *dest++ = *src1 > *src2 ? *src1 : *src2; |
|
228 src1++; |
|
229 src2++; |
|
230 } |
|
231 for (; n >= 4; n -= 4) { |
|
232 __m128 xmm0, xmm1; |
|
233 xmm0 = _mm_loadu_ps(src1); |
|
234 xmm1 = _mm_loadu_ps(src2); |
|
235 xmm0 = _mm_max_ps(xmm0, xmm1); |
|
236 _mm_store_ps(dest, xmm0); |
|
237 dest += 4; |
|
238 src1 += 4; |
|
239 src2 += 4; |
|
240 } |
|
241 for (; n > 0; n--) { |
|
242 *dest++ = *src1 > *src2 ? *src1 : *src2; |
|
243 src1++; |
|
244 src2++; |
|
245 } |
|
246 } |
|
247 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE); |
|
248 |
|
249 SSE_FUNCTION static void |
|
250 inverse_f32_sse (float *dest, float *src1, int n) |
|
251 { |
|
252 /* Initial operations to align the destination pointer */ |
|
253 for (; ((long)dest & 15) && (n > 0); n--) { |
|
254 *dest++ = 1.0 / *src1++; |
|
255 } |
|
256 for (; n >= 4; n -= 4) { |
|
257 __m128 xmm0, xmm1; |
|
258 /* While _mm_rcp_ps sounds promising, the results it gives are rather |
|
259 * different from the 1.0 / src1 reference implementation, so do that. |
|
260 */ |
|
261 xmm0 = _mm_set_ps1(1.0); |
|
262 xmm1 = _mm_loadu_ps(src1); |
|
263 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
264 _mm_store_ps(dest, xmm0); |
|
265 dest += 4; |
|
266 src1 += 4; |
|
267 } |
|
268 for (; n > 0; n--) { |
|
269 *dest++ = 1.0 / *src1++; |
|
270 } |
|
271 } |
|
272 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE); |
|
273 |
|
274 SSE_FUNCTION static void |
|
275 negative_f32_sse (float *dest, float *src1, int n) |
|
276 { |
|
277 /* Initial operations to align the destination pointer */ |
|
278 for (; ((long)dest & 15) && (n > 0); n--) { |
|
279 *dest++ = -(*src1++); |
|
280 } |
|
281 for (; n >= 4; n -= 4) { |
|
282 __m128 xmm0, xmm1; |
|
283 xmm0 = _mm_setzero_ps(); |
|
284 xmm1 = _mm_loadu_ps(src1); |
|
285 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
286 _mm_store_ps(dest, xmm0); |
|
287 dest += 4; |
|
288 src1 += 4; |
|
289 } |
|
290 for (; n > 0; n--) { |
|
291 *dest++ = -(*src1++); |
|
292 } |
|
293 } |
|
294 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE); |
|
295 |
|
296 SSE_FUNCTION static void |
|
297 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n) |
|
298 { |
|
299 __m128 xmm1; |
|
300 |
|
301 /* Initial operations to align the destination pointer */ |
|
302 for (; ((long)dest & 15) && (n > 0); n--) { |
|
303 *dest++ = *src1++ + *val; |
|
304 } |
|
305 xmm1 = _mm_load_ps1(val); |
|
306 for (; n >= 4; n -= 4) { |
|
307 __m128 xmm0; |
|
308 xmm0 = _mm_loadu_ps(src1); |
|
309 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
310 _mm_store_ps(dest, xmm0); |
|
311 dest += 4; |
|
312 src1 += 4; |
|
313 } |
|
314 for (; n > 0; n--) { |
|
315 *dest++ = *src1++ + *val; |
|
316 } |
|
317 } |
|
318 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE); |
|
319 |
|
320 SSE_FUNCTION static void |
|
321 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n) |
|
322 { |
|
323 __m128 xmm1; |
|
324 |
|
325 /* Initial operations to align the destination pointer */ |
|
326 for (; ((long)dest & 15) && (n > 0); n--) { |
|
327 *dest++ = *src1++ * *val; |
|
328 } |
|
329 xmm1 = _mm_load_ps1(val); |
|
330 for (; n >= 4; n -= 4) { |
|
331 __m128 xmm0; |
|
332 xmm0 = _mm_loadu_ps(src1); |
|
333 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
334 _mm_store_ps(dest, xmm0); |
|
335 dest += 4; |
|
336 src1 += 4; |
|
337 } |
|
338 for (; n > 0; n--) { |
|
339 *dest++ = *src1++ * *val; |
|
340 } |
|
341 } |
|
342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE); |
|
343 |
|
344 SSE_FUNCTION static void |
|
345 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n) |
|
346 { |
|
347 __m128d xmm1; |
|
348 |
|
349 /* Initial operations to align the destination pointer */ |
|
350 for (; ((long)dest & 15) && (n > 0); n--) { |
|
351 *dest++ = *src1++ * *val; |
|
352 } |
|
353 xmm1 = _mm_load_pd1(val); |
|
354 for (; n >= 2; n -= 2) { |
|
355 __m128d xmm0; |
|
356 xmm0 = _mm_loadu_pd(src1); |
|
357 xmm0 = _mm_mul_pd(xmm0, xmm1); |
|
358 _mm_store_pd(dest, xmm0); |
|
359 dest += 2; |
|
360 src1 += 2; |
|
361 } |
|
362 for (; n > 0; n--) { |
|
363 *dest++ = *src1++ * *val; |
|
364 } |
|
365 } |
|
366 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2); |
|
367 |
|
368 |
|
369 |
|
370 #ifdef __SYMBIAN32__ |
|
371 |
|
372 OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() { |
|
373 return &_oil_function_impl_add_f32_sse, add_f32; |
|
374 } |
|
375 #endif |
|
376 |
|
377 #ifdef __SYMBIAN32__ |
|
378 |
|
379 OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() { |
|
380 return &_oil_function_impl_add_f64_sse2, add_f64; |
|
381 } |
|
382 #endif |
|
383 |
|
384 #ifdef __SYMBIAN32__ |
|
385 |
|
386 OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() { |
|
387 return &_oil_function_impl_add_f64_sse2_unroll, add_f64; |
|
388 } |
|
389 #endif |
|
390 |
|
391 #ifdef __SYMBIAN32__ |
|
392 |
|
393 OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() { |
|
394 return &_oil_function_impl_subtract_f32_sse, subtract_f32; |
|
395 } |
|
396 #endif |
|
397 |
|
398 #ifdef __SYMBIAN32__ |
|
399 |
|
400 OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() { |
|
401 return &_oil_function_impl_multiply_f32_sse, multiply_f32; |
|
402 } |
|
403 #endif |
|
404 |
|
405 #ifdef __SYMBIAN32__ |
|
406 |
|
407 OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() { |
|
408 return &_oil_function_impl_divide_f32_sse, divide_f32; |
|
409 } |
|
410 #endif |
|
411 |
|
412 #ifdef __SYMBIAN32__ |
|
413 |
|
414 OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() { |
|
415 return &_oil_function_impl_minimum_f32_sse, minimum_f32; |
|
416 } |
|
417 #endif |
|
418 |
|
419 #ifdef __SYMBIAN32__ |
|
420 |
|
421 OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() { |
|
422 return &_oil_function_impl_maximum_f32_sse, maximum_f32; |
|
423 } |
|
424 #endif |
|
425 |
|
426 #ifdef __SYMBIAN32__ |
|
427 |
|
428 OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() { |
|
429 return &_oil_function_impl_inverse_f32_sse, inverse_f32; |
|
430 } |
|
431 #endif |
|
432 |
|
433 #ifdef __SYMBIAN32__ |
|
434 |
|
435 OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() { |
|
436 return &_oil_function_impl_negative_f32_sse, negative_f32; |
|
437 } |
|
438 #endif |
|
439 |
|
440 #ifdef __SYMBIAN32__ |
|
441 |
|
442 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() { |
|
443 return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns; |
|
444 } |
|
445 #endif |
|
446 |
|
447 #ifdef __SYMBIAN32__ |
|
448 |
|
449 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() { |
|
450 return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns; |
|
451 } |
|
452 #endif |
|
453 |
|
454 #ifdef __SYMBIAN32__ |
|
455 |
|
456 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() { |
|
457 return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns; |
|
458 } |
|
459 #endif |
|
460 |