|
1 /* |
|
2 * Copyright (c) 2005 |
|
3 * Eric Anholt. All rights reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND |
|
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE |
|
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
24 * SUCH DAMAGE. |
|
25 */ |
|
26 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
27 |
|
28 #ifdef HAVE_CONFIG_H |
|
29 #include "config.h" |
|
30 #endif |
|
31 #include <liboil/liboilclasses.h> |
|
32 #include <liboil/liboilfunction.h> |
|
33 #include <emmintrin.h> |
|
34 #include <xmmintrin.h> |
|
35 |
|
36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer)) |
|
37 |
|
38 SSE_FUNCTION static void |
|
39 add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
40 { |
|
41 /* Initial operations to align the destination pointer */ |
|
42 for (; ((long)dest & 15) && (n > 0); n--) { |
|
43 *dest++ = *src1++ + *src2++; |
|
44 } |
|
45 for (; n >= 8; n -= 8) { |
|
46 __m128 xmm0, xmm1; |
|
47 xmm0 = _mm_loadu_ps(src1); |
|
48 xmm1 = _mm_loadu_ps(src2); |
|
49 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
50 _mm_store_ps(dest, xmm0); |
|
51 xmm0 = _mm_loadu_ps(src1 + 4); |
|
52 xmm1 = _mm_loadu_ps(src2 + 4); |
|
53 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
54 _mm_store_ps(dest + 4, xmm0); |
|
55 dest += 8; |
|
56 src1 += 8; |
|
57 src2 += 8; |
|
58 } |
|
59 for (; n > 0; n--) { |
|
60 *dest++ = *src1++ + *src2++; |
|
61 } |
|
62 } |
|
63 OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE); |
|
64 |
|
65 SSE_FUNCTION static void |
|
66 subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
67 { |
|
68 /* Initial operations to align the destination pointer */ |
|
69 for (; ((long)dest & 15) && (n > 0); n--) { |
|
70 *dest++ = *src1++ - *src2++; |
|
71 } |
|
72 for (; n >= 8; n -= 8) { |
|
73 __m128 xmm0, xmm1; |
|
74 xmm0 = _mm_loadu_ps(src1); |
|
75 xmm1 = _mm_loadu_ps(src2); |
|
76 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
77 _mm_store_ps(dest, xmm0); |
|
78 xmm0 = _mm_loadu_ps(src1 + 4); |
|
79 xmm1 = _mm_loadu_ps(src2 + 4); |
|
80 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
81 _mm_store_ps(dest + 4, xmm0); |
|
82 dest += 8; |
|
83 src1 += 8; |
|
84 src2 += 8; |
|
85 } |
|
86 for (; n > 0; n--) { |
|
87 *dest++ = *src1++ - *src2++; |
|
88 } |
|
89 } |
|
90 OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE); |
|
91 |
|
92 SSE_FUNCTION static void |
|
93 multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
94 { |
|
95 /* Initial operations to align the destination pointer */ |
|
96 for (; ((long)dest & 15) && (n > 0); n--) { |
|
97 *dest++ = *src1++ * *src2++; |
|
98 } |
|
99 for (; n >= 8; n -= 8) { |
|
100 __m128 xmm0, xmm1; |
|
101 xmm0 = _mm_loadu_ps(src1); |
|
102 xmm1 = _mm_loadu_ps(src2); |
|
103 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
104 _mm_store_ps(dest, xmm0); |
|
105 xmm0 = _mm_loadu_ps(src1 + 4); |
|
106 xmm1 = _mm_loadu_ps(src2 + 4); |
|
107 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
108 _mm_store_ps(dest + 4, xmm0); |
|
109 dest += 8; |
|
110 src1 += 8; |
|
111 src2 += 8; |
|
112 } |
|
113 for (; n > 0; n--) { |
|
114 *dest++ = *src1++ * *src2++; |
|
115 } |
|
116 } |
|
117 OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE); |
|
118 |
|
119 SSE_FUNCTION static void |
|
120 divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
121 { |
|
122 /* Initial operations to align the destination pointer */ |
|
123 for (; ((long)dest & 15) && (n > 0); n--) { |
|
124 *dest++ = *src1++ / *src2++; |
|
125 } |
|
126 for (; n >= 8; n -= 8) { |
|
127 __m128 xmm0, xmm1; |
|
128 xmm0 = _mm_loadu_ps(src1); |
|
129 xmm1 = _mm_loadu_ps(src2); |
|
130 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
131 _mm_store_ps(dest, xmm0); |
|
132 xmm0 = _mm_loadu_ps(src1 + 4); |
|
133 xmm1 = _mm_loadu_ps(src2 + 4); |
|
134 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
135 _mm_store_ps(dest + 4, xmm0); |
|
136 dest += 8; |
|
137 src1 += 8; |
|
138 src2 += 8; |
|
139 } |
|
140 for (; n > 0; n--) { |
|
141 *dest++ = *src1++ / *src2++; |
|
142 } |
|
143 } |
|
144 OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE); |
|
145 |
|
146 SSE_FUNCTION static void |
|
147 minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
148 { |
|
149 /* Initial operations to align the destination pointer */ |
|
150 for (; ((long)dest & 15) && (n > 0); n--) { |
|
151 *dest++ = *src1 < *src2 ? *src1 : *src2; |
|
152 src1++; |
|
153 src2++; |
|
154 } |
|
155 for (; n >= 8; n -= 8) { |
|
156 __m128 xmm0, xmm1; |
|
157 xmm0 = _mm_loadu_ps(src1); |
|
158 xmm1 = _mm_loadu_ps(src2); |
|
159 xmm0 = _mm_min_ps(xmm0, xmm1); |
|
160 _mm_store_ps(dest, xmm0); |
|
161 xmm0 = _mm_loadu_ps(src1 + 4); |
|
162 xmm1 = _mm_loadu_ps(src2 + 4); |
|
163 xmm0 = _mm_min_ps(xmm0, xmm1); |
|
164 _mm_store_ps(dest + 4, xmm0); |
|
165 dest += 8; |
|
166 src1 += 8; |
|
167 src2 += 8; |
|
168 } |
|
169 for (; n > 0; n--) { |
|
170 *dest++ = *src1 < *src2 ? *src1 : *src2; |
|
171 src1++; |
|
172 src2++; |
|
173 } |
|
174 } |
|
175 OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE); |
|
176 |
|
177 SSE_FUNCTION static void |
|
178 maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) |
|
179 { |
|
180 /* Initial operations to align the destination pointer */ |
|
181 for (; ((long)dest & 15) && (n > 0); n--) { |
|
182 *dest++ = *src1 > *src2 ? *src1 : *src2; |
|
183 src1++; |
|
184 src2++; |
|
185 } |
|
186 for (; n >= 8; n -= 8) { |
|
187 __m128 xmm0, xmm1; |
|
188 xmm0 = _mm_loadu_ps(src1); |
|
189 xmm1 = _mm_loadu_ps(src2); |
|
190 xmm0 = _mm_max_ps(xmm0, xmm1); |
|
191 _mm_store_ps(dest, xmm0); |
|
192 xmm0 = _mm_loadu_ps(src1 + 4); |
|
193 xmm1 = _mm_loadu_ps(src2 + 4); |
|
194 xmm0 = _mm_max_ps(xmm0, xmm1); |
|
195 _mm_store_ps(dest + 4, xmm0); |
|
196 dest += 8; |
|
197 src1 += 8; |
|
198 src2 += 8; |
|
199 } |
|
200 for (; n > 0; n--) { |
|
201 *dest++ = *src1 > *src2 ? *src1 : *src2; |
|
202 src1++; |
|
203 src2++; |
|
204 } |
|
205 } |
|
206 OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE); |
|
207 |
|
208 SSE_FUNCTION static void |
|
209 inverse_f32_sse_unroll2 (float *dest, float *src1, int n) |
|
210 { |
|
211 /* Initial operations to align the destination pointer */ |
|
212 for (; ((long)dest & 15) && (n > 0); n--) { |
|
213 *dest++ = 1.0 / *src1++; |
|
214 } |
|
215 for (; n >= 8; n -= 8) { |
|
216 __m128 xmm0, xmm1; |
|
217 /* While _mm_rcp_ps sounds promising, the results it gives are rather |
|
218 * different from the 1.0 / src1 reference implementation, so do that. |
|
219 */ |
|
220 xmm0 = _mm_set_ps1(1.0); |
|
221 xmm1 = _mm_loadu_ps(src1); |
|
222 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
223 _mm_store_ps(dest, xmm0); |
|
224 xmm0 = _mm_set_ps1(1.0); |
|
225 xmm1 = _mm_loadu_ps(src1 + 4); |
|
226 xmm0 = _mm_div_ps(xmm0, xmm1); |
|
227 _mm_store_ps(dest + 4, xmm0); |
|
228 dest += 8; |
|
229 src1 += 8; |
|
230 } |
|
231 for (; n > 0; n--) { |
|
232 *dest++ = 1.0 / *src1++; |
|
233 } |
|
234 } |
|
235 OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE); |
|
236 |
|
237 SSE_FUNCTION static void |
|
238 negative_f32_sse_unroll2 (float *dest, float *src1, int n) |
|
239 { |
|
240 /* Initial operations to align the destination pointer */ |
|
241 for (; ((long)dest & 15) && (n > 0); n--) { |
|
242 *dest++ = -(*src1++); |
|
243 } |
|
244 for (; n >= 8; n -= 8) { |
|
245 __m128 xmm0, xmm1; |
|
246 xmm0 = _mm_setzero_ps(); |
|
247 xmm1 = _mm_loadu_ps(src1); |
|
248 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
249 _mm_store_ps(dest, xmm0); |
|
250 xmm0 = _mm_setzero_ps(); |
|
251 xmm1 = _mm_loadu_ps(src1 + 4); |
|
252 xmm0 = _mm_sub_ps(xmm0, xmm1); |
|
253 _mm_store_ps(dest + 4, xmm0); |
|
254 dest += 8; |
|
255 src1 += 8; |
|
256 } |
|
257 for (; n > 0; n--) { |
|
258 *dest++ = -(*src1++); |
|
259 } |
|
260 } |
|
261 OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE); |
|
262 |
|
263 SSE_FUNCTION static void |
|
264 scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) |
|
265 { |
|
266 __m128 xmm1; |
|
267 |
|
268 /* Initial operations to align the destination pointer */ |
|
269 for (; ((long)dest & 15) && (n > 0); n--) { |
|
270 *dest++ = *src1++ + *val; |
|
271 } |
|
272 xmm1 = _mm_load_ps1(val); |
|
273 for (; n >= 8; n -= 8) { |
|
274 __m128 xmm0; |
|
275 xmm0 = _mm_loadu_ps(src1); |
|
276 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
277 _mm_store_ps(dest, xmm0); |
|
278 xmm0 = _mm_loadu_ps(src1 + 4); |
|
279 xmm0 = _mm_add_ps(xmm0, xmm1); |
|
280 _mm_store_ps(dest + 4, xmm0); |
|
281 dest += 8; |
|
282 src1 += 8; |
|
283 } |
|
284 for (; n > 0; n--) { |
|
285 *dest++ = *src1++ + *val; |
|
286 } |
|
287 } |
|
288 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE); |
|
289 |
|
290 SSE_FUNCTION static void |
|
291 scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n) |
|
292 { |
|
293 __m128 xmm1; |
|
294 |
|
295 /* Initial operations to align the destination pointer */ |
|
296 for (; ((long)dest & 15) && (n > 0); n--) { |
|
297 *dest++ = *src1++ * *val; |
|
298 } |
|
299 xmm1 = _mm_load_ps1(val); |
|
300 for (; n >= 8; n -= 8) { |
|
301 __m128 xmm0; |
|
302 xmm0 = _mm_loadu_ps(src1); |
|
303 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
304 _mm_store_ps(dest, xmm0); |
|
305 xmm0 = _mm_loadu_ps(src1 + 4); |
|
306 xmm0 = _mm_mul_ps(xmm0, xmm1); |
|
307 _mm_store_ps(dest + 4, xmm0); |
|
308 dest += 8; |
|
309 src1 += 8; |
|
310 } |
|
311 for (; n > 0; n--) { |
|
312 *dest++ = *src1++ * *val; |
|
313 } |
|
314 } |
|
315 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE); |
|
316 |
|
317 SSE_FUNCTION static void |
|
318 scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n) |
|
319 { |
|
320 __m128d xmm1; |
|
321 |
|
322 /* Initial operations to align the destination pointer */ |
|
323 for (; ((long)dest & 15) && (n > 0); n--) { |
|
324 *dest++ = *src1++ * *val; |
|
325 } |
|
326 xmm1 = _mm_load_pd1(val); |
|
327 for (; n >= 4; n -= 4) { |
|
328 __m128d xmm0; |
|
329 xmm0 = _mm_loadu_pd(src1); |
|
330 xmm0 = _mm_mul_pd(xmm0, xmm1); |
|
331 _mm_store_pd(dest, xmm0); |
|
332 xmm0 = _mm_loadu_pd(src1 + 2); |
|
333 xmm0 = _mm_mul_pd(xmm0, xmm1); |
|
334 _mm_store_pd(dest + 2, xmm0); |
|
335 dest += 4; |
|
336 src1 += 4; |
|
337 } |
|
338 for (; n > 0; n--) { |
|
339 *dest++ = *src1++ * *val; |
|
340 } |
|
341 } |
|
342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2); |
|
343 |
|
344 |
|
345 |
|
346 #ifdef __SYMBIAN32__ |
|
347 |
|
348 OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() { |
|
349 return &_oil_function_impl_add_f32_sse_unroll2, add_f32; |
|
350 } |
|
351 #endif |
|
352 |
|
353 #ifdef __SYMBIAN32__ |
|
354 |
|
355 OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() { |
|
356 return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32; |
|
357 } |
|
358 #endif |
|
359 |
|
360 #ifdef __SYMBIAN32__ |
|
361 |
|
362 OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() { |
|
363 return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32; |
|
364 } |
|
365 #endif |
|
366 |
|
367 #ifdef __SYMBIAN32__ |
|
368 |
|
369 OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() { |
|
370 return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32; |
|
371 } |
|
372 #endif |
|
373 |
|
374 #ifdef __SYMBIAN32__ |
|
375 |
|
376 OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() { |
|
377 return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32; |
|
378 } |
|
379 #endif |
|
380 |
|
381 #ifdef __SYMBIAN32__ |
|
382 |
|
383 OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() { |
|
384 return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32; |
|
385 } |
|
386 #endif |
|
387 |
|
388 #ifdef __SYMBIAN32__ |
|
389 |
|
390 OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() { |
|
391 return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32; |
|
392 } |
|
393 #endif |
|
394 |
|
395 #ifdef __SYMBIAN32__ |
|
396 |
|
397 OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() { |
|
398 return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32; |
|
399 } |
|
400 #endif |
|
401 |
|
402 #ifdef __SYMBIAN32__ |
|
403 |
|
404 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() { |
|
405 return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns; |
|
406 } |
|
407 #endif |
|
408 |
|
409 #ifdef __SYMBIAN32__ |
|
410 |
|
411 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() { |
|
412 return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns; |
|
413 } |
|
414 #endif |
|
415 |
|
416 #ifdef __SYMBIAN32__ |
|
417 |
|
418 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() { |
|
419 return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns; |
|
420 } |
|
421 #endif |
|
422 |