|
1 /* |
|
2 * LIBOIL - Library of Optimized Inner Loops |
|
3 * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> |
|
4 * All rights reserved. |
|
5 * |
|
6 * Redistribution and use in source and binary forms, with or without |
|
7 * modification, are permitted provided that the following conditions |
|
8 * are met: |
|
9 * 1. Redistributions of source code must retain the above copyright |
|
10 * notice, this list of conditions and the following disclaimer. |
|
11 * 2. Redistributions in binary form must reproduce the above copyright |
|
12 * notice, this list of conditions and the following disclaimer in the |
|
13 * documentation and/or other materials provided with the distribution. |
|
14 * |
|
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
|
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, |
|
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
|
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
|
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
|
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
25 * POSSIBILITY OF SUCH DAMAGE. |
|
26 */ |
|
27 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
28 |
|
29 #ifdef HAVE_CONFIG_H |
|
30 #include "config.h" |
|
31 #endif |
|
32 |
|
33 #include <liboil/liboilfunction.h> |
|
34 |
|
35 OIL_DECLARE_CLASS (err_intra8x8_u8); |
|
36 OIL_DECLARE_CLASS (err_inter8x8_u8); |
|
37 OIL_DECLARE_CLASS (err_inter8x8_u8_avg); |
|
38 |
|
39 static void |
|
40 err_intra8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1) |
|
41 { |
|
42 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
43 uint32_t xsum; |
|
44 uint32_t xxsum; |
|
45 |
|
46 __asm__ __volatile__ ( |
|
47 " pxor %%mm5, %%mm5 \n\t" |
|
48 " pxor %%mm6, %%mm6 \n\t" |
|
49 " pxor %%mm7, %%mm7 \n\t" |
|
50 " mov $8, %%edi \n\t" |
|
51 "1: \n\t" |
|
52 " movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
|
53 " movq %%mm0, %%mm2 \n\t" |
|
54 |
|
55 " punpcklbw %%mm6, %%mm0 \n\t" |
|
56 " punpckhbw %%mm6, %%mm2 \n\t" |
|
57 |
|
58 " paddw %%mm0, %%mm5 \n\t" |
|
59 " paddw %%mm2, %%mm5 \n\t" |
|
60 |
|
61 " pmaddwd %%mm0, %%mm0 \n\t" |
|
62 " pmaddwd %%mm2, %%mm2 \n\t" |
|
63 |
|
64 " paddd %%mm0, %%mm7 \n\t" |
|
65 " paddd %%mm2, %%mm7 \n\t" |
|
66 |
|
67 " add %3, %2 \n\t" /* Inc pointer into src data */ |
|
68 |
|
69 " dec %%edi \n\t" |
|
70 " jnz 1b \n\t" |
|
71 |
|
72 " movq %%mm5, %%mm0 \n\t" |
|
73 " psrlq $32, %%mm5 \n\t" |
|
74 " paddw %%mm0, %%mm5 \n\t" |
|
75 " movq %%mm5, %%mm0 \n\t" |
|
76 " psrlq $16, %%mm5 \n\t" |
|
77 " paddw %%mm0, %%mm5 \n\t" |
|
78 " movd %%mm5, %%edi \n\t" |
|
79 " movswl %%di, %%edi \n\t" |
|
80 " movl %%edi, %0 \n\t" |
|
81 |
|
82 " movq %%mm7, %%mm0 \n\t" |
|
83 " psrlq $32, %%mm7 \n\t" |
|
84 " paddd %%mm0, %%mm7 \n\t" |
|
85 " movd %%mm7, %1 \n\t" |
|
86 " emms \n\t" |
|
87 |
|
88 : "=r" (xsum), |
|
89 "=r" (xxsum), |
|
90 "+r" (src1) |
|
91 : "r" (ss1) |
|
92 : "edi", "memory" |
|
93 ); |
|
94 |
|
95 /* Compute population variance as mis-match metric. */ |
|
96 *dest = (((xxsum<<6) - xsum*xsum)); |
|
97 #endif |
|
98 } |
|
99 OIL_DEFINE_IMPL_FULL (err_intra8x8_u8_mmx, err_intra8x8_u8, OIL_IMPL_FLAG_MMX); |
|
100 |
|
101 static void |
|
102 err_inter8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) |
|
103 { |
|
104 uint32_t xsum; |
|
105 uint32_t xxsum; |
|
106 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
107 __asm__ __volatile__ ( |
|
108 " pxor %%mm5, %%mm5 \n\t" |
|
109 " pxor %%mm6, %%mm6 \n\t" |
|
110 " pxor %%mm7, %%mm7 \n\t" |
|
111 " mov $8, %%edi \n\t" |
|
112 "1: \n\t" |
|
113 " movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
|
114 " movq (%3), %%mm1 \n\t" |
|
115 " movq %%mm0, %%mm2 \n\t" |
|
116 " movq %%mm1, %%mm3 \n\t" |
|
117 |
|
118 " punpcklbw %%mm6, %%mm0 \n\t" |
|
119 " punpcklbw %%mm6, %%mm1 \n\t" |
|
120 " punpckhbw %%mm6, %%mm2 \n\t" |
|
121 " punpckhbw %%mm6, %%mm3 \n\t" |
|
122 |
|
123 " psubsw %%mm1, %%mm0 \n\t" |
|
124 " psubsw %%mm3, %%mm2 \n\t" |
|
125 |
|
126 " paddw %%mm0, %%mm5 \n\t" |
|
127 " paddw %%mm2, %%mm5 \n\t" |
|
128 |
|
129 " pmaddwd %%mm0, %%mm0 \n\t" |
|
130 " pmaddwd %%mm2, %%mm2 \n\t" |
|
131 |
|
132 " paddd %%mm0, %%mm7 \n\t" |
|
133 " paddd %%mm2, %%mm7 \n\t" |
|
134 |
|
135 " add %4, %2 \n\t" /* Inc pointer into src data */ |
|
136 " add %5, %3 \n\t" /* Inc pointer into ref data */ |
|
137 |
|
138 " dec %%edi \n\t" |
|
139 " jnz 1b \n\t" |
|
140 |
|
141 " movq %%mm5, %%mm0 \n\t" |
|
142 " psrlq $32, %%mm5 \n\t" |
|
143 " paddw %%mm0, %%mm5 \n\t" |
|
144 " movq %%mm5, %%mm0 \n\t" |
|
145 " psrlq $16, %%mm5 \n\t" |
|
146 " paddw %%mm0, %%mm5 \n\t" |
|
147 " movd %%mm5, %%edi \n\t" |
|
148 " movswl %%di, %%edi \n\t" |
|
149 " movl %%edi, %0 \n\t" |
|
150 |
|
151 " movq %%mm7, %%mm0 \n\t" |
|
152 " psrlq $32, %%mm7 \n\t" |
|
153 " paddd %%mm0, %%mm7 \n\t" |
|
154 " movd %%mm7, %1 \n\t" |
|
155 " emms \n\t" |
|
156 |
|
157 : "=m" (xsum), |
|
158 "=m" (xxsum), |
|
159 "+r" (src1), |
|
160 "+r" (src2) |
|
161 : "m" (ss1), |
|
162 "m" (ss2) |
|
163 : "edi", "memory" |
|
164 ); |
|
165 |
|
166 /* Compute and return population variance as mis-match metric. */ |
|
167 *dest = (((xxsum<<6) - xsum*xsum)); |
|
168 #endif |
|
169 } |
|
170 OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_mmx, err_inter8x8_u8, OIL_IMPL_FLAG_MMX); |
|
171 |
|
172 static void |
|
173 err_inter8x8_u8_avg_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) |
|
174 { |
|
175 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
176 uint32_t xsum; |
|
177 uint32_t xxsum; |
|
178 |
|
179 __asm__ __volatile__ ( |
|
180 " pcmpeqd %%mm4, %%mm4 \n\t" /* fefefefefefefefe in mm4 */ |
|
181 " paddb %%mm4, %%mm4 \n\t" |
|
182 " pxor %%mm5, %%mm5 \n\t" |
|
183 " pxor %%mm6, %%mm6 \n\t" |
|
184 " pxor %%mm7, %%mm7 \n\t" |
|
185 " mov $8, %%edi \n\t" |
|
186 "1: \n\t" |
|
187 " movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
|
188 |
|
189 " movq (%3), %%mm2 \n\t" |
|
190 " movq (%4), %%mm3 \n\t" /* take average of mm2 and mm3 */ |
|
191 " movq %%mm2, %%mm1 \n\t" |
|
192 " pand %%mm3, %%mm1 \n\t" |
|
193 " pxor %%mm2, %%mm3 \n\t" |
|
194 " pand %%mm4, %%mm3 \n\t" |
|
195 " psrlq $1, %%mm3 \n\t" |
|
196 " paddb %%mm3, %%mm1 \n\t" |
|
197 |
|
198 " movq %%mm0, %%mm2 \n\t" |
|
199 " movq %%mm1, %%mm3 \n\t" |
|
200 |
|
201 " punpcklbw %%mm6, %%mm0 \n\t" |
|
202 " punpcklbw %%mm6, %%mm1 \n\t" |
|
203 " punpckhbw %%mm6, %%mm2 \n\t" |
|
204 " punpckhbw %%mm6, %%mm3 \n\t" |
|
205 |
|
206 " psubsw %%mm1, %%mm0 \n\t" |
|
207 " psubsw %%mm3, %%mm2 \n\t" |
|
208 |
|
209 " paddw %%mm0, %%mm5 \n\t" |
|
210 " paddw %%mm2, %%mm5 \n\t" |
|
211 |
|
212 " pmaddwd %%mm0, %%mm0 \n\t" |
|
213 " pmaddwd %%mm2, %%mm2 \n\t" |
|
214 |
|
215 " paddd %%mm0, %%mm7 \n\t" |
|
216 " paddd %%mm2, %%mm7 \n\t" |
|
217 |
|
218 " add %5, %2 \n\t" /* Inc pointer into src data */ |
|
219 " add %6, %3 \n\t" /* Inc pointer into ref data */ |
|
220 " add %6, %4 \n\t" /* Inc pointer into ref data */ |
|
221 |
|
222 " dec %%edi \n\t" |
|
223 " jnz 1b \n\t" |
|
224 |
|
225 " movq %%mm5, %%mm0 \n\t" |
|
226 " psrlq $32, %%mm5 \n\t" |
|
227 " paddw %%mm0, %%mm5 \n\t" |
|
228 " movq %%mm5, %%mm0 \n\t" |
|
229 " psrlq $16, %%mm5 \n\t" |
|
230 " paddw %%mm0, %%mm5 \n\t" |
|
231 " movd %%mm5, %%edi \n\t" |
|
232 " movswl %%di, %%edi \n\t" |
|
233 " movl %%edi, %0 \n\t" |
|
234 |
|
235 " movq %%mm7, %%mm0 \n\t" |
|
236 " psrlq $32, %%mm7 \n\t" |
|
237 " paddd %%mm0, %%mm7 \n\t" |
|
238 " movd %%mm7, %1 \n\t" |
|
239 " emms \n\t" |
|
240 |
|
241 : "=m" (xsum), |
|
242 "=m" (xxsum), |
|
243 "+r" (src1), |
|
244 "+r" (src2), |
|
245 "+r" (src3) |
|
246 : "m" (ss1), |
|
247 "m" (ss2) |
|
248 : "edi", "memory" |
|
249 ); |
|
250 |
|
251 /* Compute and return population variance as mis-match metric. */ |
|
252 *dest = (((xxsum<<6) - xsum*xsum)); |
|
253 #endif |
|
254 } |
|
255 |
|
256 OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX); |
|
257 |
|
258 #ifdef ENABLE_BROKEN_IMPLS |
|
259 static void |
|
260 err_inter8x8_u8_avg_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, uint8_t *src3, int ss2) |
|
261 { |
|
262 uint32_t xsum; |
|
263 uint32_t xxsum; |
|
264 |
|
265 __asm__ __volatile__ ( |
|
266 " pxor %%mm4, %%mm4 \n\t" |
|
267 " pxor %%mm5, %%mm5 \n\t" |
|
268 " mov $0x01010101, %%edi \n\t" |
|
269 " movd %%edi, %%mm6 \n\t" |
|
270 " punpcklbw %%mm6, %%mm6 \n\t" |
|
271 " pxor %%mm7, %%mm7 \n\t" |
|
272 " mov $8, %%edi \n\t" |
|
273 "1: \n\t" |
|
274 " movq (%2), %%mm0 \n\t" /* take 8 bytes */ |
|
275 |
|
276 " movq (%3), %%mm2 \n\t" |
|
277 " movq (%4), %%mm1 \n\t" /* take average of mm2 and mm1 */ |
|
278 " movq %%mm1, %%mm3 \n\t" |
|
279 " pavgb %%mm2, %%mm1 \n\t" |
|
280 " pxor %%mm2, %%mm3 \n\t" |
|
281 " pand %%mm6, %%mm3 \n\t" |
|
282 " psubb %%mm3, %%mm1 \n\t" |
|
283 |
|
284 " movq %%mm0, %%mm2 \n\t" |
|
285 " movq %%mm1, %%mm3 \n\t" |
|
286 |
|
287 " punpcklbw %%mm4, %%mm0 \n\t" |
|
288 " punpcklbw %%mm4, %%mm1 \n\t" |
|
289 " punpckhbw %%mm4, %%mm2 \n\t" |
|
290 " punpckhbw %%mm4, %%mm3 \n\t" |
|
291 |
|
292 " psubsw %%mm1, %%mm0 \n\t" |
|
293 " psubsw %%mm3, %%mm2 \n\t" |
|
294 |
|
295 " paddw %%mm0, %%mm5 \n\t" |
|
296 " paddw %%mm2, %%mm5 \n\t" |
|
297 |
|
298 " pmaddwd %%mm0, %%mm0 \n\t" |
|
299 " pmaddwd %%mm2, %%mm2 \n\t" |
|
300 |
|
301 " paddd %%mm0, %%mm7 \n\t" |
|
302 " paddd %%mm2, %%mm7 \n\t" |
|
303 |
|
304 " add %5, %2 \n\t" /* Inc pointer into src data */ |
|
305 " add %6, %3 \n\t" /* Inc pointer into ref data */ |
|
306 " add %6, %4 \n\t" /* Inc pointer into ref data */ |
|
307 |
|
308 " dec %%edi \n\t" |
|
309 " jnz 1b \n\t" |
|
310 |
|
311 " movq %%mm5, %%mm0 \n\t" |
|
312 " psrlq $32, %%mm5 \n\t" |
|
313 " paddw %%mm0, %%mm5 \n\t" |
|
314 " movq %%mm5, %%mm0 \n\t" |
|
315 " psrlq $16, %%mm5 \n\t" |
|
316 " paddw %%mm0, %%mm5 \n\t" |
|
317 " movd %%mm5, %%edi \n\t" |
|
318 " movswl %%di, %%edi \n\t" |
|
319 " movl %%edi, %0 \n\t" |
|
320 |
|
321 " movq %%mm7, %%mm0 \n\t" |
|
322 " psrlq $32, %%mm7 \n\t" |
|
323 " paddd %%mm0, %%mm7 \n\t" |
|
324 " movd %%mm7, %1 \n\t" |
|
325 " emms \n\t" |
|
326 |
|
327 : "=m" (xsum), |
|
328 "=m" (xxsum), |
|
329 "+r" (src1), |
|
330 "+r" (src2), |
|
331 "+r" (src3) |
|
332 : "m" (ss1), |
|
333 "m" (ss2) |
|
334 : "edi", "memory" |
|
335 ); |
|
336 |
|
337 /* Compute and return population variance as mis-match metric. */ |
|
338 *dest = (((xxsum<<6) - xsum*xsum)); |
|
339 } |
|
340 |
|
341 OIL_DEFINE_IMPL_FULL (err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); |
|
342 #endif |
|
343 |
|
344 |
|
345 |
|
346 #ifdef __SYMBIAN32__ |
|
347 |
|
348 OilFunctionImpl* __oil_function_impl_err_intra8x8_u8_mmx, err_intra8x8_u8() { |
|
349 return &_oil_function_impl_err_intra8x8_u8_mmx, err_intra8x8_u8; |
|
350 } |
|
351 #endif |
|
352 |
|
353 #ifdef __SYMBIAN32__ |
|
354 |
|
355 OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_mmx, err_inter8x8_u8() { |
|
356 return &_oil_function_impl_err_inter8x8_u8_mmx, err_inter8x8_u8; |
|
357 } |
|
358 #endif |
|
359 |
|
360 #ifdef __SYMBIAN32__ |
|
361 |
|
362 OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg() { |
|
363 return &_oil_function_impl_err_inter8x8_u8_avg_mmx, err_inter8x8_u8_avg; |
|
364 } |
|
365 #endif |
|
366 |
|
367 #ifdef __SYMBIAN32__ |
|
368 |
|
369 OilFunctionImpl* __oil_function_impl_err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg() { |
|
370 return &_oil_function_impl_err_inter8x8_u8_avg_mmxext, err_inter8x8_u8_avg; |
|
371 } |
|
372 #endif |
|
373 |