|
1 /* |
|
2 * LIBOIL - Library of Optimized Inner Loops |
|
3 * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org> |
|
4 * All rights reserved. |
|
5 * |
|
6 * Redistribution and use in source and binary forms, with or without |
|
7 * modification, are permitted provided that the following conditions |
|
8 * are met: |
|
9 * 1. Redistributions of source code must retain the above copyright |
|
10 * notice, this list of conditions and the following disclaimer. |
|
11 * 2. Redistributions in binary form must reproduce the above copyright |
|
12 * notice, this list of conditions and the following disclaimer in the |
|
13 * documentation and/or other materials provided with the distribution. |
|
14 * |
|
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR |
|
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
|
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, |
|
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
|
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
|
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, |
|
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING |
|
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
|
25 * POSSIBILITY OF SUCH DAMAGE. |
|
26 */ |
|
27 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
28 |
|
29 #ifdef HAVE_CONFIG_H |
|
30 #include "config.h" |
|
31 #endif |
|
32 |
|
33 #include <liboil/liboilfunction.h> |
|
34 |
|
35 OIL_DECLARE_CLASS (rowsad8x8_u8); |
|
36 OIL_DECLARE_CLASS (colsad8x8_u8); |
|
37 |
|
38 static void |
|
39 rowsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, uint8_t *src2) |
|
40 { |
|
41 uint32_t MaxSad; |
|
42 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
43 __asm__ __volatile__ ( |
|
44 " pxor %%mm6, %%mm6 \n\t" /* zero out mm6 for unpack */ |
|
45 " pxor %%mm7, %%mm7 \n\t" /* zero out mm7 for unpack */ |
|
46 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
|
47 " movq (%2), %%mm1 \n\t" |
|
48 |
|
49 " movq %%mm0, %%mm2 \n\t" |
|
50 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
|
51 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
|
52 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
|
53 |
|
54 " movq %%mm0, %%mm1 \n\t" |
|
55 |
|
56 " punpcklbw %%mm6, %%mm0 \n\t" /* ; unpack low four bytes to higher precision */ |
|
57 " punpckhbw %%mm7, %%mm1 \n\t" /* ; unpack high four bytes to higher precision */ |
|
58 |
|
59 " movq %%mm0, %%mm2 \n\t" |
|
60 " movq %%mm1, %%mm3 \n\t" |
|
61 " psrlq $32, %%mm2 \n\t" /* fold and add */ |
|
62 " psrlq $32, %%mm3 \n\t" |
|
63 " paddw %%mm2, %%mm0 \n\t" |
|
64 " paddw %%mm3, %%mm1 \n\t" |
|
65 " movq %%mm0, %%mm2 \n\t" |
|
66 " movq %%mm1, %%mm3 \n\t" |
|
67 " psrlq $16, %%mm2 \n\t" |
|
68 " psrlq $16, %%mm3 \n\t" |
|
69 " paddw %%mm2, %%mm0 \n\t" |
|
70 " paddw %%mm3, %%mm1 \n\t" |
|
71 |
|
72 " psubusw %%mm0, %%mm1 \n\t" |
|
73 " paddw %%mm0, %%mm1 \n\t" /* mm1 = max(mm1, mm0) */ |
|
74 " movd %%mm1, %0 \n\t" |
|
75 " andl $0xffff, %0 \n\t" |
|
76 " emms \n\t" |
|
77 |
|
78 : "=m" (MaxSad), |
|
79 "+r" (src1), |
|
80 "+r" (src2) |
|
81 : |
|
82 : "memory" |
|
83 ); |
|
84 *dest = MaxSad; |
|
85 #endif |
|
86 } |
|
87 OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmx, rowsad8x8_u8, OIL_IMPL_FLAG_MMX); |
|
88 |
|
89 static void |
|
90 rowsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, uint8_t *src2) |
|
91 { |
|
92 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
93 uint32_t MaxSad; |
|
94 |
|
95 __asm__ __volatile__ ( |
|
96 " movd (%1), %%mm0 \n\t" |
|
97 " movd (%2), %%mm1 \n\t" |
|
98 " psadbw %%mm0, %%mm1 \n\t" |
|
99 " movd 4(%1), %%mm2 \n\t" |
|
100 " movd 4(%2), %%mm3 \n\t" |
|
101 " psadbw %%mm2, %%mm3 \n\t" |
|
102 |
|
103 " pmaxsw %%mm1, %%mm3 \n\t" |
|
104 " movd %%mm3, %0 \n\t" |
|
105 " andl $0xffff, %0 \n\t" |
|
106 " emms \n\t" |
|
107 |
|
108 : "=m" (MaxSad), |
|
109 "+r" (src1), |
|
110 "+r" (src2) |
|
111 : |
|
112 : "memory" |
|
113 ); |
|
114 *dest = MaxSad; |
|
115 #endif |
|
116 } |
|
117 OIL_DEFINE_IMPL_FULL (rowsad8x8_u8_mmxext, rowsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); |
|
118 |
|
119 static void |
|
120 colsad8x8_u8_mmx (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) |
|
121 { |
|
122 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
123 uint32_t MaxSad; |
|
124 |
|
125 __asm__ __volatile__ ( |
|
126 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ |
|
127 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ |
|
128 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ |
|
129 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ |
|
130 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ |
|
131 " mov $4, %%edi \n\t" /* 4 rows */ |
|
132 "1: \n\t" |
|
133 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
|
134 " movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
|
135 |
|
136 " movq %%mm0, %%mm2 \n\t" |
|
137 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
|
138 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
|
139 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
|
140 " movq %%mm0, %%mm1 \n\t" |
|
141 |
|
142 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
|
143 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ |
|
144 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
|
145 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ |
|
146 " add %3, %1 \n\t" /* Inc pointer into the new data */ |
|
147 " add %3, %2 \n\t" /* Inc pointer into the new data */ |
|
148 |
|
149 " dec %%edi \n\t" |
|
150 " jnz 1b \n\t" |
|
151 |
|
152 " mov $4, %%edi \n\t" /* 4 rows */ |
|
153 "2: \n\t" |
|
154 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
|
155 " movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
|
156 |
|
157 " movq %%mm0, %%mm2 \n\t" |
|
158 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
|
159 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
|
160 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
|
161 " movq %%mm0, %%mm1 \n\t" |
|
162 |
|
163 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
|
164 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ |
|
165 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
|
166 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ |
|
167 " add %3, %1 \n\t" /* Inc pointer into the new data */ |
|
168 " add %3, %2 \n\t" /* Inc pointer into the new data */ |
|
169 |
|
170 " dec %%edi \n\t" |
|
171 " jnz 2b \n\t" |
|
172 |
|
173 " psubusw %%mm6, %%mm7 \n\t" |
|
174 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm7, mm6) */ |
|
175 " psubusw %%mm4, %%mm5 \n\t" |
|
176 " paddw %%mm4, %%mm5 \n\t" /* mm5 = max(mm5, mm4) */ |
|
177 " psubusw %%mm5, %%mm7 \n\t" |
|
178 " paddw %%mm5, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
|
179 " movq %%mm7, %%mm6 \n\t" |
|
180 " psrlq $32, %%mm6 \n\t" |
|
181 " psubusw %%mm6, %%mm7 \n\t" |
|
182 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
|
183 " movq %%mm7, %%mm6 \n\t" |
|
184 " psrlq $16, %%mm6 \n\t" |
|
185 " psubusw %%mm6, %%mm7 \n\t" |
|
186 " paddw %%mm6, %%mm7 \n\t" /* mm7 = max(mm5, mm7) */ |
|
187 " movd %%mm7, %0 \n\t" |
|
188 " andl $0xffff, %0 \n\t" |
|
189 " emms \n\t" |
|
190 |
|
191 : "=r" (MaxSad), |
|
192 "+r" (src1), |
|
193 "+r" (src2) |
|
194 : "r" (ss1) |
|
195 : "memory", "edi" |
|
196 ); |
|
197 *dest = MaxSad; |
|
198 #endif |
|
199 } |
|
200 OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmx, colsad8x8_u8, OIL_IMPL_FLAG_MMX); |
|
201 |
|
202 static void |
|
203 colsad8x8_u8_mmxext (uint32_t *dest, uint8_t *src1, int ss1, uint8_t *src2, int ss2) |
|
204 { |
|
205 #if !defined(__WINSCW__) && !defined(__WINS__) |
|
206 uint32_t MaxSad; |
|
207 |
|
208 __asm__ __volatile__ ( |
|
209 " pxor %%mm3, %%mm3 \n\t" /* zero out mm3 for unpack */ |
|
210 " pxor %%mm4, %%mm4 \n\t" /* mm4 low sum */ |
|
211 " pxor %%mm5, %%mm5 \n\t" /* mm5 high sum */ |
|
212 " pxor %%mm6, %%mm6 \n\t" /* mm6 low sum */ |
|
213 " pxor %%mm7, %%mm7 \n\t" /* mm7 high sum */ |
|
214 " mov $4, %%edi \n\t" /* 4 rows */ |
|
215 "1: \n\t" |
|
216 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
|
217 " movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
|
218 |
|
219 " movq %%mm0, %%mm2 \n\t" |
|
220 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
|
221 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
|
222 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
|
223 " movq %%mm0, %%mm1 \n\t" |
|
224 |
|
225 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
|
226 " paddw %%mm0, %%mm4 \n\t" /* accumulate difference... */ |
|
227 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
|
228 " paddw %%mm1, %%mm5 \n\t" /* accumulate difference... */ |
|
229 " add %3, %1 \n\t" /* Inc pointer into the new data */ |
|
230 " add %3, %2 \n\t" /* Inc pointer into the new data */ |
|
231 |
|
232 " dec %%edi \n\t" |
|
233 " jnz 1b \n\t" |
|
234 |
|
235 " mov $4, %%edi \n\t" /* 4 rows */ |
|
236 "2: \n\t" |
|
237 " movq (%1), %%mm0 \n\t" /* take 8 bytes */ |
|
238 " movq (%2), %%mm1 \n\t" /* take 8 bytes */ |
|
239 |
|
240 " movq %%mm0, %%mm2 \n\t" |
|
241 " psubusb %%mm1, %%mm0 \n\t" /* A - B */ |
|
242 " psubusb %%mm2, %%mm1 \n\t" /* B - A */ |
|
243 " por %%mm1, %%mm0 \n\t" /* and or gives abs difference */ |
|
244 " movq %%mm0, %%mm1 \n\t" |
|
245 |
|
246 " punpcklbw %%mm3, %%mm0 \n\t" /* unpack to higher precision for accumulation */ |
|
247 " paddw %%mm0, %%mm6 \n\t" /* accumulate difference... */ |
|
248 " punpckhbw %%mm3, %%mm1 \n\t" /* unpack high four bytes to higher precision */ |
|
249 " paddw %%mm1, %%mm7 \n\t" /* accumulate difference... */ |
|
250 " add %3, %1 \n\t" /* Inc pointer into the new data */ |
|
251 " add %3, %2 \n\t" /* Inc pointer into the new data */ |
|
252 |
|
253 " dec %%edi \n\t" |
|
254 " jnz 2b \n\t" |
|
255 |
|
256 " pmaxsw %%mm6, %%mm7 \n\t" |
|
257 " pmaxsw %%mm4, %%mm5 \n\t" |
|
258 " pmaxsw %%mm5, %%mm7 \n\t" |
|
259 " movq %%mm7, %%mm6 \n\t" |
|
260 " psrlq $32, %%mm6 \n\t" |
|
261 " pmaxsw %%mm6, %%mm7 \n\t" |
|
262 " movq %%mm7, %%mm6 \n\t" |
|
263 " psrlq $16, %%mm6 \n\t" |
|
264 " pmaxsw %%mm6, %%mm7 \n\t" |
|
265 " movd %%mm7, %0 \n\t" |
|
266 " andl $0xffff, %0 \n\t" |
|
267 " emms \n\t" |
|
268 |
|
269 : "=r" (MaxSad), |
|
270 "+r" (src1), |
|
271 "+r" (src2) |
|
272 : "r" (ss1) |
|
273 : "memory", "edi" |
|
274 ); |
|
275 |
|
276 *dest = MaxSad; |
|
277 #endif |
|
278 } |
|
279 OIL_DEFINE_IMPL_FULL (colsad8x8_u8_mmxext, colsad8x8_u8, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_MMXEXT); |
|
280 |
|
281 |
|
282 |
|
283 #ifdef __SYMBIAN32__ |
|
284 |
|
285 OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8() { |
|
286 return &_oil_function_impl_rowsad8x8_u8_mmx, rowsad8x8_u8; |
|
287 } |
|
288 #endif |
|
289 |
|
290 #ifdef __SYMBIAN32__ |
|
291 |
|
292 OilFunctionImpl* __oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8() { |
|
293 return &_oil_function_impl_rowsad8x8_u8_mmxext, rowsad8x8_u8; |
|
294 } |
|
295 #endif |
|
296 |
|
297 #ifdef __SYMBIAN32__ |
|
298 |
|
299 OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8() { |
|
300 return &_oil_function_impl_colsad8x8_u8_mmx, colsad8x8_u8; |
|
301 } |
|
302 #endif |
|
303 |
|
304 #ifdef __SYMBIAN32__ |
|
305 |
|
306 OilFunctionImpl* __oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8() { |
|
307 return &_oil_function_impl_colsad8x8_u8_mmxext, colsad8x8_u8; |
|
308 } |
|
309 #endif |
|
310 |