|
1 /* |
|
2 * Copyright © 2004 Red Hat, Inc. |
|
3 * Copyright © 2004 Nicholas Miell |
|
4 * Copyright © 2005 Trolltech AS |
|
5 * |
|
6 * Permission to use, copy, modify, distribute, and sell this software and its |
|
7 * documentation for any purpose is hereby granted without fee, provided that |
|
8 * the above copyright notice appear in all copies and that both that |
|
9 * copyright notice and this permission notice appear in supporting |
|
10 * documentation, and that the name of Red Hat not be used in advertising or |
|
11 * publicity pertaining to distribution of the software without specific, |
|
12 * written prior permission. Red Hat makes no representations about the |
|
13 * suitability of this software for any purpose. It is provided "as is" |
|
14 * without express or implied warranty. |
|
15 * |
|
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
|
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
|
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
|
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
|
23 * SOFTWARE. |
|
24 * |
|
25 * Author: Søren Sandmann (sandmann@redhat.com) |
|
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
|
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
|
28 * |
|
29 * Based on work by Owen Taylor |
|
30 */ |
|
31 //Portions Copyright (c) 2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. |
|
32 |
|
33 #ifdef HAVE_CONFIG_H |
|
34 #include "config.h" |
|
35 #endif |
|
36 |
|
37 #include <liboil/liboil.h> |
|
38 #include <liboil/liboilfunction.h> |
|
39 |
|
40 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
|
41 |
|
42 typedef uint32_t CARD32; |
|
43 typedef uint16_t CARD16; |
|
44 typedef int16_t INT16; |
|
45 typedef uint8_t CARD8; |
|
46 typedef uint64_t ullong; |
|
47 typedef CARD32* PicturePtr; |
|
48 typedef CARD32* FbBits; |
|
49 typedef int FbStride; |
|
50 |
|
51 |
|
52 #include "fbmmx.h" |
|
53 #include "fbpict.h" |
|
54 |
|
55 #define CHECKPOINT() |
|
56 |
|
57 OIL_DECLARE_CLASS (composite_in_argb); |
|
58 OIL_DECLARE_CLASS (composite_in_argb_const_src); |
|
59 OIL_DECLARE_CLASS (composite_in_argb_const_mask); |
|
60 OIL_DECLARE_CLASS (composite_over_argb); |
|
61 OIL_DECLARE_CLASS (composite_over_argb_const_src); |
|
62 OIL_DECLARE_CLASS (composite_add_argb); |
|
63 OIL_DECLARE_CLASS (composite_add_argb_const_src); |
|
64 OIL_DECLARE_CLASS (composite_in_over_argb); |
|
65 OIL_DECLARE_CLASS (composite_in_over_argb_const_src); |
|
66 OIL_DECLARE_CLASS (composite_in_over_argb_const_mask); |
|
67 OIL_DECLARE_CLASS (composite_over_u8); |
|
68 OIL_DECLARE_CLASS (composite_add_u8); |
|
69 |
|
70 |
|
71 /* --------------- MMX code patch for fbcompose.c --------------------- */ |
|
72 |
|
73 #if 0 |
|
74 static void |
|
75 mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width) |
|
76 { |
|
77 const __m64 mmx_0 = _mm_setzero_si64(); |
|
78 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
79 |
|
80 const uint32_t *end = mask + width; |
|
81 while (mask < end) { |
|
82 __m64 a = MmxTo(*mask); |
|
83 __m64 s = MmxTo(*src); |
|
84 a = MmxAlpha(a); |
|
85 MmxMul(s, a); |
|
86 *dest = MmxFrom(s); |
|
87 ++src; |
|
88 ++dest; |
|
89 ++mask; |
|
90 } |
|
91 _mm_empty(); |
|
92 } |
|
93 #endif |
|
94 |
|
95 #ifdef ENABLE_BROKEN_IMPLS |
|
96 static void |
|
97 mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width) |
|
98 { |
|
99 const __m64 mmx_0 = _mm_setzero_si64(); |
|
100 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
101 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
102 |
|
103 const uint32_t *end = dest + width; |
|
104 |
|
105 while (dest < end) { |
|
106 __m64 x, y, a; |
|
107 x = MmxTo(*src); |
|
108 y = MmxTo(*dest); |
|
109 a = MmxAlpha(x); |
|
110 a = MmxNegate(a); |
|
111 MmxMulAdd(y, a, x); |
|
112 *dest = MmxFrom(y); |
|
113 ++dest; |
|
114 ++src; |
|
115 } |
|
116 _mm_empty(); |
|
117 } |
|
118 OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX); |
|
119 #endif |
|
120 |
|
121 #if 0 |
|
122 static FASTCALL void |
|
123 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width) |
|
124 { |
|
125 const __m64 mmx_0 = _mm_setzero_si64(); |
|
126 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
127 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
128 |
|
129 const CARD32 *end = dest + width; |
|
130 |
|
131 while (dest < end) { |
|
132 __m64 x, y, a; |
|
133 x = MmxTo(*dest); |
|
134 y = MmxTo(*src); |
|
135 a = MmxAlpha(x); |
|
136 a = MmxNegate(a); |
|
137 MmxMulAdd(y, a, x); |
|
138 *dest = MmxFrom(y); |
|
139 ++dest; |
|
140 ++src; |
|
141 } |
|
142 _mm_empty(); |
|
143 } |
|
144 #endif |
|
145 |
|
146 #if 0 |
|
147 static void |
|
148 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width) |
|
149 { |
|
150 const __m64 mmx_0 = _mm_setzero_si64(); |
|
151 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
152 |
|
153 const CARD32 *end = dest + width; |
|
154 |
|
155 while (dest < end) { |
|
156 __m64 x, a; |
|
157 x = MmxTo(*src); |
|
158 a = MmxTo(*dest); |
|
159 a = MmxAlpha(a); |
|
160 MmxMul(x, a); |
|
161 *dest = MmxFrom(x); |
|
162 ++dest; |
|
163 ++src; |
|
164 } |
|
165 _mm_empty(); |
|
166 } |
|
167 #endif |
|
168 |
|
169 #if 0 |
|
170 static FASTCALL void |
|
171 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width) |
|
172 { |
|
173 const __m64 mmx_0 = _mm_setzero_si64(); |
|
174 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
175 |
|
176 const CARD32 *end = dest + width; |
|
177 |
|
178 while (dest < end) { |
|
179 __m64 x, a; |
|
180 x = MmxTo(*dest); |
|
181 a = MmxTo(*src); |
|
182 a = MmxAlpha(a); |
|
183 MmxMul(x, a); |
|
184 *dest = MmxFrom(x); |
|
185 ++dest; |
|
186 ++src; |
|
187 } |
|
188 _mm_empty(); |
|
189 } |
|
190 #endif |
|
191 |
|
192 #if 0 |
|
193 static FASTCALL void |
|
194 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width) |
|
195 { |
|
196 const __m64 mmx_0 = _mm_setzero_si64(); |
|
197 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
198 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
199 |
|
200 const CARD32 *end = dest + width; |
|
201 |
|
202 while (dest < end) { |
|
203 __m64 x, a; |
|
204 x = MmxTo(*src); |
|
205 a = MmxTo(*dest); |
|
206 a = MmxAlpha(a); |
|
207 a = MmxNegate(a); |
|
208 MmxMul(x, a); |
|
209 *dest = MmxFrom(x); |
|
210 ++dest; |
|
211 ++src; |
|
212 } |
|
213 _mm_empty(); |
|
214 } |
|
215 #endif |
|
216 |
|
217 #if 0 |
|
218 static FASTCALL void |
|
219 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width) |
|
220 { |
|
221 const __m64 mmx_0 = _mm_setzero_si64(); |
|
222 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
223 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
224 |
|
225 const CARD32 *end = dest + width; |
|
226 |
|
227 while (dest < end) { |
|
228 __m64 x, a; |
|
229 x = MmxTo(*dest); |
|
230 a = MmxTo(*src); |
|
231 a = MmxAlpha(a); |
|
232 a = MmxNegate(a); |
|
233 MmxMul(x, a); |
|
234 *dest = MmxFrom(x); |
|
235 ++dest; |
|
236 ++src; |
|
237 } |
|
238 _mm_empty(); |
|
239 } |
|
240 |
|
241 static FASTCALL void |
|
242 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width) |
|
243 { |
|
244 const __m64 mmx_0 = _mm_setzero_si64(); |
|
245 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
246 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
247 |
|
248 const CARD32 *end = dest + width; |
|
249 |
|
250 while (dest < end) { |
|
251 __m64 s, da, d, sia; |
|
252 s = MmxTo(*src); |
|
253 d = MmxTo(*dest); |
|
254 sia = MmxAlpha(s); |
|
255 sia = MmxNegate(sia); |
|
256 da = MmxAlpha(d); |
|
257 MmxAddMul(s, da, d, sia); |
|
258 *dest = MmxFrom(s); |
|
259 ++dest; |
|
260 ++src; |
|
261 } |
|
262 _mm_empty(); |
|
263 } |
|
264 |
|
265 static FASTCALL void |
|
266 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width) |
|
267 { |
|
268 const __m64 mmx_0 = _mm_setzero_si64(); |
|
269 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
270 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
271 |
|
272 const CARD32 *end; |
|
273 |
|
274 end = dest + width; |
|
275 |
|
276 while (dest < end) { |
|
277 __m64 s, dia, d, sa; |
|
278 s = MmxTo(*src); |
|
279 d = MmxTo(*dest); |
|
280 sa = MmxAlpha(s); |
|
281 dia = MmxAlpha(d); |
|
282 dia = MmxNegate(dia); |
|
283 MmxAddMul(s, dia, d, sa); |
|
284 *dest = MmxFrom(s); |
|
285 ++dest; |
|
286 ++src; |
|
287 } |
|
288 _mm_empty(); |
|
289 } |
|
290 |
|
291 static FASTCALL void |
|
292 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width) |
|
293 { |
|
294 const __m64 mmx_0 = _mm_setzero_si64(); |
|
295 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
296 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
297 |
|
298 const CARD32 *end = dest + width; |
|
299 |
|
300 while (dest < end) { |
|
301 __m64 s, dia, d, sia; |
|
302 s = MmxTo(*src); |
|
303 d = MmxTo(*dest); |
|
304 sia = MmxAlpha(s); |
|
305 dia = MmxAlpha(d); |
|
306 sia = MmxNegate(sia); |
|
307 dia = MmxNegate(dia); |
|
308 MmxAddMul(s, dia, d, sia); |
|
309 *dest = MmxFrom(s); |
|
310 ++dest; |
|
311 ++src; |
|
312 } |
|
313 _mm_empty(); |
|
314 } |
|
315 #endif |
|
316 |
|
317 static void |
|
318 mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width) |
|
319 { |
|
320 const __m64 mmx_0 = _mm_setzero_si64(); |
|
321 |
|
322 const uint32_t *end = dest + width; |
|
323 while (dest < end) { |
|
324 __m64 s, d; |
|
325 s = MmxTo(*src); |
|
326 d = MmxTo(*dest); |
|
327 s = MmxAdd(s, d); |
|
328 *dest = MmxFrom(s); |
|
329 ++dest; |
|
330 ++src; |
|
331 } |
|
332 _mm_empty(); |
|
333 } |
|
334 OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE); |
|
335 |
|
336 #if 0 |
|
337 static FASTCALL void |
|
338 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width) |
|
339 { |
|
340 const __m64 mmx_0 = _mm_setzero_si64(); |
|
341 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
342 |
|
343 const CARD32 *end = dest + width; |
|
344 while (dest < end) { |
|
345 CARD32 s = *src; |
|
346 CARD32 d = *dest; |
|
347 __m64 ms = MmxTo(s); |
|
348 __m64 md = MmxTo(d); |
|
349 CARD32 sa = s >> 24; |
|
350 CARD32 da = ~d >> 24; |
|
351 |
|
352 if (sa > da) { |
|
353 __m64 msa = MmxTo(FbIntDiv(da, sa)); |
|
354 msa = MmxAlpha(msa); |
|
355 MmxMul(ms, msa); |
|
356 } |
|
357 MmxAdd(md, ms); |
|
358 *dest = MmxFrom(md); |
|
359 ++src; |
|
360 ++dest; |
|
361 } |
|
362 _mm_empty(); |
|
363 } |
|
364 |
|
365 |
|
366 static FASTCALL void |
|
367 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
368 { |
|
369 const __m64 mmx_0 = _mm_setzero_si64(); |
|
370 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
371 |
|
372 const CARD32 *end = src + width; |
|
373 while (src < end) { |
|
374 __m64 a = MmxTo(*mask); |
|
375 __m64 s = MmxTo(*src); |
|
376 MmxMul(s, a); |
|
377 *dest = MmxFrom(s); |
|
378 ++src; |
|
379 ++mask; |
|
380 ++dest; |
|
381 } |
|
382 _mm_empty(); |
|
383 } |
|
384 |
|
385 static FASTCALL void |
|
386 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
387 { |
|
388 const __m64 mmx_0 = _mm_setzero_si64(); |
|
389 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
390 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
391 |
|
392 const CARD32 *end = src + width; |
|
393 while (src < end) { |
|
394 __m64 a = MmxTo(*mask); |
|
395 __m64 s = MmxTo(*src); |
|
396 __m64 d = MmxTo(*dest); |
|
397 __m64 sa = MmxAlpha(s); |
|
398 MmxMul(s, a); |
|
399 MmxMul(a, sa); |
|
400 a = MmxNegate(a); |
|
401 MmxMulAdd(d, a, s); |
|
402 *dest = MmxFrom(d); |
|
403 ++src; |
|
404 ++dest; |
|
405 ++mask; |
|
406 } |
|
407 _mm_empty(); |
|
408 } |
|
409 |
|
410 static FASTCALL void |
|
411 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
412 { |
|
413 const __m64 mmx_0 = _mm_setzero_si64(); |
|
414 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
415 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
416 |
|
417 const CARD32 *end = src + width; |
|
418 while (src < end) { |
|
419 __m64 a = MmxTo(*mask); |
|
420 __m64 s = MmxTo(*src); |
|
421 __m64 d = MmxTo(*dest); |
|
422 __m64 da = MmxAlpha(d); |
|
423 da = MmxNegate(da); |
|
424 MmxMul(s, a); |
|
425 MmxMulAdd(s, da, d); |
|
426 *dest = MmxFrom(s); |
|
427 ++src; |
|
428 ++dest; |
|
429 ++mask; |
|
430 } |
|
431 _mm_empty(); |
|
432 } |
|
433 |
|
434 |
|
435 static FASTCALL void |
|
436 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
437 { |
|
438 const __m64 mmx_0 = _mm_setzero_si64(); |
|
439 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
440 |
|
441 const CARD32 *end = src + width; |
|
442 while (src < end) { |
|
443 __m64 a = MmxTo(*mask); |
|
444 __m64 s = MmxTo(*src); |
|
445 __m64 d = MmxTo(*dest); |
|
446 __m64 da = MmxAlpha(d); |
|
447 MmxMul(s, a); |
|
448 MmxMul(s, da); |
|
449 *dest = MmxFrom(s); |
|
450 ++src; |
|
451 ++dest; |
|
452 ++mask; |
|
453 } |
|
454 _mm_empty(); |
|
455 } |
|
456 |
|
457 static FASTCALL void |
|
458 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
459 { |
|
460 const __m64 mmx_0 = _mm_setzero_si64(); |
|
461 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
462 |
|
463 const CARD32 *end = src + width; |
|
464 while (src < end) { |
|
465 __m64 a = MmxTo(*mask); |
|
466 __m64 s = MmxTo(*src); |
|
467 __m64 d = MmxTo(*dest); |
|
468 __m64 sa = MmxAlpha(s); |
|
469 MmxMul(a, sa); |
|
470 MmxMul(d, a); |
|
471 *dest = MmxFrom(d); |
|
472 ++src; |
|
473 ++dest; |
|
474 ++mask; |
|
475 } |
|
476 _mm_empty(); |
|
477 } |
|
478 |
|
479 static FASTCALL void |
|
480 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
481 { |
|
482 const __m64 mmx_0 = _mm_setzero_si64(); |
|
483 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
484 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
485 |
|
486 const CARD32 *end = src + width; |
|
487 while (src < end) { |
|
488 __m64 a = MmxTo(*mask); |
|
489 __m64 s = MmxTo(*src); |
|
490 __m64 d = MmxTo(*dest); |
|
491 __m64 da = MmxAlpha(d); |
|
492 da = MmxNegate(da); |
|
493 MmxMul(s, a); |
|
494 MmxMul(s, da); |
|
495 *dest = MmxFrom(s); |
|
496 ++src; |
|
497 ++dest; |
|
498 ++mask; |
|
499 } |
|
500 _mm_empty(); |
|
501 } |
|
502 |
|
503 static FASTCALL void |
|
504 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
505 { |
|
506 const __m64 mmx_0 = _mm_setzero_si64(); |
|
507 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
508 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
509 |
|
510 const CARD32 *end = src + width; |
|
511 while (src < end) { |
|
512 __m64 a = MmxTo(*mask); |
|
513 __m64 s = MmxTo(*src); |
|
514 __m64 d = MmxTo(*dest); |
|
515 __m64 sa = MmxAlpha(s); |
|
516 MmxMul(a, sa); |
|
517 a = MmxNegate(a); |
|
518 MmxMul(d, a); |
|
519 *dest = MmxFrom(d); |
|
520 ++src; |
|
521 ++dest; |
|
522 ++mask; |
|
523 } |
|
524 _mm_empty(); |
|
525 } |
|
526 |
|
527 static FASTCALL void |
|
528 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
529 { |
|
530 const __m64 mmx_0 = _mm_setzero_si64(); |
|
531 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
532 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
533 |
|
534 const CARD32 *end = src + width; |
|
535 while (src < end) { |
|
536 __m64 a = MmxTo(*mask); |
|
537 __m64 s = MmxTo(*src); |
|
538 __m64 d = MmxTo(*dest); |
|
539 __m64 da = MmxAlpha(d); |
|
540 __m64 sa = MmxAlpha(s); |
|
541 MmxMul(s, a); |
|
542 MmxMul(a, sa); |
|
543 a = MmxNegate(a); |
|
544 MmxAddMul(d, a, s, da); |
|
545 *dest = MmxFrom(d); |
|
546 ++src; |
|
547 ++dest; |
|
548 ++mask; |
|
549 } |
|
550 _mm_empty(); |
|
551 } |
|
552 |
|
553 static FASTCALL void |
|
554 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
555 { |
|
556 const __m64 mmx_0 = _mm_setzero_si64(); |
|
557 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
558 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
559 |
|
560 const CARD32 *end = src + width; |
|
561 while (src < end) { |
|
562 __m64 a = MmxTo(*mask); |
|
563 __m64 s = MmxTo(*src); |
|
564 __m64 d = MmxTo(*dest); |
|
565 __m64 da = MmxAlpha(d); |
|
566 __m64 sa = MmxAlpha(s) |
|
567 MmxMul(s, a); |
|
568 MmxMul(a, sa); |
|
569 da = MmxNegate(da); |
|
570 MmxAddMul(d, a, s, da); |
|
571 *dest = MmxFrom(d); |
|
572 ++src; |
|
573 ++dest; |
|
574 ++mask; |
|
575 } |
|
576 _mm_empty(); |
|
577 } |
|
578 |
|
579 static FASTCALL void |
|
580 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
581 { |
|
582 const __m64 mmx_0 = _mm_setzero_si64(); |
|
583 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
584 const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL; |
|
585 |
|
586 const CARD32 *end = src + width; |
|
587 while (src < end) { |
|
588 __m64 a = MmxTo(*mask); |
|
589 __m64 s = MmxTo(*src); |
|
590 __m64 d = MmxTo(*dest); |
|
591 __m64 da = MmxAlpha(d); |
|
592 __m64 sa = MmxAlpha(s); |
|
593 MmxMul(s, a); |
|
594 MmxMul(a, sa); |
|
595 da = MmxNegate(da); |
|
596 a = MmxNegate(a); |
|
597 MmxAddMul(d, a, s, da); |
|
598 *dest = MmxFrom(d); |
|
599 ++src; |
|
600 ++dest; |
|
601 ++mask; |
|
602 } |
|
603 _mm_empty(); |
|
604 } |
|
605 |
|
606 static FASTCALL void |
|
607 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width) |
|
608 { |
|
609 const __m64 mmx_0 = _mm_setzero_si64(); |
|
610 const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL; |
|
611 |
|
612 const CARD32 *end = src + width; |
|
613 while (src < end) { |
|
614 __m64 a = MmxTo(*mask); |
|
615 __m64 s = MmxTo(*src); |
|
616 __m64 d = MmxTo(*dest); |
|
617 MmxMul(s, a); |
|
618 d = MmxAdd(s, d); |
|
619 *dest = MmxFrom(d); |
|
620 ++src; |
|
621 ++dest; |
|
622 ++mask; |
|
623 } |
|
624 _mm_empty(); |
|
625 } |
|
626 |
|
627 extern FbComposeFunctions composeFunctions; |
|
628 |
|
629 void fbComposeSetupMMX(void) |
|
630 { |
|
631 /* check if we have MMX support and initialize accordingly */ |
|
632 if (fbHaveMMX()) { |
|
633 composeFunctions.combineU[PictOpOver] = mmxCombineOverU; |
|
634 composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU; |
|
635 composeFunctions.combineU[PictOpIn] = mmxCombineInU; |
|
636 composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU; |
|
637 composeFunctions.combineU[PictOpOut] = mmxCombineOutU; |
|
638 composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU; |
|
639 composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU; |
|
640 composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU; |
|
641 composeFunctions.combineU[PictOpXor] = mmxCombineXorU; |
|
642 composeFunctions.combineU[PictOpAdd] = mmxCombineAddU; |
|
643 composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU; |
|
644 |
|
645 composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC; |
|
646 composeFunctions.combineC[PictOpOver] = mmxCombineOverC; |
|
647 composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC; |
|
648 composeFunctions.combineC[PictOpIn] = mmxCombineInC; |
|
649 composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC; |
|
650 composeFunctions.combineC[PictOpOut] = mmxCombineOutC; |
|
651 composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC; |
|
652 composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC; |
|
653 composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC; |
|
654 composeFunctions.combineC[PictOpXor] = mmxCombineXorC; |
|
655 composeFunctions.combineC[PictOpAdd] = mmxCombineAddC; |
|
656 |
|
657 composeFunctions.combineMaskU = mmxCombineMaskU; |
|
658 } |
|
659 } |
|
660 #endif |
|
661 |
|
662 |
|
663 /* ------------------ MMX code paths called from fbpict.c ----------------------- */ |
|
664 |
|
665 typedef union { |
|
666 __m64 m64; |
|
667 uint64_t ull; |
|
668 } m64_ull; |
|
669 |
|
670 typedef struct |
|
671 { |
|
672 m64_ull mmx_4x00ff; |
|
673 m64_ull mmx_4x0080; |
|
674 m64_ull mmx_565_rgb; |
|
675 m64_ull mmx_565_unpack_multiplier; |
|
676 m64_ull mmx_565_r; |
|
677 m64_ull mmx_565_g; |
|
678 m64_ull mmx_565_b; |
|
679 m64_ull mmx_mask_0; |
|
680 m64_ull mmx_mask_1; |
|
681 m64_ull mmx_mask_2; |
|
682 m64_ull mmx_mask_3; |
|
683 m64_ull mmx_full_alpha; |
|
684 m64_ull mmx_ffff0000ffff0000; |
|
685 m64_ull mmx_0000ffff00000000; |
|
686 m64_ull mmx_000000000000ffff; |
|
687 } MMXData; |
|
688 |
|
689 static const MMXData c = |
|
690 { |
|
691 .mmx_4x00ff.ull = 0x00ff00ff00ff00ffULL, |
|
692 .mmx_4x0080.ull = 0x0080008000800080ULL, |
|
693 .mmx_565_rgb.ull = 0x000001f0003f001fULL, |
|
694 .mmx_565_r.ull = 0x000000f800000000ULL, |
|
695 .mmx_565_g.ull = 0x0000000000fc0000ULL, |
|
696 .mmx_565_b.ull = 0x00000000000000f8ULL, |
|
697 .mmx_mask_0.ull = 0xffffffffffff0000ULL, |
|
698 .mmx_mask_1.ull = 0xffffffff0000ffffULL, |
|
699 .mmx_mask_2.ull = 0xffff0000ffffffffULL, |
|
700 .mmx_mask_3.ull = 0x0000ffffffffffffULL, |
|
701 .mmx_full_alpha.ull = 0x00ff000000000000ULL, |
|
702 .mmx_565_unpack_multiplier.ull = 0x0000008404100840ULL, |
|
703 .mmx_ffff0000ffff0000.ull = 0xffff0000ffff0000ULL, |
|
704 .mmx_0000ffff00000000.ull = 0x0000ffff00000000ULL, |
|
705 .mmx_000000000000ffff.ull = 0x000000000000ffffULL, |
|
706 }; |
|
707 |
|
708 #define MC(x) ((__m64) c.mmx_##x.m64) |
|
709 |
|
710 static __inline__ __m64 |
|
711 shift (__m64 v, int s) |
|
712 { |
|
713 if (s > 0) |
|
714 return _mm_slli_si64 (v, s); |
|
715 else if (s < 0) |
|
716 return _mm_srli_si64 (v, -s); |
|
717 else |
|
718 return v; |
|
719 } |
|
720 |
|
721 static __inline__ __m64 |
|
722 negate (__m64 mask) |
|
723 { |
|
724 return _mm_xor_si64 (mask, MC(4x00ff)); |
|
725 } |
|
726 |
|
727 static __inline__ __m64 |
|
728 pix_multiply (__m64 a, __m64 b) |
|
729 { |
|
730 __m64 res; |
|
731 |
|
732 res = _mm_mullo_pi16 (a, b); |
|
733 res = _mm_adds_pu16 (res, MC(4x0080)); |
|
734 res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8)); |
|
735 res = _mm_srli_pi16 (res, 8); |
|
736 |
|
737 return res; |
|
738 } |
|
739 |
|
740 static __inline__ __m64 |
|
741 expand_alpha (__m64 pixel) |
|
742 { |
|
743 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3)); |
|
744 } |
|
745 |
|
746 static __inline__ __m64 |
|
747 expand_alpha_rev (__m64 pixel) |
|
748 { |
|
749 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0)); |
|
750 } |
|
751 |
|
752 static __inline__ __m64 |
|
753 invert_colors (__m64 pixel) |
|
754 { |
|
755 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2)); |
|
756 } |
|
757 |
|
758 /* Notes about writing mmx code |
|
759 * |
|
760 * give memory operands as the second operand. If you give it as the |
|
761 * first, gcc will first load it into a register, then use that |
|
762 * register |
|
763 * |
|
764 * ie. use |
|
765 * |
|
766 * _mm_mullo_pi16 (x, mmx_constant); |
|
767 * |
|
768 * not |
|
769 * |
|
770 * _mm_mullo_pi16 (mmx_constant, x); |
|
771 * |
|
772 * Also try to minimize dependencies. i.e. when you need a value, try |
|
773 * to calculate it from a value that was calculated as early as |
|
774 * possible. |
|
775 */ |
|
776 |
|
777 static __inline__ __m64 |
|
778 over (__m64 src, __m64 srca, __m64 dest) |
|
779 { |
|
780 return _mm_adds_pu8 (src, pix_multiply(dest, negate(srca))); |
|
781 } |
|
782 |
|
783 static __inline__ __m64 |
|
784 over_rev_non_pre (__m64 src, __m64 dest) |
|
785 { |
|
786 __m64 srca = expand_alpha (src); |
|
787 __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha)); |
|
788 |
|
789 return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest); |
|
790 } |
|
791 |
|
792 static __inline__ __m64 |
|
793 in (__m64 src, |
|
794 __m64 mask) |
|
795 { |
|
796 return pix_multiply (src, mask); |
|
797 } |
|
798 |
|
799 static __inline__ __m64 |
|
800 in_over (__m64 src, |
|
801 __m64 srca, |
|
802 __m64 mask, |
|
803 __m64 dest) |
|
804 { |
|
805 return over(in(src, mask), pix_multiply(srca, mask), dest); |
|
806 } |
|
807 |
|
808 static __inline__ __m64 |
|
809 load8888 (CARD32 v) |
|
810 { |
|
811 return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64()); |
|
812 } |
|
813 |
|
814 static __inline__ __m64 |
|
815 pack8888 (__m64 lo, __m64 hi) |
|
816 { |
|
817 __m64 r; |
|
818 r = _mm_packs_pu16 (lo, hi); |
|
819 return r; |
|
820 } |
|
821 |
|
822 static __inline__ CARD32 |
|
823 store8888 (__m64 v) |
|
824 { |
|
825 return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64())); |
|
826 } |
|
827 |
|
828 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
|
829 * |
|
830 * 00RR00GG00BB |
|
831 * |
|
832 * --- Expanding 565 in the low word --- |
|
833 * |
|
834 * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
|
835 * m = m & (01f0003f001f); |
|
836 * m = m * (008404100840); |
|
837 * m = m >> 8; |
|
838 * |
|
839 * Note the trick here - the top word is shifted by another nibble to |
|
840 * avoid it bumping into the middle word |
|
841 */ |
|
842 static __inline__ __m64 |
|
843 expand565 (__m64 pixel, int pos) |
|
844 { |
|
845 __m64 p = pixel; |
|
846 __m64 t1, t2; |
|
847 |
|
848 /* move pixel to low 16 bit and zero the rest */ |
|
849 p = shift (shift (p, (3 - pos) * 16), -48); |
|
850 |
|
851 t1 = shift (p, 36 - 11); |
|
852 t2 = shift (p, 16 - 5); |
|
853 |
|
854 p = _mm_or_si64 (t1, p); |
|
855 p = _mm_or_si64 (t2, p); |
|
856 p = _mm_and_si64 (p, MC(565_rgb)); |
|
857 |
|
858 pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier)); |
|
859 return _mm_srli_pi16 (pixel, 8); |
|
860 } |
|
861 |
|
862 static __inline__ __m64 |
|
863 expand8888 (__m64 in, int pos) |
|
864 { |
|
865 if (pos == 0) |
|
866 return _mm_unpacklo_pi8 (in, _mm_setzero_si64()); |
|
867 else |
|
868 return _mm_unpackhi_pi8 (in, _mm_setzero_si64()); |
|
869 } |
|
870 |
|
871 static __inline__ __m64 |
|
872 pack565 (__m64 pixel, __m64 target, int pos) |
|
873 { |
|
874 __m64 p = pixel; |
|
875 __m64 t = target; |
|
876 __m64 r, g, b; |
|
877 |
|
878 r = _mm_and_si64 (p, MC(565_r)); |
|
879 g = _mm_and_si64 (p, MC(565_g)); |
|
880 b = _mm_and_si64 (p, MC(565_b)); |
|
881 |
|
882 r = shift (r, - (32 - 8) + pos * 16); |
|
883 g = shift (g, - (16 - 3) + pos * 16); |
|
884 b = shift (b, - (0 + 3) + pos * 16); |
|
885 |
|
886 if (pos == 0) |
|
887 t = _mm_and_si64 (t, MC(mask_0)); |
|
888 else if (pos == 1) |
|
889 t = _mm_and_si64 (t, MC(mask_1)); |
|
890 else if (pos == 2) |
|
891 t = _mm_and_si64 (t, MC(mask_2)); |
|
892 else if (pos == 3) |
|
893 t = _mm_and_si64 (t, MC(mask_3)); |
|
894 |
|
895 p = _mm_or_si64 (r, t); |
|
896 p = _mm_or_si64 (g, p); |
|
897 |
|
898 return _mm_or_si64 (b, p); |
|
899 } |
|
900 |
|
901 #ifdef ENABLE_BROKEN_IMPLS |
|
902 /* broken. See Debian bug #340932 */ |
|
903 static void |
|
904 fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w) |
|
905 { |
|
906 __m64 vsrc, vsrca; |
|
907 |
|
908 vsrc = load8888 (*src); |
|
909 vsrca = expand_alpha (vsrc); |
|
910 |
|
911 while (w && (unsigned long)dst & 7) |
|
912 { |
|
913 *dst = store8888(over(vsrc, vsrca, load8888(*dst))); |
|
914 |
|
915 w--; |
|
916 dst++; |
|
917 } |
|
918 |
|
919 while (w >= 2) |
|
920 { |
|
921 __m64 vdest; |
|
922 __m64 dest0, dest1; |
|
923 |
|
924 vdest = *(__m64 *)dst; |
|
925 |
|
926 dest0 = over(vsrc, vsrca, expand8888(vdest, 0)); |
|
927 dest1 = over(vsrc, vsrca, expand8888(vdest, 1)); |
|
928 |
|
929 *(__m64 *)dst = pack8888(dest0, dest1); |
|
930 |
|
931 dst += 2; |
|
932 w -= 2; |
|
933 } |
|
934 |
|
935 while (w) |
|
936 { |
|
937 *dst = store8888(over(vsrc, vsrca, load8888(*dst))); |
|
938 |
|
939 w--; |
|
940 dst++; |
|
941 } |
|
942 |
|
943 _mm_empty(); |
|
944 } |
|
945 OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src, |
|
946 OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT); |
|
947 #endif |
|
948 |
|
949 #if 0 |
|
950 void |
|
951 fbCompositeSolid_nx0565mmx (CARD8 op, |
|
952 PicturePtr pSrc, |
|
953 PicturePtr pMask, |
|
954 PicturePtr pDst, |
|
955 INT16 xSrc, |
|
956 INT16 ySrc, |
|
957 INT16 xMask, |
|
958 INT16 yMask, |
|
959 INT16 xDst, |
|
960 INT16 yDst, |
|
961 CARD16 width, |
|
962 CARD16 height) |
|
963 { |
|
964 CARD32 src; |
|
965 CARD16 *dstLine, *dst; |
|
966 CARD16 w; |
|
967 FbStride dstStride; |
|
968 __m64 vsrc, vsrca; |
|
969 |
|
970 CHECKPOINT(); |
|
971 |
|
972 fbComposeGetSolid(pSrc, src, pDst->format); |
|
973 |
|
974 if (src >> 24 == 0) |
|
975 return; |
|
976 |
|
977 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
|
978 |
|
979 vsrc = load8888 (src); |
|
980 vsrca = expand_alpha (vsrc); |
|
981 |
|
982 while (height--) |
|
983 { |
|
984 dst = dstLine; |
|
985 dstLine += dstStride; |
|
986 w = width; |
|
987 |
|
988 CHECKPOINT(); |
|
989 |
|
990 while (w && (unsigned long)dst & 7) |
|
991 { |
|
992 ullong d = *dst; |
|
993 __m64 vdest = expand565 ((__m64)d, 0); |
|
994 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
|
995 *dst = (ullong)vdest; |
|
996 |
|
997 w--; |
|
998 dst++; |
|
999 } |
|
1000 |
|
1001 while (w >= 4) |
|
1002 { |
|
1003 __m64 vdest; |
|
1004 |
|
1005 vdest = *(__m64 *)dst; |
|
1006 |
|
1007 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0); |
|
1008 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1); |
|
1009 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2); |
|
1010 vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3); |
|
1011 |
|
1012 *(__m64 *)dst = vdest; |
|
1013 |
|
1014 dst += 4; |
|
1015 w -= 4; |
|
1016 } |
|
1017 |
|
1018 CHECKPOINT(); |
|
1019 |
|
1020 while (w) |
|
1021 { |
|
1022 ullong d = *dst; |
|
1023 __m64 vdest = expand565 ((__m64)d, 0); |
|
1024 vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0); |
|
1025 *dst = (ullong)vdest; |
|
1026 |
|
1027 w--; |
|
1028 dst++; |
|
1029 } |
|
1030 } |
|
1031 |
|
1032 _mm_empty(); |
|
1033 } |
|
1034 #endif |
|
1035 |
|
1036 #if 0 |
|
1037 static void |
|
1038 fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w) |
|
1039 { |
|
1040 CARD32 src, srca; |
|
1041 CARD32 *dstLine; |
|
1042 CARD32 *maskLine; |
|
1043 FbStride dstStride, maskStride; |
|
1044 __m64 vsrc, vsrca; |
|
1045 |
|
1046 |
|
1047 while (twidth && (unsigned long)q & 7) |
|
1048 { |
|
1049 CARD32 m = *(CARD32 *)p; |
|
1050 |
|
1051 if (m) |
|
1052 { |
|
1053 __m64 vdest = load8888(*q); |
|
1054 vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
|
1055 *q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
|
1056 } |
|
1057 |
|
1058 twidth--; |
|
1059 p++; |
|
1060 q++; |
|
1061 } |
|
1062 |
|
1063 while (twidth >= 2) |
|
1064 { |
|
1065 CARD32 m0, m1; |
|
1066 m0 = *p; |
|
1067 m1 = *(p + 1); |
|
1068 |
|
1069 if (m0 | m1) |
|
1070 { |
|
1071 __m64 dest0, dest1; |
|
1072 __m64 vdest = *(__m64 *)q; |
|
1073 |
|
1074 dest0 = in_over(vsrc, vsrca, load8888(m0), |
|
1075 expand8888 (vdest, 0)); |
|
1076 dest1 = in_over(vsrc, vsrca, load8888(m1), |
|
1077 expand8888 (vdest, 1)); |
|
1078 |
|
1079 *(__m64 *)q = pack8888(dest0, dest1); |
|
1080 } |
|
1081 |
|
1082 p += 2; |
|
1083 q += 2; |
|
1084 twidth -= 2; |
|
1085 } |
|
1086 |
|
1087 while (twidth) |
|
1088 { |
|
1089 CARD32 m = *(CARD32 *)p; |
|
1090 |
|
1091 if (m) |
|
1092 { |
|
1093 __m64 vdest = load8888(*q); |
|
1094 vdest = in_over(vsrc, vsrca, load8888(m), vdest); |
|
1095 *q = (ullong)pack8888(vdest, _mm_setzero_si64()); |
|
1096 } |
|
1097 |
|
1098 twidth--; |
|
1099 p++; |
|
1100 q++; |
|
1101 } |
|
1102 |
|
1103 _mm_empty(); |
|
1104 } |
|
1105 #endif |
|
1106 |
|
1107 #if 0 |
|
1108 static void |
|
1109 fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask, |
|
1110 int width) |
|
1111 { |
|
1112 |
|
1113 mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine; |
|
1114 vmask = load8888 (mask); |
|
1115 srca = MC(4x00ff); |
|
1116 |
|
1117 while (height--) |
|
1118 { |
|
1119 dst = dstLine; |
|
1120 dstLine += dstStride; |
|
1121 src = srcLine; |
|
1122 srcLine += srcStride; |
|
1123 w = width; |
|
1124 |
|
1125 while (w && (unsigned long)dst & 7) |
|
1126 { |
|
1127 __m64 s = load8888 (*src); |
|
1128 __m64 d = load8888 (*dst); |
|
1129 |
|
1130 *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
|
1131 |
|
1132 w--; |
|
1133 dst++; |
|
1134 src++; |
|
1135 } |
|
1136 |
|
1137 while (w >= 16) |
|
1138 { |
|
1139 __m64 vd0 = *(__m64 *)(dst + 0); |
|
1140 __m64 vd1 = *(__m64 *)(dst + 2); |
|
1141 __m64 vd2 = *(__m64 *)(dst + 4); |
|
1142 __m64 vd3 = *(__m64 *)(dst + 6); |
|
1143 __m64 vd4 = *(__m64 *)(dst + 8); |
|
1144 __m64 vd5 = *(__m64 *)(dst + 10); |
|
1145 __m64 vd6 = *(__m64 *)(dst + 12); |
|
1146 __m64 vd7 = *(__m64 *)(dst + 14); |
|
1147 |
|
1148 __m64 vs0 = *(__m64 *)(src + 0); |
|
1149 __m64 vs1 = *(__m64 *)(src + 2); |
|
1150 __m64 vs2 = *(__m64 *)(src + 4); |
|
1151 __m64 vs3 = *(__m64 *)(src + 6); |
|
1152 __m64 vs4 = *(__m64 *)(src + 8); |
|
1153 __m64 vs5 = *(__m64 *)(src + 10); |
|
1154 __m64 vs6 = *(__m64 *)(src + 12); |
|
1155 __m64 vs7 = *(__m64 *)(src + 14); |
|
1156 |
|
1157 vd0 = (__m64)pack8888 ( |
|
1158 in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
|
1159 in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
|
1160 |
|
1161 vd1 = (__m64)pack8888 ( |
|
1162 in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
|
1163 in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
|
1164 |
|
1165 vd2 = (__m64)pack8888 ( |
|
1166 in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
|
1167 in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
|
1168 |
|
1169 vd3 = (__m64)pack8888 ( |
|
1170 in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
|
1171 in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
|
1172 |
|
1173 vd4 = (__m64)pack8888 ( |
|
1174 in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
|
1175 in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
|
1176 |
|
1177 vd5 = (__m64)pack8888 ( |
|
1178 in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
|
1179 in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
|
1180 |
|
1181 vd6 = (__m64)pack8888 ( |
|
1182 in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
|
1183 in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
|
1184 |
|
1185 vd7 = (__m64)pack8888 ( |
|
1186 in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
|
1187 in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
|
1188 |
|
1189 *(__m64 *)(dst + 0) = vd0; |
|
1190 *(__m64 *)(dst + 2) = vd1; |
|
1191 *(__m64 *)(dst + 4) = vd2; |
|
1192 *(__m64 *)(dst + 6) = vd3; |
|
1193 *(__m64 *)(dst + 8) = vd4; |
|
1194 *(__m64 *)(dst + 10) = vd5; |
|
1195 *(__m64 *)(dst + 12) = vd6; |
|
1196 *(__m64 *)(dst + 14) = vd7; |
|
1197 |
|
1198 w -= 16; |
|
1199 dst += 16; |
|
1200 src += 16; |
|
1201 } |
|
1202 |
|
1203 while (w) |
|
1204 { |
|
1205 __m64 s = load8888 (*src); |
|
1206 __m64 d = load8888 (*dst); |
|
1207 |
|
1208 *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64()); |
|
1209 |
|
1210 w--; |
|
1211 dst++; |
|
1212 src++; |
|
1213 } |
|
1214 } |
|
1215 |
|
1216 _mm_empty(); |
|
1217 } |
|
1218 |
|
1219 void |
|
1220 fbCompositeSrc_8888x8888mmx (CARD8 op, |
|
1221 PicturePtr pSrc, |
|
1222 PicturePtr pMask, |
|
1223 PicturePtr pDst, |
|
1224 INT16 xSrc, |
|
1225 INT16 ySrc, |
|
1226 INT16 xMask, |
|
1227 INT16 yMask, |
|
1228 INT16 xDst, |
|
1229 INT16 yDst, |
|
1230 CARD16 width, |
|
1231 CARD16 height) |
|
1232 { |
|
1233 CARD32 *dstLine, *dst; |
|
1234 CARD32 *srcLine, *src; |
|
1235 FbStride dstStride, srcStride; |
|
1236 CARD16 w; |
|
1237 __m64 srca; |
|
1238 |
|
1239 CHECKPOINT(); |
|
1240 |
|
1241 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
|
1242 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
|
1243 |
|
1244 srca = MC (4x00ff); |
|
1245 |
|
1246 while (height--) |
|
1247 { |
|
1248 dst = dstLine; |
|
1249 dstLine += dstStride; |
|
1250 src = srcLine; |
|
1251 srcLine += srcStride; |
|
1252 w = width; |
|
1253 |
|
1254 while (w && (unsigned long)dst & 7) |
|
1255 { |
|
1256 __m64 s = load8888 (*src); |
|
1257 __m64 d = load8888 (*dst); |
|
1258 |
|
1259 *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64()); |
|
1260 |
|
1261 w--; |
|
1262 dst++; |
|
1263 src++; |
|
1264 } |
|
1265 |
|
1266 while (w >= 2) |
|
1267 { |
|
1268 __m64 vd = *(__m64 *)(dst + 0); |
|
1269 __m64 vs = *(__m64 *)(src + 0); |
|
1270 __m64 vs0 = expand8888 (vs, 0); |
|
1271 __m64 vs1 = expand8888 (vs, 1); |
|
1272 |
|
1273 *(__m64 *)dst = (__m64)pack8888 ( |
|
1274 over (vs0, expand_alpha (vs0), expand8888 (vd, 0)), |
|
1275 over (vs1, expand_alpha (vs1), expand8888 (vd, 1))); |
|
1276 |
|
1277 w -= 2; |
|
1278 dst += 2; |
|
1279 src += 2; |
|
1280 } |
|
1281 |
|
1282 while (w) |
|
1283 { |
|
1284 __m64 s = load8888 (*src); |
|
1285 __m64 d = load8888 (*dst); |
|
1286 |
|
1287 *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), |
|
1288 (__m64)_mm_setzero_si64()); |
|
1289 |
|
1290 w--; |
|
1291 dst++; |
|
1292 src++; |
|
1293 } |
|
1294 } |
|
1295 |
|
1296 _mm_empty(); |
|
1297 } |
|
1298 |
|
1299 void |
|
1300 fbCompositeSolidMask_nx8x8888mmx (CARD8 op, |
|
1301 PicturePtr pSrc, |
|
1302 PicturePtr pMask, |
|
1303 PicturePtr pDst, |
|
1304 INT16 xSrc, |
|
1305 INT16 ySrc, |
|
1306 INT16 xMask, |
|
1307 INT16 yMask, |
|
1308 INT16 xDst, |
|
1309 INT16 yDst, |
|
1310 CARD16 width, |
|
1311 CARD16 height) |
|
1312 { |
|
1313 CARD32 src, srca; |
|
1314 CARD32 *dstLine, *dst; |
|
1315 CARD8 *maskLine, *mask; |
|
1316 FbStride dstStride, maskStride; |
|
1317 CARD16 w; |
|
1318 __m64 vsrc, vsrca; |
|
1319 ullong srcsrc; |
|
1320 |
|
1321 CHECKPOINT(); |
|
1322 |
|
1323 fbComposeGetSolid(pSrc, src, pDst->format); |
|
1324 |
|
1325 srca = src >> 24; |
|
1326 if (srca == 0) |
|
1327 return; |
|
1328 |
|
1329 srcsrc = (unsigned long long)src << 32 | src; |
|
1330 |
|
1331 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
|
1332 fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); |
|
1333 |
|
1334 vsrc = load8888 (src); |
|
1335 vsrca = expand_alpha (vsrc); |
|
1336 |
|
1337 while (height--) |
|
1338 { |
|
1339 dst = dstLine; |
|
1340 dstLine += dstStride; |
|
1341 mask = maskLine; |
|
1342 maskLine += maskStride; |
|
1343 w = width; |
|
1344 |
|
1345 CHECKPOINT(); |
|
1346 |
|
1347 while (w && (unsigned long)dst & 7) |
|
1348 { |
|
1349 ullong m = *mask; |
|
1350 |
|
1351 if (m) |
|
1352 { |
|
1353 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst)); |
|
1354 *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
|
1355 } |
|
1356 |
|
1357 w--; |
|
1358 mask++; |
|
1359 dst++; |
|
1360 } |
|
1361 |
|
1362 CHECKPOINT(); |
|
1363 |
|
1364 while (w >= 2) |
|
1365 { |
|
1366 ullong m0, m1; |
|
1367 m0 = *mask; |
|
1368 m1 = *(mask + 1); |
|
1369 |
|
1370 if (srca == 0xff && (m0 & m1) == 0xff) |
|
1371 { |
|
1372 *(unsigned long long *)dst = srcsrc; |
|
1373 } |
|
1374 else if (m0 | m1) |
|
1375 { |
|
1376 __m64 vdest; |
|
1377 __m64 dest0, dest1; |
|
1378 |
|
1379 vdest = *(__m64 *)dst; |
|
1380 |
|
1381 dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0)); |
|
1382 dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1)); |
|
1383 |
|
1384 *(__m64 *)dst = pack8888(dest0, dest1); |
|
1385 } |
|
1386 |
|
1387 mask += 2; |
|
1388 dst += 2; |
|
1389 w -= 2; |
|
1390 } |
|
1391 |
|
1392 CHECKPOINT(); |
|
1393 |
|
1394 while (w) |
|
1395 { |
|
1396 ullong m = *mask; |
|
1397 |
|
1398 if (m) |
|
1399 { |
|
1400 __m64 vdest = load8888(*dst); |
|
1401 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest); |
|
1402 *dst = (ullong)pack8888(vdest, _mm_setzero_si64()); |
|
1403 } |
|
1404 |
|
1405 w--; |
|
1406 mask++; |
|
1407 dst++; |
|
1408 } |
|
1409 } |
|
1410 |
|
1411 _mm_empty(); |
|
1412 } |
|
1413 |
|
1414 |
|
1415 void |
|
1416 fbCompositeSolidMask_nx8x0565mmx (CARD8 op, |
|
1417 PicturePtr pSrc, |
|
1418 PicturePtr pMask, |
|
1419 PicturePtr pDst, |
|
1420 INT16 xSrc, |
|
1421 INT16 ySrc, |
|
1422 INT16 xMask, |
|
1423 INT16 yMask, |
|
1424 INT16 xDst, |
|
1425 INT16 yDst, |
|
1426 CARD16 width, |
|
1427 CARD16 height) |
|
1428 { |
|
1429 CARD32 src, srca; |
|
1430 CARD16 *dstLine, *dst; |
|
1431 CARD8 *maskLine, *mask; |
|
1432 FbStride dstStride, maskStride; |
|
1433 CARD16 w; |
|
1434 __m64 vsrc, vsrca; |
|
1435 unsigned long long srcsrcsrcsrc, src16; |
|
1436 |
|
1437 CHECKPOINT(); |
|
1438 |
|
1439 fbComposeGetSolid(pSrc, src, pDst->format); |
|
1440 |
|
1441 srca = src >> 24; |
|
1442 if (srca == 0) |
|
1443 return; |
|
1444 |
|
1445 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
|
1446 fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1); |
|
1447 |
|
1448 vsrc = load8888 (src); |
|
1449 vsrca = expand_alpha (vsrc); |
|
1450 |
|
1451 src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0); |
|
1452 |
|
1453 srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 | |
|
1454 (ullong)src16 << 16 | (ullong)src16; |
|
1455 |
|
1456 while (height--) |
|
1457 { |
|
1458 dst = dstLine; |
|
1459 dstLine += dstStride; |
|
1460 mask = maskLine; |
|
1461 maskLine += maskStride; |
|
1462 w = width; |
|
1463 |
|
1464 CHECKPOINT(); |
|
1465 |
|
1466 while (w && (unsigned long)dst & 7) |
|
1467 { |
|
1468 ullong m = *mask; |
|
1469 |
|
1470 if (m) |
|
1471 { |
|
1472 ullong d = *dst; |
|
1473 __m64 vd = (__m64)d; |
|
1474 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
|
1475 *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
|
1476 } |
|
1477 |
|
1478 w--; |
|
1479 mask++; |
|
1480 dst++; |
|
1481 } |
|
1482 |
|
1483 CHECKPOINT(); |
|
1484 |
|
1485 while (w >= 4) |
|
1486 { |
|
1487 ullong m0, m1, m2, m3; |
|
1488 m0 = *mask; |
|
1489 m1 = *(mask + 1); |
|
1490 m2 = *(mask + 2); |
|
1491 m3 = *(mask + 3); |
|
1492 |
|
1493 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
|
1494 { |
|
1495 *(unsigned long long *)dst = srcsrcsrcsrc; |
|
1496 } |
|
1497 else if (m0 | m1 | m2 | m3) |
|
1498 { |
|
1499 __m64 vdest; |
|
1500 __m64 vm0, vm1, vm2, vm3; |
|
1501 |
|
1502 vdest = *(__m64 *)dst; |
|
1503 |
|
1504 vm0 = (__m64)m0; |
|
1505 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0); |
|
1506 vm1 = (__m64)m1; |
|
1507 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1); |
|
1508 vm2 = (__m64)m2; |
|
1509 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2); |
|
1510 vm3 = (__m64)m3; |
|
1511 vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3); |
|
1512 |
|
1513 *(__m64 *)dst = vdest; |
|
1514 } |
|
1515 |
|
1516 w -= 4; |
|
1517 mask += 4; |
|
1518 dst += 4; |
|
1519 } |
|
1520 |
|
1521 CHECKPOINT(); |
|
1522 |
|
1523 while (w) |
|
1524 { |
|
1525 ullong m = *mask; |
|
1526 |
|
1527 if (m) |
|
1528 { |
|
1529 ullong d = *dst; |
|
1530 __m64 vd = (__m64)d; |
|
1531 __m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0)); |
|
1532 *dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0); |
|
1533 } |
|
1534 |
|
1535 w--; |
|
1536 mask++; |
|
1537 dst++; |
|
1538 } |
|
1539 } |
|
1540 |
|
1541 _mm_empty(); |
|
1542 } |
|
1543 |
|
1544 void |
|
1545 fbCompositeSrc_8888RevNPx0565mmx (CARD8 op, |
|
1546 PicturePtr pSrc, |
|
1547 PicturePtr pMask, |
|
1548 PicturePtr pDst, |
|
1549 INT16 xSrc, |
|
1550 INT16 ySrc, |
|
1551 INT16 xMask, |
|
1552 INT16 yMask, |
|
1553 INT16 xDst, |
|
1554 INT16 yDst, |
|
1555 CARD16 width, |
|
1556 CARD16 height) |
|
1557 { |
|
1558 CARD16 *dstLine, *dst; |
|
1559 CARD32 *srcLine, *src; |
|
1560 FbStride dstStride, srcStride; |
|
1561 CARD16 w; |
|
1562 |
|
1563 CHECKPOINT(); |
|
1564 |
|
1565 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
|
1566 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
|
1567 |
|
1568 assert (pSrc->pDrawable == pMask->pDrawable); |
|
1569 |
|
1570 while (height--) |
|
1571 { |
|
1572 dst = dstLine; |
|
1573 dstLine += dstStride; |
|
1574 src = srcLine; |
|
1575 srcLine += srcStride; |
|
1576 w = width; |
|
1577 |
|
1578 CHECKPOINT(); |
|
1579 |
|
1580 while (w && (unsigned long)dst & 7) |
|
1581 { |
|
1582 __m64 vsrc = load8888 (*src); |
|
1583 ullong d = *dst; |
|
1584 __m64 vdest = expand565 ((__m64)d, 0); |
|
1585 |
|
1586 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
|
1587 |
|
1588 *dst = (ullong)vdest; |
|
1589 |
|
1590 w--; |
|
1591 dst++; |
|
1592 src++; |
|
1593 } |
|
1594 |
|
1595 CHECKPOINT(); |
|
1596 |
|
1597 while (w >= 4) |
|
1598 { |
|
1599 CARD32 s0, s1, s2, s3; |
|
1600 unsigned char a0, a1, a2, a3; |
|
1601 |
|
1602 s0 = *src; |
|
1603 s1 = *(src + 1); |
|
1604 s2 = *(src + 2); |
|
1605 s3 = *(src + 3); |
|
1606 |
|
1607 a0 = (s0 >> 24); |
|
1608 a1 = (s1 >> 24); |
|
1609 a2 = (s2 >> 24); |
|
1610 a3 = (s3 >> 24); |
|
1611 |
|
1612 if ((a0 & a1 & a2 & a3) == 0xFF) |
|
1613 { |
|
1614 __m64 vdest; |
|
1615 vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0); |
|
1616 vdest = pack565(invert_colors(load8888(s1)), vdest, 1); |
|
1617 vdest = pack565(invert_colors(load8888(s2)), vdest, 2); |
|
1618 vdest = pack565(invert_colors(load8888(s3)), vdest, 3); |
|
1619 |
|
1620 *(__m64 *)dst = vdest; |
|
1621 } |
|
1622 else if (a0 | a1 | a2 | a3) |
|
1623 { |
|
1624 __m64 vdest = *(__m64 *)dst; |
|
1625 |
|
1626 vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0); |
|
1627 vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1); |
|
1628 vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2); |
|
1629 vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3); |
|
1630 |
|
1631 *(__m64 *)dst = vdest; |
|
1632 } |
|
1633 |
|
1634 w -= 4; |
|
1635 dst += 4; |
|
1636 src += 4; |
|
1637 } |
|
1638 |
|
1639 CHECKPOINT(); |
|
1640 |
|
1641 while (w) |
|
1642 { |
|
1643 __m64 vsrc = load8888 (*src); |
|
1644 ullong d = *dst; |
|
1645 __m64 vdest = expand565 ((__m64)d, 0); |
|
1646 |
|
1647 vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0); |
|
1648 |
|
1649 *dst = (ullong)vdest; |
|
1650 |
|
1651 w--; |
|
1652 dst++; |
|
1653 src++; |
|
1654 } |
|
1655 } |
|
1656 |
|
1657 _mm_empty(); |
|
1658 } |
|
1659 |
|
1660 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */ |
|
1661 |
|
1662 void |
|
1663 fbCompositeSrc_8888RevNPx8888mmx (CARD8 op, |
|
1664 PicturePtr pSrc, |
|
1665 PicturePtr pMask, |
|
1666 PicturePtr pDst, |
|
1667 INT16 xSrc, |
|
1668 INT16 ySrc, |
|
1669 INT16 xMask, |
|
1670 INT16 yMask, |
|
1671 INT16 xDst, |
|
1672 INT16 yDst, |
|
1673 CARD16 width, |
|
1674 CARD16 height) |
|
1675 { |
|
1676 CARD32 *dstLine, *dst; |
|
1677 CARD32 *srcLine, *src; |
|
1678 FbStride dstStride, srcStride; |
|
1679 CARD16 w; |
|
1680 |
|
1681 CHECKPOINT(); |
|
1682 |
|
1683 fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1); |
|
1684 fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1); |
|
1685 |
|
1686 assert (pSrc->pDrawable == pMask->pDrawable); |
|
1687 |
|
1688 while (height--) |
|
1689 { |
|
1690 dst = dstLine; |
|
1691 dstLine += dstStride; |
|
1692 src = srcLine; |
|
1693 srcLine += srcStride; |
|
1694 w = width; |
|
1695 |
|
1696 while (w && (unsigned long)dst & 7) |
|
1697 { |
|
1698 __m64 s = load8888 (*src); |
|
1699 __m64 d = load8888 (*dst); |
|
1700 |
|
1701 *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
|
1702 |
|
1703 w--; |
|
1704 dst++; |
|
1705 src++; |
|
1706 } |
|
1707 |
|
1708 while (w >= 2) |
|
1709 { |
|
1710 ullong s0, s1; |
|
1711 unsigned char a0, a1; |
|
1712 __m64 d0, d1; |
|
1713 |
|
1714 s0 = *src; |
|
1715 s1 = *(src + 1); |
|
1716 |
|
1717 a0 = (s0 >> 24); |
|
1718 a1 = (s1 >> 24); |
|
1719 |
|
1720 if ((a0 & a1) == 0xFF) |
|
1721 { |
|
1722 d0 = invert_colors(load8888(s0)); |
|
1723 d1 = invert_colors(load8888(s1)); |
|
1724 |
|
1725 *(__m64 *)dst = pack8888 (d0, d1); |
|
1726 } |
|
1727 else if (a0 | a1) |
|
1728 { |
|
1729 __m64 vdest = *(__m64 *)dst; |
|
1730 |
|
1731 d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0)); |
|
1732 d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1)); |
|
1733 |
|
1734 *(__m64 *)dst = pack8888 (d0, d1); |
|
1735 } |
|
1736 |
|
1737 w -= 2; |
|
1738 dst += 2; |
|
1739 src += 2; |
|
1740 } |
|
1741 |
|
1742 while (w) |
|
1743 { |
|
1744 __m64 s = load8888 (*src); |
|
1745 __m64 d = load8888 (*dst); |
|
1746 |
|
1747 *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64()); |
|
1748 |
|
1749 w--; |
|
1750 dst++; |
|
1751 src++; |
|
1752 } |
|
1753 } |
|
1754 |
|
1755 _mm_empty(); |
|
1756 } |
|
1757 |
|
1758 void |
|
1759 fbCompositeSolidMask_nx8888x0565Cmmx (CARD8 op, |
|
1760 PicturePtr pSrc, |
|
1761 PicturePtr pMask, |
|
1762 PicturePtr pDst, |
|
1763 INT16 xSrc, |
|
1764 INT16 ySrc, |
|
1765 INT16 xMask, |
|
1766 INT16 yMask, |
|
1767 INT16 xDst, |
|
1768 INT16 yDst, |
|
1769 CARD16 width, |
|
1770 CARD16 height) |
|
1771 { |
|
1772 CARD32 src, srca; |
|
1773 CARD16 *dstLine; |
|
1774 CARD32 *maskLine; |
|
1775 FbStride dstStride, maskStride; |
|
1776 __m64 vsrc, vsrca; |
|
1777 |
|
1778 CHECKPOINT(); |
|
1779 |
|
1780 fbComposeGetSolid(pSrc, src, pDst->format); |
|
1781 |
|
1782 srca = src >> 24; |
|
1783 if (srca == 0) |
|
1784 return; |
|
1785 |
|
1786 fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1); |
|
1787 fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1); |
|
1788 |
|
1789 vsrc = load8888 (src); |
|
1790 vsrca = expand_alpha (vsrc); |
|
1791 |
|
1792 while (height--) |
|
1793 { |
|
1794 int twidth = width; |
|
1795 CARD32 *p = (CARD32 *)maskLine; |
|
1796 CARD16 *q = (CARD16 *)dstLine; |
|
1797 |
|
1798 while (twidth && ((unsigned long)q & 7)) |
|
1799 { |
|
1800 CARD32 m = *(CARD32 *)p; |
|
1801 |
|
1802 if (m) |
|
1803 { |
|
1804 ullong d = *q; |
|
1805 __m64 vdest = expand565 ((__m64)d, 0); |
|
1806 vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
|
1807 *q = (ullong)vdest; |
|
1808 } |
|
1809 |
|
1810 twidth--; |
|
1811 p++; |
|
1812 q++; |
|
1813 } |
|
1814 |
|
1815 while (twidth >= 4) |
|
1816 { |
|
1817 CARD32 m0, m1, m2, m3; |
|
1818 |
|
1819 m0 = *p; |
|
1820 m1 = *(p + 1); |
|
1821 m2 = *(p + 2); |
|
1822 m3 = *(p + 3); |
|
1823 |
|
1824 if ((m0 | m1 | m2 | m3)) |
|
1825 { |
|
1826 __m64 vdest = *(__m64 *)q; |
|
1827 |
|
1828 vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0); |
|
1829 vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1); |
|
1830 vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2); |
|
1831 vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3); |
|
1832 |
|
1833 *(__m64 *)q = vdest; |
|
1834 } |
|
1835 twidth -= 4; |
|
1836 p += 4; |
|
1837 q += 4; |
|
1838 } |
|
1839 |
|
1840 while (twidth) |
|
1841 { |
|
1842 CARD32 m; |
|
1843 |
|
1844 m = *(CARD32 *)p; |
|
1845 if (m) |
|
1846 { |
|
1847 ullong d = *q; |
|
1848 __m64 vdest = expand565((__m64)d, 0); |
|
1849 vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0); |
|
1850 *q = (ullong)vdest; |
|
1851 } |
|
1852 |
|
1853 twidth--; |
|
1854 p++; |
|
1855 q++; |
|
1856 } |
|
1857 |
|
1858 maskLine += maskStride; |
|
1859 dstLine += dstStride; |
|
1860 } |
|
1861 |
|
1862 _mm_empty (); |
|
1863 } |
|
1864 #endif |
|
1865 |
|
1866 static void |
|
1867 fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w) |
|
1868 { |
|
1869 int s; |
|
1870 int d; |
|
1871 int t; |
|
1872 |
|
1873 while (w && (unsigned long)dst & 7) |
|
1874 { |
|
1875 s = *src; |
|
1876 d = *dst; |
|
1877 t = d + s; |
|
1878 s = t | (0 - (t >> 8)); |
|
1879 *dst = s; |
|
1880 |
|
1881 dst++; |
|
1882 src++; |
|
1883 w--; |
|
1884 } |
|
1885 |
|
1886 while (w >= 8) |
|
1887 { |
|
1888 *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
|
1889 dst += 8; |
|
1890 src += 8; |
|
1891 w -= 8; |
|
1892 } |
|
1893 |
|
1894 while (w) |
|
1895 { |
|
1896 s = *src; |
|
1897 d = *dst; |
|
1898 t = d + s; |
|
1899 s = t | (0 - (t >> 8)); |
|
1900 *dst = s; |
|
1901 |
|
1902 dst++; |
|
1903 src++; |
|
1904 w--; |
|
1905 } |
|
1906 |
|
1907 _mm_empty(); |
|
1908 } |
|
1909 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX); |
|
1910 |
|
1911 static void |
|
1912 fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w) |
|
1913 { |
|
1914 while (w && (unsigned long)dst & 7) |
|
1915 { |
|
1916 *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
|
1917 _mm_cvtsi32_si64(*dst))); |
|
1918 dst++; |
|
1919 src++; |
|
1920 w--; |
|
1921 } |
|
1922 |
|
1923 while (w >= 2) |
|
1924 { |
|
1925 *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst); |
|
1926 dst += 2; |
|
1927 src += 2; |
|
1928 w -= 2; |
|
1929 } |
|
1930 |
|
1931 if (w) |
|
1932 { |
|
1933 *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src), |
|
1934 _mm_cvtsi32_si64(*dst))); |
|
1935 |
|
1936 } |
|
1937 |
|
1938 _mm_empty(); |
|
1939 } |
|
1940 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE); |
|
1941 |
|
1942 #if 0 |
|
1943 #define GetStart(drw,x,y,type,stride,line,bpp) {\ |
|
1944 FbBits *__bits__; \ |
|
1945 FbStride __stride__; \ |
|
1946 int __xoff__,__yoff__; \ |
|
1947 \ |
|
1948 fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__); \ |
|
1949 (stride) = __stride__ * sizeof (FbBits) / sizeof (type); \ |
|
1950 (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__); \ |
|
1951 } |
|
1952 |
|
1953 Bool |
|
1954 fbSolidFillmmx (DrawablePtr pDraw, |
|
1955 int x, |
|
1956 int y, |
|
1957 int width, |
|
1958 int height, |
|
1959 FbBits xor) |
|
1960 { |
|
1961 FbStride stride; |
|
1962 int bpp; |
|
1963 ullong fill; |
|
1964 __m64 vfill; |
|
1965 CARD32 byte_width; |
|
1966 CARD8 *byte_line; |
|
1967 FbBits *bits; |
|
1968 int xoff, yoff; |
|
1969 |
|
1970 CHECKPOINT(); |
|
1971 |
|
1972 fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff); |
|
1973 |
|
1974 if (bpp == 16 && (xor >> 16 != (xor & 0xffff))) |
|
1975 return FALSE; |
|
1976 |
|
1977 if (bpp != 16 && bpp != 32) |
|
1978 return FALSE; |
|
1979 |
|
1980 if (bpp == 16) |
|
1981 { |
|
1982 stride = stride * sizeof (FbBits) / 2; |
|
1983 byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff)); |
|
1984 byte_width = 2 * width; |
|
1985 stride *= 2; |
|
1986 } |
|
1987 else |
|
1988 { |
|
1989 stride = stride * sizeof (FbBits) / 4; |
|
1990 byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff)); |
|
1991 byte_width = 4 * width; |
|
1992 stride *= 4; |
|
1993 } |
|
1994 |
|
1995 fill = ((ullong)xor << 32) | xor; |
|
1996 vfill = (__m64)fill; |
|
1997 |
|
1998 while (height--) |
|
1999 { |
|
2000 int w; |
|
2001 CARD8 *d = byte_line; |
|
2002 byte_line += stride; |
|
2003 w = byte_width; |
|
2004 |
|
2005 while (w >= 2 && ((unsigned long)d & 3)) |
|
2006 { |
|
2007 *(CARD16 *)d = xor; |
|
2008 w -= 2; |
|
2009 d += 2; |
|
2010 } |
|
2011 |
|
2012 while (w >= 4 && ((unsigned long)d & 7)) |
|
2013 { |
|
2014 *(CARD32 *)d = xor; |
|
2015 |
|
2016 w -= 4; |
|
2017 d += 4; |
|
2018 } |
|
2019 |
|
2020 while (w >= 64) |
|
2021 { |
|
2022 *(__m64*) (d + 0) = vfill; |
|
2023 *(__m64*) (d + 8) = vfill; |
|
2024 *(__m64*) (d + 16) = vfill; |
|
2025 *(__m64*) (d + 24) = vfill; |
|
2026 *(__m64*) (d + 32) = vfill; |
|
2027 *(__m64*) (d + 40) = vfill; |
|
2028 *(__m64*) (d + 48) = vfill; |
|
2029 *(__m64*) (d + 56) = vfill; |
|
2030 |
|
2031 w -= 64; |
|
2032 d += 64; |
|
2033 } |
|
2034 while (w >= 4) |
|
2035 { |
|
2036 *(CARD32 *)d = xor; |
|
2037 |
|
2038 w -= 4; |
|
2039 d += 4; |
|
2040 } |
|
2041 if (w >= 2) |
|
2042 { |
|
2043 *(CARD16 *)d = xor; |
|
2044 w -= 2; |
|
2045 d += 2; |
|
2046 } |
|
2047 } |
|
2048 |
|
2049 _mm_empty(); |
|
2050 return TRUE; |
|
2051 } |
|
2052 |
|
2053 Bool |
|
2054 fbCopyAreammx (DrawablePtr pSrc, |
|
2055 DrawablePtr pDst, |
|
2056 int src_x, |
|
2057 int src_y, |
|
2058 int dst_x, |
|
2059 int dst_y, |
|
2060 int width, |
|
2061 int height) |
|
2062 { |
|
2063 FbBits * src_bits; |
|
2064 FbStride src_stride; |
|
2065 int src_bpp; |
|
2066 int src_xoff; |
|
2067 int src_yoff; |
|
2068 |
|
2069 FbBits * dst_bits; |
|
2070 FbStride dst_stride; |
|
2071 int dst_bpp; |
|
2072 int dst_xoff; |
|
2073 int dst_yoff; |
|
2074 |
|
2075 CARD8 * src_bytes; |
|
2076 CARD8 * dst_bytes; |
|
2077 int byte_width; |
|
2078 |
|
2079 fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff); |
|
2080 fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff); |
|
2081 |
|
2082 if (src_bpp != 16 && src_bpp != 32) |
|
2083 return FALSE; |
|
2084 |
|
2085 if (dst_bpp != 16 && dst_bpp != 32) |
|
2086 return FALSE; |
|
2087 |
|
2088 if (src_bpp != dst_bpp) |
|
2089 { |
|
2090 return FALSE; |
|
2091 } |
|
2092 |
|
2093 if (src_bpp == 16) |
|
2094 { |
|
2095 src_stride = src_stride * sizeof (FbBits) / 2; |
|
2096 dst_stride = dst_stride * sizeof (FbBits) / 2; |
|
2097 src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
|
2098 dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
|
2099 byte_width = 2 * width; |
|
2100 src_stride *= 2; |
|
2101 dst_stride *= 2; |
|
2102 } |
|
2103 else |
|
2104 { |
|
2105 src_stride = src_stride * sizeof (FbBits) / 4; |
|
2106 dst_stride = dst_stride * sizeof (FbBits) / 4; |
|
2107 src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff)); |
|
2108 dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff)); |
|
2109 byte_width = 4 * width; |
|
2110 src_stride *= 4; |
|
2111 dst_stride *= 4; |
|
2112 } |
|
2113 |
|
2114 while (height--) |
|
2115 { |
|
2116 int w; |
|
2117 CARD8 *s = src_bytes; |
|
2118 CARD8 *d = dst_bytes; |
|
2119 src_bytes += src_stride; |
|
2120 dst_bytes += dst_stride; |
|
2121 w = byte_width; |
|
2122 |
|
2123 while (w >= 2 && ((unsigned long)d & 3)) |
|
2124 { |
|
2125 *(CARD16 *)d = *(CARD16 *)s; |
|
2126 w -= 2; |
|
2127 s += 2; |
|
2128 d += 2; |
|
2129 } |
|
2130 |
|
2131 while (w >= 4 && ((unsigned long)d & 7)) |
|
2132 { |
|
2133 *(CARD32 *)d = *(CARD32 *)s; |
|
2134 |
|
2135 w -= 4; |
|
2136 s += 4; |
|
2137 d += 4; |
|
2138 } |
|
2139 |
|
2140 while (w >= 64) |
|
2141 { |
|
2142 *(__m64 *)(d + 0) = *(__m64 *)(s + 0); |
|
2143 *(__m64 *)(d + 8) = *(__m64 *)(s + 8); |
|
2144 *(__m64 *)(d + 16) = *(__m64 *)(s + 16); |
|
2145 *(__m64 *)(d + 24) = *(__m64 *)(s + 24); |
|
2146 *(__m64 *)(d + 32) = *(__m64 *)(s + 32); |
|
2147 *(__m64 *)(d + 40) = *(__m64 *)(s + 40); |
|
2148 *(__m64 *)(d + 48) = *(__m64 *)(s + 48); |
|
2149 *(__m64 *)(d + 56) = *(__m64 *)(s + 56); |
|
2150 w -= 64; |
|
2151 s += 64; |
|
2152 d += 64; |
|
2153 } |
|
2154 while (w >= 4) |
|
2155 { |
|
2156 *(CARD32 *)d = *(CARD32 *)s; |
|
2157 |
|
2158 w -= 4; |
|
2159 s += 4; |
|
2160 d += 4; |
|
2161 } |
|
2162 if (w >= 2) |
|
2163 { |
|
2164 *(CARD16 *)d = *(CARD16 *)s; |
|
2165 w -= 2; |
|
2166 s += 2; |
|
2167 d += 2; |
|
2168 } |
|
2169 } |
|
2170 |
|
2171 _mm_empty(); |
|
2172 return TRUE; |
|
2173 } |
|
2174 |
|
2175 void |
|
2176 fbCompositeCopyAreammx (CARD8 op, |
|
2177 PicturePtr pSrc, |
|
2178 PicturePtr pMask, |
|
2179 PicturePtr pDst, |
|
2180 INT16 xSrc, |
|
2181 INT16 ySrc, |
|
2182 INT16 xMask, |
|
2183 INT16 yMask, |
|
2184 INT16 xDst, |
|
2185 INT16 yDst, |
|
2186 CARD16 width, |
|
2187 CARD16 height) |
|
2188 { |
|
2189 fbCopyAreammx (pSrc->pDrawable, |
|
2190 pDst->pDrawable, |
|
2191 xSrc, ySrc, |
|
2192 xDst, yDst, |
|
2193 width, height); |
|
2194 } |
|
2195 |
|
2196 #if !defined(__amd64__) && !defined(__x86_64__) |
|
2197 |
|
2198 enum CPUFeatures { |
|
2199 NoFeatures = 0, |
|
2200 MMX = 0x1, |
|
2201 MMX_Extensions = 0x2, |
|
2202 SSE = 0x6, |
|
2203 SSE2 = 0x8, |
|
2204 CMOV = 0x10 |
|
2205 }; |
|
2206 |
|
2207 static unsigned int detectCPUFeatures(void) { |
|
2208 unsigned int result; |
|
2209 char vendor[13]; |
|
2210 vendor[0] = 0; |
|
2211 vendor[12] = 0; |
|
2212 /* see p. 118 of amd64 instruction set manual Vol3 */ |
|
2213 __asm__ ("push %%ebx\n" |
|
2214 "pushf\n" |
|
2215 "pop %%eax\n" |
|
2216 "mov %%eax, %%ebx\n" |
|
2217 "xor $0x00200000, %%eax\n" |
|
2218 "push %%eax\n" |
|
2219 "popf\n" |
|
2220 "pushf\n" |
|
2221 "pop %%eax\n" |
|
2222 "mov $0x0, %%edx\n" |
|
2223 "xor %%ebx, %%eax\n" |
|
2224 "jz skip\n" |
|
2225 |
|
2226 "mov $0x00000000, %%eax\n" |
|
2227 "cpuid\n" |
|
2228 "mov %%ebx, %1\n" |
|
2229 "mov %%edx, %2\n" |
|
2230 "mov %%ecx, %3\n" |
|
2231 "mov $0x00000001, %%eax\n" |
|
2232 "cpuid\n" |
|
2233 "skip:\n" |
|
2234 "pop %%ebx\n" |
|
2235 "mov %%edx, %0\n" |
|
2236 : "=r" (result), |
|
2237 "=m" (vendor[0]), |
|
2238 "=m" (vendor[4]), |
|
2239 "=m" (vendor[8]) |
|
2240 : |
|
2241 : "%eax", "%ecx", "%edx" |
|
2242 ); |
|
2243 |
|
2244 unsigned int features = 0; |
|
2245 if (result) { |
|
2246 /* result now contains the standard feature bits */ |
|
2247 if (result & (1 << 15)) |
|
2248 features |= CMOV; |
|
2249 if (result & (1 << 23)) |
|
2250 features |= MMX; |
|
2251 if (result & (1 << 25)) |
|
2252 features |= SSE; |
|
2253 if (result & (1 << 26)) |
|
2254 features |= SSE2; |
|
2255 if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) { |
|
2256 /* check for AMD MMX extensions */ |
|
2257 |
|
2258 unsigned int result; |
|
2259 __asm__("push %%ebx\n" |
|
2260 "mov $0x80000000, %%eax\n" |
|
2261 "cpuid\n" |
|
2262 "xor %%edx, %%edx\n" |
|
2263 "cmp $0x1, %%eax\n" |
|
2264 "jge skip2\n" |
|
2265 "mov $0x80000001, %%eax\n" |
|
2266 "cpuid\n" |
|
2267 "skip2:\n" |
|
2268 "mov %%edx, %0\n" |
|
2269 "pop %%ebx\n" |
|
2270 : "=r" (result) |
|
2271 : |
|
2272 : "%eax", "%ecx", "%edx" |
|
2273 ); |
|
2274 if (result & (1<<22)) |
|
2275 features |= MMX_Extensions; |
|
2276 } |
|
2277 } |
|
2278 return features; |
|
2279 } |
|
2280 |
|
2281 Bool |
|
2282 fbHaveMMX (void) |
|
2283 { |
|
2284 static Bool initialized = FALSE; |
|
2285 static Bool mmx_present; |
|
2286 |
|
2287 if (!initialized) |
|
2288 { |
|
2289 unsigned int features = detectCPUFeatures(); |
|
2290 mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions); |
|
2291 initialized = TRUE; |
|
2292 } |
|
2293 |
|
2294 return mmx_present; |
|
2295 } |
|
2296 #endif /* __amd64__ */ |
|
2297 |
|
2298 |
|
2299 #endif |
|
2300 |
|
2301 |
|
2302 #ifdef __SYMBIAN32__ |
|
2303 |
|
2304 OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() { |
|
2305 return &_oil_function_impl_mmxCombineOverU, composite_over_argb; |
|
2306 } |
|
2307 #endif |
|
2308 |
|
2309 #ifdef __SYMBIAN32__ |
|
2310 |
|
2311 OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() { |
|
2312 return &_oil_function_impl_mmxCombineAddU, composite_add_argb; |
|
2313 } |
|
2314 #endif |
|
2315 |
|
2316 #ifdef __SYMBIAN32__ |
|
2317 |
|
2318 OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() { |
|
2319 return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src; |
|
2320 } |
|
2321 #endif |
|
2322 |
|
2323 #ifdef __SYMBIAN32__ |
|
2324 |
|
2325 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() { |
|
2326 return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8; |
|
2327 } |
|
2328 #endif |
|
2329 |
|
2330 #ifdef __SYMBIAN32__ |
|
2331 |
|
2332 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() { |
|
2333 return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb; |
|
2334 } |
|
2335 #endif |
|
2336 |