genericopenlibs/liboil/src/fb/fbmmx.c
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /*
       
     2  * Copyright © 2004 Red Hat, Inc.
       
     3  * Copyright © 2004 Nicholas Miell
       
     4  * Copyright © 2005 Trolltech AS
       
     5  *
       
     6  * Permission to use, copy, modify, distribute, and sell this software and its
       
     7  * documentation for any purpose is hereby granted without fee, provided that
       
     8  * the above copyright notice appear in all copies and that both that
       
     9  * copyright notice and this permission notice appear in supporting
       
    10  * documentation, and that the name of Red Hat not be used in advertising or
       
    11  * publicity pertaining to distribution of the software without specific,
       
    12  * written prior permission.  Red Hat makes no representations about the
       
    13  * suitability of this software for any purpose.  It is provided "as is"
       
    14  * without express or implied warranty.
       
    15  *
       
    16  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
       
    17  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
       
    18  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
       
    19  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
       
    20  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
       
    21  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
       
    22  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
       
    23  * SOFTWARE.
       
    24  *
       
    25  * Author:  Søren Sandmann (sandmann@redhat.com)
       
    26  * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
       
    27  * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 
       
    28  *
       
    29  * Based on work by Owen Taylor
       
    30  */
       
    31 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    32 
       
    33 #ifdef HAVE_CONFIG_H
       
    34 #include "config.h"
       
    35 #endif
       
    36 
       
    37 #include <liboil/liboil.h>
       
    38 #include <liboil/liboilfunction.h>
       
    39  
       
    40 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
       
    41 
       
    42 typedef uint32_t CARD32;
       
    43 typedef uint16_t CARD16;
       
    44 typedef int16_t INT16;
       
    45 typedef uint8_t CARD8;
       
    46 typedef uint64_t ullong;
       
    47 typedef CARD32* PicturePtr;
       
    48 typedef CARD32* FbBits;
       
    49 typedef int FbStride;
       
    50 
       
    51 
       
    52 #include "fbmmx.h"
       
    53 #include "fbpict.h"
       
    54 
       
    55 #define CHECKPOINT()
       
    56 
       
    57 OIL_DECLARE_CLASS (composite_in_argb);
       
    58 OIL_DECLARE_CLASS (composite_in_argb_const_src);
       
    59 OIL_DECLARE_CLASS (composite_in_argb_const_mask);
       
    60 OIL_DECLARE_CLASS (composite_over_argb);
       
    61 OIL_DECLARE_CLASS (composite_over_argb_const_src);
       
    62 OIL_DECLARE_CLASS (composite_add_argb);
       
    63 OIL_DECLARE_CLASS (composite_add_argb_const_src);
       
    64 OIL_DECLARE_CLASS (composite_in_over_argb);
       
    65 OIL_DECLARE_CLASS (composite_in_over_argb_const_src);
       
    66 OIL_DECLARE_CLASS (composite_in_over_argb_const_mask);
       
    67 OIL_DECLARE_CLASS (composite_over_u8);
       
    68 OIL_DECLARE_CLASS (composite_add_u8);
       
    69 
       
    70 
       
    71 /* --------------- MMX code patch for fbcompose.c --------------------- */
       
    72 
       
    73 #if 0
       
    74 static void
       
    75 mmxCombineMaskU (uint32_t *dest, const uint32_t *src, const uint8_t *mask, int width)
       
    76 {
       
    77     const __m64 mmx_0 = _mm_setzero_si64();
       
    78     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
    79     
       
    80     const uint32_t *end = mask + width;
       
    81     while (mask < end) {
       
    82         __m64 a = MmxTo(*mask);
       
    83         __m64 s = MmxTo(*src);
       
    84         a = MmxAlpha(a);
       
    85         MmxMul(s, a);
       
    86         *dest = MmxFrom(s);
       
    87         ++src;
       
    88         ++dest;
       
    89         ++mask;
       
    90     }
       
    91     _mm_empty();
       
    92 }
       
    93 #endif
       
    94 
       
    95 #ifdef ENABLE_BROKEN_IMPLS
       
    96 static void
       
    97 mmxCombineOverU (uint32_t *dest, const uint32_t *src, int width)
       
    98 {
       
    99     const __m64 mmx_0 = _mm_setzero_si64();
       
   100     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   101     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   102 
       
   103     const uint32_t *end = dest + width;
       
   104 
       
   105     while (dest < end) {
       
   106         __m64 x, y, a;
       
   107         x = MmxTo(*src);
       
   108         y = MmxTo(*dest);
       
   109         a = MmxAlpha(x);
       
   110         a = MmxNegate(a);
       
   111         MmxMulAdd(y, a, x);
       
   112         *dest = MmxFrom(y);
       
   113         ++dest;
       
   114         ++src;
       
   115     }
       
   116     _mm_empty();
       
   117 }
       
   118 OIL_DEFINE_IMPL_FULL(mmxCombineOverU, composite_over_argb, OIL_IMPL_FLAG_MMX);
       
   119 #endif
       
   120 
       
   121 #if 0
       
   122 static FASTCALL void
       
   123 mmxCombineOverReverseU (CARD32 *dest, const CARD32 *src, int width)
       
   124 {
       
   125     const __m64 mmx_0 = _mm_setzero_si64();
       
   126     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   127     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   128 
       
   129     const CARD32 *end = dest + width;
       
   130 
       
   131     while (dest < end) {
       
   132         __m64 x, y, a;
       
   133         x = MmxTo(*dest);
       
   134         y = MmxTo(*src);
       
   135         a = MmxAlpha(x);
       
   136         a = MmxNegate(a);
       
   137         MmxMulAdd(y, a, x);
       
   138         *dest = MmxFrom(y);
       
   139         ++dest;
       
   140         ++src;
       
   141     }
       
   142     _mm_empty();
       
   143 }
       
   144 #endif
       
   145 
       
   146 #if 0
       
   147 static void
       
   148 mmxCombineInU (CARD32 *dest, const CARD32 *src, int width)
       
   149 {
       
   150     const __m64 mmx_0 = _mm_setzero_si64();
       
   151     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   152 
       
   153     const CARD32 *end = dest + width;
       
   154 
       
   155     while (dest < end) {
       
   156         __m64 x, a;
       
   157         x = MmxTo(*src);
       
   158         a = MmxTo(*dest);
       
   159         a = MmxAlpha(a);
       
   160         MmxMul(x, a);
       
   161         *dest = MmxFrom(x);
       
   162         ++dest;
       
   163         ++src;
       
   164     }
       
   165     _mm_empty();
       
   166 }
       
   167 #endif
       
   168 
       
   169 #if 0
       
   170 static FASTCALL void
       
   171 mmxCombineInReverseU (CARD32 *dest, const CARD32 *src, int width)
       
   172 {
       
   173     const __m64 mmx_0 = _mm_setzero_si64();
       
   174     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   175 
       
   176     const CARD32 *end = dest + width;
       
   177 
       
   178     while (dest < end) {
       
   179         __m64 x, a;
       
   180         x = MmxTo(*dest);
       
   181         a = MmxTo(*src);
       
   182         a = MmxAlpha(a);
       
   183         MmxMul(x, a);
       
   184         *dest = MmxFrom(x);
       
   185         ++dest;
       
   186         ++src;
       
   187     }
       
   188     _mm_empty();
       
   189 }
       
   190 #endif
       
   191 
       
   192 #if 0
       
   193 static FASTCALL void
       
   194 mmxCombineOutU (CARD32 *dest, const CARD32 *src, int width)
       
   195 {
       
   196     const __m64 mmx_0 = _mm_setzero_si64();
       
   197     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   198     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   199 
       
   200     const CARD32 *end = dest + width;
       
   201 
       
   202     while (dest < end) {
       
   203         __m64 x, a;
       
   204         x = MmxTo(*src);
       
   205         a = MmxTo(*dest);
       
   206         a = MmxAlpha(a);
       
   207         a = MmxNegate(a);
       
   208         MmxMul(x, a);
       
   209         *dest = MmxFrom(x);
       
   210         ++dest;
       
   211         ++src;
       
   212     }
       
   213     _mm_empty();
       
   214 }
       
   215 #endif
       
   216 
       
   217 #if 0
       
   218 static FASTCALL void
       
   219 mmxCombineOutReverseU (CARD32 *dest, const CARD32 *src, int width)
       
   220 {
       
   221     const __m64 mmx_0 = _mm_setzero_si64();
       
   222     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   223     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   224 
       
   225     const CARD32 *end = dest + width;
       
   226 
       
   227     while (dest < end) {
       
   228         __m64 x, a;
       
   229         x = MmxTo(*dest);
       
   230         a = MmxTo(*src);
       
   231         a = MmxAlpha(a);
       
   232         a = MmxNegate(a);
       
   233         MmxMul(x, a);
       
   234         *dest = MmxFrom(x);
       
   235         ++dest;
       
   236         ++src;
       
   237     }
       
   238     _mm_empty();
       
   239 }
       
   240 
       
   241 static FASTCALL void
       
   242 mmxCombineAtopU (CARD32 *dest, const CARD32 *src, int width)
       
   243 {
       
   244     const __m64 mmx_0 = _mm_setzero_si64();
       
   245     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   246     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   247 
       
   248     const CARD32 *end = dest + width;
       
   249 
       
   250     while (dest < end) {
       
   251         __m64 s, da, d, sia;
       
   252         s = MmxTo(*src);
       
   253         d = MmxTo(*dest);
       
   254         sia = MmxAlpha(s);
       
   255         sia = MmxNegate(sia);
       
   256         da = MmxAlpha(d);
       
   257         MmxAddMul(s, da, d, sia);
       
   258         *dest = MmxFrom(s);
       
   259         ++dest;
       
   260         ++src;
       
   261     }
       
   262     _mm_empty();
       
   263 }
       
   264 
       
   265 static FASTCALL void
       
   266 mmxCombineAtopReverseU (CARD32 *dest, const CARD32 *src, int width)
       
   267 {
       
   268     const __m64 mmx_0 = _mm_setzero_si64();
       
   269     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   270     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   271 
       
   272     const CARD32 *end;
       
   273 
       
   274     end = dest + width;
       
   275 
       
   276     while (dest < end) {
       
   277         __m64 s, dia, d, sa;
       
   278         s = MmxTo(*src);
       
   279         d = MmxTo(*dest);
       
   280         sa = MmxAlpha(s);
       
   281         dia = MmxAlpha(d);
       
   282         dia = MmxNegate(dia);
       
   283         MmxAddMul(s, dia, d, sa);
       
   284         *dest = MmxFrom(s);
       
   285         ++dest;
       
   286         ++src;
       
   287     }
       
   288     _mm_empty();
       
   289 }
       
   290 
       
   291 static FASTCALL void
       
   292 mmxCombineXorU (CARD32 *dest, const CARD32 *src, int width)
       
   293 {
       
   294     const __m64 mmx_0 = _mm_setzero_si64();
       
   295     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   296     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   297 
       
   298     const CARD32 *end = dest + width;
       
   299 
       
   300     while (dest < end) {
       
   301         __m64 s, dia, d, sia;
       
   302         s = MmxTo(*src);
       
   303         d = MmxTo(*dest);
       
   304         sia = MmxAlpha(s);
       
   305         dia = MmxAlpha(d);
       
   306         sia = MmxNegate(sia);
       
   307         dia = MmxNegate(dia);
       
   308         MmxAddMul(s, dia, d, sia);
       
   309         *dest = MmxFrom(s);
       
   310         ++dest;
       
   311         ++src;
       
   312     }
       
   313     _mm_empty();
       
   314 }
       
   315 #endif
       
   316 
       
   317 static void
       
   318 mmxCombineAddU (uint32_t *dest, const uint32_t *src, int width)
       
   319 {
       
   320     const __m64 mmx_0 = _mm_setzero_si64();
       
   321 
       
   322     const uint32_t *end = dest + width;
       
   323     while (dest < end) {
       
   324         __m64 s, d;
       
   325         s = MmxTo(*src);
       
   326         d = MmxTo(*dest);
       
   327         s = MmxAdd(s, d);
       
   328         *dest = MmxFrom(s);
       
   329         ++dest;
       
   330         ++src;
       
   331     }
       
   332     _mm_empty();
       
   333 }
       
   334 OIL_DEFINE_IMPL_FULL(mmxCombineAddU, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
       
   335 
       
   336 #if 0
       
   337 static FASTCALL void
       
   338 mmxCombineSaturateU (CARD32 *dest, const CARD32 *src, int width)
       
   339 {
       
   340     const __m64 mmx_0 = _mm_setzero_si64();
       
   341     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   342 
       
   343     const CARD32 *end = dest + width;
       
   344     while (dest < end) {
       
   345         CARD32 s = *src;
       
   346         CARD32 d = *dest;
       
   347         __m64 ms = MmxTo(s);
       
   348         __m64 md = MmxTo(d);
       
   349         CARD32 sa = s >> 24;
       
   350         CARD32 da = ~d >> 24;
       
   351 
       
   352         if (sa > da) {
       
   353             __m64 msa = MmxTo(FbIntDiv(da, sa));
       
   354             msa = MmxAlpha(msa);
       
   355             MmxMul(ms, msa);
       
   356         }
       
   357         MmxAdd(md, ms);
       
   358         *dest = MmxFrom(md);
       
   359         ++src;
       
   360         ++dest;
       
   361     }
       
   362     _mm_empty();
       
   363 }
       
   364 
       
   365 
       
   366 static FASTCALL void
       
   367 mmxCombineSrcC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   368 {
       
   369     const __m64 mmx_0 = _mm_setzero_si64();
       
   370     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   371 
       
   372     const CARD32 *end = src + width;
       
   373     while (src < end) {
       
   374         __m64 a = MmxTo(*mask);
       
   375         __m64 s = MmxTo(*src);
       
   376         MmxMul(s, a);
       
   377         *dest = MmxFrom(s);
       
   378         ++src;
       
   379         ++mask;
       
   380         ++dest;
       
   381     }
       
   382     _mm_empty();
       
   383 }
       
   384 
       
   385 static FASTCALL void
       
   386 mmxCombineOverC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   387 {
       
   388     const __m64 mmx_0 = _mm_setzero_si64();
       
   389     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   390     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   391     
       
   392     const CARD32 *end = src + width;
       
   393     while (src < end) {
       
   394         __m64 a = MmxTo(*mask);
       
   395         __m64 s = MmxTo(*src);
       
   396         __m64 d = MmxTo(*dest);
       
   397         __m64 sa = MmxAlpha(s);
       
   398         MmxMul(s, a);
       
   399         MmxMul(a, sa);
       
   400         a = MmxNegate(a);
       
   401         MmxMulAdd(d, a, s);
       
   402         *dest = MmxFrom(d);
       
   403         ++src;
       
   404         ++dest;
       
   405         ++mask;
       
   406     }
       
   407     _mm_empty();
       
   408 }
       
   409 
       
   410 static FASTCALL void
       
   411 mmxCombineOverReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   412 {
       
   413     const __m64 mmx_0 = _mm_setzero_si64();
       
   414     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   415     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   416     
       
   417     const CARD32 *end = src + width;
       
   418     while (src < end) {
       
   419         __m64 a = MmxTo(*mask);
       
   420         __m64 s = MmxTo(*src);
       
   421         __m64 d = MmxTo(*dest);
       
   422         __m64 da = MmxAlpha(d);
       
   423         da = MmxNegate(da);
       
   424         MmxMul(s, a);
       
   425         MmxMulAdd(s, da, d);
       
   426         *dest = MmxFrom(s);
       
   427         ++src;
       
   428         ++dest;
       
   429         ++mask;
       
   430     }
       
   431     _mm_empty();
       
   432 }
       
   433 
       
   434 
       
   435 static FASTCALL void
       
   436 mmxCombineInC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   437 {
       
   438     const __m64 mmx_0 = _mm_setzero_si64();
       
   439     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   440     
       
   441     const CARD32 *end = src + width;
       
   442     while (src < end) {
       
   443         __m64 a = MmxTo(*mask);
       
   444         __m64 s = MmxTo(*src);
       
   445         __m64 d = MmxTo(*dest);
       
   446         __m64 da = MmxAlpha(d);
       
   447         MmxMul(s, a);
       
   448         MmxMul(s, da);
       
   449         *dest = MmxFrom(s);
       
   450         ++src;
       
   451         ++dest;
       
   452         ++mask;
       
   453     }
       
   454     _mm_empty();
       
   455 }
       
   456 
       
   457 static FASTCALL void
       
   458 mmxCombineInReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   459 {
       
   460     const __m64 mmx_0 = _mm_setzero_si64();
       
   461     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   462     
       
   463     const CARD32 *end = src + width;
       
   464     while (src < end) {
       
   465         __m64 a = MmxTo(*mask);
       
   466         __m64 s = MmxTo(*src);
       
   467         __m64 d = MmxTo(*dest);
       
   468         __m64 sa = MmxAlpha(s);
       
   469         MmxMul(a, sa);
       
   470         MmxMul(d, a);
       
   471         *dest = MmxFrom(d);
       
   472         ++src;
       
   473         ++dest;
       
   474         ++mask;
       
   475     }
       
   476     _mm_empty();
       
   477 }
       
   478 
       
   479 static FASTCALL void
       
   480 mmxCombineOutC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   481 {
       
   482     const __m64 mmx_0 = _mm_setzero_si64();
       
   483     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   484     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   485     
       
   486     const CARD32 *end = src + width;
       
   487     while (src < end) {
       
   488         __m64 a = MmxTo(*mask);
       
   489         __m64 s = MmxTo(*src);
       
   490         __m64 d = MmxTo(*dest);
       
   491         __m64 da = MmxAlpha(d);
       
   492         da = MmxNegate(da);
       
   493         MmxMul(s, a);
       
   494         MmxMul(s, da);
       
   495         *dest = MmxFrom(s);
       
   496         ++src;
       
   497         ++dest;
       
   498         ++mask;
       
   499     }
       
   500     _mm_empty();
       
   501 }
       
   502 
       
   503 static FASTCALL void
       
   504 mmxCombineOutReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   505 {
       
   506     const __m64 mmx_0 = _mm_setzero_si64();
       
   507     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   508     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   509     
       
   510     const CARD32 *end = src + width;
       
   511     while (src < end) {
       
   512         __m64 a = MmxTo(*mask);
       
   513         __m64 s = MmxTo(*src);
       
   514         __m64 d = MmxTo(*dest);
       
   515         __m64 sa = MmxAlpha(s);
       
   516         MmxMul(a, sa);
       
   517         a = MmxNegate(a);
       
   518         MmxMul(d, a);
       
   519         *dest = MmxFrom(d);
       
   520         ++src;
       
   521         ++dest;
       
   522         ++mask;
       
   523     }
       
   524     _mm_empty();
       
   525 }
       
   526 
       
   527 static FASTCALL void
       
   528 mmxCombineAtopC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   529 {
       
   530     const __m64 mmx_0 = _mm_setzero_si64();
       
   531     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   532     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   533     
       
   534     const CARD32 *end = src + width;
       
   535     while (src < end) {
       
   536         __m64 a = MmxTo(*mask);
       
   537         __m64 s = MmxTo(*src);
       
   538         __m64 d = MmxTo(*dest);
       
   539         __m64 da = MmxAlpha(d);
       
   540         __m64 sa = MmxAlpha(s); 
       
   541         MmxMul(s, a);
       
   542         MmxMul(a, sa);
       
   543         a = MmxNegate(a);
       
   544         MmxAddMul(d, a, s, da);
       
   545         *dest = MmxFrom(d);
       
   546         ++src;
       
   547         ++dest;
       
   548         ++mask;
       
   549     }
       
   550     _mm_empty();
       
   551 }
       
   552 
       
   553 static FASTCALL void
       
   554 mmxCombineAtopReverseC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   555 {
       
   556     const __m64 mmx_0 = _mm_setzero_si64();
       
   557     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   558     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   559     
       
   560     const CARD32 *end = src + width;
       
   561     while (src < end) {
       
   562         __m64 a = MmxTo(*mask);
       
   563         __m64 s = MmxTo(*src);
       
   564         __m64 d = MmxTo(*dest);
       
   565         __m64 da = MmxAlpha(d);
       
   566         __m64 sa = MmxAlpha(s)
       
   567         MmxMul(s, a);
       
   568         MmxMul(a, sa);
       
   569         da = MmxNegate(da);
       
   570         MmxAddMul(d, a, s, da);
       
   571         *dest = MmxFrom(d);
       
   572         ++src;
       
   573         ++dest;
       
   574         ++mask;
       
   575     }
       
   576     _mm_empty();
       
   577 }
       
   578 
       
   579 static FASTCALL void
       
   580 mmxCombineXorC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   581 {
       
   582     const __m64 mmx_0 = _mm_setzero_si64();
       
   583     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   584     const __m64 mmx_4x00ff = (__m64) 0x00ff00ff00ff00ffULL;
       
   585     
       
   586     const CARD32 *end = src + width;
       
   587     while (src < end) {
       
   588         __m64 a = MmxTo(*mask);
       
   589         __m64 s = MmxTo(*src);
       
   590         __m64 d = MmxTo(*dest);
       
   591         __m64 da = MmxAlpha(d);
       
   592         __m64 sa = MmxAlpha(s);
       
   593         MmxMul(s, a);
       
   594         MmxMul(a, sa);
       
   595         da = MmxNegate(da);
       
   596         a = MmxNegate(a);
       
   597         MmxAddMul(d, a, s, da);
       
   598         *dest = MmxFrom(d);
       
   599         ++src;
       
   600         ++dest;
       
   601         ++mask;
       
   602     }
       
   603     _mm_empty();
       
   604 }
       
   605 
       
   606 static FASTCALL void
       
   607 mmxCombineAddC (CARD32 *dest, CARD32 *src, CARD32 *mask, int width)
       
   608 {
       
   609     const __m64 mmx_0 = _mm_setzero_si64();
       
   610     const __m64 mmx_4x0080 = (__m64) 0x0080008000800080ULL;
       
   611     
       
   612     const CARD32 *end = src + width;
       
   613     while (src < end) {
       
   614         __m64 a = MmxTo(*mask);
       
   615         __m64 s = MmxTo(*src);
       
   616         __m64 d = MmxTo(*dest);
       
   617         MmxMul(s, a);
       
   618         d = MmxAdd(s, d);
       
   619         *dest = MmxFrom(d);
       
   620         ++src;
       
   621         ++dest;
       
   622         ++mask;
       
   623     }
       
   624     _mm_empty();
       
   625 }
       
   626 
       
   627 extern FbComposeFunctions composeFunctions;
       
   628 
       
   629 void fbComposeSetupMMX(void)
       
   630 {
       
   631     /* check if we have MMX support and initialize accordingly */
       
   632     if (fbHaveMMX()) {
       
   633         composeFunctions.combineU[PictOpOver] = mmxCombineOverU;
       
   634         composeFunctions.combineU[PictOpOverReverse] = mmxCombineOverReverseU;
       
   635         composeFunctions.combineU[PictOpIn] = mmxCombineInU;
       
   636         composeFunctions.combineU[PictOpInReverse] = mmxCombineInReverseU;
       
   637         composeFunctions.combineU[PictOpOut] = mmxCombineOutU;
       
   638         composeFunctions.combineU[PictOpOutReverse] = mmxCombineOutReverseU;
       
   639         composeFunctions.combineU[PictOpAtop] = mmxCombineAtopU;
       
   640         composeFunctions.combineU[PictOpAtopReverse] = mmxCombineAtopReverseU;
       
   641         composeFunctions.combineU[PictOpXor] = mmxCombineXorU;
       
   642         composeFunctions.combineU[PictOpAdd] = mmxCombineAddU;
       
   643         composeFunctions.combineU[PictOpSaturate] = mmxCombineSaturateU;
       
   644 
       
   645         composeFunctions.combineC[PictOpSrc] = mmxCombineSrcC;
       
   646         composeFunctions.combineC[PictOpOver] = mmxCombineOverC;
       
   647         composeFunctions.combineC[PictOpOverReverse] = mmxCombineOverReverseC;
       
   648         composeFunctions.combineC[PictOpIn] = mmxCombineInC;
       
   649         composeFunctions.combineC[PictOpInReverse] = mmxCombineInReverseC;
       
   650         composeFunctions.combineC[PictOpOut] = mmxCombineOutC;
       
   651         composeFunctions.combineC[PictOpOutReverse] = mmxCombineOutReverseC;
       
   652         composeFunctions.combineC[PictOpAtop] = mmxCombineAtopC;
       
   653         composeFunctions.combineC[PictOpAtopReverse] = mmxCombineAtopReverseC;
       
   654         composeFunctions.combineC[PictOpXor] = mmxCombineXorC;
       
   655         composeFunctions.combineC[PictOpAdd] = mmxCombineAddC;
       
   656 
       
   657         composeFunctions.combineMaskU = mmxCombineMaskU;
       
   658     } 
       
   659 }
       
   660 #endif
       
   661 
       
   662 
       
   663 /* ------------------ MMX code paths called from fbpict.c ----------------------- */
       
   664 
       
   665 typedef union {
       
   666   __m64 m64;
       
   667   uint64_t ull;
       
   668 } m64_ull;
       
   669 
       
   670 typedef struct
       
   671 {
       
   672     m64_ull mmx_4x00ff;
       
   673     m64_ull mmx_4x0080;
       
   674     m64_ull mmx_565_rgb;
       
   675     m64_ull mmx_565_unpack_multiplier;
       
   676     m64_ull mmx_565_r;
       
   677     m64_ull mmx_565_g;
       
   678     m64_ull mmx_565_b;
       
   679     m64_ull mmx_mask_0;
       
   680     m64_ull mmx_mask_1;
       
   681     m64_ull mmx_mask_2;
       
   682     m64_ull mmx_mask_3;
       
   683     m64_ull mmx_full_alpha;
       
   684     m64_ull mmx_ffff0000ffff0000;
       
   685     m64_ull mmx_0000ffff00000000;
       
   686     m64_ull mmx_000000000000ffff;
       
   687 } MMXData;
       
   688 
       
   689 static const MMXData c =
       
   690 {
       
   691     .mmx_4x00ff.ull =			0x00ff00ff00ff00ffULL,
       
   692     .mmx_4x0080.ull =			0x0080008000800080ULL,
       
   693     .mmx_565_rgb.ull =			0x000001f0003f001fULL,
       
   694     .mmx_565_r.ull =			0x000000f800000000ULL,
       
   695     .mmx_565_g.ull =			0x0000000000fc0000ULL,
       
   696     .mmx_565_b.ull =			0x00000000000000f8ULL,
       
   697     .mmx_mask_0.ull =			0xffffffffffff0000ULL,
       
   698     .mmx_mask_1.ull =			0xffffffff0000ffffULL,
       
   699     .mmx_mask_2.ull =			0xffff0000ffffffffULL,
       
   700     .mmx_mask_3.ull =			0x0000ffffffffffffULL,
       
   701     .mmx_full_alpha.ull =			0x00ff000000000000ULL,
       
   702     .mmx_565_unpack_multiplier.ull =	0x0000008404100840ULL,
       
   703     .mmx_ffff0000ffff0000.ull =		0xffff0000ffff0000ULL,
       
   704     .mmx_0000ffff00000000.ull =		0x0000ffff00000000ULL,
       
   705     .mmx_000000000000ffff.ull =		0x000000000000ffffULL,
       
   706 };
       
   707 
       
   708 #define MC(x) ((__m64) c.mmx_##x.m64)
       
   709 
       
   710 static __inline__ __m64
       
   711 shift (__m64 v, int s)
       
   712 {
       
   713     if (s > 0)
       
   714 	return _mm_slli_si64 (v, s);
       
   715     else if (s < 0)
       
   716 	return _mm_srli_si64 (v, -s);
       
   717     else
       
   718 	return v;
       
   719 }
       
   720 
       
   721 static __inline__ __m64
       
   722 negate (__m64 mask)
       
   723 {
       
   724     return _mm_xor_si64 (mask, MC(4x00ff));
       
   725 }
       
   726 
       
   727 static __inline__ __m64
       
   728 pix_multiply (__m64 a, __m64 b)
       
   729 {
       
   730     __m64 res;
       
   731     
       
   732     res = _mm_mullo_pi16 (a, b);
       
   733     res = _mm_adds_pu16 (res, MC(4x0080));
       
   734     res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
       
   735     res = _mm_srli_pi16 (res, 8);
       
   736     
       
   737     return res;
       
   738 }
       
   739 
       
   740 static __inline__ __m64
       
   741 expand_alpha (__m64 pixel)
       
   742 {
       
   743     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 3, 3, 3));
       
   744 }
       
   745 
       
   746 static __inline__ __m64
       
   747 expand_alpha_rev (__m64 pixel)
       
   748 {
       
   749     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(0, 0, 0, 0));
       
   750 }    
       
   751 
       
   752 static __inline__ __m64
       
   753 invert_colors (__m64 pixel)
       
   754 {
       
   755     return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE(3, 0, 1, 2));
       
   756 }
       
   757 
       
   758 /* Notes about writing mmx code
       
   759  *
       
   760  * give memory operands as the second operand. If you give it as the
       
   761  * first, gcc will first load it into a register, then use that
       
   762  * register
       
   763  *
       
   764  *   ie. use
       
   765  *
       
   766  *         _mm_mullo_pi16 (x, mmx_constant);
       
   767  *
       
   768  *   not
       
   769  *
       
   770  *         _mm_mullo_pi16 (mmx_constant, x);
       
   771  *
       
   772  * Also try to minimize dependencies. i.e. when you need a value, try
       
   773  * to calculate it from a value that was calculated as early as
       
   774  * possible.
       
   775  */
       
   776 
       
   777 static __inline__ __m64
       
   778 over (__m64 src, __m64 srca, __m64 dest)
       
   779 {
       
   780     return  _mm_adds_pu8 (src, pix_multiply(dest, negate(srca)));
       
   781 }
       
   782 
       
   783 static __inline__ __m64
       
   784 over_rev_non_pre (__m64 src, __m64 dest)
       
   785 {
       
   786     __m64 srca = expand_alpha (src);
       
   787     __m64 srcfaaa = _mm_or_si64 (srca, MC(full_alpha));
       
   788     
       
   789     return over(pix_multiply(invert_colors(src), srcfaaa), srca, dest);
       
   790 }
       
   791 
       
   792 static __inline__ __m64
       
   793 in (__m64 src,
       
   794     __m64 mask)
       
   795 {
       
   796     return pix_multiply (src, mask);
       
   797 }
       
   798 
       
   799 static __inline__ __m64
       
   800 in_over (__m64 src,
       
   801 	 __m64 srca,
       
   802 	 __m64 mask,
       
   803 	 __m64 dest)
       
   804 {
       
   805     return over(in(src, mask), pix_multiply(srca, mask), dest);
       
   806 }
       
   807 
       
   808 static __inline__ __m64
       
   809 load8888 (CARD32 v)
       
   810 {
       
   811     return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64());
       
   812 }
       
   813 
       
   814 static __inline__ __m64
       
   815 pack8888 (__m64 lo, __m64 hi)
       
   816 {
       
   817     __m64 r;
       
   818     r = _mm_packs_pu16 (lo, hi);
       
   819     return r;
       
   820 }
       
   821 
       
   822 static __inline__ CARD32
       
   823 store8888 (__m64 v)
       
   824 {
       
   825     return _mm_cvtsi64_si32(pack8888(v, _mm_setzero_si64()));
       
   826 }
       
   827 
       
   828 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
       
   829  *
       
   830  *    00RR00GG00BB
       
   831  * 
       
   832  * --- Expanding 565 in the low word ---
       
   833  * 
       
   834  * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
       
   835  * m = m & (01f0003f001f);
       
   836  * m = m * (008404100840);
       
   837  * m = m >> 8;
       
   838  * 
       
   839  * Note the trick here - the top word is shifted by another nibble to
       
   840  * avoid it bumping into the middle word
       
   841  */
       
   842 static __inline__ __m64
       
   843 expand565 (__m64 pixel, int pos)
       
   844 {
       
   845     __m64 p = pixel;
       
   846     __m64 t1, t2;
       
   847     
       
   848     /* move pixel to low 16 bit and zero the rest */
       
   849     p = shift (shift (p, (3 - pos) * 16), -48); 
       
   850     
       
   851     t1 = shift (p, 36 - 11);
       
   852     t2 = shift (p, 16 - 5);
       
   853     
       
   854     p = _mm_or_si64 (t1, p);
       
   855     p = _mm_or_si64 (t2, p);
       
   856     p = _mm_and_si64 (p, MC(565_rgb));
       
   857     
       
   858     pixel = _mm_mullo_pi16 (p, MC(565_unpack_multiplier));
       
   859     return _mm_srli_pi16 (pixel, 8);
       
   860 }
       
   861 
       
   862 static __inline__ __m64
       
   863 expand8888 (__m64 in, int pos)
       
   864 {
       
   865     if (pos == 0)
       
   866 	return _mm_unpacklo_pi8 (in, _mm_setzero_si64());
       
   867     else
       
   868 	return _mm_unpackhi_pi8 (in, _mm_setzero_si64());
       
   869 }
       
   870 
       
   871 static __inline__ __m64
       
   872 pack565 (__m64 pixel, __m64 target, int pos)
       
   873 {
       
   874     __m64 p = pixel;
       
   875     __m64 t = target;
       
   876     __m64 r, g, b;
       
   877     
       
   878     r = _mm_and_si64 (p, MC(565_r));
       
   879     g = _mm_and_si64 (p, MC(565_g));
       
   880     b = _mm_and_si64 (p, MC(565_b));
       
   881     
       
   882     r = shift (r, - (32 - 8) + pos * 16);
       
   883     g = shift (g, - (16 - 3) + pos * 16);
       
   884     b = shift (b, - (0  + 3) + pos * 16);
       
   885     
       
   886     if (pos == 0)
       
   887 	t = _mm_and_si64 (t, MC(mask_0));
       
   888     else if (pos == 1)
       
   889 	t = _mm_and_si64 (t, MC(mask_1));
       
   890     else if (pos == 2)
       
   891 	t = _mm_and_si64 (t, MC(mask_2));
       
   892     else if (pos == 3)
       
   893 	t = _mm_and_si64 (t, MC(mask_3));
       
   894     
       
   895     p = _mm_or_si64 (r, t);
       
   896     p = _mm_or_si64 (g, p);
       
   897     
       
   898     return _mm_or_si64 (b, p);
       
   899 }
       
   900 
       
   901 #ifdef ENABLE_BROKEN_IMPLS
       
   902 /* broken.  See Debian bug #340932 */
       
   903 static void
       
   904 fbCompositeSolid_nx8888mmx (uint32_t *dst, uint32_t *src, int w)
       
   905 {
       
   906     __m64	vsrc, vsrca;
       
   907 
       
   908     vsrc = load8888 (*src);
       
   909     vsrca = expand_alpha (vsrc);
       
   910 
       
   911     while (w && (unsigned long)dst & 7)
       
   912     {
       
   913         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
       
   914         
       
   915         w--;
       
   916         dst++;
       
   917     }
       
   918     
       
   919     while (w >= 2)
       
   920     {
       
   921         __m64 vdest;
       
   922         __m64 dest0, dest1;
       
   923         
       
   924         vdest = *(__m64 *)dst;
       
   925         
       
   926         dest0 = over(vsrc, vsrca, expand8888(vdest, 0));
       
   927         dest1 = over(vsrc, vsrca, expand8888(vdest, 1));
       
   928         
       
   929         *(__m64 *)dst = pack8888(dest0, dest1);
       
   930         
       
   931         dst += 2;
       
   932         w -= 2;
       
   933     }
       
   934     
       
   935     while (w)
       
   936     {
       
   937         *dst = store8888(over(vsrc, vsrca, load8888(*dst)));
       
   938         
       
   939         w--;
       
   940         dst++;
       
   941     }
       
   942     
       
   943     _mm_empty();
       
   944 }
       
   945 OIL_DEFINE_IMPL_FULL(fbCompositeSolid_nx8888mmx, composite_over_argb_const_src,
       
   946     OIL_IMPL_FLAG_MMX| OIL_IMPL_FLAG_MMXEXT);
       
   947 #endif
       
   948 
       
   949 #if 0
       
   950 void
       
   951 fbCompositeSolid_nx0565mmx (CARD8	op,
       
   952 			    PicturePtr pSrc,
       
   953 			    PicturePtr pMask,
       
   954 			    PicturePtr pDst,
       
   955 			    INT16	xSrc,
       
   956 			    INT16	ySrc,
       
   957 			    INT16	xMask,
       
   958 			    INT16	yMask,
       
   959 			    INT16	xDst,
       
   960 			    INT16	yDst,
       
   961 			    CARD16	width,
       
   962 			    CARD16	height)
       
   963 {
       
   964     CARD32	src;
       
   965     CARD16	*dstLine, *dst;
       
   966     CARD16	w;
       
   967     FbStride	dstStride;
       
   968     __m64	vsrc, vsrca;
       
   969     
       
   970     CHECKPOINT();
       
   971     
       
   972     fbComposeGetSolid(pSrc, src, pDst->format);
       
   973     
       
   974     if (src >> 24 == 0)
       
   975 	return;
       
   976     
       
   977     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
       
   978     
       
   979     vsrc = load8888 (src);
       
   980     vsrca = expand_alpha (vsrc);
       
   981     
       
   982     while (height--)
       
   983     {
       
   984 	dst = dstLine;
       
   985 	dstLine += dstStride;
       
   986 	w = width;
       
   987 	
       
   988 	CHECKPOINT();
       
   989 	
       
   990 	while (w && (unsigned long)dst & 7)
       
   991 	{
       
   992 	    ullong d = *dst;
       
   993 	    __m64 vdest = expand565 ((__m64)d, 0);
       
   994 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
       
   995 	    *dst = (ullong)vdest;
       
   996 	    
       
   997 	    w--;
       
   998 	    dst++;
       
   999 	}
       
  1000 	
       
  1001 	while (w >= 4)
       
  1002 	{
       
  1003 	    __m64 vdest;
       
  1004 	    
       
  1005 	    vdest = *(__m64 *)dst;
       
  1006 	    
       
  1007 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 0)), vdest, 0);
       
  1008 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 1)), vdest, 1);
       
  1009 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 2)), vdest, 2);
       
  1010 	    vdest = pack565 (over(vsrc, vsrca, expand565(vdest, 3)), vdest, 3);
       
  1011 	    
       
  1012 	    *(__m64 *)dst = vdest;
       
  1013 	    
       
  1014 	    dst += 4;
       
  1015 	    w -= 4;
       
  1016 	}
       
  1017 	
       
  1018 	CHECKPOINT();
       
  1019 	
       
  1020 	while (w)
       
  1021 	{
       
  1022 	    ullong d = *dst;
       
  1023 	    __m64 vdest = expand565 ((__m64)d, 0);
       
  1024 	    vdest = pack565(over(vsrc, vsrca, vdest), vdest, 0);
       
  1025 	    *dst = (ullong)vdest;
       
  1026 	    
       
  1027 	    w--;
       
  1028 	    dst++;
       
  1029 	}
       
  1030     }
       
  1031     
       
  1032     _mm_empty();
       
  1033 }
       
  1034 #endif
       
  1035 
       
  1036 #if 0
       
  1037 static void
       
  1038 fbCompositeSolidMask_nx8888x8888Cmmx (uint32_t *dst, uint32_t *src, uint8_t *mask, int w)
       
  1039 {
       
  1040     CARD32	src, srca;
       
  1041     CARD32	*dstLine;
       
  1042     CARD32	*maskLine;
       
  1043     FbStride	dstStride, maskStride;
       
  1044     __m64	vsrc, vsrca;
       
  1045     
       
  1046     
       
  1047     while (twidth && (unsigned long)q & 7)
       
  1048     {
       
  1049         CARD32 m = *(CARD32 *)p;
       
  1050         
       
  1051         if (m)
       
  1052         {
       
  1053             __m64 vdest = load8888(*q);
       
  1054             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
       
  1055             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
       
  1056         }
       
  1057         
       
  1058         twidth--;
       
  1059         p++;
       
  1060         q++;
       
  1061     }
       
  1062     
       
  1063     while (twidth >= 2)
       
  1064     {
       
  1065         CARD32 m0, m1;
       
  1066         m0 = *p;
       
  1067         m1 = *(p + 1);
       
  1068         
       
  1069         if (m0 | m1)
       
  1070         {
       
  1071             __m64 dest0, dest1;
       
  1072             __m64 vdest = *(__m64 *)q;
       
  1073             
       
  1074             dest0 = in_over(vsrc, vsrca, load8888(m0),
       
  1075                             expand8888 (vdest, 0));
       
  1076             dest1 = in_over(vsrc, vsrca, load8888(m1),
       
  1077                             expand8888 (vdest, 1));
       
  1078             
       
  1079             *(__m64 *)q = pack8888(dest0, dest1);
       
  1080         }
       
  1081         
       
  1082         p += 2;
       
  1083         q += 2;
       
  1084         twidth -= 2;
       
  1085     }
       
  1086     
       
  1087     while (twidth)
       
  1088     {
       
  1089         CARD32 m = *(CARD32 *)p;
       
  1090         
       
  1091         if (m)
       
  1092         {
       
  1093             __m64 vdest = load8888(*q);
       
  1094             vdest = in_over(vsrc, vsrca, load8888(m), vdest);
       
  1095             *q = (ullong)pack8888(vdest, _mm_setzero_si64());
       
  1096         }
       
  1097         
       
  1098         twidth--;
       
  1099         p++;
       
  1100         q++;
       
  1101     }
       
  1102     
       
  1103     _mm_empty();
       
  1104 }
       
  1105 #endif
       
  1106 
       
  1107 #if 0
       
  1108 static void
       
  1109 fbCompositeSrc_8888x8x8888mmx (uint32_t *dest, uint32_t *src, uint8_t *mask,
       
  1110     int width)
       
  1111 {
       
  1112 
       
  1113     mask = *maskLine << 24 | *maskLine << 16 | *maskLine << 8 | *maskLine;
       
  1114     vmask = load8888 (mask);
       
  1115     srca = MC(4x00ff);
       
  1116     
       
  1117     while (height--)
       
  1118     {
       
  1119 	dst = dstLine;
       
  1120 	dstLine += dstStride;
       
  1121 	src = srcLine;
       
  1122 	srcLine += srcStride;
       
  1123 	w = width;
       
  1124 
       
  1125 	while (w && (unsigned long)dst & 7)
       
  1126 	{
       
  1127 	    __m64 s = load8888 (*src);
       
  1128 	    __m64 d = load8888 (*dst);
       
  1129 	    
       
  1130 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
       
  1131 	    
       
  1132 	    w--;
       
  1133 	    dst++;
       
  1134 	    src++;
       
  1135 	}
       
  1136 
       
  1137 	while (w >= 16)
       
  1138 	{
       
  1139 	    __m64 vd0 = *(__m64 *)(dst + 0);
       
  1140 	    __m64 vd1 = *(__m64 *)(dst + 2);
       
  1141 	    __m64 vd2 = *(__m64 *)(dst + 4);
       
  1142 	    __m64 vd3 = *(__m64 *)(dst + 6);
       
  1143 	    __m64 vd4 = *(__m64 *)(dst + 8);
       
  1144 	    __m64 vd5 = *(__m64 *)(dst + 10);
       
  1145 	    __m64 vd6 = *(__m64 *)(dst + 12);
       
  1146 	    __m64 vd7 = *(__m64 *)(dst + 14);
       
  1147 
       
  1148 	    __m64 vs0 = *(__m64 *)(src + 0);
       
  1149 	    __m64 vs1 = *(__m64 *)(src + 2);
       
  1150 	    __m64 vs2 = *(__m64 *)(src + 4);
       
  1151 	    __m64 vs3 = *(__m64 *)(src + 6);
       
  1152 	    __m64 vs4 = *(__m64 *)(src + 8);
       
  1153 	    __m64 vs5 = *(__m64 *)(src + 10);
       
  1154 	    __m64 vs6 = *(__m64 *)(src + 12);
       
  1155 	    __m64 vs7 = *(__m64 *)(src + 14);
       
  1156 
       
  1157 	    vd0 = (__m64)pack8888 (
       
  1158 		in_over (expand8888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
       
  1159 		in_over (expand8888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
       
  1160 	
       
  1161 	    vd1 = (__m64)pack8888 (
       
  1162 		in_over (expand8888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
       
  1163 		in_over (expand8888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
       
  1164 	
       
  1165 	    vd2 = (__m64)pack8888 (
       
  1166 		in_over (expand8888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
       
  1167 		in_over (expand8888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
       
  1168 	
       
  1169 	    vd3 = (__m64)pack8888 (
       
  1170 		in_over (expand8888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
       
  1171 		in_over (expand8888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
       
  1172 	
       
  1173 	    vd4 = (__m64)pack8888 (
       
  1174 		in_over (expand8888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
       
  1175 		in_over (expand8888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
       
  1176 	
       
  1177 	    vd5 = (__m64)pack8888 (
       
  1178 		in_over (expand8888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
       
  1179 		in_over (expand8888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
       
  1180 	
       
  1181 	    vd6 = (__m64)pack8888 (
       
  1182 		in_over (expand8888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
       
  1183 		in_over (expand8888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
       
  1184 	
       
  1185 	    vd7 = (__m64)pack8888 (
       
  1186 		in_over (expand8888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
       
  1187 		in_over (expand8888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
       
  1188 
       
  1189     	    *(__m64 *)(dst + 0) = vd0;
       
  1190 	    *(__m64 *)(dst + 2) = vd1;
       
  1191 	    *(__m64 *)(dst + 4) = vd2;
       
  1192 	    *(__m64 *)(dst + 6) = vd3;
       
  1193 	    *(__m64 *)(dst + 8) = vd4;
       
  1194 	    *(__m64 *)(dst + 10) = vd5;
       
  1195 	    *(__m64 *)(dst + 12) = vd6;
       
  1196 	    *(__m64 *)(dst + 14) = vd7;
       
  1197 	
       
  1198 	    w -= 16;
       
  1199 	    dst += 16;
       
  1200 	    src += 16;
       
  1201 	}
       
  1202 	
       
  1203 	while (w)
       
  1204 	{
       
  1205 	    __m64 s = load8888 (*src);
       
  1206 	    __m64 d = load8888 (*dst);
       
  1207 	    
       
  1208 	    *dst = (ullong)pack8888 (in_over (s, srca, vmask, d), (__m64)_mm_setzero_si64());
       
  1209 	    
       
  1210 	    w--;
       
  1211 	    dst++;
       
  1212 	    src++;
       
  1213 	}
       
  1214     }
       
  1215 
       
  1216     _mm_empty(); 
       
  1217 }
       
  1218 
       
  1219 void
       
  1220 fbCompositeSrc_8888x8888mmx (CARD8	op,
       
  1221 			     PicturePtr pSrc,
       
  1222 			     PicturePtr pMask,
       
  1223 			     PicturePtr pDst,
       
  1224 			     INT16	xSrc,
       
  1225 			     INT16	ySrc,
       
  1226 			     INT16      xMask,
       
  1227 			     INT16      yMask,
       
  1228 			     INT16      xDst,
       
  1229 			     INT16      yDst,
       
  1230 			     CARD16     width,
       
  1231 			     CARD16     height)
       
  1232 {
       
  1233     CARD32	*dstLine, *dst;
       
  1234     CARD32	*srcLine, *src;
       
  1235     FbStride	dstStride, srcStride;
       
  1236     CARD16	w;
       
  1237     __m64  srca;
       
  1238     
       
  1239     CHECKPOINT();
       
  1240     
       
  1241     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
       
  1242     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
       
  1243 
       
  1244     srca = MC (4x00ff);
       
  1245     
       
  1246     while (height--)
       
  1247     {
       
  1248 	dst = dstLine;
       
  1249 	dstLine += dstStride;
       
  1250 	src = srcLine;
       
  1251 	srcLine += srcStride;
       
  1252 	w = width;
       
  1253 
       
  1254 	while (w && (unsigned long)dst & 7)
       
  1255 	{
       
  1256 	    __m64 s = load8888 (*src);
       
  1257 	    __m64 d = load8888 (*dst);
       
  1258 	    
       
  1259 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d), (__m64)_mm_setzero_si64());
       
  1260 	    
       
  1261 	    w--;
       
  1262 	    dst++;
       
  1263 	    src++;
       
  1264 	}
       
  1265 
       
  1266 	while (w >= 2)
       
  1267 	{
       
  1268 	    __m64 vd = *(__m64 *)(dst + 0);
       
  1269 	    __m64 vs = *(__m64 *)(src + 0);
       
  1270 	    __m64 vs0 = expand8888 (vs, 0);
       
  1271 	    __m64 vs1 = expand8888 (vs, 1);
       
  1272 
       
  1273 	    *(__m64 *)dst = (__m64)pack8888 (
       
  1274 		over (vs0, expand_alpha (vs0), expand8888 (vd, 0)),
       
  1275 		over (vs1, expand_alpha (vs1), expand8888 (vd, 1)));
       
  1276 	    
       
  1277 	    w -= 2;
       
  1278 	    dst += 2;
       
  1279 	    src += 2;
       
  1280 	}
       
  1281 	
       
  1282 	while (w)
       
  1283 	{
       
  1284 	    __m64 s = load8888 (*src);
       
  1285 	    __m64 d = load8888 (*dst);
       
  1286 	    
       
  1287 	    *dst = (ullong)pack8888 (over (s, expand_alpha (s), d),
       
  1288 				     (__m64)_mm_setzero_si64());
       
  1289 	    
       
  1290 	    w--;
       
  1291 	    dst++;
       
  1292 	    src++;
       
  1293 	}
       
  1294     }
       
  1295 
       
  1296     _mm_empty(); 
       
  1297 }
       
  1298 
       
  1299 void
       
  1300 fbCompositeSolidMask_nx8x8888mmx (CARD8      op,
       
  1301 				  PicturePtr pSrc,
       
  1302 				  PicturePtr pMask,
       
  1303 				  PicturePtr pDst,
       
  1304 				  INT16      xSrc,
       
  1305 				  INT16      ySrc,
       
  1306 				  INT16      xMask,
       
  1307 				  INT16      yMask,
       
  1308 				  INT16      xDst,
       
  1309 				  INT16      yDst,
       
  1310 				  CARD16     width,
       
  1311 				  CARD16     height)
       
  1312 {
       
  1313     CARD32	src, srca;
       
  1314     CARD32	*dstLine, *dst;
       
  1315     CARD8	*maskLine, *mask;
       
  1316     FbStride	dstStride, maskStride;
       
  1317     CARD16	w;
       
  1318     __m64	vsrc, vsrca;
       
  1319     ullong	srcsrc;
       
  1320     
       
  1321     CHECKPOINT();
       
  1322     
       
  1323     fbComposeGetSolid(pSrc, src, pDst->format);
       
  1324     
       
  1325     srca = src >> 24;
       
  1326     if (srca == 0)
       
  1327 	return;
       
  1328     
       
  1329     srcsrc = (unsigned long long)src << 32 | src;
       
  1330     
       
  1331     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
       
  1332     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
       
  1333     
       
  1334     vsrc = load8888 (src);
       
  1335     vsrca = expand_alpha (vsrc);
       
  1336     
       
  1337     while (height--)
       
  1338     {
       
  1339 	dst = dstLine;
       
  1340 	dstLine += dstStride;
       
  1341 	mask = maskLine;
       
  1342 	maskLine += maskStride;
       
  1343 	w = width;
       
  1344 	
       
  1345 	CHECKPOINT();
       
  1346 	
       
  1347 	while (w && (unsigned long)dst & 7)
       
  1348 	{
       
  1349 	    ullong m = *mask;
       
  1350 	    
       
  1351 	    if (m)
       
  1352 	    {
       
  1353 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), load8888(*dst));
       
  1354 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
       
  1355 	    }
       
  1356 	    
       
  1357 	    w--;
       
  1358 	    mask++;
       
  1359 	    dst++;
       
  1360 	}
       
  1361 	
       
  1362 	CHECKPOINT();
       
  1363 	
       
  1364 	while (w >= 2)
       
  1365 	{
       
  1366 	    ullong m0, m1;
       
  1367 	    m0 = *mask;
       
  1368 	    m1 = *(mask + 1);
       
  1369 	    
       
  1370 	    if (srca == 0xff && (m0 & m1) == 0xff)
       
  1371 	    {
       
  1372 		*(unsigned long long *)dst = srcsrc;
       
  1373 	    }
       
  1374 	    else if (m0 | m1)
       
  1375 	    {
       
  1376 		__m64 vdest;
       
  1377 		__m64 dest0, dest1;
       
  1378 		
       
  1379 		vdest = *(__m64 *)dst;
       
  1380 		
       
  1381 		dest0 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m0), expand8888(vdest, 0));
       
  1382 		dest1 = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m1), expand8888(vdest, 1));
       
  1383 		
       
  1384 		*(__m64 *)dst = pack8888(dest0, dest1);
       
  1385 	    }
       
  1386 	    
       
  1387 	    mask += 2;
       
  1388 	    dst += 2;
       
  1389 	    w -= 2;
       
  1390 	}
       
  1391 	
       
  1392 	CHECKPOINT();
       
  1393 	
       
  1394 	while (w)
       
  1395 	{
       
  1396 	    ullong m = *mask;
       
  1397 	    
       
  1398 	    if (m)
       
  1399 	    {
       
  1400 		__m64 vdest = load8888(*dst);
       
  1401 		vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), vdest);
       
  1402 		*dst = (ullong)pack8888(vdest, _mm_setzero_si64());
       
  1403 	    }
       
  1404 	    
       
  1405 	    w--;
       
  1406 	    mask++;
       
  1407 	    dst++;
       
  1408 	}
       
  1409     }
       
  1410     
       
  1411     _mm_empty();
       
  1412 }
       
  1413 
       
  1414 
       
  1415 void
       
  1416 fbCompositeSolidMask_nx8x0565mmx (CARD8      op,
       
  1417 				  PicturePtr pSrc,
       
  1418 				  PicturePtr pMask,
       
  1419 				  PicturePtr pDst,
       
  1420 				  INT16      xSrc,
       
  1421 				  INT16      ySrc,
       
  1422 				  INT16      xMask,
       
  1423 				  INT16      yMask,
       
  1424 				  INT16      xDst,
       
  1425 				  INT16      yDst,
       
  1426 				  CARD16     width,
       
  1427 				  CARD16     height)
       
  1428 {
       
  1429     CARD32	src, srca;
       
  1430     CARD16	*dstLine, *dst;
       
  1431     CARD8	*maskLine, *mask;
       
  1432     FbStride	dstStride, maskStride;
       
  1433     CARD16	w;
       
  1434     __m64	vsrc, vsrca;
       
  1435     unsigned long long srcsrcsrcsrc, src16;
       
  1436     
       
  1437     CHECKPOINT();
       
  1438     
       
  1439     fbComposeGetSolid(pSrc, src, pDst->format);
       
  1440     
       
  1441     srca = src >> 24;
       
  1442     if (srca == 0)
       
  1443 	return;
       
  1444     
       
  1445     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
       
  1446     fbComposeGetStart (pMask, xMask, yMask, CARD8, maskStride, maskLine, 1);
       
  1447     
       
  1448     vsrc = load8888 (src);
       
  1449     vsrca = expand_alpha (vsrc);
       
  1450     
       
  1451     src16 = (ullong)pack565(vsrc, _mm_setzero_si64(), 0);
       
  1452     
       
  1453     srcsrcsrcsrc = (ullong)src16 << 48 | (ullong)src16 << 32 |
       
  1454 	(ullong)src16 << 16 | (ullong)src16;
       
  1455     
       
  1456     while (height--)
       
  1457     {
       
  1458 	dst = dstLine;
       
  1459 	dstLine += dstStride;
       
  1460 	mask = maskLine;
       
  1461 	maskLine += maskStride;
       
  1462 	w = width;
       
  1463 	
       
  1464 	CHECKPOINT();
       
  1465 	
       
  1466 	while (w && (unsigned long)dst & 7)
       
  1467 	{
       
  1468 	    ullong m = *mask;
       
  1469 	    
       
  1470 	    if (m)
       
  1471 	    {
       
  1472 		ullong d = *dst;
       
  1473 		__m64 vd = (__m64)d;
       
  1474 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
       
  1475 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
       
  1476 	    }
       
  1477 	    
       
  1478 	    w--;
       
  1479 	    mask++;
       
  1480 	    dst++;
       
  1481 	}
       
  1482 	
       
  1483 	CHECKPOINT();
       
  1484 	
       
  1485 	while (w >= 4)
       
  1486 	{
       
  1487 	    ullong m0, m1, m2, m3;
       
  1488 	    m0 = *mask;
       
  1489 	    m1 = *(mask + 1);
       
  1490 	    m2 = *(mask + 2);
       
  1491 	    m3 = *(mask + 3);
       
  1492 	    
       
  1493 	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
       
  1494 	    {
       
  1495 		*(unsigned long long *)dst = srcsrcsrcsrc;
       
  1496 	    }
       
  1497 	    else if (m0 | m1 | m2 | m3)
       
  1498 	    {
       
  1499 		__m64 vdest;
       
  1500 		__m64 vm0, vm1, vm2, vm3;
       
  1501 		
       
  1502 		vdest = *(__m64 *)dst;
       
  1503 		
       
  1504 		vm0 = (__m64)m0;
       
  1505 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm0), expand565(vdest, 0)), vdest, 0);
       
  1506 		vm1 = (__m64)m1;
       
  1507 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm1), expand565(vdest, 1)), vdest, 1);
       
  1508 		vm2 = (__m64)m2;
       
  1509 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm2), expand565(vdest, 2)), vdest, 2);
       
  1510 		vm3 = (__m64)m3;
       
  1511 		vdest = pack565(in_over(vsrc, vsrca, expand_alpha_rev(vm3), expand565(vdest, 3)), vdest, 3);
       
  1512 		
       
  1513 		*(__m64 *)dst = vdest;
       
  1514 	    }
       
  1515 	    
       
  1516 	    w -= 4;
       
  1517 	    mask += 4;
       
  1518 	    dst += 4;
       
  1519 	}
       
  1520 	
       
  1521 	CHECKPOINT();
       
  1522 	
       
  1523 	while (w)
       
  1524 	{
       
  1525 	    ullong m = *mask;
       
  1526 	    
       
  1527 	    if (m)
       
  1528 	    {
       
  1529 		ullong d = *dst;
       
  1530 		__m64 vd = (__m64)d;
       
  1531 		__m64 vdest = in_over(vsrc, vsrca, expand_alpha_rev ((__m64)m), expand565(vd, 0));
       
  1532 		*dst = (ullong)pack565(vdest, _mm_setzero_si64(), 0);
       
  1533 	    }
       
  1534 	    
       
  1535 	    w--;
       
  1536 	    mask++;
       
  1537 	    dst++;
       
  1538 	}
       
  1539     }
       
  1540     
       
  1541     _mm_empty();
       
  1542 }
       
  1543 
       
  1544 void
       
  1545 fbCompositeSrc_8888RevNPx0565mmx (CARD8      op,
       
  1546 				  PicturePtr pSrc,
       
  1547 				  PicturePtr pMask,
       
  1548 				  PicturePtr pDst,
       
  1549 				  INT16      xSrc,
       
  1550 				  INT16      ySrc,
       
  1551 				  INT16      xMask,
       
  1552 				  INT16      yMask,
       
  1553 				  INT16      xDst,
       
  1554 				  INT16      yDst,
       
  1555 				  CARD16     width,
       
  1556 				  CARD16     height)
       
  1557 {
       
  1558     CARD16	*dstLine, *dst;
       
  1559     CARD32	*srcLine, *src;
       
  1560     FbStride	dstStride, srcStride;
       
  1561     CARD16	w;
       
  1562     
       
  1563     CHECKPOINT();
       
  1564     
       
  1565     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
       
  1566     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
       
  1567     
       
  1568     assert (pSrc->pDrawable == pMask->pDrawable);
       
  1569     
       
  1570     while (height--)
       
  1571     {
       
  1572 	dst = dstLine;
       
  1573 	dstLine += dstStride;
       
  1574 	src = srcLine;
       
  1575 	srcLine += srcStride;
       
  1576 	w = width;
       
  1577 	
       
  1578 	CHECKPOINT();
       
  1579 	
       
  1580 	while (w && (unsigned long)dst & 7)
       
  1581 	{
       
  1582 	    __m64 vsrc = load8888 (*src);
       
  1583 	    ullong d = *dst;
       
  1584 	    __m64 vdest = expand565 ((__m64)d, 0);
       
  1585 	    
       
  1586 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
       
  1587 	    
       
  1588 	    *dst = (ullong)vdest;
       
  1589 	    
       
  1590 	    w--;
       
  1591 	    dst++;
       
  1592 	    src++;
       
  1593 	}
       
  1594 	
       
  1595 	CHECKPOINT();
       
  1596 	
       
  1597 	while (w >= 4)
       
  1598 	{
       
  1599 	    CARD32 s0, s1, s2, s3;
       
  1600 	    unsigned char a0, a1, a2, a3;
       
  1601 	    
       
  1602 	    s0 = *src;
       
  1603 	    s1 = *(src + 1);
       
  1604 	    s2 = *(src + 2);
       
  1605 	    s3 = *(src + 3);
       
  1606 	    
       
  1607 	    a0 = (s0 >> 24);
       
  1608 	    a1 = (s1 >> 24);
       
  1609 	    a2 = (s2 >> 24);
       
  1610 	    a3 = (s3 >> 24);
       
  1611 	    
       
  1612 	    if ((a0 & a1 & a2 & a3) == 0xFF)
       
  1613 	    {
       
  1614 		__m64 vdest;
       
  1615 		vdest = pack565(invert_colors(load8888(s0)), _mm_setzero_si64(), 0);
       
  1616 		vdest = pack565(invert_colors(load8888(s1)), vdest, 1);
       
  1617 		vdest = pack565(invert_colors(load8888(s2)), vdest, 2);
       
  1618 		vdest = pack565(invert_colors(load8888(s3)), vdest, 3);
       
  1619 		
       
  1620 		*(__m64 *)dst = vdest;
       
  1621 	    }
       
  1622 	    else if (a0 | a1 | a2 | a3)
       
  1623 	    {
       
  1624 		__m64 vdest = *(__m64 *)dst;
       
  1625 		
       
  1626 		vdest = pack565(over_rev_non_pre(load8888(s0), expand565(vdest, 0)), vdest, 0);
       
  1627 	        vdest = pack565(over_rev_non_pre(load8888(s1), expand565(vdest, 1)), vdest, 1);
       
  1628 		vdest = pack565(over_rev_non_pre(load8888(s2), expand565(vdest, 2)), vdest, 2);
       
  1629 		vdest = pack565(over_rev_non_pre(load8888(s3), expand565(vdest, 3)), vdest, 3);
       
  1630 		
       
  1631 		*(__m64 *)dst = vdest;
       
  1632 	    }
       
  1633 	    
       
  1634 	    w -= 4;
       
  1635 	    dst += 4;
       
  1636 	    src += 4;
       
  1637 	}
       
  1638 	
       
  1639 	CHECKPOINT();
       
  1640 	
       
  1641 	while (w)
       
  1642 	{
       
  1643 	    __m64 vsrc = load8888 (*src);
       
  1644 	    ullong d = *dst;
       
  1645 	    __m64 vdest = expand565 ((__m64)d, 0);
       
  1646 	    
       
  1647 	    vdest = pack565(over_rev_non_pre(vsrc, vdest), vdest, 0);
       
  1648 	    
       
  1649 	    *dst = (ullong)vdest;
       
  1650 	    
       
  1651 	    w--;
       
  1652 	    dst++;
       
  1653 	    src++;
       
  1654 	}
       
  1655     }
       
  1656     
       
  1657     _mm_empty();
       
  1658 }
       
  1659 
       
  1660 /* "8888RevNP" is GdkPixbuf's format: ABGR, non premultiplied */
       
  1661 
       
  1662 void
       
  1663 fbCompositeSrc_8888RevNPx8888mmx (CARD8      op,
       
  1664 				  PicturePtr pSrc,
       
  1665 				  PicturePtr pMask,
       
  1666 				  PicturePtr pDst,
       
  1667 				  INT16      xSrc,
       
  1668 				  INT16      ySrc,
       
  1669 				  INT16      xMask,
       
  1670 				  INT16      yMask,
       
  1671 				  INT16      xDst,
       
  1672 				  INT16      yDst,
       
  1673 				  CARD16     width,
       
  1674 				  CARD16     height)
       
  1675 {
       
  1676     CARD32	*dstLine, *dst;
       
  1677     CARD32	*srcLine, *src;
       
  1678     FbStride	dstStride, srcStride;
       
  1679     CARD16	w;
       
  1680     
       
  1681     CHECKPOINT();
       
  1682     
       
  1683     fbComposeGetStart (pDst, xDst, yDst, CARD32, dstStride, dstLine, 1);
       
  1684     fbComposeGetStart (pSrc, xSrc, ySrc, CARD32, srcStride, srcLine, 1);
       
  1685     
       
  1686     assert (pSrc->pDrawable == pMask->pDrawable);
       
  1687     
       
  1688     while (height--)
       
  1689     {
       
  1690 	dst = dstLine;
       
  1691 	dstLine += dstStride;
       
  1692 	src = srcLine;
       
  1693 	srcLine += srcStride;
       
  1694 	w = width;
       
  1695 	
       
  1696 	while (w && (unsigned long)dst & 7)
       
  1697 	{
       
  1698 	    __m64 s = load8888 (*src);
       
  1699 	    __m64 d = load8888 (*dst);
       
  1700 	    
       
  1701 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
       
  1702 	    
       
  1703 	    w--;
       
  1704 	    dst++;
       
  1705 	    src++;
       
  1706 	}
       
  1707 	
       
  1708 	while (w >= 2)
       
  1709 	{
       
  1710 	    ullong s0, s1;
       
  1711 	    unsigned char a0, a1;
       
  1712 	    __m64 d0, d1;
       
  1713 	    
       
  1714 	    s0 = *src;
       
  1715 	    s1 = *(src + 1);
       
  1716 	    
       
  1717 	    a0 = (s0 >> 24);
       
  1718 	    a1 = (s1 >> 24);
       
  1719 	    
       
  1720 	    if ((a0 & a1) == 0xFF)
       
  1721 	    {
       
  1722 		d0 = invert_colors(load8888(s0));
       
  1723 		d1 = invert_colors(load8888(s1));
       
  1724 		
       
  1725 		*(__m64 *)dst = pack8888 (d0, d1);
       
  1726 	    }
       
  1727 	    else if (a0 | a1)
       
  1728 	    {
       
  1729 		__m64 vdest = *(__m64 *)dst;
       
  1730 		
       
  1731 		d0 = over_rev_non_pre (load8888(s0), expand8888 (vdest, 0));
       
  1732 		d1 = over_rev_non_pre (load8888(s1), expand8888 (vdest, 1));
       
  1733 		
       
  1734 		*(__m64 *)dst = pack8888 (d0, d1);
       
  1735 	    }
       
  1736 	    
       
  1737 	    w -= 2;
       
  1738 	    dst += 2;
       
  1739 	    src += 2;
       
  1740 	}
       
  1741 	
       
  1742 	while (w)
       
  1743 	{
       
  1744 	    __m64 s = load8888 (*src);
       
  1745 	    __m64 d = load8888 (*dst);
       
  1746 	    
       
  1747 	    *dst = (ullong)pack8888 (over_rev_non_pre (s, d), _mm_setzero_si64());
       
  1748 	    
       
  1749 	    w--;
       
  1750 	    dst++;
       
  1751 	    src++;
       
  1752 	}
       
  1753     }
       
  1754     
       
  1755     _mm_empty();
       
  1756 }
       
  1757 
       
  1758 void
       
  1759 fbCompositeSolidMask_nx8888x0565Cmmx (CARD8      op,
       
  1760 				      PicturePtr pSrc,
       
  1761 				      PicturePtr pMask,
       
  1762 				      PicturePtr pDst,
       
  1763 				      INT16      xSrc,
       
  1764 				      INT16      ySrc,
       
  1765 				      INT16      xMask,
       
  1766 				      INT16      yMask,
       
  1767 				      INT16      xDst,
       
  1768 				      INT16      yDst,
       
  1769 				      CARD16     width,
       
  1770 				      CARD16     height)
       
  1771 {
       
  1772     CARD32	src, srca;
       
  1773     CARD16	*dstLine;
       
  1774     CARD32	*maskLine;
       
  1775     FbStride	dstStride, maskStride;
       
  1776     __m64  vsrc, vsrca;
       
  1777     
       
  1778     CHECKPOINT();
       
  1779     
       
  1780     fbComposeGetSolid(pSrc, src, pDst->format);
       
  1781     
       
  1782     srca = src >> 24;
       
  1783     if (srca == 0)
       
  1784 	return;
       
  1785     
       
  1786     fbComposeGetStart (pDst, xDst, yDst, CARD16, dstStride, dstLine, 1);
       
  1787     fbComposeGetStart (pMask, xMask, yMask, CARD32, maskStride, maskLine, 1);
       
  1788     
       
  1789     vsrc = load8888 (src);
       
  1790     vsrca = expand_alpha (vsrc);
       
  1791     
       
  1792     while (height--)
       
  1793     {
       
  1794 	int twidth = width;
       
  1795 	CARD32 *p = (CARD32 *)maskLine;
       
  1796 	CARD16 *q = (CARD16 *)dstLine;
       
  1797 	
       
  1798 	while (twidth && ((unsigned long)q & 7))
       
  1799 	{
       
  1800 	    CARD32 m = *(CARD32 *)p;
       
  1801 	    
       
  1802 	    if (m)
       
  1803 	    {
       
  1804 		ullong d = *q;
       
  1805 		__m64 vdest = expand565 ((__m64)d, 0);
       
  1806 		vdest = pack565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
       
  1807 		*q = (ullong)vdest;
       
  1808 	    }
       
  1809 	    
       
  1810 	    twidth--;
       
  1811 	    p++;
       
  1812 	    q++;
       
  1813 	}
       
  1814 	
       
  1815 	while (twidth >= 4)
       
  1816 	{
       
  1817 	    CARD32 m0, m1, m2, m3;
       
  1818 	    
       
  1819 	    m0 = *p;
       
  1820 	    m1 = *(p + 1);
       
  1821 	    m2 = *(p + 2);
       
  1822 	    m3 = *(p + 3);
       
  1823 	    
       
  1824 	    if ((m0 | m1 | m2 | m3))
       
  1825 	    {
       
  1826 		__m64 vdest = *(__m64 *)q;
       
  1827 		
       
  1828 		vdest = pack565(in_over(vsrc, vsrca, load8888(m0), expand565(vdest, 0)), vdest, 0);
       
  1829 		vdest = pack565(in_over(vsrc, vsrca, load8888(m1), expand565(vdest, 1)), vdest, 1);
       
  1830 		vdest = pack565(in_over(vsrc, vsrca, load8888(m2), expand565(vdest, 2)), vdest, 2);
       
  1831 		vdest = pack565(in_over(vsrc, vsrca, load8888(m3), expand565(vdest, 3)), vdest, 3);
       
  1832 		
       
  1833 		*(__m64 *)q = vdest;
       
  1834 	    }
       
  1835 	    twidth -= 4;
       
  1836 	    p += 4;
       
  1837 	    q += 4;
       
  1838 	}
       
  1839 	
       
  1840 	while (twidth)
       
  1841 	{
       
  1842 	    CARD32 m;
       
  1843 	    
       
  1844 	    m = *(CARD32 *)p;
       
  1845 	    if (m)
       
  1846 	    {
       
  1847 		ullong d = *q;
       
  1848 		__m64 vdest = expand565((__m64)d, 0);
       
  1849 		vdest = pack565 (in_over(vsrc, vsrca, load8888(m), vdest), vdest, 0);
       
  1850 		*q = (ullong)vdest;
       
  1851 	    }
       
  1852 	    
       
  1853 	    twidth--;
       
  1854 	    p++;
       
  1855 	    q++;
       
  1856 	}
       
  1857 	
       
  1858 	maskLine += maskStride;
       
  1859 	dstLine += dstStride;
       
  1860     }
       
  1861     
       
  1862     _mm_empty ();
       
  1863 }
       
  1864 #endif
       
  1865 
       
  1866 static void
       
  1867 fbCompositeSrcAdd_8000x8000mmx (uint8_t *dst, uint8_t *src, int w)
       
  1868 {
       
  1869     int s;
       
  1870     int d;
       
  1871     int t;
       
  1872 
       
  1873     while (w && (unsigned long)dst & 7)
       
  1874     {
       
  1875         s = *src;
       
  1876         d = *dst;
       
  1877         t = d + s;
       
  1878         s = t | (0 - (t >> 8));
       
  1879         *dst = s;
       
  1880         
       
  1881         dst++;
       
  1882         src++;
       
  1883         w--;
       
  1884     }
       
  1885     
       
  1886     while (w >= 8)
       
  1887     {
       
  1888         *(__m64*)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
       
  1889         dst += 8;
       
  1890         src += 8;
       
  1891         w -= 8;
       
  1892     }
       
  1893     
       
  1894     while (w)
       
  1895     {
       
  1896         s = *src;
       
  1897         d = *dst;
       
  1898         t = d + s;
       
  1899         s = t | (0 - (t >> 8));
       
  1900         *dst = s;
       
  1901         
       
  1902         dst++;
       
  1903         src++;
       
  1904         w--;
       
  1905     }
       
  1906 
       
  1907     _mm_empty();
       
  1908 }
       
  1909 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8000x8000mmx, composite_add_u8, OIL_IMPL_FLAG_MMX);
       
  1910 
       
  1911 static void
       
  1912 fbCompositeSrcAdd_8888x8888mmx (uint32_t *dst, uint32_t *src, int w)
       
  1913 {
       
  1914     while (w && (unsigned long)dst & 7)
       
  1915     {
       
  1916         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
       
  1917                                              _mm_cvtsi32_si64(*dst)));
       
  1918         dst++;
       
  1919         src++;
       
  1920         w--;
       
  1921     }
       
  1922     
       
  1923     while (w >= 2)
       
  1924     {
       
  1925         *(__m64 *)dst = _mm_adds_pu8(*(__m64*)src, *(__m64*)dst);
       
  1926         dst += 2;
       
  1927         src += 2;
       
  1928         w -= 2;
       
  1929     }
       
  1930     
       
  1931     if (w)
       
  1932     {
       
  1933         *dst = _mm_cvtsi64_si32(_mm_adds_pu8(_mm_cvtsi32_si64(*src),
       
  1934                                              _mm_cvtsi32_si64(*dst)));
       
  1935         
       
  1936     }
       
  1937     
       
  1938     _mm_empty();
       
  1939 }
       
  1940 OIL_DEFINE_IMPL_FULL (fbCompositeSrcAdd_8888x8888mmx, composite_add_argb, OIL_IMPL_FLAG_MMX | OIL_IMPL_FLAG_SSE);
       
  1941 
       
  1942 #if 0
       
  1943 #define GetStart(drw,x,y,type,stride,line,bpp) {\
       
  1944     FbBits	*__bits__;									\
       
  1945     FbStride	__stride__;									\
       
  1946     int		__xoff__,__yoff__;								\
       
  1947 												\
       
  1948     fbGetDrawable((drw),__bits__,__stride__,bpp,__xoff__,__yoff__);				\
       
  1949     (stride) = __stride__ * sizeof (FbBits) / sizeof (type);					\
       
  1950     (line) = ((type *) __bits__) + (stride) * ((y) - __yoff__) + ((x) - __xoff__);		\
       
  1951 }
       
  1952 
       
  1953 Bool
       
  1954 fbSolidFillmmx (DrawablePtr	pDraw,
       
  1955 		int		x,
       
  1956 		int		y,
       
  1957 		int		width,
       
  1958 		int		height,
       
  1959 		FbBits		xor)
       
  1960 { 
       
  1961     FbStride	stride;
       
  1962     int		bpp;
       
  1963     ullong	fill;
       
  1964     __m64	vfill;
       
  1965     CARD32	byte_width;
       
  1966     CARD8	*byte_line;
       
  1967     FbBits      *bits;
       
  1968     int		xoff, yoff;
       
  1969     
       
  1970     CHECKPOINT();
       
  1971     
       
  1972     fbGetDrawable(pDraw, bits, stride, bpp, xoff, yoff);
       
  1973     
       
  1974     if (bpp == 16 && (xor >> 16 != (xor & 0xffff)))
       
  1975 	return FALSE;
       
  1976     
       
  1977     if (bpp != 16 && bpp != 32)
       
  1978 	return FALSE;
       
  1979     
       
  1980     if (bpp == 16)
       
  1981     {
       
  1982 	stride = stride * sizeof (FbBits) / 2;
       
  1983 	byte_line = (CARD8 *)(((CARD16 *)bits) + stride * (y - yoff) + (x - xoff));
       
  1984 	byte_width = 2 * width;
       
  1985 	stride *= 2;
       
  1986     }
       
  1987     else
       
  1988     {
       
  1989 	stride = stride * sizeof (FbBits) / 4;
       
  1990 	byte_line = (CARD8 *)(((CARD32 *)bits) + stride * (y - yoff) + (x - xoff));
       
  1991 	byte_width = 4 * width;
       
  1992 	stride *= 4;
       
  1993     }
       
  1994     
       
  1995     fill = ((ullong)xor << 32) | xor;
       
  1996     vfill = (__m64)fill;
       
  1997     
       
  1998     while (height--)
       
  1999     {
       
  2000 	int w;
       
  2001 	CARD8 *d = byte_line;
       
  2002 	byte_line += stride;
       
  2003 	w = byte_width;
       
  2004 	
       
  2005 	while (w >= 2 && ((unsigned long)d & 3))
       
  2006 	{
       
  2007 	    *(CARD16 *)d = xor;
       
  2008 	    w -= 2;
       
  2009 	    d += 2;
       
  2010 	}
       
  2011 	
       
  2012 	while (w >= 4 && ((unsigned long)d & 7))
       
  2013 	{
       
  2014 	    *(CARD32 *)d = xor;
       
  2015 	    
       
  2016 	    w -= 4;
       
  2017 	    d += 4;
       
  2018 	}
       
  2019 	
       
  2020 	while (w >= 64)
       
  2021 	{
       
  2022 	    *(__m64*) (d +  0) = vfill;
       
  2023 	    *(__m64*) (d +  8) = vfill;
       
  2024 	    *(__m64*) (d + 16) = vfill;
       
  2025 	    *(__m64*) (d + 24) = vfill;
       
  2026 	    *(__m64*) (d + 32) = vfill;
       
  2027 	    *(__m64*) (d + 40) = vfill;
       
  2028 	    *(__m64*) (d + 48) = vfill;
       
  2029 	    *(__m64*) (d + 56) = vfill;
       
  2030 	    
       
  2031 	    w -= 64;
       
  2032 	    d += 64;
       
  2033 	}
       
  2034 	while (w >= 4)
       
  2035 	{
       
  2036 	    *(CARD32 *)d = xor;
       
  2037 	    
       
  2038 	    w -= 4;
       
  2039 	    d += 4;
       
  2040 	}
       
  2041 	if (w >= 2)
       
  2042 	{
       
  2043 	    *(CARD16 *)d = xor;
       
  2044 	    w -= 2;
       
  2045 	    d += 2;
       
  2046 	}
       
  2047     }
       
  2048     
       
  2049     _mm_empty();
       
  2050     return TRUE;
       
  2051 }
       
  2052 
       
  2053 Bool
       
  2054 fbCopyAreammx (DrawablePtr	pSrc,
       
  2055 	       DrawablePtr	pDst,
       
  2056 	       int		src_x,
       
  2057 	       int		src_y,
       
  2058 	       int		dst_x,
       
  2059 	       int		dst_y,
       
  2060 	       int		width,
       
  2061 	       int		height)
       
  2062 {
       
  2063     FbBits *	src_bits;
       
  2064     FbStride	src_stride;
       
  2065     int		src_bpp;
       
  2066     int		src_xoff;
       
  2067     int		src_yoff;
       
  2068 
       
  2069     FbBits *	dst_bits;
       
  2070     FbStride	dst_stride;
       
  2071     int		dst_bpp;
       
  2072     int		dst_xoff;
       
  2073     int		dst_yoff;
       
  2074 
       
  2075     CARD8 *	src_bytes;
       
  2076     CARD8 *	dst_bytes;
       
  2077     int		byte_width;
       
  2078     
       
  2079     fbGetDrawable(pSrc, src_bits, src_stride, src_bpp, src_xoff, src_yoff);
       
  2080     fbGetDrawable(pDst, dst_bits, dst_stride, dst_bpp, dst_xoff, dst_yoff);
       
  2081 
       
  2082     if (src_bpp != 16 && src_bpp != 32)
       
  2083 	return FALSE;
       
  2084 
       
  2085     if (dst_bpp != 16 && dst_bpp != 32)
       
  2086 	return FALSE;
       
  2087 
       
  2088     if (src_bpp != dst_bpp)
       
  2089     {
       
  2090 	return FALSE;
       
  2091     }
       
  2092     
       
  2093     if (src_bpp == 16)
       
  2094     {
       
  2095 	src_stride = src_stride * sizeof (FbBits) / 2;
       
  2096 	dst_stride = dst_stride * sizeof (FbBits) / 2;
       
  2097 	src_bytes = (CARD8 *)(((CARD16 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
       
  2098 	dst_bytes = (CARD8 *)(((CARD16 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
       
  2099 	byte_width = 2 * width;
       
  2100 	src_stride *= 2;
       
  2101 	dst_stride *= 2;
       
  2102     }
       
  2103     else
       
  2104     {
       
  2105 	src_stride = src_stride * sizeof (FbBits) / 4;
       
  2106 	dst_stride = dst_stride * sizeof (FbBits) / 4;
       
  2107 	src_bytes = (CARD8 *)(((CARD32 *)src_bits) + src_stride * (src_y - src_yoff) + (src_x - src_xoff));
       
  2108 	dst_bytes = (CARD8 *)(((CARD32 *)dst_bits) + dst_stride * (dst_y - dst_yoff) + (dst_x - dst_xoff));
       
  2109 	byte_width = 4 * width;
       
  2110 	src_stride *= 4;
       
  2111 	dst_stride *= 4;
       
  2112     }
       
  2113 
       
  2114     while (height--)
       
  2115     {
       
  2116 	int w;
       
  2117 	CARD8 *s = src_bytes;
       
  2118 	CARD8 *d = dst_bytes;
       
  2119 	src_bytes += src_stride;
       
  2120 	dst_bytes += dst_stride;
       
  2121 	w = byte_width;
       
  2122 	
       
  2123 	while (w >= 2 && ((unsigned long)d & 3))
       
  2124 	{
       
  2125 	    *(CARD16 *)d = *(CARD16 *)s;
       
  2126 	    w -= 2;
       
  2127 	    s += 2;
       
  2128 	    d += 2;
       
  2129 	}
       
  2130 	
       
  2131 	while (w >= 4 && ((unsigned long)d & 7))
       
  2132 	{
       
  2133 	    *(CARD32 *)d = *(CARD32 *)s;
       
  2134 	    
       
  2135 	    w -= 4;
       
  2136 	    s += 4;
       
  2137 	    d += 4;
       
  2138 	}
       
  2139 	
       
  2140 	while (w >= 64)
       
  2141 	{
       
  2142 	    *(__m64 *)(d + 0)  = *(__m64 *)(s + 0);
       
  2143 	    *(__m64 *)(d + 8)  = *(__m64 *)(s + 8);
       
  2144 	    *(__m64 *)(d + 16) = *(__m64 *)(s + 16);
       
  2145 	    *(__m64 *)(d + 24) = *(__m64 *)(s + 24);
       
  2146 	    *(__m64 *)(d + 32) = *(__m64 *)(s + 32);
       
  2147 	    *(__m64 *)(d + 40) = *(__m64 *)(s + 40);
       
  2148 	    *(__m64 *)(d + 48) = *(__m64 *)(s + 48);
       
  2149 	    *(__m64 *)(d + 56) = *(__m64 *)(s + 56);
       
  2150 	    w -= 64;
       
  2151 	    s += 64;
       
  2152 	    d += 64;
       
  2153 	}
       
  2154 	while (w >= 4)
       
  2155 	{
       
  2156 	    *(CARD32 *)d = *(CARD32 *)s;
       
  2157 
       
  2158 	    w -= 4;
       
  2159 	    s += 4;
       
  2160 	    d += 4;
       
  2161 	}
       
  2162 	if (w >= 2)
       
  2163 	{
       
  2164 	    *(CARD16 *)d = *(CARD16 *)s;
       
  2165 	    w -= 2;
       
  2166 	    s += 2;
       
  2167 	    d += 2;
       
  2168 	}
       
  2169     }
       
  2170     
       
  2171     _mm_empty();
       
  2172     return TRUE;
       
  2173 }
       
  2174 
       
  2175 void
       
  2176 fbCompositeCopyAreammx (CARD8		op,
       
  2177 			PicturePtr	pSrc,
       
  2178 			PicturePtr	pMask,
       
  2179 			PicturePtr	pDst,
       
  2180 			INT16		xSrc,
       
  2181 			INT16		ySrc,
       
  2182 			INT16		xMask,
       
  2183 			INT16		yMask,
       
  2184 			INT16		xDst,
       
  2185 			INT16		yDst,
       
  2186 			CARD16		width,
       
  2187 			CARD16		height)
       
  2188 {
       
  2189     fbCopyAreammx (pSrc->pDrawable,
       
  2190 		   pDst->pDrawable,
       
  2191 		   xSrc, ySrc,
       
  2192 		   xDst, yDst,
       
  2193 		   width, height);
       
  2194 }
       
  2195 
       
  2196 #if !defined(__amd64__) && !defined(__x86_64__)
       
  2197 
       
  2198 enum CPUFeatures {
       
  2199     NoFeatures = 0,
       
  2200     MMX = 0x1,
       
  2201     MMX_Extensions = 0x2, 
       
  2202     SSE = 0x6,
       
  2203     SSE2 = 0x8,
       
  2204     CMOV = 0x10
       
  2205 };
       
  2206 
       
  2207 static unsigned int detectCPUFeatures(void) {
       
  2208     unsigned int result;
       
  2209     char vendor[13];
       
  2210     vendor[0] = 0;
       
  2211     vendor[12] = 0;
       
  2212     /* see p. 118 of amd64 instruction set manual Vol3 */
       
  2213     __asm__ ("push %%ebx\n"
       
  2214              "pushf\n"
       
  2215              "pop %%eax\n"
       
  2216              "mov %%eax, %%ebx\n"
       
  2217              "xor $0x00200000, %%eax\n"
       
  2218              "push %%eax\n"
       
  2219              "popf\n"
       
  2220              "pushf\n"
       
  2221              "pop %%eax\n"
       
  2222              "mov $0x0, %%edx\n"
       
  2223              "xor %%ebx, %%eax\n"
       
  2224              "jz skip\n"
       
  2225 
       
  2226              "mov $0x00000000, %%eax\n"
       
  2227              "cpuid\n"
       
  2228              "mov %%ebx, %1\n"
       
  2229              "mov %%edx, %2\n"
       
  2230              "mov %%ecx, %3\n"
       
  2231              "mov $0x00000001, %%eax\n"
       
  2232              "cpuid\n"
       
  2233              "skip:\n"
       
  2234              "pop %%ebx\n"
       
  2235              "mov %%edx, %0\n"
       
  2236              : "=r" (result), 
       
  2237                "=m" (vendor[0]), 
       
  2238                "=m" (vendor[4]), 
       
  2239                "=m" (vendor[8])
       
  2240              :
       
  2241              : "%eax", "%ecx", "%edx"
       
  2242         );
       
  2243 
       
  2244     unsigned int features = 0;
       
  2245     if (result) {
       
  2246         /* result now contains the standard feature bits */
       
  2247         if (result & (1 << 15))
       
  2248             features |= CMOV;
       
  2249         if (result & (1 << 23))
       
  2250             features |= MMX;
       
  2251         if (result & (1 << 25))
       
  2252             features |= SSE;
       
  2253         if (result & (1 << 26))
       
  2254             features |= SSE2;
       
  2255         if ((result & MMX) && !(result & SSE) && (strcmp(vendor, "AuthenticAMD") == 0)) {
       
  2256             /* check for AMD MMX extensions */
       
  2257 
       
  2258             unsigned int result;            
       
  2259             __asm__("push %%ebx\n"
       
  2260                     "mov $0x80000000, %%eax\n"
       
  2261                     "cpuid\n"
       
  2262                     "xor %%edx, %%edx\n"
       
  2263                     "cmp $0x1, %%eax\n"
       
  2264                     "jge skip2\n"
       
  2265                     "mov $0x80000001, %%eax\n"
       
  2266                     "cpuid\n"
       
  2267                     "skip2:\n"
       
  2268                     "mov %%edx, %0\n"
       
  2269                     "pop %%ebx\n"
       
  2270                     : "=r" (result)
       
  2271                     :
       
  2272                     : "%eax", "%ecx", "%edx"
       
  2273                 );
       
  2274             if (result & (1<<22))
       
  2275                 features |= MMX_Extensions;
       
  2276         }
       
  2277     }
       
  2278     return features;
       
  2279 }
       
  2280 
       
  2281 Bool
       
  2282 fbHaveMMX (void)
       
  2283 {
       
  2284     static Bool initialized = FALSE;
       
  2285     static Bool mmx_present;
       
  2286     
       
  2287     if (!initialized)
       
  2288     {
       
  2289         unsigned int features = detectCPUFeatures();
       
  2290 	mmx_present = (features & (MMX|MMX_Extensions)) == (MMX|MMX_Extensions);
       
  2291         initialized = TRUE;
       
  2292     }
       
  2293     
       
  2294     return mmx_present;
       
  2295 }
       
  2296 #endif /* __amd64__ */
       
  2297 
       
  2298 
       
  2299 #endif
       
  2300 
       
  2301 
       
  2302 #ifdef	__SYMBIAN32__
       
  2303  
       
  2304 OilFunctionImpl* __oil_function_impl_mmxCombineOverU, composite_over_argb() {
       
  2305 		return &_oil_function_impl_mmxCombineOverU, composite_over_argb;
       
  2306 }
       
  2307 #endif
       
  2308 
       
  2309 #ifdef	__SYMBIAN32__
       
  2310  
       
  2311 OilFunctionImpl* __oil_function_impl_mmxCombineAddU, composite_add_argb() {
       
  2312 		return &_oil_function_impl_mmxCombineAddU, composite_add_argb;
       
  2313 }
       
  2314 #endif
       
  2315 
       
  2316 #ifdef	__SYMBIAN32__
       
  2317  
       
  2318 OilFunctionImpl* __oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src() {
       
  2319 		return &_oil_function_impl_fbCompositeSolid_nx8888mmx, composite_over_argb_const_src;
       
  2320 }
       
  2321 #endif
       
  2322 
       
  2323 #ifdef	__SYMBIAN32__
       
  2324  
       
  2325 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8() {
       
  2326 		return &_oil_function_impl_fbCompositeSrcAdd_8000x8000mmx, composite_add_u8;
       
  2327 }
       
  2328 #endif
       
  2329 
       
  2330 #ifdef	__SYMBIAN32__
       
  2331  
       
  2332 OilFunctionImpl* __oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb() {
       
  2333 		return &_oil_function_impl_fbCompositeSrcAdd_8888x8888mmx, composite_add_argb;
       
  2334 }
       
  2335 #endif
       
  2336