genericopenlibs/liboil/src/composite_sse_4pix.c
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /*
       
     2  * Copyright (c) 2005
       
     3  *	Eric Anholt.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 
       
    27 #ifdef HAVE_CONFIG_H
       
    28 #include "config.h"
       
    29 #endif
       
    30 #include <liboilclasses.h>
       
    31 #include <liboilfunction.h>
       
    32 #include <emmintrin.h>
       
    33 #include "liboil/liboilcolorspace.h"
       
    34 
       
    35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
       
    36 
       
    37 #ifdef ENABLE_BROKEN_IMPLS
       
    38 
       
    39 union m128_int {
       
    40   __m128i m128;
       
    41   uint64_t ull[2];
       
    42 };
       
    43 
       
    44 static const struct _SSEData {
       
    45   union m128_int sse_16xff;
       
    46   union m128_int sse_8x0080;
       
    47 } c = {
       
    48     .sse_16xff.ull =	{0xffffffffffffffffULL, 0xffffffffffffffffULL},
       
    49     .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
       
    50 };
       
    51 
       
    52 #define MC(x) (c.sse_##x.m128)
       
    53 
       
    54 /* non-SSE2 compositing support */
       
    55 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
       
    56 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
       
    57 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
       
    58 
       
    59 /* This SSE2 code is based around operations on four pixels at a time.  The
       
    60  * exception is muldiv_255_sse2, which needs to expand the four pixels into
       
    61  * 2 sets of 2 pixels at 16 bits per channel each, for the purpose of doing
       
    62  * the appropriate rounding on division.
       
    63  */
       
    64 
       
    65 /* Shuffles the given value such that the alpha for each pixel appears in each
       
    66  * channel of the pixel.
       
    67  */
       
    68 SSE_FUNCTION static inline __m128i
       
    69 argb_A_sse2(__m128i a)
       
    70 {
       
    71 #if 0
       
    72   /* Shift the alpha channel of each pixel into the low byte */
       
    73   a = _mm_srli_epi32(a, 24);
       
    74   /* Now, shift and or so we can get it into all the channels */
       
    75   a = _mm_or_si128(a, _mm_slli_epi32(a, 8));
       
    76   a = _mm_or_si128(a, _mm_slli_epi32(a, 16));
       
    77   return a;
       
    78 #else
       
    79   /* Move the alpha channel into the low byte */
       
    80   a = _mm_srli_epi32(a, 24);
       
    81   /* Pack our four alpha channels down into the lower 32 bits */
       
    82   a = _mm_packus_epi16(a, _mm_setzero_si128());
       
    83   a = _mm_packus_epi16(a, _mm_setzero_si128());
       
    84   /* And expand it back out into four pixels of all channels the same */
       
    85   a = _mm_unpacklo_epi8(a, a);
       
    86   return _mm_unpacklo_epi16(a, a);
       
    87 #endif
       
    88 }
       
    89 
       
    90 /* Multiplies the unpacked 16-bits-per-channel pixel data in a
       
    91  * channel-by-channel by b, and divides the result by 255, with rounding.
       
    92  */
       
    93 SSE_FUNCTION static inline __m128i
       
    94 inner_muldiv_255_sse2(__m128i a, __m128i b)
       
    95 {
       
    96   __m128i ret;
       
    97   __m128i roundconst = MC(8x0080);
       
    98 
       
    99   ret = _mm_mullo_epi16(a, b);
       
   100   ret = _mm_adds_epu16(ret, roundconst);
       
   101   ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
       
   102   ret = _mm_srli_epi16(ret, 8);
       
   103 
       
   104   return ret;
       
   105 }
       
   106 
       
   107 SSE_FUNCTION static inline __m128i
       
   108 muldiv_255_sse2(__m128i a, __m128i b)
       
   109 {
       
   110   __m128i alow, blow, ahigh, bhigh, low, high;
       
   111 
       
   112   alow = _mm_unpacklo_epi8(a, _mm_setzero_si128());
       
   113   blow = _mm_unpacklo_epi8(b, _mm_setzero_si128());
       
   114   ahigh = _mm_unpackhi_epi8(a, _mm_setzero_si128());
       
   115   bhigh = _mm_unpackhi_epi8(b, _mm_setzero_si128());
       
   116   low = inner_muldiv_255_sse2(alow, blow);
       
   117   high = inner_muldiv_255_sse2(ahigh, bhigh);
       
   118   return _mm_packus_epi16(low, high);
       
   119 }
       
   120 
       
   121 SSE_FUNCTION static inline __m128i
       
   122 negate_argb_sse2(__m128i a)
       
   123 {
       
   124   return _mm_xor_si128(a, MC(16xff));
       
   125 }
       
   126 
       
   127 SSE_FUNCTION static inline __m128i
       
   128 load_argb_sse2(const uint32_t *src)
       
   129 {
       
   130   return _mm_loadu_si128((__m128i *)src);
       
   131 }
       
   132 
       
   133 SSE_FUNCTION static inline __m128i
       
   134 set1_argb_sse2(uint32_t src)
       
   135 {
       
   136   return _mm_set1_epi32(src);
       
   137 }
       
   138 
       
   139 SSE_FUNCTION static inline __m128i
       
   140 load_u8_mask(const uint8_t *m)
       
   141 {
       
   142   __m128i a;
       
   143   a = _mm_cvtsi32_si128(*(uint32_t *)m);
       
   144   a = _mm_unpacklo_epi8(a, a);
       
   145   a = _mm_unpacklo_epi16(a, a);
       
   146   return a;
       
   147 }
       
   148 
       
   149 SSE_FUNCTION static inline __m128i
       
   150 set1_u8_mask(uint8_t m)
       
   151 {
       
   152   return _mm_set1_epi8(m);
       
   153 }
       
   154 
       
   155 SSE_FUNCTION static void
       
   156 store_argb_sse2(uint32_t *dest, __m128i pix)
       
   157 {
       
   158   _mm_store_si128((__m128i *)dest, pix);
       
   159 }
       
   160 
       
   161 SSE_FUNCTION static __m128i 
       
   162 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
       
   163 {
       
   164   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
       
   165 }
       
   166 
       
   167 SSE_FUNCTION static void
       
   168 composite_in_argb_sse (uint32_t *dest, const uint32_t *src, const uint8_t *mask,
       
   169     int n)
       
   170 {
       
   171   for (; ((long)dest & 15) && (n > 0); n--) {
       
   172     uint32_t s = *src++;
       
   173     uint8_t m = *mask++;
       
   174 
       
   175     *dest++ = oil_argb(
       
   176 	COMPOSITE_IN(oil_argb_A(s), m),
       
   177 	COMPOSITE_IN(oil_argb_R(s), m),
       
   178 	COMPOSITE_IN(oil_argb_G(s), m),
       
   179 	COMPOSITE_IN(oil_argb_B(s), m));
       
   180   }
       
   181   for (; n >= 4; n -= 4) {
       
   182     __m128i s, m;
       
   183     s = load_argb_sse2(src);
       
   184     m = load_u8_mask(mask);
       
   185     store_argb_sse2(dest, muldiv_255_sse2(s, m));
       
   186     src += 4;
       
   187     mask += 4;
       
   188     dest += 4;
       
   189   }
       
   190   for (; n > 0; n--) {
       
   191     uint32_t s = *src++;
       
   192     uint8_t m = *mask++;
       
   193 
       
   194     *dest++ = oil_argb(
       
   195 	COMPOSITE_IN(oil_argb_A(s), m),
       
   196 	COMPOSITE_IN(oil_argb_R(s), m),
       
   197 	COMPOSITE_IN(oil_argb_G(s), m),
       
   198 	COMPOSITE_IN(oil_argb_B(s), m));
       
   199   }
       
   200 }
       
   201 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_sse, composite_in_argb,
       
   202     OIL_IMPL_FLAG_SSE2);
       
   203 
       
   204 SSE_FUNCTION static void
       
   205 composite_in_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
       
   206     const uint8_t *mask, int n)
       
   207 {
       
   208   __m128i s;
       
   209 
       
   210   s = set1_argb_sse2(*src);
       
   211 
       
   212   for (; ((long)dest & 15) && (n > 0); n--) {
       
   213     uint8_t m = *mask++;
       
   214 
       
   215     *dest++ = oil_argb(
       
   216 	COMPOSITE_IN(oil_argb_A(*src), m),
       
   217 	COMPOSITE_IN(oil_argb_R(*src), m),
       
   218 	COMPOSITE_IN(oil_argb_G(*src), m),
       
   219 	COMPOSITE_IN(oil_argb_B(*src), m));
       
   220   }
       
   221   for (; n >= 4; n -= 4) {
       
   222     __m128i m;
       
   223     m = load_u8_mask(mask);
       
   224     store_argb_sse2(dest, muldiv_255_sse2(s, m));
       
   225     mask += 4;
       
   226     dest += 4;
       
   227   }
       
   228   for (; n > 0; n--) {
       
   229     uint8_t m = *mask++;
       
   230 
       
   231     *dest++ = oil_argb(
       
   232 	COMPOSITE_IN(oil_argb_A(*src), m),
       
   233 	COMPOSITE_IN(oil_argb_R(*src), m),
       
   234 	COMPOSITE_IN(oil_argb_G(*src), m),
       
   235 	COMPOSITE_IN(oil_argb_B(*src), m));
       
   236   }
       
   237 }
       
   238 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_src_sse,
       
   239     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   240 
       
   241 SSE_FUNCTION static void
       
   242 composite_in_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
       
   243     const uint8_t *mask, int n)
       
   244 {
       
   245   __m128i m;
       
   246 
       
   247   m = set1_u8_mask(*mask);
       
   248 
       
   249   for (; ((long)dest & 15) && (n > 0); n--) {
       
   250     uint32_t s = *src++;
       
   251 
       
   252     *dest++ = oil_argb(
       
   253 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
       
   254 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
       
   255 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
       
   256 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
       
   257   }
       
   258   for (; n >= 4; n -= 4) {
       
   259     __m128i s;
       
   260     s = load_argb_sse2(src);
       
   261     store_argb_sse2(dest,  muldiv_255_sse2(s, m));
       
   262     src += 4;
       
   263     dest += 4;
       
   264   }
       
   265   for (; n > 0; n--) {
       
   266     uint32_t s = *src++;
       
   267 
       
   268     *dest++ = oil_argb(
       
   269 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
       
   270 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
       
   271 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
       
   272 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
       
   273   }
       
   274 }
       
   275 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_argb_const_mask_sse,
       
   276     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
       
   277 
       
   278 SSE_FUNCTION static void
       
   279 composite_over_argb_sse (uint32_t *dest, const uint32_t *src, int n)
       
   280 {
       
   281   for (; ((long)dest & 15) && (n > 0); n--) {
       
   282     uint32_t d = *dest, s = *src++;
       
   283     uint8_t srca = oil_argb_A(s);
       
   284     d = oil_argb(
       
   285 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
       
   286 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
       
   287 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
       
   288 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
       
   289     *dest++ = d;
       
   290   }
       
   291   for (; n >= 4; n -= 4) {
       
   292     __m128i d, s;
       
   293     s = load_argb_sse2(src);
       
   294     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
       
   295     store_argb_sse2(dest, d);
       
   296     src += 4;
       
   297     dest += 4;
       
   298   }
       
   299   for (; n > 0; n--) {
       
   300     uint32_t d = *dest, s = *src++;
       
   301     uint8_t srca = oil_argb_A(s);
       
   302     d = oil_argb(
       
   303 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
       
   304 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
       
   305 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
       
   306 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
       
   307     *dest++ = d;
       
   308   }
       
   309 }
       
   310 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_sse, composite_over_argb,
       
   311     OIL_IMPL_FLAG_SSE2);
       
   312 
       
   313 SSE_FUNCTION static void
       
   314 composite_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src, int n)
       
   315 {
       
   316   __m128i s, sa;
       
   317   uint32_t srca;
       
   318 
       
   319   srca = oil_argb_A(*src);
       
   320   s = set1_argb_sse2(*src);
       
   321   sa = negate_argb_sse2(argb_A_sse2(s));
       
   322   for (; ((long)dest & 15) && (n > 0); n--) {
       
   323     uint32_t d = *dest;
       
   324     d = oil_argb(
       
   325 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
       
   326 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
       
   327 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
       
   328 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
       
   329     *dest++ = d;
       
   330   }
       
   331   for (; n >= 4; n -= 4) {
       
   332     __m128i d;
       
   333     d = _mm_adds_epu8(s, muldiv_255_sse2(*(__m128i *)dest, sa));
       
   334     store_argb_sse2(dest, d);
       
   335     dest += 4;
       
   336   }
       
   337   for (; n > 0; n--) {
       
   338     uint32_t d = *dest;
       
   339     d = oil_argb(
       
   340 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
       
   341 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
       
   342 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
       
   343 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
       
   344     *dest++ = d;
       
   345   }
       
   346 }
       
   347 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_argb_const_src_sse,
       
   348     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   349 
       
   350 SSE_FUNCTION static void
       
   351 composite_in_over_argb_sse (uint32_t *dest, const uint32_t *src,
       
   352     const uint8_t *mask, int n)
       
   353 {
       
   354   for (; ((long)dest & 15) && (n > 0); n--) {
       
   355     uint32_t d = *dest, s = *src++, m = *mask++, color;
       
   356     uint8_t srca;
       
   357 
       
   358     color = oil_argb(
       
   359         COMPOSITE_IN(oil_argb_A(s), m),
       
   360         COMPOSITE_IN(oil_argb_R(s), m),
       
   361         COMPOSITE_IN(oil_argb_G(s), m),
       
   362         COMPOSITE_IN(oil_argb_B(s), m));
       
   363     srca = oil_argb_A(color);
       
   364     d = oil_argb(
       
   365 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   366 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   367 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   368 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   369     *dest++ = d;
       
   370   }
       
   371   for (; n >= 4; n -= 4) {
       
   372     __m128i d, s, m;
       
   373     s = load_argb_sse2(src);
       
   374     m = load_u8_mask(mask);
       
   375     s = muldiv_255_sse2(s, m);
       
   376     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
       
   377     store_argb_sse2(dest, d);
       
   378     src += 4;
       
   379     mask += 4;
       
   380     dest += 4;
       
   381   }
       
   382   for (; n > 0; n--) {
       
   383     uint32_t d = *dest, s = *src++, m = *mask++, color;
       
   384     uint8_t srca;
       
   385 
       
   386     color = oil_argb(
       
   387         COMPOSITE_IN(oil_argb_A(s), m),
       
   388         COMPOSITE_IN(oil_argb_R(s), m),
       
   389         COMPOSITE_IN(oil_argb_G(s), m),
       
   390         COMPOSITE_IN(oil_argb_B(s), m));
       
   391     srca = oil_argb_A(color);
       
   392     d = oil_argb(
       
   393 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   394 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   395 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   396 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   397     *dest++ = d;
       
   398   }
       
   399 }
       
   400 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_sse, composite_in_over_argb,
       
   401     OIL_IMPL_FLAG_SSE2);
       
   402 
       
   403 SSE_FUNCTION static void
       
   404 composite_in_over_argb_const_src_sse (uint32_t *dest, const uint32_t *src,
       
   405     const uint8_t *mask, int n)
       
   406 {
       
   407   __m128i s;
       
   408 
       
   409   s = set1_argb_sse2(*src);
       
   410 
       
   411   for (; ((long)dest & 15) && (n > 0); n--) {
       
   412     uint32_t d = *dest, m = *mask++, color;
       
   413     uint8_t srca;
       
   414 
       
   415     color = oil_argb(
       
   416         COMPOSITE_IN(oil_argb_A(*src), m),
       
   417         COMPOSITE_IN(oil_argb_R(*src), m),
       
   418         COMPOSITE_IN(oil_argb_G(*src), m),
       
   419         COMPOSITE_IN(oil_argb_B(*src), m));
       
   420     srca = oil_argb_A(color);
       
   421     d = oil_argb(
       
   422 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   423 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   424 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   425 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   426     *dest++ = d;
       
   427   }
       
   428   for (; n >= 4; n -= 4) {
       
   429     __m128i d, color, m;
       
   430     m = load_u8_mask(mask);
       
   431     color = muldiv_255_sse2(s, m);
       
   432     d = over_argb_sse2(*(__m128i *)dest, color, argb_A_sse2(color));
       
   433     store_argb_sse2(dest, d);
       
   434     mask += 4;
       
   435     dest += 4;
       
   436   }
       
   437   for (; n > 0; n--) {
       
   438     uint32_t d = *dest, m = *mask++, color;
       
   439     uint8_t srca;
       
   440 
       
   441     color = oil_argb(
       
   442         COMPOSITE_IN(oil_argb_A(*src), m),
       
   443         COMPOSITE_IN(oil_argb_R(*src), m),
       
   444         COMPOSITE_IN(oil_argb_G(*src), m),
       
   445         COMPOSITE_IN(oil_argb_B(*src), m));
       
   446     srca = oil_argb_A(color);
       
   447     d = oil_argb(
       
   448 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   449 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   450 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   451 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   452     *dest++ = d;
       
   453   }
       
   454 }
       
   455 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_src_sse,
       
   456     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   457 
       
   458 SSE_FUNCTION static void
       
   459 composite_in_over_argb_const_mask_sse (uint32_t *dest, const uint32_t *src,
       
   460     const uint8_t *mask, int n)
       
   461 {
       
   462   __m128i m;
       
   463 
       
   464   m = set1_u8_mask(*mask);
       
   465 
       
   466   for (; ((long)dest & 15) && (n > 0); n--) {
       
   467     uint32_t d = *dest, s = *src++, color;
       
   468     uint8_t srca;
       
   469 
       
   470     color = oil_argb(
       
   471         COMPOSITE_IN(oil_argb_A(s), *mask),
       
   472         COMPOSITE_IN(oil_argb_R(s), *mask),
       
   473         COMPOSITE_IN(oil_argb_G(s), *mask),
       
   474         COMPOSITE_IN(oil_argb_B(s), *mask));
       
   475     srca = oil_argb_A(color);
       
   476     d = oil_argb(
       
   477 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   478 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   479 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   480 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   481     *dest++ = d;
       
   482   }
       
   483   for (; n >= 4; n -= 4) {
       
   484     __m128i d, s;
       
   485     s = load_argb_sse2(src);
       
   486     s = muldiv_255_sse2(s, m);
       
   487     d = over_argb_sse2(*(__m128i *)dest, s, argb_A_sse2(s));
       
   488     store_argb_sse2(dest, d);
       
   489     src += 4;
       
   490     dest += 4;
       
   491   }
       
   492   for (; n > 0; n--) {
       
   493     uint32_t d = *dest, s = *src++, color;
       
   494     uint8_t srca;
       
   495 
       
   496     color = oil_argb(
       
   497         COMPOSITE_IN(oil_argb_A(s), *mask),
       
   498         COMPOSITE_IN(oil_argb_R(s), *mask),
       
   499         COMPOSITE_IN(oil_argb_G(s), *mask),
       
   500         COMPOSITE_IN(oil_argb_B(s), *mask));
       
   501     srca = oil_argb_A(color);
       
   502     d = oil_argb(
       
   503 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   504 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   505 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   506 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   507     *dest++ = d;
       
   508   }
       
   509 }
       
   510 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_in_over_argb_const_mask_sse,
       
   511     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
       
   512 
       
   513 SSE_FUNCTION static void
       
   514 composite_over_u8_sse (uint8_t *dest, const uint8_t *src, int n)
       
   515 {
       
   516   /* Initial operations to align the destination pointer */
       
   517   for (; ((long)dest & 15) && (n > 0); n--) {
       
   518     *dest = COMPOSITE_OVER(*dest, *src, *src);
       
   519     src++;
       
   520     dest++;
       
   521   }
       
   522   /* over_u8 can be dealt with using our argb code, with srca = s */
       
   523   for (; n >= 16; n -= 16) {
       
   524     __m128i d, s;
       
   525     d = *(__m128i *)dest;
       
   526     s = load_argb_sse2((uint32_t *)src);
       
   527     store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
       
   528     src += 16;
       
   529     dest += 16;
       
   530   }
       
   531   for (; n > 0; n--) {
       
   532     *dest = COMPOSITE_OVER(*dest, *src, *src);
       
   533     src++;
       
   534     dest++;
       
   535   }
       
   536 }
       
   537 OIL_DEFINE_IMPL_FULL_WRAPPER (composite_over_u8_sse, composite_over_u8,
       
   538     OIL_IMPL_FLAG_SSE2);
       
   539 
       
   540 #endif
       
   541