genericopenlibs/liboil/src/composite_sse_2pix.c
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /*
       
     2  * Copyright (c) 2005
       
     3  *	Eric Anholt.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 
       
    27 #ifdef HAVE_CONFIG_H
       
    28 #include "config.h"
       
    29 #endif
       
    30 #include <liboilclasses.h>
       
    31 #include <liboilfunction.h>
       
    32 #include <emmintrin.h>
       
    33 #include "liboil/liboilcolorspace.h"
       
    34 
       
    35 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
       
    36 
       
    37 /* non-SSE2 compositing support */
       
    38 #define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
       
    39 #define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
       
    40 #define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))
       
    41 
       
    42 /* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
       
    43  * the channel value in the low byte.  This means 2 pixels per pass.
       
    44  */
       
    45 
       
    46 #ifdef ENABLE_BROKEN_IMPLS
       
    47 
       
    48 union m128_int {
       
    49   __m128i m128;
       
    50   uint64_t ull[2];
       
    51 };
       
    52 
       
    53 static const struct _SSEData {
       
    54   union m128_int sse_8x00ff;
       
    55   union m128_int sse_8x0080;
       
    56 } c = {
       
    57     .sse_8x00ff.ull =	{0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
       
    58     .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
       
    59 };
       
    60 
       
    61 #define MC(x) (c.sse_##x.m128)
       
    62 
       
    63 /* Shuffles the given value such that the alpha for each pixel appears in each
       
    64  * channel of the pixel.
       
    65  */
       
    66 SSE_FUNCTION static inline __m128i
       
    67 argb_A_sse2(__m128i a)
       
    68 {
       
    69   a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
       
    70   a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
       
    71   return a;
       
    72 }
       
    73 
       
    74 /* Multiplies the pixel data in a channel-by-channel by b, and divides the
       
    75  * result by 255, with rounding.
       
    76  */
       
    77 SSE_FUNCTION static inline __m128i
       
    78 muldiv_255_sse2(__m128i a, __m128i b)
       
    79 {
       
    80   __m128i ret;
       
    81   __m128i roundconst = MC(8x0080);
       
    82 
       
    83   ret = _mm_mullo_epi16(a, b);
       
    84   ret = _mm_adds_epu16(ret, roundconst);
       
    85   ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
       
    86   ret = _mm_srli_epi16(ret, 8);
       
    87 
       
    88   return ret;
       
    89 }
       
    90 
       
    91 SSE_FUNCTION static inline __m128i
       
    92 negate_argb_sse2(__m128i a)
       
    93 {
       
    94   return _mm_xor_si128(a, MC(8x00ff));
       
    95 }
       
    96 
       
    97 /* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
       
    98 SSE_FUNCTION static inline __m128i
       
    99 load_argb_sse2(const uint32_t *src)
       
   100 {
       
   101   __m128i pix;
       
   102 
       
   103   pix = _mm_loadl_epi64((__m128i *)src);
       
   104   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
       
   105   return pix;
       
   106 }
       
   107 
       
   108 SSE_FUNCTION static inline __m128i
       
   109 set1_argb_sse2(uint32_t src)
       
   110 {
       
   111   __m128i pix;
       
   112 
       
   113   pix = _mm_set1_epi32(src);
       
   114   pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
       
   115   return pix;
       
   116 }
       
   117 
       
   118 SSE_FUNCTION static inline __m128i
       
   119 load_u8_mask(const uint8_t *m)
       
   120 {
       
   121   return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
       
   122 }
       
   123 
       
   124 SSE_FUNCTION static inline __m128i
       
   125 set1_u8_mask(uint8_t m)
       
   126 {
       
   127   return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
       
   128 }
       
   129 
       
   130 /* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
       
   131 SSE_FUNCTION static void
       
   132 store_argb_sse2(uint32_t *dest, __m128i pix)
       
   133 {
       
   134   pix = _mm_packus_epi16(pix, pix);
       
   135   _mm_storel_epi64((__m128i *)dest, pix);
       
   136 }
       
   137 
       
   138 SSE_FUNCTION static __m128i 
       
   139 over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
       
   140 {
       
   141   return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
       
   142 }
       
   143 
       
   144 SSE_FUNCTION static void
       
   145 composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   146     const uint8_t *mask, int n)
       
   147 {
       
   148   for (; n >= 2; n -= 2) {
       
   149     __m128i s, m;
       
   150     s = load_argb_sse2(src);
       
   151     m = load_u8_mask(mask);
       
   152     store_argb_sse2(dest, muldiv_255_sse2(s, m));
       
   153     src += 2;
       
   154     mask += 2;
       
   155     dest += 2;
       
   156   }
       
   157   for (; n > 0; n--) {
       
   158     uint32_t s = *src++;
       
   159     uint8_t m = *mask++;
       
   160 
       
   161     *dest++ = oil_argb(
       
   162 	COMPOSITE_IN(oil_argb_A(s), m),
       
   163 	COMPOSITE_IN(oil_argb_R(s), m),
       
   164 	COMPOSITE_IN(oil_argb_G(s), m),
       
   165 	COMPOSITE_IN(oil_argb_B(s), m));
       
   166   }
       
   167 }
       
   168 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
       
   169     OIL_IMPL_FLAG_SSE2);
       
   170 
       
   171 SSE_FUNCTION static void
       
   172 composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   173     const uint8_t *mask, int n)
       
   174 {
       
   175   __m128i s;
       
   176 
       
   177   s = set1_argb_sse2(*src);
       
   178 
       
   179   for (; n >= 2; n -= 2) {
       
   180     __m128i m;
       
   181     m = load_u8_mask(mask);
       
   182     store_argb_sse2(dest, muldiv_255_sse2(s, m));
       
   183     mask += 2;
       
   184     dest += 2;
       
   185   }
       
   186   for (; n > 0; n--) {
       
   187     uint8_t m = *mask++;
       
   188 
       
   189     *dest++ = oil_argb(
       
   190 	COMPOSITE_IN(oil_argb_A(*src), m),
       
   191 	COMPOSITE_IN(oil_argb_R(*src), m),
       
   192 	COMPOSITE_IN(oil_argb_G(*src), m),
       
   193 	COMPOSITE_IN(oil_argb_B(*src), m));
       
   194   }
       
   195 }
       
   196 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
       
   197     composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   198 
       
   199 #ifdef SSE_ALIGN
       
   200 SSE_FUNCTION static void
       
   201 composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   202     const uint8_t *mask, int n)
       
   203 {
       
   204   __m128i m;
       
   205 
       
   206   m = set1_u8_mask(*mask);
       
   207 
       
   208   for (; n >= 2; n -= 2) {
       
   209     __m128i s;
       
   210     s = load_argb_sse2(src);
       
   211     store_argb_sse2(dest,  muldiv_255_sse2(s, m));
       
   212     src += 2;
       
   213     dest += 2;
       
   214   }
       
   215   for (; n > 0; n--) {
       
   216     uint32_t s = *src++;
       
   217 
       
   218     *dest++ = oil_argb(
       
   219 	COMPOSITE_IN(oil_argb_A(s), mask[0]),
       
   220 	COMPOSITE_IN(oil_argb_R(s), mask[0]),
       
   221 	COMPOSITE_IN(oil_argb_G(s), mask[0]),
       
   222 	COMPOSITE_IN(oil_argb_B(s), mask[0]));
       
   223   }
       
   224 }
       
   225 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
       
   226     composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
       
   227 #endif
       
   228 
       
   229 SSE_FUNCTION static void
       
   230 composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
       
   231 {
       
   232   for (; n >= 2; n -= 2) {
       
   233     __m128i d, s;
       
   234     s = load_argb_sse2(src);
       
   235     d = load_argb_sse2(dest);
       
   236     d = over_argb_sse2(d, s, argb_A_sse2(s));
       
   237     store_argb_sse2(dest, d);
       
   238     src += 2;
       
   239     dest += 2;
       
   240   }
       
   241   for (; n > 0; n--) {
       
   242     uint32_t d = *dest, s = *src++;
       
   243     uint8_t srca = oil_argb_A(s);
       
   244     d = oil_argb(
       
   245 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
       
   246 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
       
   247 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
       
   248 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
       
   249     *dest++ = d;
       
   250   }
       
   251 }
       
   252 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
       
   253     OIL_IMPL_FLAG_SSE2);
       
   254 
       
   255 SSE_FUNCTION static void
       
   256 composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   257     int n)
       
   258 {
       
   259   __m128i s, sa;
       
   260   uint32_t srca;
       
   261 
       
   262   srca = oil_argb_A(*src);
       
   263   s = set1_argb_sse2(*src);
       
   264   sa = negate_argb_sse2(argb_A_sse2(s));
       
   265   for (; n >= 2; n -= 2) {
       
   266     __m128i d;
       
   267     d = load_argb_sse2(dest);
       
   268     d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
       
   269     store_argb_sse2(dest, d);
       
   270     dest += 2;
       
   271   }
       
   272   for (; n > 0; n--) {
       
   273     uint32_t d = *dest;
       
   274     d = oil_argb(
       
   275 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
       
   276 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
       
   277 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
       
   278 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
       
   279     *dest++ = d;
       
   280   }
       
   281 }
       
   282 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
       
   283     composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   284 
       
   285 SSE_FUNCTION static void
       
   286 composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   287     const uint8_t *mask, int n)
       
   288 {
       
   289   for (; n >= 2; n -= 2) {
       
   290     __m128i d, s, m;
       
   291     s = load_argb_sse2(src);
       
   292     m = load_u8_mask(mask);
       
   293     d = load_argb_sse2(dest);
       
   294     s = muldiv_255_sse2(s, m);
       
   295     d = over_argb_sse2(d, s, argb_A_sse2(s));
       
   296     store_argb_sse2(dest, d);
       
   297     src += 2;
       
   298     mask += 2;
       
   299     dest += 2;
       
   300   }
       
   301   for (; n > 0; n--) {
       
   302     uint32_t d = *dest, s = *src++, m = *mask++, color;
       
   303     uint8_t srca;
       
   304 
       
   305     color = oil_argb(
       
   306         COMPOSITE_IN(oil_argb_A(s), m),
       
   307         COMPOSITE_IN(oil_argb_R(s), m),
       
   308         COMPOSITE_IN(oil_argb_G(s), m),
       
   309         COMPOSITE_IN(oil_argb_B(s), m));
       
   310     srca = oil_argb_A(color);
       
   311     d = oil_argb(
       
   312 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   313 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   314 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   315 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   316     *dest++ = d;
       
   317   }
       
   318 }
       
   319 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
       
   320     OIL_IMPL_FLAG_SSE2);
       
   321 
       
   322 SSE_FUNCTION static void
       
   323 composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   324     const uint8_t *mask, int n)
       
   325 {
       
   326   __m128i s;
       
   327 
       
   328   s = set1_argb_sse2(*src);
       
   329 
       
   330   for (; n >= 2; n -= 2) {
       
   331     __m128i d, color, m;
       
   332     m = load_u8_mask(mask);
       
   333     d = load_argb_sse2(dest);
       
   334     color = muldiv_255_sse2(s, m);
       
   335     d = over_argb_sse2(d, color, argb_A_sse2(color));
       
   336     store_argb_sse2(dest, d);
       
   337     mask += 2;
       
   338     dest += 2;
       
   339   }
       
   340   for (; n > 0; n--) {
       
   341     uint32_t d = *dest, m = *mask++, color;
       
   342     uint8_t srca;
       
   343 
       
   344     color = oil_argb(
       
   345         COMPOSITE_IN(oil_argb_A(*src), m),
       
   346         COMPOSITE_IN(oil_argb_R(*src), m),
       
   347         COMPOSITE_IN(oil_argb_G(*src), m),
       
   348         COMPOSITE_IN(oil_argb_B(*src), m));
       
   349     srca = oil_argb_A(color);
       
   350     d = oil_argb(
       
   351 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   352 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   353 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   354 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   355     *dest++ = d;
       
   356   }
       
   357 }
       
   358 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
       
   359     composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);
       
   360 
       
   361 SSE_FUNCTION static void
       
   362 composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
       
   363     const uint8_t *mask, int n)
       
   364 {
       
   365   __m128i m;
       
   366 
       
   367   m = set1_u8_mask(*mask);
       
   368 
       
   369   for (; n >= 2; n -= 2) {
       
   370     __m128i d, s;
       
   371     s = load_argb_sse2(src);
       
   372     d = load_argb_sse2(dest);
       
   373     s = muldiv_255_sse2(s, m);
       
   374     d = over_argb_sse2(d, s, argb_A_sse2(s));
       
   375     store_argb_sse2(dest, d);
       
   376     src += 2;
       
   377     dest += 2;
       
   378   }
       
   379   for (; n > 0; n--) {
       
   380     uint32_t d = *dest, s = *src++, color;
       
   381     uint8_t srca;
       
   382 
       
   383     color = oil_argb(
       
   384         COMPOSITE_IN(oil_argb_A(s), *mask),
       
   385         COMPOSITE_IN(oil_argb_R(s), *mask),
       
   386         COMPOSITE_IN(oil_argb_G(s), *mask),
       
   387         COMPOSITE_IN(oil_argb_B(s), *mask));
       
   388     srca = oil_argb_A(color);
       
   389     d = oil_argb(
       
   390 	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
       
   391 	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
       
   392 	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
       
   393 	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
       
   394     *dest++ = d;
       
   395   }
       
   396 }
       
   397 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
       
   398     composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);
       
   399 
       
   400 SSE_FUNCTION static void
       
   401 composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
       
   402 {
       
   403   /* Initial operations to align the destination pointer */
       
   404   for (; ((long)dest & 15) && (n > 0); n--) {
       
   405     *dest = COMPOSITE_OVER(*dest, *src, *src);
       
   406     src++;
       
   407     dest++;
       
   408   }
       
   409   /* over_u8 can be dealt with using our argb code, with srca = s */
       
   410   for (; n >= 8; n -= 8) {
       
   411     __m128i d, s;
       
   412     d = load_argb_sse2((uint32_t *)dest);
       
   413     s = load_argb_sse2((uint32_t *)src);
       
   414     store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
       
   415     src += 8;
       
   416     dest += 8;
       
   417   }
       
   418   for (; n > 0; n--) {
       
   419     *dest = COMPOSITE_OVER(*dest, *src, *src);
       
   420     src++;
       
   421     dest++;
       
   422   }
       
   423 }
       
   424 OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
       
   425     OIL_IMPL_FLAG_SSE2);
       
   426 #endif
       
   427