genericopenlibs/liboil/src/composite_sse_2pix.c
author Dremov Kirill (Nokia-D-MSW/Tampere) <kirill.dremov@nokia.com>
Fri, 16 Apr 2010 16:46:38 +0300
changeset 18 47c74d1534e1
permissions -rw-r--r--
Revision: 201011 Kit: 201015

/*
 * Copyright (c) 2005
 *	Eric Anholt.  All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif
#include <liboilclasses.h>
#include <liboilfunction.h>
#include <emmintrin.h>
#include "liboil/liboilcolorspace.h"

#define SSE_FUNCTION __attribute__((force_align_arg_pointer))

/* non-SSE2 compositing support */
#define COMPOSITE_OVER(d,s,m) ((d) + (s) - oil_muldiv_255((d),(m)))
#define COMPOSITE_ADD(d,s) oil_clamp_255((d) + (s))
#define COMPOSITE_IN(s,m) oil_muldiv_255((s),(m))

/* rgba values in SSE2 code will be unpacked as 16-bit integers per channel with
 * the channel value in the low byte.  This means 2 pixels per pass.
 */

#ifdef ENABLE_BROKEN_IMPLS

union m128_int {
  __m128i m128;
  uint64_t ull[2];
};

static const struct _SSEData {
  union m128_int sse_8x00ff;
  union m128_int sse_8x0080;
} c = {
    .sse_8x00ff.ull =	{0x00ff00ff00ff00ffULL, 0x00ff00ff00ff00ffULL},
    .sse_8x0080.ull =	{0x0080008000800080ULL, 0x0080008000800080ULL},
};

#define MC(x) (c.sse_##x.m128)

/* Shuffles the given value such that the alpha for each pixel appears in each
 * channel of the pixel.
 */
SSE_FUNCTION static inline __m128i
argb_A_sse2(__m128i a)
{
  a = _mm_shufflehi_epi16(a, _MM_SHUFFLE(3,3,3,3));
  a = _mm_shufflelo_epi16(a, _MM_SHUFFLE(3,3,3,3));
  return a;
}

/* Multiplies the pixel data in a channel-by-channel by b, and divides the
 * result by 255, with rounding.
 */
SSE_FUNCTION static inline __m128i
muldiv_255_sse2(__m128i a, __m128i b)
{
  __m128i ret;
  __m128i roundconst = MC(8x0080);

  ret = _mm_mullo_epi16(a, b);
  ret = _mm_adds_epu16(ret, roundconst);
  ret = _mm_adds_epu16(ret, _mm_srli_epi16(ret, 8));
  ret = _mm_srli_epi16(ret, 8);

  return ret;
}

SSE_FUNCTION static inline __m128i
negate_argb_sse2(__m128i a)
{
  return _mm_xor_si128(a, MC(8x00ff));
}

/* Loads the 2 (unaligned) pixels at *src into unpacked SSE2 registers */
SSE_FUNCTION static inline __m128i
load_argb_sse2(const uint32_t *src)
{
  __m128i pix;

  pix = _mm_loadl_epi64((__m128i *)src);
  pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
  return pix;
}

SSE_FUNCTION static inline __m128i
set1_argb_sse2(uint32_t src)
{
  __m128i pix;

  pix = _mm_set1_epi32(src);
  pix = _mm_unpacklo_epi8(pix, _mm_setzero_si128());
  return pix;
}

SSE_FUNCTION static inline __m128i
load_u8_mask(const uint8_t *m)
{
  return _mm_unpacklo_epi64(_mm_set1_epi16(m[0]), _mm_set1_epi16(m[1]));
}

SSE_FUNCTION static inline __m128i
set1_u8_mask(uint8_t m)
{
  return _mm_unpacklo_epi8(_mm_set1_epi8(m), _mm_setzero_si128());
}

/* Stores the 2 unpacked pixels in pix into the (unaligned) *dest */
SSE_FUNCTION static void
store_argb_sse2(uint32_t *dest, __m128i pix)
{
  pix = _mm_packus_epi16(pix, pix);
  _mm_storel_epi64((__m128i *)dest, pix);
}

SSE_FUNCTION static __m128i 
over_argb_sse2(__m128i dest, __m128i src, __m128i srca)
{
  return _mm_adds_epu8(src, muldiv_255_sse2(dest, negate_argb_sse2(srca)));
}

SSE_FUNCTION static void
composite_in_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  for (; n >= 2; n -= 2) {
    __m128i s, m;
    s = load_argb_sse2(src);
    m = load_u8_mask(mask);
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
    src += 2;
    mask += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t s = *src++;
    uint8_t m = *mask++;

    *dest++ = oil_argb(
	COMPOSITE_IN(oil_argb_A(s), m),
	COMPOSITE_IN(oil_argb_R(s), m),
	COMPOSITE_IN(oil_argb_G(s), m),
	COMPOSITE_IN(oil_argb_B(s), m));
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_sse_2pix, composite_in_argb,
    OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_in_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  __m128i s;

  s = set1_argb_sse2(*src);

  for (; n >= 2; n -= 2) {
    __m128i m;
    m = load_u8_mask(mask);
    store_argb_sse2(dest, muldiv_255_sse2(s, m));
    mask += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint8_t m = *mask++;

    *dest++ = oil_argb(
	COMPOSITE_IN(oil_argb_A(*src), m),
	COMPOSITE_IN(oil_argb_R(*src), m),
	COMPOSITE_IN(oil_argb_G(*src), m),
	COMPOSITE_IN(oil_argb_B(*src), m));
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_src_sse_2pix,
    composite_in_argb_const_src, OIL_IMPL_FLAG_SSE2);

#ifdef SSE_ALIGN
SSE_FUNCTION static void
composite_in_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  __m128i m;

  m = set1_u8_mask(*mask);

  for (; n >= 2; n -= 2) {
    __m128i s;
    s = load_argb_sse2(src);
    store_argb_sse2(dest,  muldiv_255_sse2(s, m));
    src += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t s = *src++;

    *dest++ = oil_argb(
	COMPOSITE_IN(oil_argb_A(s), mask[0]),
	COMPOSITE_IN(oil_argb_R(s), mask[0]),
	COMPOSITE_IN(oil_argb_G(s), mask[0]),
	COMPOSITE_IN(oil_argb_B(s), mask[0]));
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_argb_const_mask_sse_2pix,
    composite_in_argb_const_mask, OIL_IMPL_FLAG_SSE2);
#endif

SSE_FUNCTION static void
composite_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src, int n)
{
  for (; n >= 2; n -= 2) {
    __m128i d, s;
    s = load_argb_sse2(src);
    d = load_argb_sse2(dest);
    d = over_argb_sse2(d, s, argb_A_sse2(s));
    store_argb_sse2(dest, d);
    src += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t d = *dest, s = *src++;
    uint8_t srca = oil_argb_A(s);
    d = oil_argb(
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(s), srca),
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(s), srca),
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(s), srca),
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(s), srca));
    *dest++ = d;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_sse_2pix, composite_over_argb,
    OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
    int n)
{
  __m128i s, sa;
  uint32_t srca;

  srca = oil_argb_A(*src);
  s = set1_argb_sse2(*src);
  sa = negate_argb_sse2(argb_A_sse2(s));
  for (; n >= 2; n -= 2) {
    __m128i d;
    d = load_argb_sse2(dest);
    d = _mm_adds_epu8(s, muldiv_255_sse2(d, sa));
    store_argb_sse2(dest, d);
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t d = *dest;
    d = oil_argb(
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(*src), srca),
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(*src), srca),
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(*src), srca),
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(*src), srca));
    *dest++ = d;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_argb_const_src_sse_2pix,
    composite_over_argb_const_src, OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_in_over_argb_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  for (; n >= 2; n -= 2) {
    __m128i d, s, m;
    s = load_argb_sse2(src);
    m = load_u8_mask(mask);
    d = load_argb_sse2(dest);
    s = muldiv_255_sse2(s, m);
    d = over_argb_sse2(d, s, argb_A_sse2(s));
    store_argb_sse2(dest, d);
    src += 2;
    mask += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t d = *dest, s = *src++, m = *mask++, color;
    uint8_t srca;

    color = oil_argb(
        COMPOSITE_IN(oil_argb_A(s), m),
        COMPOSITE_IN(oil_argb_R(s), m),
        COMPOSITE_IN(oil_argb_G(s), m),
        COMPOSITE_IN(oil_argb_B(s), m));
    srca = oil_argb_A(color);
    d = oil_argb(
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
    *dest++ = d;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_sse_2pix, composite_in_over_argb,
    OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_in_over_argb_const_src_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  __m128i s;

  s = set1_argb_sse2(*src);

  for (; n >= 2; n -= 2) {
    __m128i d, color, m;
    m = load_u8_mask(mask);
    d = load_argb_sse2(dest);
    color = muldiv_255_sse2(s, m);
    d = over_argb_sse2(d, color, argb_A_sse2(color));
    store_argb_sse2(dest, d);
    mask += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t d = *dest, m = *mask++, color;
    uint8_t srca;

    color = oil_argb(
        COMPOSITE_IN(oil_argb_A(*src), m),
        COMPOSITE_IN(oil_argb_R(*src), m),
        COMPOSITE_IN(oil_argb_G(*src), m),
        COMPOSITE_IN(oil_argb_B(*src), m));
    srca = oil_argb_A(color);
    d = oil_argb(
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
    *dest++ = d;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_src_sse_2pix,
    composite_in_over_argb_const_src, OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_in_over_argb_const_mask_sse_2pix (uint32_t *dest, const uint32_t *src,
    const uint8_t *mask, int n)
{
  __m128i m;

  m = set1_u8_mask(*mask);

  for (; n >= 2; n -= 2) {
    __m128i d, s;
    s = load_argb_sse2(src);
    d = load_argb_sse2(dest);
    s = muldiv_255_sse2(s, m);
    d = over_argb_sse2(d, s, argb_A_sse2(s));
    store_argb_sse2(dest, d);
    src += 2;
    dest += 2;
  }
  for (; n > 0; n--) {
    uint32_t d = *dest, s = *src++, color;
    uint8_t srca;

    color = oil_argb(
        COMPOSITE_IN(oil_argb_A(s), *mask),
        COMPOSITE_IN(oil_argb_R(s), *mask),
        COMPOSITE_IN(oil_argb_G(s), *mask),
        COMPOSITE_IN(oil_argb_B(s), *mask));
    srca = oil_argb_A(color);
    d = oil_argb(
	COMPOSITE_OVER(oil_argb_A(d), oil_argb_A(color), srca),
	COMPOSITE_OVER(oil_argb_R(d), oil_argb_R(color), srca),
	COMPOSITE_OVER(oil_argb_G(d), oil_argb_G(color), srca),
	COMPOSITE_OVER(oil_argb_B(d), oil_argb_B(color), srca));
    *dest++ = d;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_in_over_argb_const_mask_sse_2pix,
    composite_in_over_argb_const_mask, OIL_IMPL_FLAG_SSE2);

SSE_FUNCTION static void
composite_over_u8_sse_2pix (uint8_t *dest, const uint8_t *src, int n)
{
  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest = COMPOSITE_OVER(*dest, *src, *src);
    src++;
    dest++;
  }
  /* over_u8 can be dealt with using our argb code, with srca = s */
  for (; n >= 8; n -= 8) {
    __m128i d, s;
    d = load_argb_sse2((uint32_t *)dest);
    s = load_argb_sse2((uint32_t *)src);
    store_argb_sse2((uint32_t *)dest, over_argb_sse2(d, s, s));
    src += 8;
    dest += 8;
  }
  for (; n > 0; n--) {
    *dest = COMPOSITE_OVER(*dest, *src, *src);
    src++;
    dest++;
  }
}
OIL_DEFINE_IMPL_FULL_WRAPPER(composite_over_u8_sse_2pix, composite_over_u8,
    OIL_IMPL_FLAG_SSE2);
#endif