genericopenlibs/liboil/src/math_sse_unroll2.c
changeset 18 47c74d1534e1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genericopenlibs/liboil/src/math_sse_unroll2.c	Fri Apr 16 16:46:38 2010 +0300
@@ -0,0 +1,422 @@
+/*
+ * Copyright (c) 2005
+ *	Eric Anholt.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+//Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+#include <liboil/liboilclasses.h>
+#include <liboil/liboilfunction.h>
+#include <emmintrin.h>
+#include <xmmintrin.h>
+
+#define SSE_FUNCTION __attribute__((force_align_arg_pointer))
+
+SSE_FUNCTION static void
+add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ + *src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_add_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_add_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ + *src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ - *src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_sub_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_sub_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ - *src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ * *src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_mul_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_mul_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ * *src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ / *src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_div_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_div_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ / *src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1 < *src2 ? *src1 : *src2;
+    src1++;
+    src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_min_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_min_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1 < *src2 ? *src1 : *src2;
+    src1++;
+    src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1 > *src2 ? *src1 : *src2;
+    src1++;
+    src2++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm1 = _mm_loadu_ps(src2);
+    xmm0 = _mm_max_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm1 = _mm_loadu_ps(src2 + 4);
+    xmm0 = _mm_max_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+    src2 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1 > *src2 ? *src1 : *src2;
+    src1++;
+    src2++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = 1.0 / *src1++;
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    /* While _mm_rcp_ps sounds promising, the results it gives are rather
+     * different from the 1.0 / src1 reference implementation, so do that.
+     */
+    xmm0 = _mm_set_ps1(1.0);
+    xmm1 = _mm_loadu_ps(src1);
+    xmm0 = _mm_div_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_set_ps1(1.0);
+    xmm1 = _mm_loadu_ps(src1 + 4);
+    xmm0 = _mm_div_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = 1.0 / *src1++;
+  }
+}
+OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+negative_f32_sse_unroll2 (float *dest, float *src1, int n)
+{
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = -(*src1++);
+  }
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0, xmm1;
+    xmm0 = _mm_setzero_ps();
+    xmm1 = _mm_loadu_ps(src1);
+    xmm0 = _mm_sub_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_setzero_ps();
+    xmm1 = _mm_loadu_ps(src1 + 4);
+    xmm0 = _mm_sub_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = -(*src1++);
+  }
+}
+OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
+{
+  __m128 xmm1;
+
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ + *val;
+  }
+  xmm1 = _mm_load_ps1(val);
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm0 = _mm_add_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm0 = _mm_add_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ + *val;
+  }
+}
+OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
+{
+  __m128 xmm1;
+
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ * *val;
+  }
+  xmm1 = _mm_load_ps1(val);
+  for (; n >= 8; n -= 8) {
+    __m128 xmm0;
+    xmm0 = _mm_loadu_ps(src1);
+    xmm0 = _mm_mul_ps(xmm0, xmm1);
+    _mm_store_ps(dest, xmm0);
+    xmm0 = _mm_loadu_ps(src1 + 4);
+    xmm0 = _mm_mul_ps(xmm0, xmm1);
+    _mm_store_ps(dest + 4, xmm0);
+    dest += 8;
+    src1 += 8;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ * *val;
+  }
+}
+OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
+
+SSE_FUNCTION static void
+scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
+{
+  __m128d xmm1;
+
+  /* Initial operations to align the destination pointer */
+  for (; ((long)dest & 15) && (n > 0); n--) {
+    *dest++ = *src1++ * *val;
+  }
+  xmm1 = _mm_load_pd1(val);
+  for (; n >= 4; n -= 4) {
+    __m128d xmm0;
+    xmm0 = _mm_loadu_pd(src1);
+    xmm0 = _mm_mul_pd(xmm0, xmm1);
+    _mm_store_pd(dest, xmm0);
+    xmm0 = _mm_loadu_pd(src1 + 2);
+    xmm0 = _mm_mul_pd(xmm0, xmm1);
+    _mm_store_pd(dest + 2, xmm0);
+    dest += 4;
+    src1 += 4;
+  }
+  for (; n > 0; n--) {
+    *dest++ = *src1++ * *val;
+  }
+}
+OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
+
+
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() {
+		return &_oil_function_impl_add_f32_sse_unroll2, add_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() {
+		return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() {
+		return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() {
+		return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() {
+		return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() {
+		return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() {
+		return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() {
+		return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() {
+		return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() {
+		return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns;
+}
+#endif
+
+#ifdef	__SYMBIAN32__
+ 
+OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() {
+		return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns;
+}
+#endif
+