genericopenlibs/liboil/src/math_sse_unroll2.c
branchRCL_3
changeset 56 acd3cd4aaceb
equal deleted inserted replaced
54:4332f0f7be53 56:acd3cd4aaceb
       
     1 /*
       
     2  * Copyright (c) 2005
       
     3  *	Eric Anholt.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    27 
       
    28 #ifdef HAVE_CONFIG_H
       
    29 #include "config.h"
       
    30 #endif
       
    31 #include <liboil/liboilclasses.h>
       
    32 #include <liboil/liboilfunction.h>
       
    33 #include <emmintrin.h>
       
    34 #include <xmmintrin.h>
       
    35 
       
    36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
       
    37 
       
    38 SSE_FUNCTION static void
       
    39 add_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
    40 {
       
    41   /* Initial operations to align the destination pointer */
       
    42   for (; ((long)dest & 15) && (n > 0); n--) {
       
    43     *dest++ = *src1++ + *src2++;
       
    44   }
       
    45   for (; n >= 8; n -= 8) {
       
    46     __m128 xmm0, xmm1;
       
    47     xmm0 = _mm_loadu_ps(src1);
       
    48     xmm1 = _mm_loadu_ps(src2);
       
    49     xmm0 = _mm_add_ps(xmm0, xmm1);
       
    50     _mm_store_ps(dest, xmm0);
       
    51     xmm0 = _mm_loadu_ps(src1 + 4);
       
    52     xmm1 = _mm_loadu_ps(src2 + 4);
       
    53     xmm0 = _mm_add_ps(xmm0, xmm1);
       
    54     _mm_store_ps(dest + 4, xmm0);
       
    55     dest += 8;
       
    56     src1 += 8;
       
    57     src2 += 8;
       
    58   }
       
    59   for (; n > 0; n--) {
       
    60     *dest++ = *src1++ + *src2++;
       
    61   }
       
    62 }
       
    63 OIL_DEFINE_IMPL_FULL (add_f32_sse_unroll2, add_f32, OIL_IMPL_FLAG_SSE);
       
    64 
       
    65 SSE_FUNCTION static void
       
    66 subtract_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
    67 {
       
    68   /* Initial operations to align the destination pointer */
       
    69   for (; ((long)dest & 15) && (n > 0); n--) {
       
    70     *dest++ = *src1++ - *src2++;
       
    71   }
       
    72   for (; n >= 8; n -= 8) {
       
    73     __m128 xmm0, xmm1;
       
    74     xmm0 = _mm_loadu_ps(src1);
       
    75     xmm1 = _mm_loadu_ps(src2);
       
    76     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
    77     _mm_store_ps(dest, xmm0);
       
    78     xmm0 = _mm_loadu_ps(src1 + 4);
       
    79     xmm1 = _mm_loadu_ps(src2 + 4);
       
    80     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
    81     _mm_store_ps(dest + 4, xmm0);
       
    82     dest += 8;
       
    83     src1 += 8;
       
    84     src2 += 8;
       
    85   }
       
    86   for (; n > 0; n--) {
       
    87     *dest++ = *src1++ - *src2++;
       
    88   }
       
    89 }
       
    90 OIL_DEFINE_IMPL_FULL (subtract_f32_sse_unroll2, subtract_f32, OIL_IMPL_FLAG_SSE);
       
    91 
       
    92 SSE_FUNCTION static void
       
    93 multiply_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
    94 {
       
    95   /* Initial operations to align the destination pointer */
       
    96   for (; ((long)dest & 15) && (n > 0); n--) {
       
    97     *dest++ = *src1++ * *src2++;
       
    98   }
       
    99   for (; n >= 8; n -= 8) {
       
   100     __m128 xmm0, xmm1;
       
   101     xmm0 = _mm_loadu_ps(src1);
       
   102     xmm1 = _mm_loadu_ps(src2);
       
   103     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   104     _mm_store_ps(dest, xmm0);
       
   105     xmm0 = _mm_loadu_ps(src1 + 4);
       
   106     xmm1 = _mm_loadu_ps(src2 + 4);
       
   107     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   108     _mm_store_ps(dest + 4, xmm0);
       
   109     dest += 8;
       
   110     src1 += 8;
       
   111     src2 += 8;
       
   112   }
       
   113   for (; n > 0; n--) {
       
   114     *dest++ = *src1++ * *src2++;
       
   115   }
       
   116 }
       
   117 OIL_DEFINE_IMPL_FULL (multiply_f32_sse_unroll2, multiply_f32, OIL_IMPL_FLAG_SSE);
       
   118 
       
   119 SSE_FUNCTION static void
       
   120 divide_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
   121 {
       
   122   /* Initial operations to align the destination pointer */
       
   123   for (; ((long)dest & 15) && (n > 0); n--) {
       
   124     *dest++ = *src1++ / *src2++;
       
   125   }
       
   126   for (; n >= 8; n -= 8) {
       
   127     __m128 xmm0, xmm1;
       
   128     xmm0 = _mm_loadu_ps(src1);
       
   129     xmm1 = _mm_loadu_ps(src2);
       
   130     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   131     _mm_store_ps(dest, xmm0);
       
   132     xmm0 = _mm_loadu_ps(src1 + 4);
       
   133     xmm1 = _mm_loadu_ps(src2 + 4);
       
   134     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   135     _mm_store_ps(dest + 4, xmm0);
       
   136     dest += 8;
       
   137     src1 += 8;
       
   138     src2 += 8;
       
   139   }
       
   140   for (; n > 0; n--) {
       
   141     *dest++ = *src1++ / *src2++;
       
   142   }
       
   143 }
       
   144 OIL_DEFINE_IMPL_FULL (divide_f32_sse_unroll2, divide_f32, OIL_IMPL_FLAG_SSE);
       
   145 
       
   146 SSE_FUNCTION static void
       
   147 minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
   148 {
       
   149   /* Initial operations to align the destination pointer */
       
   150   for (; ((long)dest & 15) && (n > 0); n--) {
       
   151     *dest++ = *src1 < *src2 ? *src1 : *src2;
       
   152     src1++;
       
   153     src2++;
       
   154   }
       
   155   for (; n >= 8; n -= 8) {
       
   156     __m128 xmm0, xmm1;
       
   157     xmm0 = _mm_loadu_ps(src1);
       
   158     xmm1 = _mm_loadu_ps(src2);
       
   159     xmm0 = _mm_min_ps(xmm0, xmm1);
       
   160     _mm_store_ps(dest, xmm0);
       
   161     xmm0 = _mm_loadu_ps(src1 + 4);
       
   162     xmm1 = _mm_loadu_ps(src2 + 4);
       
   163     xmm0 = _mm_min_ps(xmm0, xmm1);
       
   164     _mm_store_ps(dest + 4, xmm0);
       
   165     dest += 8;
       
   166     src1 += 8;
       
   167     src2 += 8;
       
   168   }
       
   169   for (; n > 0; n--) {
       
   170     *dest++ = *src1 < *src2 ? *src1 : *src2;
       
   171     src1++;
       
   172     src2++;
       
   173   }
       
   174 }
       
   175 OIL_DEFINE_IMPL_FULL (minimum_f32_sse_unroll2, minimum_f32, OIL_IMPL_FLAG_SSE);
       
   176 
       
   177 SSE_FUNCTION static void
       
   178 maximum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
       
   179 {
       
   180   /* Initial operations to align the destination pointer */
       
   181   for (; ((long)dest & 15) && (n > 0); n--) {
       
   182     *dest++ = *src1 > *src2 ? *src1 : *src2;
       
   183     src1++;
       
   184     src2++;
       
   185   }
       
   186   for (; n >= 8; n -= 8) {
       
   187     __m128 xmm0, xmm1;
       
   188     xmm0 = _mm_loadu_ps(src1);
       
   189     xmm1 = _mm_loadu_ps(src2);
       
   190     xmm0 = _mm_max_ps(xmm0, xmm1);
       
   191     _mm_store_ps(dest, xmm0);
       
   192     xmm0 = _mm_loadu_ps(src1 + 4);
       
   193     xmm1 = _mm_loadu_ps(src2 + 4);
       
   194     xmm0 = _mm_max_ps(xmm0, xmm1);
       
   195     _mm_store_ps(dest + 4, xmm0);
       
   196     dest += 8;
       
   197     src1 += 8;
       
   198     src2 += 8;
       
   199   }
       
   200   for (; n > 0; n--) {
       
   201     *dest++ = *src1 > *src2 ? *src1 : *src2;
       
   202     src1++;
       
   203     src2++;
       
   204   }
       
   205 }
       
   206 OIL_DEFINE_IMPL_FULL (maximum_f32_sse_unroll2, maximum_f32, OIL_IMPL_FLAG_SSE);
       
   207 
       
   208 SSE_FUNCTION static void
       
   209 inverse_f32_sse_unroll2 (float *dest, float *src1, int n)
       
   210 {
       
   211   /* Initial operations to align the destination pointer */
       
   212   for (; ((long)dest & 15) && (n > 0); n--) {
       
   213     *dest++ = 1.0 / *src1++;
       
   214   }
       
   215   for (; n >= 8; n -= 8) {
       
   216     __m128 xmm0, xmm1;
       
   217     /* While _mm_rcp_ps sounds promising, the results it gives are rather
       
   218      * different from the 1.0 / src1 reference implementation, so do that.
       
   219      */
       
   220     xmm0 = _mm_set_ps1(1.0);
       
   221     xmm1 = _mm_loadu_ps(src1);
       
   222     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   223     _mm_store_ps(dest, xmm0);
       
   224     xmm0 = _mm_set_ps1(1.0);
       
   225     xmm1 = _mm_loadu_ps(src1 + 4);
       
   226     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   227     _mm_store_ps(dest + 4, xmm0);
       
   228     dest += 8;
       
   229     src1 += 8;
       
   230   }
       
   231   for (; n > 0; n--) {
       
   232     *dest++ = 1.0 / *src1++;
       
   233   }
       
   234 }
       
   235 OIL_DEFINE_IMPL_FULL (inverse_f32_sse_unroll2, inverse_f32, OIL_IMPL_FLAG_SSE);
       
   236 
       
   237 SSE_FUNCTION static void
       
   238 negative_f32_sse_unroll2 (float *dest, float *src1, int n)
       
   239 {
       
   240   /* Initial operations to align the destination pointer */
       
   241   for (; ((long)dest & 15) && (n > 0); n--) {
       
   242     *dest++ = -(*src1++);
       
   243   }
       
   244   for (; n >= 8; n -= 8) {
       
   245     __m128 xmm0, xmm1;
       
   246     xmm0 = _mm_setzero_ps();
       
   247     xmm1 = _mm_loadu_ps(src1);
       
   248     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
   249     _mm_store_ps(dest, xmm0);
       
   250     xmm0 = _mm_setzero_ps();
       
   251     xmm1 = _mm_loadu_ps(src1 + 4);
       
   252     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
   253     _mm_store_ps(dest + 4, xmm0);
       
   254     dest += 8;
       
   255     src1 += 8;
       
   256   }
       
   257   for (; n > 0; n--) {
       
   258     *dest++ = -(*src1++);
       
   259   }
       
   260 }
       
   261 OIL_DEFINE_IMPL_FULL (negative_f32_sse_unroll2, negative_f32, OIL_IMPL_FLAG_SSE);
       
   262 
       
   263 SSE_FUNCTION static void
       
   264 scalaradd_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
       
   265 {
       
   266   __m128 xmm1;
       
   267 
       
   268   /* Initial operations to align the destination pointer */
       
   269   for (; ((long)dest & 15) && (n > 0); n--) {
       
   270     *dest++ = *src1++ + *val;
       
   271   }
       
   272   xmm1 = _mm_load_ps1(val);
       
   273   for (; n >= 8; n -= 8) {
       
   274     __m128 xmm0;
       
   275     xmm0 = _mm_loadu_ps(src1);
       
   276     xmm0 = _mm_add_ps(xmm0, xmm1);
       
   277     _mm_store_ps(dest, xmm0);
       
   278     xmm0 = _mm_loadu_ps(src1 + 4);
       
   279     xmm0 = _mm_add_ps(xmm0, xmm1);
       
   280     _mm_store_ps(dest + 4, xmm0);
       
   281     dest += 8;
       
   282     src1 += 8;
       
   283   }
       
   284   for (; n > 0; n--) {
       
   285     *dest++ = *src1++ + *val;
       
   286   }
       
   287 }
       
   288 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
       
   289 
       
   290 SSE_FUNCTION static void
       
   291 scalarmultiply_f32_ns_sse_unroll2 (float *dest, float *src1, float *val, int n)
       
   292 {
       
   293   __m128 xmm1;
       
   294 
       
   295   /* Initial operations to align the destination pointer */
       
   296   for (; ((long)dest & 15) && (n > 0); n--) {
       
   297     *dest++ = *src1++ * *val;
       
   298   }
       
   299   xmm1 = _mm_load_ps1(val);
       
   300   for (; n >= 8; n -= 8) {
       
   301     __m128 xmm0;
       
   302     xmm0 = _mm_loadu_ps(src1);
       
   303     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   304     _mm_store_ps(dest, xmm0);
       
   305     xmm0 = _mm_loadu_ps(src1 + 4);
       
   306     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   307     _mm_store_ps(dest + 4, xmm0);
       
   308     dest += 8;
       
   309     src1 += 8;
       
   310   }
       
   311   for (; n > 0; n--) {
       
   312     *dest++ = *src1++ * *val;
       
   313   }
       
   314 }
       
   315 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
       
   316 
       
   317 SSE_FUNCTION static void
       
   318 scalarmultiply_f64_ns_sse2_unroll2 (double *dest, double *src1, double *val, int n)
       
   319 {
       
   320   __m128d xmm1;
       
   321 
       
   322   /* Initial operations to align the destination pointer */
       
   323   for (; ((long)dest & 15) && (n > 0); n--) {
       
   324     *dest++ = *src1++ * *val;
       
   325   }
       
   326   xmm1 = _mm_load_pd1(val);
       
   327   for (; n >= 4; n -= 4) {
       
   328     __m128d xmm0;
       
   329     xmm0 = _mm_loadu_pd(src1);
       
   330     xmm0 = _mm_mul_pd(xmm0, xmm1);
       
   331     _mm_store_pd(dest, xmm0);
       
   332     xmm0 = _mm_loadu_pd(src1 + 2);
       
   333     xmm0 = _mm_mul_pd(xmm0, xmm1);
       
   334     _mm_store_pd(dest + 2, xmm0);
       
   335     dest += 4;
       
   336     src1 += 4;
       
   337   }
       
   338   for (; n > 0; n--) {
       
   339     *dest++ = *src1++ * *val;
       
   340   }
       
   341 }
       
   342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
       
   343 
       
   344 
       
   345 
       
   346 #ifdef	__SYMBIAN32__
       
   347  
       
   348 OilFunctionImpl* __oil_function_impl_add_f32_sse_unroll2, add_f32() {
       
   349 		return &_oil_function_impl_add_f32_sse_unroll2, add_f32;
       
   350 }
       
   351 #endif
       
   352 
       
   353 #ifdef	__SYMBIAN32__
       
   354  
       
   355 OilFunctionImpl* __oil_function_impl_subtract_f32_sse_unroll2, subtract_f32() {
       
   356 		return &_oil_function_impl_subtract_f32_sse_unroll2, subtract_f32;
       
   357 }
       
   358 #endif
       
   359 
       
   360 #ifdef	__SYMBIAN32__
       
   361  
       
   362 OilFunctionImpl* __oil_function_impl_multiply_f32_sse_unroll2, multiply_f32() {
       
   363 		return &_oil_function_impl_multiply_f32_sse_unroll2, multiply_f32;
       
   364 }
       
   365 #endif
       
   366 
       
   367 #ifdef	__SYMBIAN32__
       
   368  
       
   369 OilFunctionImpl* __oil_function_impl_divide_f32_sse_unroll2, divide_f32() {
       
   370 		return &_oil_function_impl_divide_f32_sse_unroll2, divide_f32;
       
   371 }
       
   372 #endif
       
   373 
       
   374 #ifdef	__SYMBIAN32__
       
   375  
       
   376 OilFunctionImpl* __oil_function_impl_minimum_f32_sse_unroll2, minimum_f32() {
       
   377 		return &_oil_function_impl_minimum_f32_sse_unroll2, minimum_f32;
       
   378 }
       
   379 #endif
       
   380 
       
   381 #ifdef	__SYMBIAN32__
       
   382  
       
   383 OilFunctionImpl* __oil_function_impl_maximum_f32_sse_unroll2, maximum_f32() {
       
   384 		return &_oil_function_impl_maximum_f32_sse_unroll2, maximum_f32;
       
   385 }
       
   386 #endif
       
   387 
       
   388 #ifdef	__SYMBIAN32__
       
   389  
       
   390 OilFunctionImpl* __oil_function_impl_inverse_f32_sse_unroll2, inverse_f32() {
       
   391 		return &_oil_function_impl_inverse_f32_sse_unroll2, inverse_f32;
       
   392 }
       
   393 #endif
       
   394 
       
   395 #ifdef	__SYMBIAN32__
       
   396  
       
   397 OilFunctionImpl* __oil_function_impl_negative_f32_sse_unroll2, negative_f32() {
       
   398 		return &_oil_function_impl_negative_f32_sse_unroll2, negative_f32;
       
   399 }
       
   400 #endif
       
   401 
       
   402 #ifdef	__SYMBIAN32__
       
   403  
       
   404 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns() {
       
   405 		return &_oil_function_impl_scalaradd_f32_ns_sse_unroll2, scalaradd_f32_ns;
       
   406 }
       
   407 #endif
       
   408 
       
   409 #ifdef	__SYMBIAN32__
       
   410  
       
   411 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns() {
       
   412 		return &_oil_function_impl_scalarmultiply_f32_ns_sse_unroll2, scalarmultiply_f32_ns;
       
   413 }
       
   414 #endif
       
   415 
       
   416 #ifdef	__SYMBIAN32__
       
   417  
       
   418 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns() {
       
   419 		return &_oil_function_impl_scalarmultiply_f64_ns_sse2_unroll2, scalarmultiply_f64_ns;
       
   420 }
       
   421 #endif
       
   422