genericopenlibs/liboil/src/math_sse.c
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /*
       
     2  * Copyright (c) 2005
       
     3  *	Eric Anholt.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    27 
       
    28 #ifdef HAVE_CONFIG_H
       
    29 #include "config.h"
       
    30 #endif
       
    31 #include <liboilclasses.h>
       
    32 #include <liboilfunction.h>
       
    33 #include <emmintrin.h>
       
    34 #include <xmmintrin.h>
       
    35 
       
    36 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
       
    37 
       
    38 SSE_FUNCTION static void
       
    39 add_f32_sse (float *dest, float *src1, float *src2, int n)
       
    40 {
       
    41   /* Initial operations to align the destination pointer */
       
    42   for (; ((long)dest & 15) && (n > 0); n--) {
       
    43     *dest++ = *src1++ + *src2++;
       
    44   }
       
    45   for (; n >= 4; n -= 4) {
       
    46     __m128 xmm0, xmm1;
       
    47     xmm0 = _mm_loadu_ps(src1);
       
    48     xmm1 = _mm_loadu_ps(src2);
       
    49     xmm0 = _mm_add_ps(xmm0, xmm1);
       
    50     _mm_store_ps(dest, xmm0);
       
    51     dest += 4;
       
    52     src1 += 4;
       
    53     src2 += 4;
       
    54   }
       
    55   for (; n > 0; n--) {
       
    56     *dest++ = *src1++ + *src2++;
       
    57   }
       
    58 }
       
    59 OIL_DEFINE_IMPL_FULL (add_f32_sse, add_f32, OIL_IMPL_FLAG_SSE);
       
    60 
       
    61 SSE_FUNCTION static void
       
    62 add_f64_sse2 (double *dest, double *src1, double *src2, int n)
       
    63 {
       
    64   __m128d xmm0, xmm1;
       
    65   while (((long)dest & 15) && (0 < n)) {
       
    66     *dest++ = *src1++ + *src2++;
       
    67     n--;
       
    68   }
       
    69   while (1 < n) {
       
    70     xmm0 = _mm_loadu_pd(src1);
       
    71     xmm1 = _mm_loadu_pd(src2);
       
    72     xmm0 = _mm_add_pd(xmm0, xmm1);
       
    73     _mm_store_pd(dest, xmm0);
       
    74     dest += 2;
       
    75     src1 += 2;
       
    76     src2 += 2;
       
    77     n -= 2;
       
    78   }
       
    79   while (0 < n) {
       
    80     *dest++ = *src1++ + *src2++;
       
    81     n--;
       
    82   }
       
    83 }
       
    84 OIL_DEFINE_IMPL_FULL (add_f64_sse2, add_f64, OIL_IMPL_FLAG_SSE2);
       
    85 
       
    86 SSE_FUNCTION static void
       
    87 add_f64_sse2_unroll (double *dest, double *src1, double *src2, int n)
       
    88 {
       
    89   __m128d xmm0, xmm1;
       
    90   while (((long)dest & 15) && (0 < n)) {
       
    91     *dest++ = *src1++ + *src2++;
       
    92     n--;
       
    93   }
       
    94   while (3 < n) {
       
    95     xmm0 = _mm_loadu_pd(src1);
       
    96     xmm1 = _mm_loadu_pd(src2);
       
    97     xmm0 = _mm_add_pd(xmm0, xmm1);
       
    98     _mm_store_pd(dest, xmm0);
       
    99 
       
   100     xmm0 = _mm_loadu_pd(src1+2);
       
   101     xmm1 = _mm_loadu_pd(src2+2);
       
   102     xmm0 = _mm_add_pd(xmm0, xmm1);
       
   103     _mm_store_pd(dest+2, xmm0);
       
   104     dest += 4;
       
   105     src1 += 4;
       
   106     src2 += 4;
       
   107     n -= 4;
       
   108   }
       
   109   while (1 < n) {
       
   110     xmm0 = _mm_loadu_pd(src1);
       
   111     xmm1 = _mm_loadu_pd(src2);
       
   112     xmm0 = _mm_add_pd(xmm0, xmm1);
       
   113     _mm_store_pd(dest, xmm0);
       
   114     dest += 2;
       
   115     src1 += 2;
       
   116     src2 += 2;
       
   117     n -= 2;
       
   118   }
       
   119   while (0 < n) {
       
   120     *dest++ = *src1++ + *src2++;
       
   121     n--;
       
   122   }
       
   123 }
       
   124 OIL_DEFINE_IMPL_FULL (add_f64_sse2_unroll, add_f64, OIL_IMPL_FLAG_SSE2);
       
   125 
       
   126 SSE_FUNCTION static void
       
   127 subtract_f32_sse (float *dest, float *src1, float *src2, int n)
       
   128 {
       
   129   /* Initial operations to align the destination pointer */
       
   130   for (; ((long)dest & 15) && (n > 0); n--) {
       
   131     *dest++ = *src1++ - *src2++;
       
   132   }
       
   133   for (; n >= 4; n -= 4) {
       
   134     __m128 xmm0, xmm1;
       
   135     xmm0 = _mm_loadu_ps(src1);
       
   136     xmm1 = _mm_loadu_ps(src2);
       
   137     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
   138     _mm_store_ps(dest, xmm0);
       
   139     dest += 4;
       
   140     src1 += 4;
       
   141     src2 += 4;
       
   142   }
       
   143   for (; n > 0; n--) {
       
   144     *dest++ = *src1++ - *src2++;
       
   145   }
       
   146 }
       
   147 OIL_DEFINE_IMPL_FULL (subtract_f32_sse, subtract_f32, OIL_IMPL_FLAG_SSE);
       
   148 
       
   149 SSE_FUNCTION static void
       
   150 multiply_f32_sse (float *dest, float *src1, float *src2, int n)
       
   151 {
       
   152   /* Initial operations to align the destination pointer */
       
   153   for (; ((long)dest & 15) && (n > 0); n--) {
       
   154     *dest++ = *src1++ * *src2++;
       
   155   }
       
   156   for (; n >= 4; n -= 4) {
       
   157     __m128 xmm0, xmm1;
       
   158     xmm0 = _mm_loadu_ps(src1);
       
   159     xmm1 = _mm_loadu_ps(src2);
       
   160     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   161     _mm_store_ps(dest, xmm0);
       
   162     dest += 4;
       
   163     src1 += 4;
       
   164     src2 += 4;
       
   165   }
       
   166   for (; n > 0; n--) {
       
   167     *dest++ = *src1++ * *src2++;
       
   168   }
       
   169 }
       
   170 OIL_DEFINE_IMPL_FULL (multiply_f32_sse, multiply_f32, OIL_IMPL_FLAG_SSE);
       
   171 
       
   172 SSE_FUNCTION static void
       
   173 divide_f32_sse (float *dest, float *src1, float *src2, int n)
       
   174 {
       
   175   /* Initial operations to align the destination pointer */
       
   176   for (; ((long)dest & 15) && (n > 0); n--) {
       
   177     *dest++ = *src1++ / *src2++;
       
   178   }
       
   179   for (; n >= 4; n -= 4) {
       
   180     __m128 xmm0, xmm1;
       
   181     xmm0 = _mm_loadu_ps(src1);
       
   182     xmm1 = _mm_loadu_ps(src2);
       
   183     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   184     _mm_store_ps(dest, xmm0);
       
   185     dest += 4;
       
   186     src1 += 4;
       
   187     src2 += 4;
       
   188   }
       
   189   for (; n > 0; n--) {
       
   190     *dest++ = *src1++ / *src2++;
       
   191   }
       
   192 }
       
   193 OIL_DEFINE_IMPL_FULL (divide_f32_sse, divide_f32, OIL_IMPL_FLAG_SSE);
       
   194 
       
   195 SSE_FUNCTION static void
       
   196 minimum_f32_sse (float *dest, float *src1, float *src2, int n)
       
   197 {
       
   198   /* Initial operations to align the destination pointer */
       
   199   for (; ((long)dest & 15) && (n > 0); n--) {
       
   200     *dest++ = *src1 < *src2 ? *src1 : *src2;
       
   201     src1++;
       
   202     src2++;
       
   203   }
       
   204   for (; n >= 4; n -= 4) {
       
   205     __m128 xmm0, xmm1;
       
   206     xmm0 = _mm_loadu_ps(src1);
       
   207     xmm1 = _mm_loadu_ps(src2);
       
   208     xmm0 = _mm_min_ps(xmm0, xmm1);
       
   209     _mm_store_ps(dest, xmm0);
       
   210     dest += 4;
       
   211     src1 += 4;
       
   212     src2 += 4;
       
   213   }
       
   214   for (; n > 0; n--) {
       
   215     *dest++ = *src1 < *src2 ? *src1 : *src2;
       
   216     src1++;
       
   217     src2++;
       
   218   }
       
   219 }
       
   220 OIL_DEFINE_IMPL_FULL (minimum_f32_sse, minimum_f32, OIL_IMPL_FLAG_SSE);
       
   221 
       
   222 SSE_FUNCTION static void
       
   223 maximum_f32_sse (float *dest, float *src1, float *src2, int n)
       
   224 {
       
   225   /* Initial operations to align the destination pointer */
       
   226   for (; ((long)dest & 15) && (n > 0); n--) {
       
   227     *dest++ = *src1 > *src2 ? *src1 : *src2;
       
   228     src1++;
       
   229     src2++;
       
   230   }
       
   231   for (; n >= 4; n -= 4) {
       
   232     __m128 xmm0, xmm1;
       
   233     xmm0 = _mm_loadu_ps(src1);
       
   234     xmm1 = _mm_loadu_ps(src2);
       
   235     xmm0 = _mm_max_ps(xmm0, xmm1);
       
   236     _mm_store_ps(dest, xmm0);
       
   237     dest += 4;
       
   238     src1 += 4;
       
   239     src2 += 4;
       
   240   }
       
   241   for (; n > 0; n--) {
       
   242     *dest++ = *src1 > *src2 ? *src1 : *src2;
       
   243     src1++;
       
   244     src2++;
       
   245   }
       
   246 }
       
   247 OIL_DEFINE_IMPL_FULL (maximum_f32_sse, maximum_f32, OIL_IMPL_FLAG_SSE);
       
   248 
       
   249 SSE_FUNCTION static void
       
   250 inverse_f32_sse (float *dest, float *src1, int n)
       
   251 {
       
   252   /* Initial operations to align the destination pointer */
       
   253   for (; ((long)dest & 15) && (n > 0); n--) {
       
   254     *dest++ = 1.0 / *src1++;
       
   255   }
       
   256   for (; n >= 4; n -= 4) {
       
   257     __m128 xmm0, xmm1;
       
   258     /* While _mm_rcp_ps sounds promising, the results it gives are rather
       
   259      * different from the 1.0 / src1 reference implementation, so do that.
       
   260      */
       
   261     xmm0 = _mm_set_ps1(1.0);
       
   262     xmm1 = _mm_loadu_ps(src1);
       
   263     xmm0 = _mm_div_ps(xmm0, xmm1);
       
   264     _mm_store_ps(dest, xmm0);
       
   265     dest += 4;
       
   266     src1 += 4;
       
   267   }
       
   268   for (; n > 0; n--) {
       
   269     *dest++ = 1.0 / *src1++;
       
   270   }
       
   271 }
       
   272 OIL_DEFINE_IMPL_FULL (inverse_f32_sse, inverse_f32, OIL_IMPL_FLAG_SSE);
       
   273 
       
   274 SSE_FUNCTION static void
       
   275 negative_f32_sse (float *dest, float *src1, int n)
       
   276 {
       
   277   /* Initial operations to align the destination pointer */
       
   278   for (; ((long)dest & 15) && (n > 0); n--) {
       
   279     *dest++ = -(*src1++);
       
   280   }
       
   281   for (; n >= 4; n -= 4) {
       
   282     __m128 xmm0, xmm1;
       
   283     xmm0 = _mm_setzero_ps();
       
   284     xmm1 = _mm_loadu_ps(src1);
       
   285     xmm0 = _mm_sub_ps(xmm0, xmm1);
       
   286     _mm_store_ps(dest, xmm0);
       
   287     dest += 4;
       
   288     src1 += 4;
       
   289   }
       
   290   for (; n > 0; n--) {
       
   291     *dest++ = -(*src1++);
       
   292   }
       
   293 }
       
   294 OIL_DEFINE_IMPL_FULL (negative_f32_sse, negative_f32, OIL_IMPL_FLAG_SSE);
       
   295 
       
   296 SSE_FUNCTION static void
       
   297 scalaradd_f32_ns_sse (float *dest, float *src1, float *val, int n)
       
   298 {
       
   299   __m128 xmm1;
       
   300 
       
   301   /* Initial operations to align the destination pointer */
       
   302   for (; ((long)dest & 15) && (n > 0); n--) {
       
   303     *dest++ = *src1++ + *val;
       
   304   }
       
   305   xmm1 = _mm_load_ps1(val);
       
   306   for (; n >= 4; n -= 4) {
       
   307     __m128 xmm0;
       
   308     xmm0 = _mm_loadu_ps(src1);
       
   309     xmm0 = _mm_add_ps(xmm0, xmm1);
       
   310     _mm_store_ps(dest, xmm0);
       
   311     dest += 4;
       
   312     src1 += 4;
       
   313   }
       
   314   for (; n > 0; n--) {
       
   315     *dest++ = *src1++ + *val;
       
   316   }
       
   317 }
       
   318 OIL_DEFINE_IMPL_FULL (scalaradd_f32_ns_sse, scalaradd_f32_ns, OIL_IMPL_FLAG_SSE);
       
   319 
       
   320 SSE_FUNCTION static void
       
   321 scalarmultiply_f32_ns_sse (float *dest, float *src1, float *val, int n)
       
   322 {
       
   323   __m128 xmm1;
       
   324 
       
   325   /* Initial operations to align the destination pointer */
       
   326   for (; ((long)dest & 15) && (n > 0); n--) {
       
   327     *dest++ = *src1++ * *val;
       
   328   }
       
   329   xmm1 = _mm_load_ps1(val);
       
   330   for (; n >= 4; n -= 4) {
       
   331     __m128 xmm0;
       
   332     xmm0 = _mm_loadu_ps(src1);
       
   333     xmm0 = _mm_mul_ps(xmm0, xmm1);
       
   334     _mm_store_ps(dest, xmm0);
       
   335     dest += 4;
       
   336     src1 += 4;
       
   337   }
       
   338   for (; n > 0; n--) {
       
   339     *dest++ = *src1++ * *val;
       
   340   }
       
   341 }
       
   342 OIL_DEFINE_IMPL_FULL (scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns, OIL_IMPL_FLAG_SSE);
       
   343 
       
   344 SSE_FUNCTION static void
       
   345 scalarmultiply_f64_ns_sse2 (double *dest, double *src1, double *val, int n)
       
   346 {
       
   347   __m128d xmm1;
       
   348 
       
   349   /* Initial operations to align the destination pointer */
       
   350   for (; ((long)dest & 15) && (n > 0); n--) {
       
   351     *dest++ = *src1++ * *val;
       
   352   }
       
   353   xmm1 = _mm_load_pd1(val);
       
   354   for (; n >= 2; n -= 2) {
       
   355     __m128d xmm0;
       
   356     xmm0 = _mm_loadu_pd(src1);
       
   357     xmm0 = _mm_mul_pd(xmm0, xmm1);
       
   358     _mm_store_pd(dest, xmm0);
       
   359     dest += 2;
       
   360     src1 += 2;
       
   361   }
       
   362   for (; n > 0; n--) {
       
   363     *dest++ = *src1++ * *val;
       
   364   }
       
   365 }
       
   366 OIL_DEFINE_IMPL_FULL (scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns, OIL_IMPL_FLAG_SSE2);
       
   367 
       
   368 
       
   369 
       
   370 #ifdef	__SYMBIAN32__
       
   371  
       
   372 OilFunctionImpl* __oil_function_impl_add_f32_sse, add_f32() {
       
   373 		return &_oil_function_impl_add_f32_sse, add_f32;
       
   374 }
       
   375 #endif
       
   376 
       
   377 #ifdef	__SYMBIAN32__
       
   378  
       
   379 OilFunctionImpl* __oil_function_impl_add_f64_sse2, add_f64() {
       
   380 		return &_oil_function_impl_add_f64_sse2, add_f64;
       
   381 }
       
   382 #endif
       
   383 
       
   384 #ifdef	__SYMBIAN32__
       
   385  
       
   386 OilFunctionImpl* __oil_function_impl_add_f64_sse2_unroll, add_f64() {
       
   387 		return &_oil_function_impl_add_f64_sse2_unroll, add_f64;
       
   388 }
       
   389 #endif
       
   390 
       
   391 #ifdef	__SYMBIAN32__
       
   392  
       
   393 OilFunctionImpl* __oil_function_impl_subtract_f32_sse, subtract_f32() {
       
   394 		return &_oil_function_impl_subtract_f32_sse, subtract_f32;
       
   395 }
       
   396 #endif
       
   397 
       
   398 #ifdef	__SYMBIAN32__
       
   399  
       
   400 OilFunctionImpl* __oil_function_impl_multiply_f32_sse, multiply_f32() {
       
   401 		return &_oil_function_impl_multiply_f32_sse, multiply_f32;
       
   402 }
       
   403 #endif
       
   404 
       
   405 #ifdef	__SYMBIAN32__
       
   406  
       
   407 OilFunctionImpl* __oil_function_impl_divide_f32_sse, divide_f32() {
       
   408 		return &_oil_function_impl_divide_f32_sse, divide_f32;
       
   409 }
       
   410 #endif
       
   411 
       
   412 #ifdef	__SYMBIAN32__
       
   413  
       
   414 OilFunctionImpl* __oil_function_impl_minimum_f32_sse, minimum_f32() {
       
   415 		return &_oil_function_impl_minimum_f32_sse, minimum_f32;
       
   416 }
       
   417 #endif
       
   418 
       
   419 #ifdef	__SYMBIAN32__
       
   420  
       
   421 OilFunctionImpl* __oil_function_impl_maximum_f32_sse, maximum_f32() {
       
   422 		return &_oil_function_impl_maximum_f32_sse, maximum_f32;
       
   423 }
       
   424 #endif
       
   425 
       
   426 #ifdef	__SYMBIAN32__
       
   427  
       
   428 OilFunctionImpl* __oil_function_impl_inverse_f32_sse, inverse_f32() {
       
   429 		return &_oil_function_impl_inverse_f32_sse, inverse_f32;
       
   430 }
       
   431 #endif
       
   432 
       
   433 #ifdef	__SYMBIAN32__
       
   434  
       
   435 OilFunctionImpl* __oil_function_impl_negative_f32_sse, negative_f32() {
       
   436 		return &_oil_function_impl_negative_f32_sse, negative_f32;
       
   437 }
       
   438 #endif
       
   439 
       
   440 #ifdef	__SYMBIAN32__
       
   441  
       
   442 OilFunctionImpl* __oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns() {
       
   443 		return &_oil_function_impl_scalaradd_f32_ns_sse, scalaradd_f32_ns;
       
   444 }
       
   445 #endif
       
   446 
       
   447 #ifdef	__SYMBIAN32__
       
   448  
       
   449 OilFunctionImpl* __oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns() {
       
   450 		return &_oil_function_impl_scalarmultiply_f32_ns_sse, scalarmultiply_f32_ns;
       
   451 }
       
   452 #endif
       
   453 
       
   454 #ifdef	__SYMBIAN32__
       
   455  
       
   456 OilFunctionImpl* __oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns() {
       
   457 		return &_oil_function_impl_scalarmultiply_f64_ns_sse2, scalarmultiply_f64_ns;
       
   458 }
       
   459 #endif
       
   460