genericopenlibs/liboil/src/clamp_sse.c
branchRCL_3
changeset 56 acd3cd4aaceb
equal deleted inserted replaced
54:4332f0f7be53 56:acd3cd4aaceb
       
     1 /*
       
     2  * Copyright (c) 2005
       
     3  *	Eric Anholt.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    27 
       
    28 #ifdef HAVE_CONFIG_H
       
    29 #include "config.h"
       
    30 #endif
       
    31 #include "liboil/liboilclasses.h"
       
    32 #include "liboil/liboilfunction.h"
       
    33 #include <emmintrin.h>
       
    34 #include <xmmintrin.h>
       
    35 
       
    36 /* TODO: If we have gcc 4.2 or above, do this. Otherwise, disable all SSE use */
       
    37 #define SSE_FUNCTION __attribute__((force_align_arg_pointer))
       
    38 
       
    39 SSE_FUNCTION static void
       
    40 clamp_u8_sse (uint8_t *dest, uint8_t *src1, int n, uint8_t *src2_1,
       
    41     uint8_t *src3_1)
       
    42 {
       
    43   __m128i xmm1, xmm2;
       
    44   uint8_t min = *src2_1;
       
    45   uint8_t max = *src3_1;
       
    46 
       
    47   /* Initial operations to align the destination pointer */
       
    48   for (; ((long)dest & 15) && (n > 0); n--) {
       
    49     uint8_t x = *src1++;
       
    50     if (x < min)
       
    51       x = min;
       
    52     if (x > max)
       
    53       x = max;
       
    54     *dest++ = x;
       
    55   }
       
    56   xmm1 = _mm_set1_epi8(min);
       
    57   xmm2 = _mm_set1_epi8(max);
       
    58   for (; n >= 16; n -= 16) {
       
    59     __m128i xmm0;
       
    60     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
    61     xmm0 = _mm_max_epu8(xmm0, xmm1);
       
    62     xmm0 = _mm_min_epu8(xmm0, xmm2);
       
    63     _mm_store_si128((__m128i *)dest, xmm0);
       
    64     dest += 16;
       
    65     src1 += 16;
       
    66   }
       
    67   for (; n > 0; n--) {
       
    68     uint8_t x = *src1++;
       
    69     if (x < min)
       
    70       x = min;
       
    71     if (x > max)
       
    72       x = max;
       
    73     *dest++ = x;
       
    74   }
       
    75 }
       
    76 OIL_DEFINE_IMPL_FULL (clamp_u8_sse, clamp_u8, OIL_IMPL_FLAG_SSE2);
       
    77 
       
    78 SSE_FUNCTION static void
       
    79 clamp_s16_sse (int16_t *dest, int16_t *src1, int n, int16_t *src2_1,
       
    80     int16_t *src3_1)
       
    81 {
       
    82   __m128i xmm1, xmm2;
       
    83   int16_t min = *src2_1;
       
    84   int16_t max = *src3_1;
       
    85 
       
    86   /* Initial operations to align the destination pointer */
       
    87   for (; ((long)dest & 15) && (n > 0); n--) {
       
    88     int16_t x = *src1++;
       
    89     if (x < min)
       
    90       x = min;
       
    91     if (x > max)
       
    92       x = max;
       
    93     *dest++ = x;
       
    94   }
       
    95   xmm1 = _mm_set1_epi16(min);
       
    96   xmm2 = _mm_set1_epi16(max);
       
    97   for (; n >= 8; n -= 8) {
       
    98     __m128i xmm0;
       
    99     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
   100     xmm0 = _mm_max_epi16(xmm0, xmm1);
       
   101     xmm0 = _mm_min_epi16(xmm0, xmm2);
       
   102     _mm_store_si128((__m128i *)dest, xmm0);
       
   103     dest += 8;
       
   104     src1 += 8;
       
   105   }
       
   106   for (; n > 0; n--) {
       
   107     int16_t x = *src1++;
       
   108     if (x < min)
       
   109       x = min;
       
   110     if (x > max)
       
   111       x = max;
       
   112     *dest++ = x;
       
   113   }
       
   114 }
       
   115 OIL_DEFINE_IMPL_FULL (clamp_s16_sse, clamp_s16, OIL_IMPL_FLAG_SSE2);
       
   116 
       
   117 SSE_FUNCTION static void
       
   118 clamp_f32_sse (float *dest, const float *src1, int n, const float *src2_1,
       
   119     const float *src3_1)
       
   120 {
       
   121   __m128 xmm1, xmm2;
       
   122   float min = *src2_1;
       
   123   float max = *src3_1;
       
   124 
       
   125   /* Initial operations to align the destination pointer */
       
   126   for (; ((long)dest & 15) && (n > 0); n--) {
       
   127     float x = *src1++;
       
   128     if (x < min)
       
   129       x = min;
       
   130     if (x > max)
       
   131       x = max;
       
   132     *dest++ = x;
       
   133   }
       
   134   xmm1 = _mm_set_ps1(min);
       
   135   xmm2 = _mm_set_ps1(max);
       
   136   for (; n >= 4; n -= 4) {
       
   137     __m128 xmm0;
       
   138     xmm0 = _mm_loadu_ps(src1);
       
   139     xmm0 = _mm_max_ps(xmm0, xmm1);
       
   140     xmm0 = _mm_min_ps(xmm0, xmm2);
       
   141     _mm_store_ps(dest, xmm0);
       
   142     dest += 4;
       
   143     src1 += 4;
       
   144   }
       
   145   for (; n > 0; n--) {
       
   146     float x = *src1++;
       
   147     if (x < min)
       
   148       x = min;
       
   149     if (x > max)
       
   150       x = max;
       
   151     *dest++ = x;
       
   152   }
       
   153 }
       
   154 OIL_DEFINE_IMPL_FULL (clamp_f32_sse, clamp_f32, OIL_IMPL_FLAG_SSE);
       
   155 
       
   156 SSE_FUNCTION static void
       
   157 clamp_f64_sse (double *dest, const double *src1, int n, const double *src2_1,
       
   158     const double *src3_1)
       
   159 {
       
   160   __m128d xmm1, xmm2;
       
   161   double min = *src2_1;
       
   162   double max = *src3_1;
       
   163 
       
   164   /* Initial operations to align the destination pointer */
       
   165   for (; ((long)dest & 15) && (n > 0); n--) {
       
   166     double x = *src1++;
       
   167     if (x < min)
       
   168       x = min;
       
   169     if (x > max)
       
   170       x = max;
       
   171     *dest++ = x;
       
   172   }
       
   173   xmm1 = _mm_set1_pd(min);
       
   174   xmm2 = _mm_set1_pd(max);
       
   175   for (; n >= 2; n -= 2) {
       
   176     __m128d xmm0;
       
   177     xmm0 = _mm_loadu_pd(src1);
       
   178     xmm0 = _mm_max_pd(xmm0, xmm1);
       
   179     xmm0 = _mm_min_pd(xmm0, xmm2);
       
   180     _mm_store_pd(dest, xmm0);
       
   181     dest += 2;
       
   182     src1 += 2;
       
   183   }
       
   184   for (; n > 0; n--) {
       
   185     double x = *src1++;
       
   186     if (x < min)
       
   187       x = min;
       
   188     if (x > max)
       
   189       x = max;
       
   190     *dest++ = x;
       
   191   }
       
   192 }
       
   193 OIL_DEFINE_IMPL_FULL (clamp_f64_sse, clamp_f64,
       
   194     OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
       
   195 
       
   196 SSE_FUNCTION static void
       
   197 clamplow_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
       
   198     const uint8_t *src2_1)
       
   199 {
       
   200   __m128i xmm1;
       
   201   uint8_t min = *src2_1;
       
   202 
       
   203   /* Initial operations to align the destination pointer */
       
   204   for (; ((long)dest & 15) && (n > 0); n--) {
       
   205     uint8_t x = *src1++;
       
   206     if (x < min)
       
   207       x = min;
       
   208     *dest++ = x;
       
   209   }
       
   210   xmm1 = _mm_set1_epi8(min);
       
   211   for (; n >= 16; n -= 16) {
       
   212     __m128i xmm0;
       
   213     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
   214     xmm0 = _mm_max_epu8(xmm0, xmm1);
       
   215     _mm_store_si128((__m128i *)dest, xmm0);
       
   216     dest += 16;
       
   217     src1 += 16;
       
   218   }
       
   219   for (; n > 0; n--) {
       
   220     uint8_t x = *src1++;
       
   221     if (x < min)
       
   222       x = min;
       
   223     *dest++ = x;
       
   224   }
       
   225 }
       
   226 OIL_DEFINE_IMPL_FULL (clamplow_u8_sse, clamplow_u8, OIL_IMPL_FLAG_SSE2);
       
   227 
       
   228 SSE_FUNCTION static void
       
   229 clamplow_s16_sse (int16_t *dest, const int16_t *src1, int n,
       
   230     const int16_t *src2_1)
       
   231 {
       
   232   __m128i xmm1;
       
   233   int16_t min = *src2_1;
       
   234 
       
   235   /* Initial operations to align the destination pointer */
       
   236   for (; ((long)dest & 15) && (n > 0); n--) {
       
   237     int16_t x = *src1++;
       
   238     if (x < min)
       
   239       x = min;
       
   240     *dest++ = x;
       
   241   }
       
   242   xmm1 = _mm_set1_epi16(min);
       
   243   for (; n >= 8; n -= 8) {
       
   244     __m128i xmm0;
       
   245     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
   246     xmm0 = _mm_max_epi16(xmm0, xmm1);
       
   247     _mm_store_si128((__m128i *)dest, xmm0);
       
   248     dest += 8;
       
   249     src1 += 8;
       
   250   }
       
   251   for (; n > 0; n--) {
       
   252     int16_t x = *src1++;
       
   253     if (x < min)
       
   254       x = min;
       
   255     *dest++ = x;
       
   256   }
       
   257 }
       
   258 OIL_DEFINE_IMPL_FULL (clamplow_s16_sse, clamplow_s16, OIL_IMPL_FLAG_SSE2);
       
   259 
       
   260 SSE_FUNCTION static void
       
   261 clamplow_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
       
   262 {
       
   263   __m128 xmm1;
       
   264   float min = *src2_1;
       
   265 
       
   266   /* Initial operations to align the destination pointer */
       
   267   for (; ((long)dest & 15) && (n > 0); n--) {
       
   268     float x = *src1++;
       
   269     if (x < min)
       
   270       x = min;
       
   271     *dest++ = x;
       
   272   }
       
   273   xmm1 = _mm_set_ps1(min);
       
   274   for (; n >= 4; n -= 4) {
       
   275     __m128 xmm0;
       
   276     xmm0 = _mm_loadu_ps(src1);
       
   277     xmm0 = _mm_max_ps(xmm0, xmm1);
       
   278     _mm_store_ps(dest, xmm0);
       
   279     dest += 4;
       
   280     src1 += 4;
       
   281   }
       
   282   for (; n > 0; n--) {
       
   283     float x = *src1++;
       
   284     if (x < min)
       
   285       x = min;
       
   286     *dest++ = x;
       
   287   }
       
   288 }
       
   289 OIL_DEFINE_IMPL_FULL (clamplow_f32_sse, clamplow_f32, OIL_IMPL_FLAG_SSE);
       
   290 
       
   291 SSE_FUNCTION static void
       
   292 clamplow_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
       
   293 {
       
   294   __m128d xmm1;
       
   295   double min = *src2_1;
       
   296 
       
   297   /* Initial operations to align the destination pointer */
       
   298   for (; ((long)dest & 15) && (n > 0); n--) {
       
   299     double x = *src1++;
       
   300     if (x < min)
       
   301       x = min;
       
   302     *dest++ = x;
       
   303   }
       
   304   xmm1 = _mm_set1_pd(min);
       
   305   for (; n >= 2; n -= 2) {
       
   306     __m128d xmm0;
       
   307     xmm0 = _mm_loadu_pd(src1);
       
   308     xmm0 = _mm_max_pd(xmm0, xmm1);
       
   309     _mm_store_pd(dest, xmm0);
       
   310     dest += 2;
       
   311     src1 += 2;
       
   312   }
       
   313   for (; n > 0; n--) {
       
   314     double x = *src1++;
       
   315     if (x < min)
       
   316       x = min;
       
   317     *dest++ = x;
       
   318   }
       
   319 }
       
   320 OIL_DEFINE_IMPL_FULL (clamplow_f64_sse, clamplow_f64,
       
   321     OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
       
   322 
       
   323 SSE_FUNCTION static void
       
   324 clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
       
   325     const uint8_t *src2_1)
       
   326 {
       
   327   __m128i xmm1;
       
   328   uint8_t max = *src2_1;
       
   329 
       
   330   /* Initial operations to align the destination pointer */
       
   331   for (; ((long)dest & 15) && (n > 0); n--) {
       
   332     uint8_t x = *src1++;
       
   333     if (x > max)
       
   334       x = max;
       
   335     *dest++ = x;
       
   336   }
       
   337   xmm1 = _mm_set1_epi8(max);
       
   338   for (; n >= 16; n -= 16) {
       
   339     __m128i xmm0;
       
   340     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
   341     xmm0 = _mm_min_epu8(xmm0, xmm1);
       
   342     _mm_store_si128((__m128i *)dest, xmm0);
       
   343     dest += 16;
       
   344     src1 += 16;
       
   345   }
       
   346   for (; n > 0; n--) {
       
   347     uint8_t x = *src1++;
       
   348     if (x > max)
       
   349       x = max;
       
   350     *dest++ = x;
       
   351   }
       
   352 }
       
   353 OIL_DEFINE_IMPL_FULL (clamphigh_u8_sse, clamphigh_u8, OIL_IMPL_FLAG_SSE2);
       
   354 
       
   355 SSE_FUNCTION static void
       
   356 clamphigh_s16_sse (int16_t *dest, const int16_t *src1, int n,
       
   357     const int16_t *src2_1)
       
   358 {
       
   359   __m128i xmm1;
       
   360   int16_t max = *src2_1;
       
   361 
       
   362   /* Initial operations to align the destination pointer */
       
   363   for (; ((long)dest & 15) && (n > 0); n--) {
       
   364     int16_t x = *src1++;
       
   365     if (x > max)
       
   366       x = max;
       
   367     *dest++ = x;
       
   368   }
       
   369   xmm1 = _mm_set1_epi16(max);
       
   370   for (; n >= 8; n -= 8) {
       
   371     __m128i xmm0;
       
   372     xmm0 = _mm_loadu_si128((__m128i *)src1);
       
   373     xmm0 = _mm_min_epi16(xmm0, xmm1);
       
   374     _mm_store_si128((__m128i *)dest, xmm0);
       
   375     dest += 8;
       
   376     src1 += 8;
       
   377   }
       
   378   for (; n > 0; n--) {
       
   379     int16_t x = *src1++;
       
   380     if (x > max)
       
   381       x = max;
       
   382     *dest++ = x;
       
   383   }
       
   384 }
       
   385 OIL_DEFINE_IMPL_FULL (clamphigh_s16_sse, clamphigh_s16, OIL_IMPL_FLAG_SSE2);
       
   386 
       
   387 SSE_FUNCTION static void
       
   388 clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
       
   389 {
       
   390   __m128 xmm1;
       
   391   float max = *src2_1;
       
   392 
       
   393   /* Initial operations to align the destination pointer */
       
   394   for (; ((long)dest & 15) && (n > 0); n--) {
       
   395     float x = *src1++;
       
   396     if (x > max)
       
   397       x = max;
       
   398     *dest++ = x;
       
   399   }
       
   400   xmm1 = _mm_set_ps1(max);
       
   401   for (; n >= 4; n -= 4) {
       
   402     __m128 xmm0;
       
   403     xmm0 = _mm_loadu_ps(src1);
       
   404     xmm0 = _mm_min_ps(xmm0, xmm1);
       
   405     _mm_store_ps(dest, xmm0);
       
   406     dest += 4;
       
   407     src1 += 4;
       
   408   }
       
   409   for (; n > 0; n--) {
       
   410     float x = *src1++;
       
   411     if (x > max)
       
   412       x = max;
       
   413     *dest++ = x;
       
   414   }
       
   415 }
       
   416 OIL_DEFINE_IMPL_FULL (clamphigh_f32_sse, clamphigh_f32, OIL_IMPL_FLAG_SSE);
       
   417 
       
   418 SSE_FUNCTION static void
       
   419 clamphigh_f64_sse (double *dest, const double *src1, int n, const double *src2_1)
       
   420 {
       
   421   __m128d xmm1;
       
   422   double max = *src2_1;
       
   423 
       
   424   /* Initial operations to align the destination pointer */
       
   425   for (; ((long)dest & 15) && (n > 0); n--) {
       
   426     double x = *src1++;
       
   427     if (x > max)
       
   428       x = max;
       
   429     *dest++ = x;
       
   430   }
       
   431   xmm1 = _mm_set1_pd(max);
       
   432   for (; n >= 2; n -= 2) {
       
   433     __m128d xmm0;
       
   434     xmm0 = _mm_loadu_pd(src1);
       
   435     xmm0 = _mm_min_pd(xmm0, xmm1);
       
   436     _mm_store_pd(dest, xmm0);
       
   437     dest += 2;
       
   438     src1 += 2;
       
   439   }
       
   440   for (; n > 0; n--) {
       
   441     double x = *src1++;
       
   442     if (x > max)
       
   443       x = max;
       
   444     *dest++ = x;
       
   445   }
       
   446 }
       
   447 OIL_DEFINE_IMPL_FULL (clamphigh_f64_sse, clamphigh_f64,
       
   448     OIL_IMPL_FLAG_SSE | OIL_IMPL_FLAG_SSE2);
       
   449 
       
   450 
       
   451 #ifdef	__SYMBIAN32__
       
   452  
       
   453 OilFunctionImpl* __oil_function_impl_clamp_u8_sse, clamp_u8() {
       
   454 		return &_oil_function_impl_clamp_u8_sse, clamp_u8;
       
   455 }
       
   456 #endif
       
   457 
       
   458 #ifdef	__SYMBIAN32__
       
   459  
       
   460 OilFunctionImpl* __oil_function_impl_clamp_s16_sse, clamp_s16() {
       
   461 		return &_oil_function_impl_clamp_s16_sse, clamp_s16;
       
   462 }
       
   463 #endif
       
   464 
       
   465 #ifdef	__SYMBIAN32__
       
   466  
       
   467 OilFunctionImpl* __oil_function_impl_clamp_f32_sse, clamp_f32() {
       
   468 		return &_oil_function_impl_clamp_f32_sse, clamp_f32;
       
   469 }
       
   470 #endif
       
   471 
       
   472 #ifdef	__SYMBIAN32__
       
   473  
       
   474 OilFunctionImpl* __oil_function_impl_clamp_f64_sse, clamp_f64() {
       
   475 		return &_oil_function_impl_clamp_f64_sse, clamp_f64;
       
   476 }
       
   477 #endif
       
   478 
       
   479 #ifdef	__SYMBIAN32__
       
   480  
       
   481 OilFunctionImpl* __oil_function_impl_clamplow_u8_sse, clamplow_u8() {
       
   482 		return &_oil_function_impl_clamplow_u8_sse, clamplow_u8;
       
   483 }
       
   484 #endif
       
   485 
       
   486 #ifdef	__SYMBIAN32__
       
   487  
       
   488 OilFunctionImpl* __oil_function_impl_clamplow_s16_sse, clamplow_s16() {
       
   489 		return &_oil_function_impl_clamplow_s16_sse, clamplow_s16;
       
   490 }
       
   491 #endif
       
   492 
       
   493 #ifdef	__SYMBIAN32__
       
   494  
       
   495 OilFunctionImpl* __oil_function_impl_clamplow_f32_sse, clamplow_f32() {
       
   496 		return &_oil_function_impl_clamplow_f32_sse, clamplow_f32;
       
   497 }
       
   498 #endif
       
   499 
       
   500 #ifdef	__SYMBIAN32__
       
   501  
       
   502 OilFunctionImpl* __oil_function_impl_clamplow_f64_sse, clamplow_f64() {
       
   503 		return &_oil_function_impl_clamplow_f64_sse, clamplow_f64;
       
   504 }
       
   505 #endif
       
   506 
       
   507 #ifdef	__SYMBIAN32__
       
   508  
       
   509 OilFunctionImpl* __oil_function_impl_clamphigh_u8_sse, clamphigh_u8() {
       
   510 		return &_oil_function_impl_clamphigh_u8_sse, clamphigh_u8;
       
   511 }
       
   512 #endif
       
   513 
       
   514 #ifdef	__SYMBIAN32__
       
   515  
       
   516 OilFunctionImpl* __oil_function_impl_clamphigh_s16_sse, clamphigh_s16() {
       
   517 		return &_oil_function_impl_clamphigh_s16_sse, clamphigh_s16;
       
   518 }
       
   519 #endif
       
   520 
       
   521 #ifdef	__SYMBIAN32__
       
   522  
       
   523 OilFunctionImpl* __oil_function_impl_clamphigh_f32_sse, clamphigh_f32() {
       
   524 		return &_oil_function_impl_clamphigh_f32_sse, clamphigh_f32;
       
   525 }
       
   526 #endif
       
   527 
       
   528 #ifdef	__SYMBIAN32__
       
   529  
       
   530 OilFunctionImpl* __oil_function_impl_clamphigh_f64_sse, clamphigh_f64() {
       
   531 		return &_oil_function_impl_clamphigh_f64_sse, clamphigh_f64;
       
   532 }
       
   533 #endif
       
   534