genericopenlibs/liboil/src/i386/abs_i386.c
changeset 31 ce057bb09d0b
parent 18 47c74d1534e1
equal deleted inserted replaced
30:e20de85af2ee 31:ce057bb09d0b
       
     1 /*
       
     2  * LIBOIL - Library of Optimized Inner Loops
       
     3  * Copyright (c) 2003,2004 David A. Schleef <ds@schleef.org>
       
     4  * All rights reserved.
       
     5  *
       
     6  * Redistribution and use in source and binary forms, with or without
       
     7  * modification, are permitted provided that the following conditions
       
     8  * are met:
       
     9  * 1. Redistributions of source code must retain the above copyright
       
    10  *    notice, this list of conditions and the following disclaimer.
       
    11  * 2. Redistributions in binary form must reproduce the above copyright
       
    12  *    notice, this list of conditions and the following disclaimer in the
       
    13  *    documentation and/or other materials provided with the distribution.
       
    14  * 
       
    15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
       
    16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
       
    17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
       
    19  * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
       
    20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
       
    21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
       
    23  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
       
    24  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
       
    25  * POSSIBILITY OF SUCH DAMAGE.
       
    26  */
       
    27 //Portions Copyright (c)  2008-2009 Nokia Corporation and/or its subsidiary(-ies). All rights reserved. 
       
    28 
       
    29 #ifdef HAVE_CONFIG_H
       
    30 #include "config.h"
       
    31 #endif
       
    32 
       
    33 #include <liboil/liboilfunction.h>
       
    34 #include "liboil/simdpack/simdpack.h"
       
    35 
       
    36 #define ABS(x) ((x)>0 ? (x) : -(x))
       
    37 
       
    38 #if 0
       
    39 static void
       
    40 abs_u16_s16_i386asm (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
    41 {
       
    42   __asm__ __volatile__ ("\n"
       
    43       "	.p2align 4,,15			\n"
       
    44       "1:  movswl	(%0), %%eax		\n"
       
    45       "    addl	$2, %0			\n"
       
    46       "    movl	%%eax, %%edx		\n"
       
    47       "    negl	%%edx			\n"
       
    48       "    cmpl	$-1, %%eax		\n"
       
    49       "    cmovle	%%edx, %%eax		\n"
       
    50       "    movw	%%ax, (%1)		\n"
       
    51       "    addl	$2, %1			\n"
       
    52       "    decl	%2			\n"
       
    53       "    testl	%2, %2			\n"
       
    54       "    jg	1b			\n":"+r" (src), "+r" (dest), "+r" (n)
       
    55       ::"eax", "edx");
       
    56 }
       
    57 
       
    58 OIL_DEFINE_IMPL_FULL (abs_u16_s16_i386asm, abs_u16_s16, OIL_IMPL_FLAG_CMOV);
       
    59 #endif
       
    60 
       
    61 #if 0
       
    62 /* The previous function after running through uberopt */
       
    63 static void
       
    64 abs_u16_s16_i386asm_uber4 (uint16_t * dest, int dstr, int16_t * src,
       
    65     int sstr, int n)
       
    66 {
       
    67   __asm__ __volatile__ ("\n"
       
    68       "	.p2align 4,,15			\n"
       
    69       "1:                               \n"
       
    70       "    movswl	(%0), %%eax	\n" /* UBER 0:     */
       
    71       "    addl	$2, %0			\n" /* UBER 1: 0   */
       
    72       "    movl	%%eax, %%edx		\n" /* UBER 2: 0   */
       
    73       "    decl	%2			\n" /* UBER 7:     */
       
    74       "    negl	%%edx			\n" /* UBER 3: 2   */
       
    75       "    cmpl	$-1, %%eax ; cmovle %%edx, %%eax \n" /* UBER 4: 3 */
       
    76       "    movw	%%ax, (%1)		\n" /* UBER 5: 4   */
       
    77       "    addl	$2, %1			\n" /* UBER 6: 5   */
       
    78       "    testl	%2, %2		\n"
       
    79       "    jg	1b			\n"
       
    80       :"+r" (src), "+r" (dest), "+r" (n)
       
    81       ::"eax", "edx");
       
    82 }
       
    83 OIL_DEFINE_IMPL_FULL (abs_u16_s16_i386asm_uber4, abs_u16_s16, OIL_IMPL_FLAG_CMOV);
       
    84 #endif
       
    85 
       
    86 #if 0
       
    87 static void
       
    88 abs_u16_s16_i386asm2 (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
    89 {
       
    90   __asm__ __volatile__ ("\n"
       
    91       "	pushl	%%ebp			\n"
       
    92       "	movl	%%eax, %%ebp		\n"
       
    93       "	.p2align 4,,15			\n"
       
    94       "1:	movswl	(%%edi), %%eax		\n"
       
    95       "	addl	$2, %%edi		\n"
       
    96       "	movl	%%eax, %%edx		\n"
       
    97       "	negl	%%edx			\n"
       
    98       "	cmpl	$-1, %%eax		\n"
       
    99       "	cmovle	%%edx, %%eax		\n"
       
   100       "	movw	%%ax, (%%ebp)		\n"
       
   101       "	addl	$2, %%ebp		\n"
       
   102       "	decl	%2			\n"
       
   103       "	testl	%2, %2			\n"
       
   104       "	jg	1b			\n"
       
   105       "	popl	%%ebp			\n":"+D" (src), "+a" (dest), "+S" (n)
       
   106       ::"ecx", "edx");
       
   107 }
       
   108 OIL_DEFINE_IMPL_FULL (abs_u16_s16_i386asm2, abs_u16_s16, OIL_IMPL_FLAG_CMOV);
       
   109 #endif
       
   110 
       
   111 static void
       
   112 abs_u16_s16_i386asm3 (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
   113 {
       
   114 #if !defined(__WINSCW__) && !defined(__WINS__)      
       
   115   __asm__ __volatile__ ("\n"
       
   116       "	.p2align 4,,15			\n"
       
   117       "1:  movswl (%1), %%eax           \n"
       
   118       "    add %3, %1                   \n"
       
   119       "    mov %%eax, %%edx             \n"
       
   120       "    sar $0xf, %%ax               \n"
       
   121       "    and %%edx, %%eax             \n"
       
   122       "    add %%eax, %%eax             \n"
       
   123       "    sub %%eax, %%edx             \n"
       
   124       "    mov %%dx, (%0)               \n"
       
   125       "    add %4, %0                   \n"
       
   126       "    decl %2                      \n"
       
   127       "    jne 1b                       \n"
       
   128       : "+r" (dest), "+r" (src), "+m" (n)
       
   129       : "m" (dstr), "m" (sstr)
       
   130       : "eax", "edx");
       
   131 #endif
       
   132 }
       
   133 OIL_DEFINE_IMPL_ASM (abs_u16_s16_i386asm3, abs_u16_s16);
       
   134 
       
   135 
       
   136 
       
   137 static void
       
   138 abs_u16_s16_mmx (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
   139 {
       
   140 #if !defined(__WINSCW__) && !defined(__WINS__)      
       
   141   static const int16_t p[][4] = {
       
   142     { -32768, -32768, -32768, -32768 },
       
   143     { 32767, 32767, 32767, 32767 }
       
   144   };
       
   145   int16_t tmp[4];
       
   146 
       
   147   while (n & 3) {
       
   148     *dest = ABS (*src);
       
   149     OIL_INCREMENT (dest, dstr);
       
   150     OIL_INCREMENT (src, sstr);
       
   151     n--;
       
   152   }
       
   153   n /= 4;
       
   154  
       
   155   __asm__ __volatile__ ("\n"
       
   156       "	movq	(%0), %%mm2		\n"
       
   157       "	movq	8(%0), %%mm3		\n"
       
   158       :: "r" (p));
       
   159 
       
   160   while (n--) {
       
   161     tmp[0] = *src;
       
   162     OIL_INCREMENT (src, sstr);
       
   163     tmp[1] = *src;
       
   164     OIL_INCREMENT (src, sstr);
       
   165     tmp[2] = *src;
       
   166     OIL_INCREMENT (src, sstr);
       
   167     tmp[3] = *src;
       
   168     OIL_INCREMENT (src, sstr);
       
   169     __asm__ __volatile__ ("\n"
       
   170         "	movq	(%0), %%mm1		\n"
       
   171         "	movq	%%mm1, %%mm0		\n"
       
   172         "	paddsw	%%mm2, %%mm0		\n"
       
   173         "	paddsw	%%mm3, %%mm1		\n"
       
   174         "	psubsw	%%mm2, %%mm0		\n"
       
   175         "	psubsw	%%mm3, %%mm1		\n"
       
   176         "	psubw	%%mm1, %%mm0		\n"
       
   177         "	movq	%%mm0, (%0)		\n"
       
   178         : : "r" (tmp)
       
   179         : "memory" );
       
   180     *dest = tmp[0];
       
   181     OIL_INCREMENT (dest, dstr);
       
   182     *dest = tmp[1];
       
   183     OIL_INCREMENT (dest, dstr);
       
   184     *dest = tmp[2];
       
   185     OIL_INCREMENT (dest, dstr);
       
   186     *dest = tmp[3];
       
   187     OIL_INCREMENT (dest, dstr);
       
   188   }
       
   189   asm volatile ("emms");
       
   190 #endif  
       
   191 }
       
   192 
       
   193 OIL_DEFINE_IMPL_FULL (abs_u16_s16_mmx, abs_u16_s16, OIL_IMPL_FLAG_MMX);
       
   194 
       
   195 #if 0
       
   196 static void
       
   197 abs_u16_s16_mmxx (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
   198 {
       
   199   short p[] = { -32768, -32768, -32768, -32768,
       
   200     32767, 32767, 32767, 32767
       
   201   };
       
   202 
       
   203   while (n & 7) {
       
   204     *dest = ABS (*src);
       
   205     OIL_INCREMENT (dest, dstr);
       
   206     OIL_INCREMENT (src, sstr);
       
   207     n--;
       
   208   }
       
   209   n /= 8;
       
   210   __asm__ __volatile__ ("\n"
       
   211       "	movq	(%3), %%mm2		\n"
       
   212       "	movq	8(%3), %%mm3		\n"
       
   213       "	.p2align 4,,15			\n"
       
   214       "1:	movq	(%%edi), %%mm0		\n"
       
   215       "	movq	(%%edi), %%mm1		\n"
       
   216       "	paddsw	%%mm2, %%mm0		\n"
       
   217       "	paddsw	%%mm3, %%mm1		\n"
       
   218       "	psubsw	%%mm2, %%mm0		\n"
       
   219       "	psubsw	%%mm3, %%mm1		\n"
       
   220       "	psubw	%%mm1, %%mm0		\n"
       
   221       "	movq	%%mm0, (%%eax)		\n"
       
   222       "	 movq	8(%%edi), %%mm4		\n"
       
   223       "	 movq	8(%%edi), %%mm5		\n"
       
   224       "	 addl	$16, %%edi		\n"
       
   225       "	 paddsw	%%mm2, %%mm4		\n"
       
   226       "	 paddsw	%%mm3, %%mm5		\n"
       
   227       "	 psubsw	%%mm2, %%mm4		\n"
       
   228       "	 psubsw	%%mm3, %%mm5		\n"
       
   229       "	 psubw	%%mm5, %%mm4		\n"
       
   230       "	 movq	%%mm4, 8(%%eax)		\n"
       
   231       "	 addl	$16, %%eax		\n"
       
   232       "	decl	%2			\n"
       
   233       "	testl	%2, %2			\n"
       
   234       "	jg	1b			\n":"+D" (src), "+a" (dest), "+S" (n)
       
   235       :"c" (p));
       
   236   asm volatile ("emms");
       
   237 }
       
   238 OIL_DEFINE_IMPL_FULL (abs_u16_s16_mmxx, abs_u16_s16, OIL_IMPL_FLAG_MMX);
       
   239 #endif
       
   240 
       
   241 #ifdef ENABLE_BROKEN_IMPLS
       
   242 static void
       
   243 abs_u16_s16_mmx2 (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
   244 {
       
   245   while (n & 7) {
       
   246     *dest = ABS (*src);
       
   247     OIL_INCREMENT (dest, dstr);
       
   248     OIL_INCREMENT (src, sstr);
       
   249     n--;
       
   250   }
       
   251   n /= 8;
       
   252   __asm__ __volatile__ ("\n"
       
   253       "	pushl	%%ebp			\n"
       
   254       "	movl	%%eax, %%ebp		\n"
       
   255       "	.p2align 4,,15			\n"
       
   256       "1:	movq	(%%edi), %%mm0		\n"
       
   257       "	pxor	%%mm1, %%mm1		\n"
       
   258       "	 movq	8(%%edi), %%mm2		\n"
       
   259       "	 addl	$16, %%edi		\n"
       
   260       "	psubw	%%mm0, %%mm1		\n"
       
   261       "	 pxor	%%mm3, %%mm3		\n"
       
   262       "	pmaxsw	%%mm0, %%mm1		\n"
       
   263       "	 psubw	%%mm2, %%mm3		\n"
       
   264       "	movq	%%mm1, (%%ebp)		\n"
       
   265       "	 pmaxsw	%%mm2, %%mm3		\n"
       
   266       "	 movq	%%mm3, 8(%%ebp)		\n"
       
   267       "	 addl	$16, %%ebp		\n"
       
   268       "	decl	%2			\n"
       
   269       "	testl	%2, %2			\n"
       
   270       "	jg	1b			\n"
       
   271       "	popl	%%ebp			\n":"+D" (src), "+a" (dest), "+S" (n)
       
   272       ::"ecx", "edx");
       
   273   asm volatile ("emms");
       
   274 }
       
   275 OIL_DEFINE_IMPL_FULL (abs_u16_s16_mmx2, abs_u16_s16, OIL_IMPL_FLAG_MMXEXT);
       
   276 #endif
       
   277 
       
   278 #ifdef ENABLE_BROKEN_IMPLS
       
   279 static void
       
   280 abs_u16_s16_sse2 (uint16_t * dest, int dstr, int16_t * src, int sstr, int n)
       
   281 {
       
   282   while (n & 7) {
       
   283     *dest = ABS (*src);
       
   284     OIL_INCREMENT (dest, dstr);
       
   285     OIL_INCREMENT (src, sstr);
       
   286     n--;
       
   287   }
       
   288   n /= 8;
       
   289   __asm__ __volatile__ ("\n"
       
   290       "	pushl	%%ebp			\n"
       
   291       "	movl	%%eax, %%ebp		\n"
       
   292       "	.p2align 4,,15			\n"
       
   293       "1:	movq	(%%edi), %%xmm0		\n"
       
   294       "	addl	$16, %%edi		\n"
       
   295       "	pxor	%%xmm1, %%xmm1		\n"
       
   296       "	psubw	%%xmm0, %%xmm1		\n"
       
   297       "	pmaxsw	%%xmm0, %%xmm1		\n"
       
   298       "	movq	%%xmm1, (%%ebp)		\n"
       
   299       "	addl	$16, %%ebp		\n"
       
   300       "	decl	%2			\n"
       
   301       "	testl	%2, %2			\n"
       
   302       "	jg	1b			\n"
       
   303       "	popl	%%ebp			\n":"+D" (src), "+a" (dest), "+S" (n)
       
   304       ::"ecx", "edx");
       
   305 }
       
   306 OIL_DEFINE_IMPL_FULL (abs_u16_s16_sse2, abs_u16_s16, OIL_IMPL_FLAG_SSE2);
       
   307 #endif
       
   308 
       
   309 
       
   310 
       
   311 #ifdef	__SYMBIAN32__
       
   312  
       
   313 OilFunctionImpl* __oil_function_impl_abs_u16_s16_i386asm, abs_u16_s16() {
       
   314 		return &_oil_function_impl_abs_u16_s16_i386asm, abs_u16_s16;
       
   315 }
       
   316 #endif
       
   317 
       
   318 #ifdef	__SYMBIAN32__
       
   319  
       
   320 OilFunctionImpl* __oil_function_impl_abs_u16_s16_i386asm_uber4, abs_u16_s16() {
       
   321 		return &_oil_function_impl_abs_u16_s16_i386asm_uber4, abs_u16_s16;
       
   322 }
       
   323 #endif
       
   324 
       
   325 #ifdef	__SYMBIAN32__
       
   326  
       
   327 OilFunctionImpl* __oil_function_impl_abs_u16_s16_i386asm2, abs_u16_s16() {
       
   328 		return &_oil_function_impl_abs_u16_s16_i386asm2, abs_u16_s16;
       
   329 }
       
   330 #endif
       
   331 
       
   332 #ifdef	__SYMBIAN32__
       
   333  
       
   334 OilFunctionImpl* __oil_function_impl_abs_u16_s16_mmx, abs_u16_s16() {
       
   335 		return &_oil_function_impl_abs_u16_s16_mmx, abs_u16_s16;
       
   336 }
       
   337 #endif
       
   338 
       
   339 #ifdef	__SYMBIAN32__
       
   340  
       
   341 OilFunctionImpl* __oil_function_impl_abs_u16_s16_mmxx, abs_u16_s16() {
       
   342 		return &_oil_function_impl_abs_u16_s16_mmxx, abs_u16_s16;
       
   343 }
       
   344 #endif
       
   345 
       
   346 #ifdef	__SYMBIAN32__
       
   347  
       
   348 OilFunctionImpl* __oil_function_impl_abs_u16_s16_mmx2, abs_u16_s16() {
       
   349 		return &_oil_function_impl_abs_u16_s16_mmx2, abs_u16_s16;
       
   350 }
       
   351 #endif
       
   352 
       
   353 #ifdef	__SYMBIAN32__
       
   354  
       
   355 OilFunctionImpl* __oil_function_impl_abs_u16_s16_sse2, abs_u16_s16() {
       
   356 		return &_oil_function_impl_abs_u16_s16_sse2, abs_u16_s16;
       
   357 }
       
   358 #endif
       
   359 
       
   360 
       
   361 
       
   362 #ifdef	__SYMBIAN32__
       
   363  
       
   364 OilFunctionImpl* __oil_function_impl_abs_u16_s16_i386asm3() {
       
   365 		return &_oil_function_impl_abs_u16_s16_i386asm3;
       
   366 }
       
   367 #endif
       
   368