genericopenlibs/liboil/src/arm/math_vfp_asm.s
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 /*
       
     2  * Copyright (c) 2007
       
     3  *	Josep Torra <josep@fluendo.com>.  All rights reserved.
       
     4  *
       
     5  * Redistribution and use in source and binary forms, with or without
       
     6  * modification, are permitted provided that the following conditions
       
     7  * are met:
       
     8  * 1. Redistributions of source code must retain the above copyright
       
     9  *    notice, this list of conditions and the following disclaimer.
       
    10  * 2. Redistributions in binary form must reproduce the above copyright
       
    11  *    notice, this list of conditions and the following disclaimer in the
       
    12  *    documentation and/or other materials provided with the distribution.
       
    13  *
       
    14  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
       
    15  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
       
    16  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
       
    17  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
       
    18  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
       
    19  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
       
    20  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
       
    21  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
       
    22  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
       
    23  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
       
    24  * SUCH DAMAGE.
       
    25  */
       
    26 
       
    27 #if __VFP_FP__
       
    28 /* 
       
    29 ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp
       
    30 **
       
    31 ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n);
       
    32 ** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n);
       
    33 ** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n);
       
    34 ** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n);
       
    35 ** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n);
       
    36 ** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n);
       
    37 ** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n);
       
    38 ** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n);
       
    39 **
       
    40 ** d:   $r0     |   s1: $r1     | s2:  $r2     |   n:  $r3     |
       
    41 **
       
    42 */
       
    43 
       
    44 #define UNROLL_F32_TEMPLATE(fname,finst) \
       
    45   .global vfp_ ## fname ## ;                                                  \
       
    46   vfp_ ## fname ## :                                                          \
       
    47     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
    48     ands          ip, r3, #7;               /* ip = n % 8 */                  \
       
    49     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
    50   vfp_ ## fname ## _loop1:                                                    \
       
    51     fldmias       r1!, {s0};                                                  \
       
    52     fldmias       r2!, {s1};                                                  \
       
    53     ## finst ##s  s2, s0, s1;                                                 \
       
    54     fstmias       r0!, {s2};                                                  \
       
    55     subs          ip, ip, #1;                                                 \
       
    56     bne           vfp_ ## fname ## _loop1;                                    \
       
    57   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
       
    58     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
       
    59     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
    60     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
    61     mov           fp, #7;                                                     \
       
    62     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
       
    63     fmxr          fpscr, fp;                                                  \
       
    64   vfp_ ## fname ## _loop2:                                                    \
       
    65     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
       
    66     fldmias       r2!, {s16, s17, s18, s19, s20, s21, s22, s23};              \
       
    67     ## finst ##s  s24, s8, s16;                                               \
       
    68     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
       
    69     subs          ip, ip, #1;                                                 \
       
    70     bne           vfp_ ## fname ## _loop2;                                    \
       
    71     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
    72   vfp_ ## fname ## _end:                                                      \
       
    73     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
    74 
       
    75 #define UNROLL_F64_TEMPLATE(fname,finst) \
       
    76   .global vfp_ ## fname ## ;                                                  \
       
    77   vfp_ ## fname ## :                                                          \
       
    78     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
    79     ands          ip, r3, #3;               /* ip = n % 3 */                  \
       
    80     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
    81   vfp_ ## fname ## _loop1:                                                    \
       
    82     fldmiad       r1!, {d0};                                                  \
       
    83     fldmiad       r2!, {d1};                                                  \
       
    84     ## finst ##d  d2, d0, d1;                                                 \
       
    85     fstmiad       r0!, {d2};                                                  \
       
    86     subs          ip, ip, #1;                                                 \
       
    87     bne           vfp_ ## fname ## _loop1;                                    \
       
    88   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
       
    89     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
       
    90     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
    91     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
    92     mov           fp, #3;                                                     \
       
    93     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
       
    94     fmxr          fpscr, fp;                                                  \
       
    95   vfp_ ## fname ## _loop2:                                                    \
       
    96     fldmiad       r1!, {d4, d5, d6, d7};                                      \
       
    97     fldmiad       r2!, {d8, d9, d10, d11};                                    \
       
    98     ## finst ##d  d12, d4, d8;                                                \
       
    99     fstmiad       r0!, {d12, d13, d14, d15};                                  \
       
   100     subs          ip, ip, #1;                                                 \
       
   101     bne           vfp_ ## fname ## _loop2;                                    \
       
   102     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
   103   vfp_ ## fname ## _end:                                                      \
       
   104     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
   105 
       
   106 .align 2
       
   107 UNROLL_F32_TEMPLATE(add_f32,fadd);
       
   108 UNROLL_F64_TEMPLATE(add_f64,fadd);
       
   109 
       
   110 UNROLL_F32_TEMPLATE(divide_f32,fdiv);
       
   111 UNROLL_F64_TEMPLATE(divide_f64,fdiv);
       
   112 
       
   113 UNROLL_F32_TEMPLATE(multiply_f32,fmul);
       
   114 UNROLL_F64_TEMPLATE(multiply_f64,fmul);
       
   115 
       
   116 UNROLL_F32_TEMPLATE(subtract_f32,fsub);
       
   117 UNROLL_F64_TEMPLATE(subtract_f64,fsub);
       
   118 
       
   119 #undef UNROLL_F32_TEMPLATE
       
   120 #undef UNROLL_F64_TEMPLATE
       
   121 
       
   122 /* 
       
   123 **
       
   124 ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n);
       
   125 ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n);
       
   126 ** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n);
       
   127 ** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n);
       
   128 **
       
   129 ** d:   $r0     |   s1: $r1     | s2_1:  $r2   |   n:  $r3     |
       
   130 **
       
   131 */
       
   132 #define UNROLL_F32_TEMPLATE(fname,finst) \
       
   133   .global vfp_ ## fname ## ;                                                  \
       
   134   vfp_ ## fname ## :                                                          \
       
   135     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
   136     fldmias       r2, {s1};                 /* load scalar value */           \
       
   137     ands          ip, r3, #7;               /* ip = n % 8 */                  \
       
   138     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
   139   vfp_ ## fname ## _loop1:                                                    \
       
   140     fldmias       r1!, {s0};                                                  \
       
   141     ## finst ##s  s2, s0, s1;                                                 \
       
   142     fstmias       r0!, {s2};                                                  \
       
   143     subs          ip, ip, #1;                                                 \
       
   144     bne           vfp_ ## fname ## _loop1;                                    \
       
   145   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
       
   146     movs          ip, r3, lsr #3;           /* ip = n / 8 */                  \
       
   147     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
   148     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
   149     mov           fp, #7;                                                     \
       
   150     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
       
   151     fmxr          fpscr, fp;                                                  \
       
   152   vfp_ ## fname ## _loop2:                                                    \
       
   153     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
       
   154     ## finst ##s  s24, s8, s1;                                                \
       
   155     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
       
   156     subs          ip, ip, #1;                                                 \
       
   157     bne           vfp_ ## fname ## _loop2;                                    \
       
   158     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
   159   vfp_ ## fname ## _end:                                                      \
       
   160     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
   161 
       
   162 #define UNROLL_F64_TEMPLATE(fname,finst) \
       
   163   .global vfp_ ## fname ## ;                                                  \
       
   164   vfp_ ## fname ## :                                                          \
       
   165     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
   166     fldmiad       r2, {d1};                 /* load scalar value */           \
       
   167     ands          ip, r3, #3;               /* ip = n % 3 */                  \
       
   168     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
   169   vfp_ ## fname ## _loop1:                                                    \
       
   170     fldmiad       r1!, {d0};                                                  \
       
   171     ## finst ##d  d2, d0, d1;                                                 \
       
   172     fstmiad       r0!, {d2};                                                  \
       
   173     subs          ip, ip, #1;                                                 \
       
   174     bne           vfp_ ## fname ## _loop1;                                    \
       
   175   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
       
   176     movs          ip, r3, lsr #2;           /* ip = n / 4 */                  \
       
   177     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
   178     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
   179     mov           fp, #3;                                                     \
       
   180     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
       
   181     fmxr          fpscr, fp;                                                  \
       
   182   vfp_ ## fname ## _loop2:                                                    \
       
   183     fldmiad       r1!, {d4, d5, d6, d7};                                      \
       
   184     ## finst ##d  d12, d4, d1;                                                \
       
   185     fstmiad       r0!, {d12, d13, d14, d15};                                  \
       
   186     subs          ip, ip, #1;                                                 \
       
   187     bne           vfp_ ## fname ## _loop2;                                    \
       
   188     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
   189   vfp_ ## fname ## _end:                                                      \
       
   190     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
   191 
       
   192 UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd);
       
   193 UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd);
       
   194 
       
   195 UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul);
       
   196 UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul);
       
   197 
       
   198 #undef UNROLL_F32_TEMPLATE
       
   199 #undef UNROLL_F64_TEMPLATE
       
   200 
       
   201 /* 
       
   202 **
       
   203 ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n);
       
   204 ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n);
       
   205 ** void vfp_negative_f32(float *d, const float *s, int n);
       
   206 ** void vfp_negative_f64(double *d, const double *s, int n);
       
   207 **
       
   208 ** d:   $r0     |   s: $r1      |   n:  $r2     |
       
   209 **
       
   210 */
       
   211 #define UNROLL_F32_TEMPLATE(fname,finst) \
       
   212   .global vfp_ ## fname ## ;                                                  \
       
   213   vfp_ ## fname ## :                                                          \
       
   214     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
   215     ands          ip, r2, #7;               /* ip = n % 8 */                  \
       
   216     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
   217   vfp_ ## fname ## _loop1:                                                    \
       
   218     fldmias       r1!, {s0};                                                  \
       
   219     ## finst ##s  s2, s0;                                                     \
       
   220     fstmias       r0!, {s2};                                                  \
       
   221     subs          ip, ip, #1;                                                 \
       
   222     bne           vfp_ ## fname ## _loop1;                                    \
       
   223   vfp_ ## fname ## _unroll:                 /* unroll by 8 */                 \
       
   224     movs          ip, r2, lsr #3;           /* ip = n / 8 */                  \
       
   225     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
   226     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
   227     mov           fp, #7;                                                     \
       
   228     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 8 */      \
       
   229     fmxr          fpscr, fp;                                                  \
       
   230   vfp_ ## fname ## _loop2:                                                    \
       
   231     fldmias       r1!, {s8, s9, s10, s11, s12, s13, s14, s15};                \
       
   232     ## finst ##s  s24, s8;                                                    \
       
   233     fstmias       r0!, {s24, s25, s26, s27, s28, s29, s30, s31};              \
       
   234     subs          ip, ip, #1;                                                 \
       
   235     bne           vfp_ ## fname ## _loop2;                                    \
       
   236     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
   237   vfp_ ## fname ## _end:                                                      \
       
   238     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
   239 
       
   240 #define UNROLL_F64_TEMPLATE(fname,finst) \
       
   241   .global vfp_ ## fname ## ;                                                  \
       
   242   vfp_ ## fname ## :                                                          \
       
   243     stmdb         sp!, {fp, lr};            /* save registers to stack */     \
       
   244     ands          ip, r2, #3;               /* ip = n % 3 */                  \
       
   245     beq           vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */  \
       
   246   vfp_ ## fname ## _loop1:                                                    \
       
   247     fldmiad       r1!, {d0};                                                  \
       
   248     ## finst ##d  d2, d0;                                                     \
       
   249     fstmiad       r0!, {d2};                                                  \
       
   250     subs          ip, ip, #1;                                                 \
       
   251     bne           vfp_ ## fname ## _loop1;                                    \
       
   252   vfp_ ## fname ## _unroll:                 /* unroll by 4 */                 \
       
   253     movs          ip, r2, lsr #2;           /* ip = n / 4 */                  \
       
   254     beq           vfp_ ## fname ## _end;    /* if ip == 0 goto finish */      \
       
   255     fmrx          lr, fpscr;                /* read fpscr register into arm */\
       
   256     mov           fp, #3;                                                     \
       
   257     orr           fp, lr, fp, lsl #16;      /* set vector lenght to 4 */      \
       
   258     fmxr          fpscr, fp;                                                  \
       
   259   vfp_ ## fname ## _loop2:                                                    \
       
   260     fldmiad       r1!, {d4, d5, d6, d7};                                      \
       
   261     ## finst ##d  d12, d4;                                                    \
       
   262     fstmiad       r0!, {d12, d13, d14, d15};                                  \
       
   263     subs          ip, ip, #1;                                                 \
       
   264     bne           vfp_ ## fname ## _loop2;                                    \
       
   265     fmxr          fpscr, lr;                /* restore original fpscr */      \
       
   266   vfp_ ## fname ## _end:                                                      \
       
   267     ldmia         sp!, {fp, pc};        /* recovering from stack and return */   
       
   268 
       
   269 UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs);
       
   270 UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs);
       
   271 
       
   272 UNROLL_F32_TEMPLATE(negative_f32,fneg);
       
   273 UNROLL_F64_TEMPLATE(negative_f64,fneg);
       
   274 
       
   275 #undef UNROLL_F32_TEMPLATE
       
   276 #undef UNROLL_F64_TEMPLATE
       
   277 #endif