|
1 /* |
|
2 * Copyright (c) 2007 |
|
3 * Josep Torra <josep@fluendo.com>. All rights reserved. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * |
|
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND |
|
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE |
|
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
24 * SUCH DAMAGE. |
|
25 */ |
|
26 |
|
27 #if __VFP_FP__ |
|
28 /* |
|
29 ** compile with -mcpu=arm1136j-s -mfpu=vfp -mfloat-abi=softfp |
|
30 ** |
|
31 ** void vfp_add_f32 (float *d, const float *s1, const float *s2, int n); |
|
32 ** void vfp_add_f64 (double *d, const double *s1, const double *s2, int n); |
|
33 ** void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n); |
|
34 ** void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n); |
|
35 ** void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n); |
|
36 ** void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n); |
|
37 ** void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n); |
|
38 ** void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n); |
|
39 ** |
|
40 ** d: $r0 | s1: $r1 | s2: $r2 | n: $r3 | |
|
41 ** |
|
42 */ |
|
43 |
|
44 #define UNROLL_F32_TEMPLATE(fname,finst) \ |
|
45 .global vfp_ ## fname ## ; \ |
|
46 vfp_ ## fname ## : \ |
|
47 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
48 ands ip, r3, #7; /* ip = n % 8 */ \ |
|
49 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
50 vfp_ ## fname ## _loop1: \ |
|
51 fldmias r1!, {s0}; \ |
|
52 fldmias r2!, {s1}; \ |
|
53 ## finst ##s s2, s0, s1; \ |
|
54 fstmias r0!, {s2}; \ |
|
55 subs ip, ip, #1; \ |
|
56 bne vfp_ ## fname ## _loop1; \ |
|
57 vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
|
58 movs ip, r3, lsr #3; /* ip = n / 8 */ \ |
|
59 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
60 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
61 mov fp, #7; \ |
|
62 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
|
63 fmxr fpscr, fp; \ |
|
64 vfp_ ## fname ## _loop2: \ |
|
65 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
|
66 fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}; \ |
|
67 ## finst ##s s24, s8, s16; \ |
|
68 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
|
69 subs ip, ip, #1; \ |
|
70 bne vfp_ ## fname ## _loop2; \ |
|
71 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
72 vfp_ ## fname ## _end: \ |
|
73 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
74 |
|
75 #define UNROLL_F64_TEMPLATE(fname,finst) \ |
|
76 .global vfp_ ## fname ## ; \ |
|
77 vfp_ ## fname ## : \ |
|
78 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
79 ands ip, r3, #3; /* ip = n % 3 */ \ |
|
80 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
81 vfp_ ## fname ## _loop1: \ |
|
82 fldmiad r1!, {d0}; \ |
|
83 fldmiad r2!, {d1}; \ |
|
84 ## finst ##d d2, d0, d1; \ |
|
85 fstmiad r0!, {d2}; \ |
|
86 subs ip, ip, #1; \ |
|
87 bne vfp_ ## fname ## _loop1; \ |
|
88 vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
|
89 movs ip, r3, lsr #2; /* ip = n / 4 */ \ |
|
90 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
91 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
92 mov fp, #3; \ |
|
93 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
|
94 fmxr fpscr, fp; \ |
|
95 vfp_ ## fname ## _loop2: \ |
|
96 fldmiad r1!, {d4, d5, d6, d7}; \ |
|
97 fldmiad r2!, {d8, d9, d10, d11}; \ |
|
98 ## finst ##d d12, d4, d8; \ |
|
99 fstmiad r0!, {d12, d13, d14, d15}; \ |
|
100 subs ip, ip, #1; \ |
|
101 bne vfp_ ## fname ## _loop2; \ |
|
102 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
103 vfp_ ## fname ## _end: \ |
|
104 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
105 |
|
106 .align 2 |
|
107 UNROLL_F32_TEMPLATE(add_f32,fadd); |
|
108 UNROLL_F64_TEMPLATE(add_f64,fadd); |
|
109 |
|
110 UNROLL_F32_TEMPLATE(divide_f32,fdiv); |
|
111 UNROLL_F64_TEMPLATE(divide_f64,fdiv); |
|
112 |
|
113 UNROLL_F32_TEMPLATE(multiply_f32,fmul); |
|
114 UNROLL_F64_TEMPLATE(multiply_f64,fmul); |
|
115 |
|
116 UNROLL_F32_TEMPLATE(subtract_f32,fsub); |
|
117 UNROLL_F64_TEMPLATE(subtract_f64,fsub); |
|
118 |
|
119 #undef UNROLL_F32_TEMPLATE |
|
120 #undef UNROLL_F64_TEMPLATE |
|
121 |
|
122 /* |
|
123 ** |
|
124 ** void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n); |
|
125 ** void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n); |
|
126 ** void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n); |
|
127 ** void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n); |
|
128 ** |
|
129 ** d: $r0 | s1: $r1 | s2_1: $r2 | n: $r3 | |
|
130 ** |
|
131 */ |
|
132 #define UNROLL_F32_TEMPLATE(fname,finst) \ |
|
133 .global vfp_ ## fname ## ; \ |
|
134 vfp_ ## fname ## : \ |
|
135 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
136 fldmias r2, {s1}; /* load scalar value */ \ |
|
137 ands ip, r3, #7; /* ip = n % 8 */ \ |
|
138 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
139 vfp_ ## fname ## _loop1: \ |
|
140 fldmias r1!, {s0}; \ |
|
141 ## finst ##s s2, s0, s1; \ |
|
142 fstmias r0!, {s2}; \ |
|
143 subs ip, ip, #1; \ |
|
144 bne vfp_ ## fname ## _loop1; \ |
|
145 vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
|
146 movs ip, r3, lsr #3; /* ip = n / 8 */ \ |
|
147 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
148 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
149 mov fp, #7; \ |
|
150 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
|
151 fmxr fpscr, fp; \ |
|
152 vfp_ ## fname ## _loop2: \ |
|
153 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
|
154 ## finst ##s s24, s8, s1; \ |
|
155 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
|
156 subs ip, ip, #1; \ |
|
157 bne vfp_ ## fname ## _loop2; \ |
|
158 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
159 vfp_ ## fname ## _end: \ |
|
160 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
161 |
|
162 #define UNROLL_F64_TEMPLATE(fname,finst) \ |
|
163 .global vfp_ ## fname ## ; \ |
|
164 vfp_ ## fname ## : \ |
|
165 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
166 fldmiad r2, {d1}; /* load scalar value */ \ |
|
167 ands ip, r3, #3; /* ip = n % 3 */ \ |
|
168 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
169 vfp_ ## fname ## _loop1: \ |
|
170 fldmiad r1!, {d0}; \ |
|
171 ## finst ##d d2, d0, d1; \ |
|
172 fstmiad r0!, {d2}; \ |
|
173 subs ip, ip, #1; \ |
|
174 bne vfp_ ## fname ## _loop1; \ |
|
175 vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
|
176 movs ip, r3, lsr #2; /* ip = n / 4 */ \ |
|
177 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
178 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
179 mov fp, #3; \ |
|
180 orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \ |
|
181 fmxr fpscr, fp; \ |
|
182 vfp_ ## fname ## _loop2: \ |
|
183 fldmiad r1!, {d4, d5, d6, d7}; \ |
|
184 ## finst ##d d12, d4, d1; \ |
|
185 fstmiad r0!, {d12, d13, d14, d15}; \ |
|
186 subs ip, ip, #1; \ |
|
187 bne vfp_ ## fname ## _loop2; \ |
|
188 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
189 vfp_ ## fname ## _end: \ |
|
190 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
191 |
|
192 UNROLL_F32_TEMPLATE(scalaradd_f32_ns,fadd); |
|
193 UNROLL_F64_TEMPLATE(scalaradd_f64_ns,fadd); |
|
194 |
|
195 UNROLL_F32_TEMPLATE(scalarmultiply_f32_ns,fmul); |
|
196 UNROLL_F64_TEMPLATE(scalarmultiply_f64_ns,fmul); |
|
197 |
|
198 #undef UNROLL_F32_TEMPLATE |
|
199 #undef UNROLL_F64_TEMPLATE |
|
200 |
|
201 /* |
|
202 ** |
|
203 ** void vfp_abs_f32_f32_ns(float *d, const float *s, int n); |
|
204 ** void vfp_abs_f64_f64_ns(double *d, const double *s, int n); |
|
205 ** void vfp_negative_f32(float *d, const float *s, int n); |
|
206 ** void vfp_negative_f64(double *d, const double *s, int n); |
|
207 ** |
|
208 ** d: $r0 | s: $r1 | n: $r2 | |
|
209 ** |
|
210 */ |
|
211 #define UNROLL_F32_TEMPLATE(fname,finst) \ |
|
212 .global vfp_ ## fname ## ; \ |
|
213 vfp_ ## fname ## : \ |
|
214 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
215 ands ip, r2, #7; /* ip = n % 8 */ \ |
|
216 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
217 vfp_ ## fname ## _loop1: \ |
|
218 fldmias r1!, {s0}; \ |
|
219 ## finst ##s s2, s0; \ |
|
220 fstmias r0!, {s2}; \ |
|
221 subs ip, ip, #1; \ |
|
222 bne vfp_ ## fname ## _loop1; \ |
|
223 vfp_ ## fname ## _unroll: /* unroll by 8 */ \ |
|
224 movs ip, r2, lsr #3; /* ip = n / 8 */ \ |
|
225 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
226 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
227 mov fp, #7; \ |
|
228 orr fp, lr, fp, lsl #16; /* set vector lenght to 8 */ \ |
|
229 fmxr fpscr, fp; \ |
|
230 vfp_ ## fname ## _loop2: \ |
|
231 fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}; \ |
|
232 ## finst ##s s24, s8; \ |
|
233 fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}; \ |
|
234 subs ip, ip, #1; \ |
|
235 bne vfp_ ## fname ## _loop2; \ |
|
236 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
237 vfp_ ## fname ## _end: \ |
|
238 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
239 |
|
240 #define UNROLL_F64_TEMPLATE(fname,finst) \ |
|
241 .global vfp_ ## fname ## ; \ |
|
242 vfp_ ## fname ## : \ |
|
243 stmdb sp!, {fp, lr}; /* save registers to stack */ \ |
|
244 ands ip, r2, #3; /* ip = n % 3 */ \ |
|
245 beq vfp_ ## fname ## _unroll; /* if ip == 0 goto prep_loop2 */ \ |
|
246 vfp_ ## fname ## _loop1: \ |
|
247 fldmiad r1!, {d0}; \ |
|
248 ## finst ##d d2, d0; \ |
|
249 fstmiad r0!, {d2}; \ |
|
250 subs ip, ip, #1; \ |
|
251 bne vfp_ ## fname ## _loop1; \ |
|
252 vfp_ ## fname ## _unroll: /* unroll by 4 */ \ |
|
253 movs ip, r2, lsr #2; /* ip = n / 4 */ \ |
|
254 beq vfp_ ## fname ## _end; /* if ip == 0 goto finish */ \ |
|
255 fmrx lr, fpscr; /* read fpscr register into arm */\ |
|
256 mov fp, #3; \ |
|
257 orr fp, lr, fp, lsl #16; /* set vector lenght to 4 */ \ |
|
258 fmxr fpscr, fp; \ |
|
259 vfp_ ## fname ## _loop2: \ |
|
260 fldmiad r1!, {d4, d5, d6, d7}; \ |
|
261 ## finst ##d d12, d4; \ |
|
262 fstmiad r0!, {d12, d13, d14, d15}; \ |
|
263 subs ip, ip, #1; \ |
|
264 bne vfp_ ## fname ## _loop2; \ |
|
265 fmxr fpscr, lr; /* restore original fpscr */ \ |
|
266 vfp_ ## fname ## _end: \ |
|
267 ldmia sp!, {fp, pc}; /* recovering from stack and return */ |
|
268 |
|
269 UNROLL_F32_TEMPLATE(abs_f32_f32_ns,fabs); |
|
270 UNROLL_F64_TEMPLATE(abs_f64_f64_ns,fabs); |
|
271 |
|
272 UNROLL_F32_TEMPLATE(negative_f32,fneg); |
|
273 UNROLL_F64_TEMPLATE(negative_f64,fneg); |
|
274 |
|
275 #undef UNROLL_F32_TEMPLATE |
|
276 #undef UNROLL_F64_TEMPLATE |
|
277 #endif |