|
1 #if __ARMCC__ |
|
2 |
|
3 #define __CPU_ARM |
|
4 #define __CPU_HAS_VFP |
|
5 #include <arm_vfp.h> |
|
6 #include <e32std.h> |
|
7 |
|
8 |
|
9 extern "C" { |
|
10 |
|
11 EXPORT_C __NAKED__ void vfp_add_f32 (float *d, const float *s1, const float *s2, int n) |
|
12 { |
|
13 asm(" stmdb sp!, {fp, lr}"); |
|
14 asm("ands ip, r3, #7"); |
|
15 asm("beq vfp_add_f32_unroll"); |
|
16 |
|
17 //asm("fldmias r1!, {s0}"); |
|
18 VFP_FLDMIAS(CC_AL,1,0,1); |
|
19 |
|
20 asm("vfp_add_f32_loop1: "); |
|
21 |
|
22 //asm("fldmias r2!, {s1}"); |
|
23 VFP_FLDMIAS(CC_AL,2,1,1); |
|
24 |
|
25 //asm("fadds s2, s0, s1"); |
|
26 VFP_FADDS(CC_AL,2,0,1); |
|
27 |
|
28 //asm("fstmias r0!, {s2}"); |
|
29 VFP_FSTMIAS(CC_AL,0,2,1); |
|
30 |
|
31 asm("subs ip, ip, #1"); |
|
32 asm("bne vfp_add_f32_loop1 "); |
|
33 asm("vfp_add_f32_unroll: movs ip, r3, lsr #3"); |
|
34 asm("beq vfp_add_f32_end"); |
|
35 |
|
36 |
|
37 //asm("fmrx lr, fpscr"); |
|
38 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
39 |
|
40 |
|
41 asm("mov fp, #7"); |
|
42 asm("orr fp, lr, fp, lsl #16"); |
|
43 |
|
44 //asm("fmxr fpscr, fp"); |
|
45 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
46 |
|
47 |
|
48 asm("vfp_add_f32_loop2:"); |
|
49 |
|
50 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
51 VFP_FLDMIAS(CC_AL,1,8,8); |
|
52 |
|
53 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); |
|
54 VFP_FLDMIAS(CC_AL,2,16,8); |
|
55 |
|
56 //asm("fadds s24, s8, s16"); |
|
57 VFP_FADDS(CC_AL,24,8,16); |
|
58 |
|
59 asm("subs ip, ip, #1"); |
|
60 asm("bne vfp_add_f32_loop2"); |
|
61 |
|
62 //asm("fmxr fpscr, lr"); |
|
63 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
64 |
|
65 asm("vfp_add_f32_end:"); |
|
66 asm ("ldmia sp!, {fp, pc}"); |
|
67 |
|
68 } |
|
69 |
|
70 |
|
71 EXPORT_C __NAKED__ void vfp_divide_f32 (float *d, const float *s1, const float *s2, int n) |
|
72 { |
|
73 asm(" stmdb sp!, {fp, lr}"); |
|
74 asm("ands ip, r3, #7"); |
|
75 asm("beq vfp_divide_f32_unroll"); |
|
76 |
|
77 //asm("fldmias r1!, {s0}"); |
|
78 VFP_FLDMIAS(CC_AL,1,0,1); |
|
79 |
|
80 asm("vfp_divide_f32_loop1:"); |
|
81 |
|
82 //asm("fldmias r2!, {s1}"); |
|
83 VFP_FLDMIAS(CC_AL,2,1,1); |
|
84 |
|
85 //asm("fadds s2, s0, s1"); |
|
86 VFP_FDIVS(CC_AL,2,0,1); |
|
87 |
|
88 //asm("fstmias r0!, {s2}"); |
|
89 VFP_FSTMIAS(CC_AL,0,2,1); |
|
90 |
|
91 asm("subs ip, ip, #1"); |
|
92 asm("bne vfp_divide_f32_loop1"); |
|
93 asm("vfp_divide_f32_unroll: movs ip, r3, lsr #3"); |
|
94 asm("beq vfp_divide_f32_end"); |
|
95 |
|
96 |
|
97 //asm("fmrx lr, fpscr"); |
|
98 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
99 |
|
100 |
|
101 asm("mov fp, #7"); |
|
102 asm("orr fp, lr, fp, lsl #16"); |
|
103 |
|
104 //asm("fmxr fpscr, fp"); |
|
105 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
106 |
|
107 |
|
108 asm("vfp_divide_f32_loop2:"); |
|
109 |
|
110 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
111 VFP_FLDMIAS(CC_AL,1,8,8); |
|
112 |
|
113 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); |
|
114 VFP_FLDMIAS(CC_AL,2,16,8); |
|
115 |
|
116 //asm("fadds s24, s8, s16"); |
|
117 VFP_FDIVS(CC_AL,24,8,16); |
|
118 |
|
119 asm("subs ip, ip, #1"); |
|
120 asm("bne vfp_divide_f32_loop2"); |
|
121 |
|
122 //asm("fmxr fpscr, lr"); |
|
123 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
124 |
|
125 asm("vfp_divide_f32_end:"); |
|
126 asm ("ldmia sp!, {fp, pc}"); |
|
127 |
|
128 } |
|
129 |
|
130 EXPORT_C __NAKED__ void vfp_multiply_f32 (float *d, const float *s1, const float *s2, int n) |
|
131 { |
|
132 asm(" stmdb sp!, {fp, lr}"); |
|
133 asm("ands ip, r3, #7"); |
|
134 asm("beq vfp_multiply_f32_unroll"); |
|
135 |
|
136 //asm("fldmias r1!, {s0}"); |
|
137 VFP_FLDMIAS(CC_AL,1,0,1); |
|
138 |
|
139 asm("vfp_multiply_f32_loop1:"); |
|
140 |
|
141 //asm("fldmias r2!, {s1}"); |
|
142 VFP_FLDMIAS(CC_AL,2,1,1); |
|
143 |
|
144 //asm("fadds s2, s0, s1"); |
|
145 VFP_FMULS(CC_AL,2,0,1); |
|
146 |
|
147 //asm("fstmias r0!, {s2}"); |
|
148 VFP_FSTMIAS(CC_AL,0,2,1); |
|
149 |
|
150 asm("subs ip, ip, #1"); |
|
151 asm("bne vfp_multiply_f32_loop1"); |
|
152 asm("vfp_multiply_f32_unroll: movs ip, r3, lsr #3"); |
|
153 asm("beq vfp_multiply_f32_end"); |
|
154 |
|
155 |
|
156 //asm("fmrx lr, fpscr"); |
|
157 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
158 |
|
159 |
|
160 asm("mov fp, #7"); |
|
161 asm("orr fp, lr, fp, lsl #16"); |
|
162 |
|
163 //asm("fmxr fpscr, fp"); |
|
164 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
165 |
|
166 |
|
167 asm("vfp_multiply_f32_loop2:"); |
|
168 |
|
169 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
170 VFP_FLDMIAS(CC_AL,1,8,8); |
|
171 |
|
172 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); |
|
173 VFP_FLDMIAS(CC_AL,2,16,8); |
|
174 |
|
175 //asm("fadds s24, s8, s16"); |
|
176 VFP_FMULS(CC_AL,24,8,16); |
|
177 |
|
178 asm("subs ip, ip, #1"); |
|
179 asm("bne vfp_multiply_f32_loop2"); |
|
180 |
|
181 //asm("fmxr fpscr, lr"); |
|
182 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
183 |
|
184 asm("vfp_multiply_f32_end:"); |
|
185 asm ("ldmia sp!, {fp, pc}"); |
|
186 |
|
187 } |
|
188 |
|
189 EXPORT_C __NAKED__ void vfp_subtract_f32 (float *d, const float *s1, const float *s2, int n) |
|
190 { |
|
191 asm(" stmdb sp!, {fp, lr}"); |
|
192 asm("ands ip, r3, #7"); |
|
193 asm("beq vfp_subtract_f32_unroll"); |
|
194 |
|
195 //asm("fldmias r1!, {s0}"); |
|
196 VFP_FLDMIAS(CC_AL,1,0,1); |
|
197 |
|
198 asm("vfp_subtract_f32_loop1:"); |
|
199 |
|
200 //asm("fldmias r2!, {s1}"); |
|
201 VFP_FLDMIAS(CC_AL,2,1,1); |
|
202 |
|
203 //asm("fadds s2, s0, s1"); |
|
204 VFP_FSUBS(CC_AL,2,0,1); |
|
205 |
|
206 //asm("fstmias r0!, {s2}"); |
|
207 VFP_FSTMIAS(CC_AL,0,2,1); |
|
208 |
|
209 asm("subs ip, ip, #1"); |
|
210 asm("bne vfp_subtract_f32_loop1"); |
|
211 asm("vfp_subtract_f32_unroll: movs ip, r3, lsr #3"); |
|
212 asm("beq vfp_subtract_f32_end"); |
|
213 |
|
214 |
|
215 //asm("fmrx lr, fpscr"); |
|
216 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
217 |
|
218 |
|
219 asm("mov fp, #7"); |
|
220 asm("orr fp, lr, fp, lsl #16"); |
|
221 |
|
222 //asm("fmxr fpscr, fp"); |
|
223 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
224 |
|
225 |
|
226 asm("vfp_subtract_f32_loop2:"); |
|
227 |
|
228 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
229 VFP_FLDMIAS(CC_AL,1,8,8); |
|
230 |
|
231 //asm("fldmias r2!, {s16, s17, s18, s19, s20, s21, s22, s23}"); |
|
232 VFP_FLDMIAS(CC_AL,2,16,8); |
|
233 |
|
234 //asm("fadds s24, s8, s16"); |
|
235 VFP_FSUBS(CC_AL,24,8,16); |
|
236 |
|
237 asm("subs ip, ip, #1"); |
|
238 asm("bne vfp_subtract_f32_loop2"); |
|
239 |
|
240 //asm("fmxr fpscr, lr"); |
|
241 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
242 |
|
243 asm("vfp_subtract_f32_end:"); |
|
244 asm ("ldmia sp!, {fp, pc}"); |
|
245 |
|
246 } |
|
247 |
|
248 EXPORT_C __NAKED__ void vfp_add_f64 (double *d, const double *s1, const double *s2, int n) |
|
249 { |
|
250 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
251 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
252 asm("beq vfp_add_f64_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
253 asm("vfp_add_f64_loop1:"); |
|
254 |
|
255 //asm("fldmiad r1!, {d0}"); |
|
256 VFP_FLDMIAD(CC_AL,1,0,1); |
|
257 |
|
258 //asm("fldmiad r2!, {d1}"); |
|
259 VFP_FLDMIAD(CC_AL,2,1,1); |
|
260 |
|
261 //asm("faddd d2, d0, d1"); |
|
262 VFP_FADDD(,2,0,1); |
|
263 |
|
264 //asm("fstmiad r0!, {d2}"); |
|
265 VFP_FSTMIAD(CC_AL,0,2,1); |
|
266 |
|
267 asm("subs ip, ip, #1"); |
|
268 asm("bne vfp_add_f64_loop1"); |
|
269 asm("vfp_add_f64_unroll:"); /* unroll by 4 */ |
|
270 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
271 asm(" beq vfp_add_f64_end"); /* if ip == 0 goto finish */ |
|
272 |
|
273 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
274 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
275 |
|
276 asm("mov fp, #3"); |
|
277 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
278 |
|
279 //asm("fmxr fpscr, fp"); |
|
280 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
281 |
|
282 asm("vfp_add_f64_loop2:"); |
|
283 |
|
284 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
285 VFP_FLDMIAS(CC_AL,1,4,4); |
|
286 |
|
287 //asm("fldmiad r2!, {d8, d9, d10, d11}"); |
|
288 VFP_FLDMIAS(CC_AL,2,8,4); |
|
289 |
|
290 //asm("faddd d12, d4, d8"); |
|
291 VFP_FADDD(,12,4,8); |
|
292 |
|
293 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
294 VFP_FSTMIAS(CC_AL,0,12,4); |
|
295 |
|
296 asm("subs ip, ip, #1"); |
|
297 asm("bne vfp_add_f64_loop2"); |
|
298 |
|
299 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
300 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
301 |
|
302 asm("vfp_add_f64_end:"); |
|
303 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
304 } |
|
305 |
|
306 |
|
307 |
|
308 |
|
309 EXPORT_C __NAKED__ void vfp_abs_f32_f32_ns(float *d, const float *s, int n) |
|
310 { |
|
311 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
312 asm("ands ip, r2, #7"); /* ip = n % 8 */ |
|
313 asm("beq vfp_abs_f32_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
314 asm("vfp_abs_f32_f32_ns_loop1:"); |
|
315 |
|
316 //asm("fldmias r1!, {s0}"); |
|
317 VFP_FLDMIAS(CC_AL,1,0,1); |
|
318 |
|
319 //asm("fabss s2, s0"); |
|
320 VFP_FABSS(CC_AL,2,0); |
|
321 |
|
322 //asm("fstmias r0!, {s2}"); |
|
323 VFP_FSTMIAS(CC_AL,0,2,1); |
|
324 |
|
325 asm("subs ip, ip, #1"); |
|
326 asm("bne vfp_abs_f32_f32_ns_loop1"); |
|
327 asm("vfp_abs_f32_f32_ns_unroll:"); /* unroll by 8 */ |
|
328 asm("movs ip, r2, lsr #3"); /* ip = n / 8 */ |
|
329 asm("beq vfp_abs_f32_f32_ns_end"); /* if ip == 0 goto finish */ |
|
330 |
|
331 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
332 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
333 |
|
334 asm("mov fp, #7"); |
|
335 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
336 |
|
337 //asm("fmxr fpscr, fp"); |
|
338 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
339 |
|
340 asm("vfp_abs_f32_f32_ns_loop2:"); |
|
341 |
|
342 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
343 VFP_FLDMIAS(CC_AL,1,8,8); |
|
344 |
|
345 //asm("fabss s24, s8"); |
|
346 VFP_FABSS(CC_AL,2,0); |
|
347 |
|
348 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); |
|
349 VFP_FSTMIAS(CC_AL,0,24,8); |
|
350 |
|
351 asm("subs ip, ip, #1"); |
|
352 asm("bne vfp_abs_f32_f32_ns_loop2"); |
|
353 |
|
354 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
355 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
356 |
|
357 asm("vfp_abs_f32_f32_ns_end:"); |
|
358 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
359 } |
|
360 |
|
361 EXPORT_C __NAKED__ void vfp_negative_f32(float *d, const float *s, int n) |
|
362 { |
|
363 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
364 asm("ands ip, r2, #7"); /* ip = n % 8 */ |
|
365 asm("beq vfp_negative_f32_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
366 asm("vfp_negative_f32_loop1:"); |
|
367 |
|
368 //asm("fldmias r1!, {s0}"); |
|
369 VFP_FLDMIAS(CC_AL,1,0,1); |
|
370 |
|
371 //asm("fnegs s2, s0"); |
|
372 VFP_FNEGS(CC_AL,2,0); |
|
373 |
|
374 //asm("fstmias r0!, {s2}"); |
|
375 VFP_FSTMIAS(CC_AL,0,2,1); |
|
376 |
|
377 asm("subs ip, ip, #1"); |
|
378 asm("bne vfp_negative_f32_loop1"); |
|
379 asm("vfp_negative_f32_unroll:"); /* unroll by 8 */ |
|
380 asm("movs ip, r2, lsr #3"); /* ip = n / 8 */ |
|
381 asm("beq vfp_negative_f32_end"); /* if ip == 0 goto finish */ |
|
382 |
|
383 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
384 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
385 |
|
386 asm("mov fp, #7"); |
|
387 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
388 |
|
389 // asm("fmxr fpscr, fp"); |
|
390 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
391 |
|
392 asm("vfp_negative_f32_loop2:"); |
|
393 |
|
394 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
395 VFP_FLDMIAS(CC_AL,1,8,8); |
|
396 |
|
397 //asm("fnegs s24, s8"); |
|
398 VFP_FNEGS(CC_AL,2,0); |
|
399 |
|
400 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); |
|
401 VFP_FSTMIAS(CC_AL,0,24,8); |
|
402 |
|
403 asm("subs ip, ip, #1"); |
|
404 asm("bne vfp_negative_f32_loop2"); |
|
405 |
|
406 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
407 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
408 |
|
409 asm("vfp_negative_f32_end:"); |
|
410 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
411 } |
|
412 |
|
413 EXPORT_C __NAKED__ void vfp_abs_f64_f64_ns(double *d, const double *s, int n) |
|
414 { |
|
415 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
416 asm("ands ip, r2, #3"); /* ip = n % 3 */ |
|
417 asm("beq vfp_abs_f64_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
418 asm("vfp_abs_f64_f64_ns_loop1:"); |
|
419 |
|
420 //asm("fldmiad r1!, {d0}"); |
|
421 VFP_FLDMIAD(CC_AL,1,0,1); |
|
422 |
|
423 //asm("fabsd d2, d0"); |
|
424 VFP_FABSD(,2,0); |
|
425 |
|
426 //asm("fstmiad r0!, {d2}"); |
|
427 VFP_FSTMIAD(CC_AL,0,2,1); |
|
428 |
|
429 asm("subs ip, ip, #1"); |
|
430 asm("bne vfp_abs_f64_f64_ns_loop1"); |
|
431 asm("vfp_abs_f64_f64_ns_unroll:"); /* unroll by 4 */ |
|
432 asm("movs ip, r2, lsr #2"); /* ip = n / 4 */ |
|
433 asm("beq vfp_abs_f64_f64_ns_end"); /* if ip == 0 goto finish */ |
|
434 |
|
435 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
436 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
437 |
|
438 asm("mov fp, #3"); |
|
439 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ |
|
440 |
|
441 //asm("fmxr fpscr, fp"); |
|
442 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
443 |
|
444 asm("vfp_abs_f64_f64_ns_loop2:"); |
|
445 |
|
446 |
|
447 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
448 VFP_FLDMIAD(CC_AL,1,4,4); |
|
449 |
|
450 //asm("fabsd d12, d4"); |
|
451 VFP_FABSD(,12,4); |
|
452 |
|
453 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
454 VFP_FSTMIAD(CC_AL,0,12,4); |
|
455 |
|
456 asm("subs ip, ip, #1"); |
|
457 asm("bne vfp_abs_f64_f64_ns_loop2"); |
|
458 |
|
459 // asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
460 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
461 |
|
462 asm("vfp_abs_f64_f64_ns_end:"); |
|
463 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
464 } |
|
465 |
|
466 |
|
467 EXPORT_C __NAKED__ void vfp_negative_f64(double *d, const double *s, int n) |
|
468 { |
|
469 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
470 asm("ands ip, r2, #3"); /* ip = n % 3 */ |
|
471 asm("beq vfp_negative_f64_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
472 asm("vfp_negative_f64_loop1:"); |
|
473 |
|
474 //asm("fldmiad r1!, {d0}"); |
|
475 VFP_FLDMIAD(CC_AL,1,0,1); |
|
476 |
|
477 //asm("fnegd d2, d0"); |
|
478 VFP_FNEGD(,2,0); |
|
479 |
|
480 //asm("fstmiad r0!, {d2}"); |
|
481 VFP_FSTMIAD(CC_AL,0,2,1); |
|
482 |
|
483 asm("subs ip, ip, #1"); |
|
484 asm("bne vfp_negative_f64_loop1"); |
|
485 asm("vfp_negative_f64_unroll:"); /* unroll by 4 */ |
|
486 asm("movs ip, r2, lsr #2"); /* ip = n / 4 */ |
|
487 asm("beq vfp_negative_f64_end"); /* if ip == 0 goto finish */ |
|
488 |
|
489 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
490 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
491 |
|
492 asm("mov fp, #3"); |
|
493 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ |
|
494 |
|
495 //asm("fmxr fpscr, fp"); |
|
496 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
497 |
|
498 asm("vfp_negative_f64_loop2:"); |
|
499 |
|
500 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
501 VFP_FLDMIAD(CC_AL,1,4,4); |
|
502 |
|
503 //asm("fnegd d12, d4"); |
|
504 VFP_FNEGD(,12,4); |
|
505 |
|
506 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
507 VFP_FSTMIAD(CC_AL,0,12,4); |
|
508 |
|
509 asm("subs ip, ip, #1"); |
|
510 asm("bne vfp_negative_f64_loop2"); |
|
511 |
|
512 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
513 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
514 |
|
515 asm("vfp_negative_f64_end:"); |
|
516 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
517 } |
|
518 |
|
519 |
|
520 //Rakhi changes |
|
521 EXPORT_C __NAKED__ void vfp_divide_f64 (double *d, const double *s1, const double *s2, int n) |
|
522 { |
|
523 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
524 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
525 asm("beq vfp_divide_f64_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
526 asm("vfp_divide_f64_loop1:"); |
|
527 |
|
528 //asm("fldmiad r1!, {d0}"); |
|
529 VFP_FLDMIAD(CC_AL,1,0,1); |
|
530 |
|
531 //asm("fldmiad r2!, {d1}"); |
|
532 VFP_FLDMIAD(CC_AL,2,1,1); |
|
533 |
|
534 //asm("faddd d2, d0, d1"); |
|
535 VFP_FDIVD(,2,0,1); |
|
536 |
|
537 //asm("fstmiad r0!, {d2}"); |
|
538 VFP_FSTMIAD(CC_AL,0,2,1); |
|
539 |
|
540 asm("subs ip, ip, #1"); |
|
541 asm("bne vfp_divide_f64_loop1"); |
|
542 asm("vfp_divide_f64_unroll:"); /* unroll by 4 */ |
|
543 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
544 asm(" beq vfp_divide_f64_end"); /* if ip == 0 goto finish */ |
|
545 |
|
546 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
547 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
548 |
|
549 asm("mov fp, #3"); |
|
550 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
551 |
|
552 //asm("fmxr fpscr, fp"); |
|
553 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
554 |
|
555 asm("vfp_divide_f64_loop2:"); |
|
556 |
|
557 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
558 VFP_FLDMIAS(CC_AL,1,4,4); |
|
559 |
|
560 //asm("fldmiad r2!, {d8, d9, d10, d11}"); |
|
561 VFP_FLDMIAS(CC_AL,2,8,4); |
|
562 |
|
563 //asm("faddd d12, d4, d8"); |
|
564 VFP_FDIVD(,12,4,8); |
|
565 |
|
566 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
567 VFP_FSTMIAS(CC_AL,0,12,4); |
|
568 |
|
569 asm("subs ip, ip, #1"); |
|
570 asm("bne vfp_divide_f64_loop2"); |
|
571 |
|
572 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
573 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
574 |
|
575 asm("vfp_divide_f64_end:"); |
|
576 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
577 } |
|
578 |
|
579 EXPORT_C __NAKED__ void vfp_multiply_f64 (double *d, const double *s1, const double *s2, int n) |
|
580 { |
|
581 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
582 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
583 asm("beq vfp_multiply_f64_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
584 asm("vfp_multiply_f64_loop1:"); |
|
585 |
|
586 //asm("fldmiad r1!, {d0}"); |
|
587 VFP_FLDMIAD(CC_AL,1,0,1); |
|
588 |
|
589 //asm("fldmiad r2!, {d1}"); |
|
590 VFP_FLDMIAD(CC_AL,2,1,1); |
|
591 |
|
592 //asm("faddd d2, d0, d1"); |
|
593 VFP_FMULD(,2,0,1); |
|
594 |
|
595 //asm("fstmiad r0!, {d2}"); |
|
596 VFP_FSTMIAD(CC_AL,0,2,1); |
|
597 |
|
598 asm("subs ip, ip, #1"); |
|
599 asm("bne vfp_multiply_f64_loop1"); |
|
600 asm("vfp_multiply_f64_unroll:"); /* unroll by 4 */ |
|
601 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
602 asm(" beq vfp_multiply_f64_end"); /* if ip == 0 goto finish */ |
|
603 |
|
604 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
605 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
606 |
|
607 asm("mov fp, #3"); |
|
608 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
609 |
|
610 //asm("fmxr fpscr, fp"); |
|
611 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
612 |
|
613 asm("vfp_multiply_f64_loop2:"); |
|
614 |
|
615 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
616 VFP_FLDMIAS(CC_AL,1,4,4); |
|
617 |
|
618 //asm("fldmiad r2!, {d8, d9, d10, d11}"); |
|
619 VFP_FLDMIAS(CC_AL,2,8,4); |
|
620 |
|
621 //asm("faddd d12, d4, d8"); |
|
622 VFP_FMULD(,12,4,8); |
|
623 |
|
624 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
625 VFP_FSTMIAS(CC_AL,0,12,4); |
|
626 |
|
627 asm("subs ip, ip, #1"); |
|
628 asm("bne vfp_multiply_f64_loop2"); |
|
629 |
|
630 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
631 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
632 |
|
633 asm("vfp_multiply_f64_end:"); |
|
634 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
635 } |
|
636 |
|
637 EXPORT_C __NAKED__ void vfp_subtract_f64 (double *d, const double *s1, const double *s2, int n) |
|
638 { |
|
639 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
640 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
641 asm("beq vfp_subtract_f64_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
642 asm("vfp_subtract_f64_loop1:"); |
|
643 |
|
644 //asm("fldmiad r1!, {d0}"); |
|
645 VFP_FLDMIAD(CC_AL,1,0,1); |
|
646 |
|
647 //asm("fldmiad r2!, {d1}"); |
|
648 VFP_FLDMIAD(CC_AL,2,1,1); |
|
649 |
|
650 //asm("faddd d2, d0, d1"); |
|
651 VFP_FSUBD(,2,0,1); |
|
652 |
|
653 //asm("fstmiad r0!, {d2}"); |
|
654 VFP_FSTMIAD(CC_AL,0,2,1); |
|
655 |
|
656 asm("subs ip, ip, #1"); |
|
657 asm("bne vfp_subtract_f64_loop1"); |
|
658 asm("vfp_subtract_f64_unroll:"); /* unroll by 4 */ |
|
659 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
660 asm(" beq vfp_subtract_f64_end"); /* if ip == 0 goto finish */ |
|
661 |
|
662 //asm(" fmrx lr, fpscr"); /* read fpscr register into arm */ |
|
663 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
664 |
|
665 asm("mov fp, #3"); |
|
666 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
667 |
|
668 //asm("fmxr fpscr, fp"); |
|
669 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
670 |
|
671 asm("vfp_subtract_f64_loop2:"); |
|
672 |
|
673 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
674 VFP_FLDMIAS(CC_AL,1,4,4); |
|
675 |
|
676 //asm("fldmiad r2!, {d8, d9, d10, d11}"); |
|
677 VFP_FLDMIAS(CC_AL,2,8,4); |
|
678 |
|
679 //asm("faddd d12, d4, d8"); |
|
680 VFP_FSUBD(,12,4,8); |
|
681 |
|
682 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
683 VFP_FSTMIAS(CC_AL,0,12,4); |
|
684 |
|
685 asm("subs ip, ip, #1"); |
|
686 asm("bne vfp_subtract_f64_loop2"); |
|
687 |
|
688 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
689 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
690 |
|
691 asm("vfp_subtract_f64_end:"); |
|
692 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
693 } |
|
694 |
|
695 EXPORT_C __NAKED__ void vfp_scalaradd_f32_ns (float *d, const float *s1, const float *s2_1, int n) |
|
696 { |
|
697 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
698 |
|
699 //asm("fldmias r2, {s1}"); /* load scalar value */ |
|
700 VFP_FLDMIAS(CC_AL,2,1,1); |
|
701 |
|
702 asm("ands ip, r3, #7"); /* ip = n % 8 */ |
|
703 asm("beq vfp_scalaradd_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
704 asm("vfp_scalaradd_f32_ns_loop1:"); |
|
705 |
|
706 //asm("fldmias r1!, {s0}"); |
|
707 VFP_FLDMIAS(CC_AL,1,0,1); |
|
708 |
|
709 //asm("FADDS s2, s0, s1"); |
|
710 VFP_FADDS(CC_AL,2,0,1); |
|
711 |
|
712 //asm("fstmias r0!, {s2}"); |
|
713 VFP_FSTMIAS(CC_AL,0,2,8); |
|
714 |
|
715 asm("subs ip, ip, #1"); |
|
716 asm("bne vfp_scalaradd_f32_ns_loop1"); |
|
717 asm("vfp_scalaradd_f32_ns_unroll:"); /* unroll by 8 */ |
|
718 asm("movs ip, r3, lsr #3"); /* ip = n / 8 */ |
|
719 asm("beq vfp_scalaradd_f32_ns_end"); /* if ip == 0 goto finish */ |
|
720 |
|
721 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ |
|
722 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
723 |
|
724 asm("mov fp, #7"); |
|
725 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
726 |
|
727 //asm("fmxr fpscr, fp"); |
|
728 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
729 |
|
730 asm("vfp_scalaradd_f32_ns_loop2:"); |
|
731 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
732 VFP_FLDMIAS(CC_AL,1,8,8); |
|
733 |
|
734 //asm("FADDS s24, s8, s1"); |
|
735 VFP_FADDS(CC_AL,24,8,1); |
|
736 |
|
737 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); |
|
738 VFP_FSTMIAS(CC_AL,0,24,8); |
|
739 |
|
740 asm("subs ip, ip, #1"); |
|
741 asm("bne vfp_scalaradd_f32_ns_loop2"); |
|
742 |
|
743 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
744 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
745 |
|
746 asm("vfp_scalaradd_f32_ns_end:"); |
|
747 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
748 } |
|
749 |
|
750 EXPORT_C __NAKED__ void vfp_scalarmultiply_f32_ns (float *d, const float *s1, const float *s2_1, int n) |
|
751 { |
|
752 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
753 |
|
754 //asm("fldmias r2, {s1}"); /* load scalar value */ |
|
755 VFP_FLDMIAS(CC_AL,2,1,1); |
|
756 |
|
757 asm("ands ip, r3, #7"); /* ip = n % 8 */ |
|
758 asm("beq vfp_scalarmultiply_f32_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
759 asm("vfp_scalarmultiply_f32_ns_loop1:"); |
|
760 |
|
761 //asm("fldmias r1!, {s0}"); |
|
762 VFP_FLDMIAS(CC_AL,1,0,1); |
|
763 |
|
764 //asm("FADDS s2, s0, s1"); |
|
765 VFP_FMULS(CC_AL,2,0,1); |
|
766 |
|
767 //asm("fstmias r0!, {s2}"); |
|
768 VFP_FSTMIAS(CC_AL,0,2,8); |
|
769 |
|
770 asm("subs ip, ip, #1"); |
|
771 asm("bne vfp_scalarmultiply_f32_ns_loop1"); |
|
772 asm("vfp_scalarmultiply_f32_ns_unroll:"); /* unroll by 8 */ |
|
773 asm("movs ip, r3, lsr #3"); /* ip = n / 8 */ |
|
774 asm("beq vfp_scalarmultiply_f32_ns_end"); /* if ip == 0 goto finish */ |
|
775 |
|
776 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ |
|
777 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
778 |
|
779 asm("mov fp, #7"); |
|
780 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 8 */ |
|
781 |
|
782 //asm("fmxr fpscr, fp"); |
|
783 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
784 |
|
785 asm("vfp_scalarmultiply_f32_ns_loop2:"); |
|
786 //asm("fldmias r1!, {s8, s9, s10, s11, s12, s13, s14, s15}"); |
|
787 VFP_FLDMIAS(CC_AL,1,8,8); |
|
788 |
|
789 //asm("FADDS s24, s8, s1"); |
|
790 VFP_FMULS(CC_AL,24,8,1); |
|
791 |
|
792 //asm("fstmias r0!, {s24, s25, s26, s27, s28, s29, s30, s31}"); |
|
793 VFP_FSTMIAS(CC_AL,0,24,8); |
|
794 |
|
795 asm("subs ip, ip, #1"); |
|
796 asm("bne vfp_scalarmultiply_f32_ns_loop2"); |
|
797 |
|
798 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
799 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
800 |
|
801 asm("vfp_scalarmultiply_f32_ns_end:"); |
|
802 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
803 } |
|
804 |
|
805 EXPORT_C __NAKED__ void vfp_scalaradd_f64_ns (double *d, const double *s1, const double *s2_1, int n) |
|
806 { |
|
807 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
808 |
|
809 //asm("fldmiad r2, {d1}"); /* load scalar value */ |
|
810 VFP_FLDMIAD(CC_AL,2,1,1); |
|
811 |
|
812 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
813 asm("beq vfp_scalaradd_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
814 asm("vfp_scalaradd_f64_ns_loop1:"); |
|
815 //asm("fldmiad r1!, {d0}"); |
|
816 VFP_FLDMIAD(CC_AL,1,0,1); |
|
817 |
|
818 //asm("VFP_FADDD d2, d0, d1"); |
|
819 VFP_FADDD(,2,0,1); |
|
820 |
|
821 //asm("fstmiad r0!, {d2}"); |
|
822 VFP_FSTMIAD(CC_AL,0,2,1); |
|
823 |
|
824 asm("subs ip, ip, #1"); |
|
825 asm("bne vfp_scalaradd_f64_ns_loop1"); |
|
826 asm("vfp_scalaradd_f64_ns_unroll:"); /* unroll by 4 */ |
|
827 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
828 asm("beq vfp_scalaradd_f64_ns_end"); /* if ip == 0 goto finish */ |
|
829 |
|
830 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ |
|
831 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
832 |
|
833 asm("mov fp, #3"); |
|
834 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ |
|
835 |
|
836 //asm("fmxr fpscr, fp"); |
|
837 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
838 |
|
839 asm("vfp_scalaradd_f64_ns_loop2:"); |
|
840 |
|
841 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
842 VFP_FLDMIAD(CC_AL,1,4,4); |
|
843 |
|
844 //asm("VFP_FADDD d12, d4, d1"); |
|
845 VFP_FADDD(,12,4,1); |
|
846 |
|
847 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
848 VFP_FSTMIAD(CC_AL,0,12,4); |
|
849 |
|
850 asm("subs ip, ip, #1"); |
|
851 asm("bne vfp_scalaradd_f64_ns_loop2"); |
|
852 |
|
853 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
854 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
855 |
|
856 asm("vfp_scalaradd_f64_ns_end:"); |
|
857 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
858 } |
|
859 |
|
860 EXPORT_C __NAKED__ void vfp_scalarmultiply_f64_ns (double *d, const double *s1, const double *s2_1, int n) |
|
861 { |
|
862 |
|
863 asm("stmdb sp!, {fp, lr}"); /* save registers to stack */ |
|
864 |
|
865 //asm("fldmiad r2, {d1}"); /* load scalar value */ |
|
866 VFP_FLDMIAD(CC_AL,2,1,1); |
|
867 |
|
868 asm("ands ip, r3, #3"); /* ip = n % 3 */ |
|
869 asm("beq vfp_scalarmultiply_f64_ns_unroll"); /* if ip == 0 goto prep_loop2 */ |
|
870 asm("vfp_scalarmultiply_f64_ns_loop1:"); |
|
871 //asm("fldmiad r1!, {d0}"); |
|
872 VFP_FLDMIAD(CC_AL,1,0,1); |
|
873 |
|
874 //asm("VFP_FADDD d2, d0, d1"); |
|
875 VFP_FMULD(,2,0,1); |
|
876 |
|
877 //asm("fstmiad r0!, {d2}"); |
|
878 VFP_FSTMIAD(CC_AL,0,2,1); |
|
879 |
|
880 asm("subs ip, ip, #1"); |
|
881 asm("bne vfp_scalarmultiply_f64_ns_loop1"); |
|
882 asm("vfp_scalarmultiply_f64_ns_unroll:"); /* unroll by 4 */ |
|
883 asm("movs ip, r3, lsr #2"); /* ip = n / 4 */ |
|
884 asm("beq vfp_scalarmultiply_f64_ns_end"); /* if ip == 0 goto finish */ |
|
885 |
|
886 //asm("fmrx lr, fpscr"); /* read fpscr register into arm */\ |
|
887 VFP_FMRX(,14,VFP_XREG_FPSCR); |
|
888 |
|
889 asm("mov fp, #3"); |
|
890 asm("orr fp, lr, fp, lsl #16"); /* set vector lenght to 4 */ |
|
891 |
|
892 //asm("fmxr fpscr, fp"); |
|
893 VFP_FMXR(,VFP_XREG_FPSCR,11); |
|
894 |
|
895 asm("vfp_scalarmultiply_f64_ns_loop2:"); |
|
896 |
|
897 //asm("fldmiad r1!, {d4, d5, d6, d7}"); |
|
898 VFP_FLDMIAD(CC_AL,1,4,4); |
|
899 |
|
900 //asm("VFP_FADDD d12, d4, d1"); |
|
901 VFP_FMULD(,12,4,1); |
|
902 |
|
903 //asm("fstmiad r0!, {d12, d13, d14, d15}"); |
|
904 VFP_FSTMIAD(CC_AL,0,12,4); |
|
905 |
|
906 asm("subs ip, ip, #1"); |
|
907 asm("bne vfp_scalarmultiply_f64_ns_loop2"); |
|
908 |
|
909 //asm("fmxr fpscr, lr"); /* restore original fpscr */ |
|
910 VFP_FMXR(,VFP_XREG_FPSCR,14); |
|
911 |
|
912 asm("vfp_scalarmultiply_f64_ns_end:"); |
|
913 asm("ldmia sp!, {fp, pc}"); /* recovering from stack and return */ |
|
914 |
|
915 } |
|
916 |
|
917 |
|
918 } |
|
919 #endif |