|
1 /* |
|
2 * ARM NEON vector operations. |
|
3 * |
|
4 * Copyright (c) 2007, 2008 CodeSourcery. |
|
5 * Written by Paul Brook |
|
6 * |
|
7 * This code is licenced under the GNU GPL v2. |
|
8 */ |
|
9 #include <stdlib.h> |
|
10 #include <stdio.h> |
|
11 |
|
12 #include "cpu.h" |
|
13 #include "exec-all.h" |
|
14 #include "helpers.h" |
|
15 |
|
16 #define SIGNBIT (uint32_t)0x80000000 |
|
17 #define SIGNBIT64 ((uint64_t)1 << 63) |
|
18 |
|
19 #define SET_QC() env->vfp.xregs[ARM_VFP_FPSCR] = CPSR_Q |
|
20 |
|
21 static float_status neon_float_status; |
|
22 #define NFS &neon_float_status |
|
23 |
|
24 /* Helper routines to perform bitwise copies between float and int. */ |
|
25 static inline float32 vfp_itos(uint32_t i) |
|
26 { |
|
27 union { |
|
28 uint32_t i; |
|
29 float32 s; |
|
30 } v; |
|
31 |
|
32 v.i = i; |
|
33 return v.s; |
|
34 } |
|
35 |
|
36 static inline uint32_t vfp_stoi(float32 s) |
|
37 { |
|
38 union { |
|
39 uint32_t i; |
|
40 float32 s; |
|
41 } v; |
|
42 |
|
43 v.s = s; |
|
44 return v.i; |
|
45 } |
|
46 |
|
47 #define NEON_TYPE1(name, type) \ |
|
48 typedef struct \ |
|
49 { \ |
|
50 type v1; \ |
|
51 } neon_##name; |
|
52 #ifdef WORDS_BIGENDIAN |
|
53 #define NEON_TYPE2(name, type) \ |
|
54 typedef struct \ |
|
55 { \ |
|
56 type v2; \ |
|
57 type v1; \ |
|
58 } neon_##name; |
|
59 #define NEON_TYPE4(name, type) \ |
|
60 typedef struct \ |
|
61 { \ |
|
62 type v4; \ |
|
63 type v3; \ |
|
64 type v2; \ |
|
65 type v1; \ |
|
66 } neon_##name; |
|
67 #else |
|
68 #define NEON_TYPE2(name, type) \ |
|
69 typedef struct \ |
|
70 { \ |
|
71 type v1; \ |
|
72 type v2; \ |
|
73 } neon_##name; |
|
74 #define NEON_TYPE4(name, type) \ |
|
75 typedef struct \ |
|
76 { \ |
|
77 type v1; \ |
|
78 type v2; \ |
|
79 type v3; \ |
|
80 type v4; \ |
|
81 } neon_##name; |
|
82 #endif |
|
83 |
|
84 NEON_TYPE4(s8, int8_t) |
|
85 NEON_TYPE4(u8, uint8_t) |
|
86 NEON_TYPE2(s16, int16_t) |
|
87 NEON_TYPE2(u16, uint16_t) |
|
88 NEON_TYPE1(s32, int32_t) |
|
89 NEON_TYPE1(u32, uint32_t) |
|
90 #undef NEON_TYPE4 |
|
91 #undef NEON_TYPE2 |
|
92 #undef NEON_TYPE1 |
|
93 |
|
94 /* Copy from a uint32_t to a vector structure type. */ |
|
95 #define NEON_UNPACK(vtype, dest, val) do { \ |
|
96 union { \ |
|
97 vtype v; \ |
|
98 uint32_t i; \ |
|
99 } conv_u; \ |
|
100 conv_u.i = (val); \ |
|
101 dest = conv_u.v; \ |
|
102 } while(0) |
|
103 |
|
104 /* Copy from a vector structure type to a uint32_t. */ |
|
105 #define NEON_PACK(vtype, dest, val) do { \ |
|
106 union { \ |
|
107 vtype v; \ |
|
108 uint32_t i; \ |
|
109 } conv_u; \ |
|
110 conv_u.v = (val); \ |
|
111 dest = conv_u.i; \ |
|
112 } while(0) |
|
113 |
|
114 #define NEON_DO1 \ |
|
115 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); |
|
116 #define NEON_DO2 \ |
|
117 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ |
|
118 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); |
|
119 #define NEON_DO4 \ |
|
120 NEON_FN(vdest.v1, vsrc1.v1, vsrc2.v1); \ |
|
121 NEON_FN(vdest.v2, vsrc1.v2, vsrc2.v2); \ |
|
122 NEON_FN(vdest.v3, vsrc1.v3, vsrc2.v3); \ |
|
123 NEON_FN(vdest.v4, vsrc1.v4, vsrc2.v4); |
|
124 |
|
125 #define NEON_VOP_BODY(vtype, n) \ |
|
126 { \ |
|
127 uint32_t res; \ |
|
128 vtype vsrc1; \ |
|
129 vtype vsrc2; \ |
|
130 vtype vdest; \ |
|
131 NEON_UNPACK(vtype, vsrc1, arg1); \ |
|
132 NEON_UNPACK(vtype, vsrc2, arg2); \ |
|
133 NEON_DO##n; \ |
|
134 NEON_PACK(vtype, res, vdest); \ |
|
135 return res; \ |
|
136 } |
|
137 |
|
138 #define NEON_VOP(name, vtype, n) \ |
|
139 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ |
|
140 NEON_VOP_BODY(vtype, n) |
|
141 |
|
142 #define NEON_VOP_ENV(name, vtype, n) \ |
|
143 uint32_t HELPER(glue(neon_,name))(CPUState *env, uint32_t arg1, uint32_t arg2) \ |
|
144 NEON_VOP_BODY(vtype, n) |
|
145 |
|
146 /* Pairwise operations. */ |
|
147 /* For 32-bit elements each segment only contains a single element, so |
|
148 the elementwise and pairwise operations are the same. */ |
|
149 #define NEON_PDO2 \ |
|
150 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ |
|
151 NEON_FN(vdest.v2, vsrc2.v1, vsrc2.v2); |
|
152 #define NEON_PDO4 \ |
|
153 NEON_FN(vdest.v1, vsrc1.v1, vsrc1.v2); \ |
|
154 NEON_FN(vdest.v2, vsrc1.v3, vsrc1.v4); \ |
|
155 NEON_FN(vdest.v3, vsrc2.v1, vsrc2.v2); \ |
|
156 NEON_FN(vdest.v4, vsrc2.v3, vsrc2.v4); \ |
|
157 |
|
158 #define NEON_POP(name, vtype, n) \ |
|
159 uint32_t HELPER(glue(neon_,name))(uint32_t arg1, uint32_t arg2) \ |
|
160 { \ |
|
161 uint32_t res; \ |
|
162 vtype vsrc1; \ |
|
163 vtype vsrc2; \ |
|
164 vtype vdest; \ |
|
165 NEON_UNPACK(vtype, vsrc1, arg1); \ |
|
166 NEON_UNPACK(vtype, vsrc2, arg2); \ |
|
167 NEON_PDO##n; \ |
|
168 NEON_PACK(vtype, res, vdest); \ |
|
169 return res; \ |
|
170 } |
|
171 |
|
172 /* Unary operators. */ |
|
173 #define NEON_VOP1(name, vtype, n) \ |
|
174 uint32_t HELPER(glue(neon_,name))(uint32_t arg) \ |
|
175 { \ |
|
176 vtype vsrc1; \ |
|
177 vtype vdest; \ |
|
178 NEON_UNPACK(vtype, vsrc1, arg); \ |
|
179 NEON_DO##n; \ |
|
180 NEON_PACK(vtype, arg, vdest); \ |
|
181 return arg; \ |
|
182 } |
|
183 |
|
184 |
|
185 #define NEON_USAT(dest, src1, src2, type) do { \ |
|
186 uint32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ |
|
187 if (tmp != (type)tmp) { \ |
|
188 SET_QC(); \ |
|
189 dest = ~0; \ |
|
190 } else { \ |
|
191 dest = tmp; \ |
|
192 }} while(0) |
|
193 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) |
|
194 NEON_VOP_ENV(qadd_u8, neon_u8, 4) |
|
195 #undef NEON_FN |
|
196 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) |
|
197 NEON_VOP_ENV(qadd_u16, neon_u16, 2) |
|
198 #undef NEON_FN |
|
199 #undef NEON_USAT |
|
200 |
|
201 #define NEON_SSAT(dest, src1, src2, type) do { \ |
|
202 int32_t tmp = (uint32_t)src1 + (uint32_t)src2; \ |
|
203 if (tmp != (type)tmp) { \ |
|
204 SET_QC(); \ |
|
205 if (src2 > 0) { \ |
|
206 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ |
|
207 } else { \ |
|
208 tmp = 1 << (sizeof(type) * 8 - 1); \ |
|
209 } \ |
|
210 } \ |
|
211 dest = tmp; \ |
|
212 } while(0) |
|
213 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) |
|
214 NEON_VOP_ENV(qadd_s8, neon_s8, 4) |
|
215 #undef NEON_FN |
|
216 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) |
|
217 NEON_VOP_ENV(qadd_s16, neon_s16, 2) |
|
218 #undef NEON_FN |
|
219 #undef NEON_SSAT |
|
220 |
|
221 #define NEON_USAT(dest, src1, src2, type) do { \ |
|
222 uint32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ |
|
223 if (tmp != (type)tmp) { \ |
|
224 SET_QC(); \ |
|
225 dest = 0; \ |
|
226 } else { \ |
|
227 dest = tmp; \ |
|
228 }} while(0) |
|
229 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint8_t) |
|
230 NEON_VOP_ENV(qsub_u8, neon_u8, 4) |
|
231 #undef NEON_FN |
|
232 #define NEON_FN(dest, src1, src2) NEON_USAT(dest, src1, src2, uint16_t) |
|
233 NEON_VOP_ENV(qsub_u16, neon_u16, 2) |
|
234 #undef NEON_FN |
|
235 #undef NEON_USAT |
|
236 |
|
237 #define NEON_SSAT(dest, src1, src2, type) do { \ |
|
238 int32_t tmp = (uint32_t)src1 - (uint32_t)src2; \ |
|
239 if (tmp != (type)tmp) { \ |
|
240 SET_QC(); \ |
|
241 if (src2 < 0) { \ |
|
242 tmp = (1 << (sizeof(type) * 8 - 1)) - 1; \ |
|
243 } else { \ |
|
244 tmp = 1 << (sizeof(type) * 8 - 1); \ |
|
245 } \ |
|
246 } \ |
|
247 dest = tmp; \ |
|
248 } while(0) |
|
249 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int8_t) |
|
250 NEON_VOP_ENV(qsub_s8, neon_s8, 4) |
|
251 #undef NEON_FN |
|
252 #define NEON_FN(dest, src1, src2) NEON_SSAT(dest, src1, src2, int16_t) |
|
253 NEON_VOP_ENV(qsub_s16, neon_s16, 2) |
|
254 #undef NEON_FN |
|
255 #undef NEON_SSAT |
|
256 |
|
257 #define NEON_FN(dest, src1, src2) dest = (src1 + src2) >> 1 |
|
258 NEON_VOP(hadd_s8, neon_s8, 4) |
|
259 NEON_VOP(hadd_u8, neon_u8, 4) |
|
260 NEON_VOP(hadd_s16, neon_s16, 2) |
|
261 NEON_VOP(hadd_u16, neon_u16, 2) |
|
262 #undef NEON_FN |
|
263 |
|
264 int32_t HELPER(neon_hadd_s32)(int32_t src1, int32_t src2) |
|
265 { |
|
266 int32_t dest; |
|
267 |
|
268 dest = (src1 >> 1) + (src2 >> 1); |
|
269 if (src1 & src2 & 1) |
|
270 dest++; |
|
271 return dest; |
|
272 } |
|
273 |
|
274 uint32_t HELPER(neon_hadd_u32)(uint32_t src1, uint32_t src2) |
|
275 { |
|
276 uint32_t dest; |
|
277 |
|
278 dest = (src1 >> 1) + (src2 >> 1); |
|
279 if (src1 & src2 & 1) |
|
280 dest++; |
|
281 return dest; |
|
282 } |
|
283 |
|
284 #define NEON_FN(dest, src1, src2) dest = (src1 + src2 + 1) >> 1 |
|
285 NEON_VOP(rhadd_s8, neon_s8, 4) |
|
286 NEON_VOP(rhadd_u8, neon_u8, 4) |
|
287 NEON_VOP(rhadd_s16, neon_s16, 2) |
|
288 NEON_VOP(rhadd_u16, neon_u16, 2) |
|
289 #undef NEON_FN |
|
290 |
|
291 int32_t HELPER(neon_rhadd_s32)(int32_t src1, int32_t src2) |
|
292 { |
|
293 int32_t dest; |
|
294 |
|
295 dest = (src1 >> 1) + (src2 >> 1); |
|
296 if ((src1 | src2) & 1) |
|
297 dest++; |
|
298 return dest; |
|
299 } |
|
300 |
|
301 uint32_t HELPER(neon_rhadd_u32)(uint32_t src1, uint32_t src2) |
|
302 { |
|
303 uint32_t dest; |
|
304 |
|
305 dest = (src1 >> 1) + (src2 >> 1); |
|
306 if ((src1 | src2) & 1) |
|
307 dest++; |
|
308 return dest; |
|
309 } |
|
310 |
|
311 #define NEON_FN(dest, src1, src2) dest = (src1 - src2) >> 1 |
|
312 NEON_VOP(hsub_s8, neon_s8, 4) |
|
313 NEON_VOP(hsub_u8, neon_u8, 4) |
|
314 NEON_VOP(hsub_s16, neon_s16, 2) |
|
315 NEON_VOP(hsub_u16, neon_u16, 2) |
|
316 #undef NEON_FN |
|
317 |
|
318 int32_t HELPER(neon_hsub_s32)(int32_t src1, int32_t src2) |
|
319 { |
|
320 int32_t dest; |
|
321 |
|
322 dest = (src1 >> 1) - (src2 >> 1); |
|
323 if ((~src1) & src2 & 1) |
|
324 dest--; |
|
325 return dest; |
|
326 } |
|
327 |
|
328 uint32_t HELPER(neon_hsub_u32)(uint32_t src1, uint32_t src2) |
|
329 { |
|
330 uint32_t dest; |
|
331 |
|
332 dest = (src1 >> 1) - (src2 >> 1); |
|
333 if ((~src1) & src2 & 1) |
|
334 dest--; |
|
335 return dest; |
|
336 } |
|
337 |
|
338 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? ~0 : 0 |
|
339 NEON_VOP(cgt_s8, neon_s8, 4) |
|
340 NEON_VOP(cgt_u8, neon_u8, 4) |
|
341 NEON_VOP(cgt_s16, neon_s16, 2) |
|
342 NEON_VOP(cgt_u16, neon_u16, 2) |
|
343 NEON_VOP(cgt_s32, neon_s32, 1) |
|
344 NEON_VOP(cgt_u32, neon_u32, 1) |
|
345 #undef NEON_FN |
|
346 |
|
347 #define NEON_FN(dest, src1, src2) dest = (src1 >= src2) ? ~0 : 0 |
|
348 NEON_VOP(cge_s8, neon_s8, 4) |
|
349 NEON_VOP(cge_u8, neon_u8, 4) |
|
350 NEON_VOP(cge_s16, neon_s16, 2) |
|
351 NEON_VOP(cge_u16, neon_u16, 2) |
|
352 NEON_VOP(cge_s32, neon_s32, 1) |
|
353 NEON_VOP(cge_u32, neon_u32, 1) |
|
354 #undef NEON_FN |
|
355 |
|
356 #define NEON_FN(dest, src1, src2) dest = (src1 < src2) ? src1 : src2 |
|
357 NEON_VOP(min_s8, neon_s8, 4) |
|
358 NEON_VOP(min_u8, neon_u8, 4) |
|
359 NEON_VOP(min_s16, neon_s16, 2) |
|
360 NEON_VOP(min_u16, neon_u16, 2) |
|
361 NEON_VOP(min_s32, neon_s32, 1) |
|
362 NEON_VOP(min_u32, neon_u32, 1) |
|
363 NEON_POP(pmin_s8, neon_s8, 4) |
|
364 NEON_POP(pmin_u8, neon_u8, 4) |
|
365 NEON_POP(pmin_s16, neon_s16, 2) |
|
366 NEON_POP(pmin_u16, neon_u16, 2) |
|
367 #undef NEON_FN |
|
368 |
|
369 #define NEON_FN(dest, src1, src2) dest = (src1 > src2) ? src1 : src2 |
|
370 NEON_VOP(max_s8, neon_s8, 4) |
|
371 NEON_VOP(max_u8, neon_u8, 4) |
|
372 NEON_VOP(max_s16, neon_s16, 2) |
|
373 NEON_VOP(max_u16, neon_u16, 2) |
|
374 NEON_VOP(max_s32, neon_s32, 1) |
|
375 NEON_VOP(max_u32, neon_u32, 1) |
|
376 NEON_POP(pmax_s8, neon_s8, 4) |
|
377 NEON_POP(pmax_u8, neon_u8, 4) |
|
378 NEON_POP(pmax_s16, neon_s16, 2) |
|
379 NEON_POP(pmax_u16, neon_u16, 2) |
|
380 #undef NEON_FN |
|
381 |
|
382 #define NEON_FN(dest, src1, src2) \ |
|
383 dest = (src1 > src2) ? (src1 - src2) : (src2 - src1) |
|
384 NEON_VOP(abd_s8, neon_s8, 4) |
|
385 NEON_VOP(abd_u8, neon_u8, 4) |
|
386 NEON_VOP(abd_s16, neon_s16, 2) |
|
387 NEON_VOP(abd_u16, neon_u16, 2) |
|
388 NEON_VOP(abd_s32, neon_s32, 1) |
|
389 NEON_VOP(abd_u32, neon_u32, 1) |
|
390 #undef NEON_FN |
|
391 |
|
392 #define NEON_FN(dest, src1, src2) do { \ |
|
393 int8_t tmp; \ |
|
394 tmp = (int8_t)src2; \ |
|
395 if (abs(tmp) >= sizeof(src1) * 8) { \ |
|
396 dest = 0; \ |
|
397 } else if (tmp < 0) { \ |
|
398 dest = src1 >> -tmp; \ |
|
399 } else { \ |
|
400 dest = src1 << tmp; \ |
|
401 }} while (0) |
|
402 NEON_VOP(shl_u8, neon_u8, 4) |
|
403 NEON_VOP(shl_u16, neon_u16, 2) |
|
404 NEON_VOP(shl_u32, neon_u32, 1) |
|
405 #undef NEON_FN |
|
406 |
|
407 uint64_t HELPER(neon_shl_u64)(uint64_t val, uint64_t shiftop) |
|
408 { |
|
409 int8_t shift = (int8_t)shiftop; |
|
410 if (shift >= 64 || shift <= -64) { |
|
411 val = 0; |
|
412 } else if (shift < 0) { |
|
413 val >>= -shift; |
|
414 } else { |
|
415 val <<= shift; |
|
416 } |
|
417 return val; |
|
418 } |
|
419 |
|
420 #define NEON_FN(dest, src1, src2) do { \ |
|
421 int8_t tmp; \ |
|
422 tmp = (int8_t)src2; \ |
|
423 if (tmp >= sizeof(src1) * 8) { \ |
|
424 dest = 0; \ |
|
425 } else if (tmp <= -(int) (sizeof(src1) * 8)) { \ |
|
426 dest = src1 >> (sizeof(src1) * 8 - 1); \ |
|
427 } else if (tmp < 0) { \ |
|
428 dest = src1 >> -tmp; \ |
|
429 } else { \ |
|
430 dest = src1 << tmp; \ |
|
431 }} while (0) |
|
432 NEON_VOP(shl_s8, neon_s8, 4) |
|
433 NEON_VOP(shl_s16, neon_s16, 2) |
|
434 NEON_VOP(shl_s32, neon_s32, 1) |
|
435 #undef NEON_FN |
|
436 |
|
437 uint64_t HELPER(neon_shl_s64)(uint64_t valop, uint64_t shiftop) |
|
438 { |
|
439 int8_t shift = (int8_t)shiftop; |
|
440 int64_t val = valop; |
|
441 if (shift >= 64) { |
|
442 val = 0; |
|
443 } else if (shift <= -64) { |
|
444 val >>= 63; |
|
445 } else if (shift < 0) { |
|
446 val >>= -shift; |
|
447 } else { |
|
448 val <<= shift; |
|
449 } |
|
450 return val; |
|
451 } |
|
452 |
|
453 #define NEON_FN(dest, src1, src2) do { \ |
|
454 int8_t tmp; \ |
|
455 tmp = (int8_t)src2; \ |
|
456 if (tmp >= sizeof(src1) * 8) { \ |
|
457 dest = 0; \ |
|
458 } else if (tmp < -(int) (sizeof(src1) * 8)) { \ |
|
459 dest = src1 >> (sizeof(src1) * 8 - 1); \ |
|
460 } else if (tmp == -(int) (sizeof(src1) * 8)) { \ |
|
461 dest = src1 >> (tmp - 1); \ |
|
462 dest++; \ |
|
463 src2 >>= 1; \ |
|
464 } else if (tmp < 0) { \ |
|
465 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ |
|
466 } else { \ |
|
467 dest = src1 << tmp; \ |
|
468 }} while (0) |
|
469 NEON_VOP(rshl_s8, neon_s8, 4) |
|
470 NEON_VOP(rshl_s16, neon_s16, 2) |
|
471 NEON_VOP(rshl_s32, neon_s32, 1) |
|
472 #undef NEON_FN |
|
473 |
|
474 uint64_t HELPER(neon_rshl_s64)(uint64_t valop, uint64_t shiftop) |
|
475 { |
|
476 int8_t shift = (int8_t)shiftop; |
|
477 int64_t val = valop; |
|
478 if (shift >= 64) { |
|
479 val = 0; |
|
480 } else if (shift < -64) { |
|
481 val >>= 63; |
|
482 } else if (shift == -63) { |
|
483 val >>= 63; |
|
484 val++; |
|
485 val >>= 1; |
|
486 } else if (shift < 0) { |
|
487 val = (val + ((int64_t)1 << (-1 - shift))) >> -shift; |
|
488 } else { |
|
489 val <<= shift; |
|
490 } |
|
491 return val; |
|
492 } |
|
493 |
|
494 #define NEON_FN(dest, src1, src2) do { \ |
|
495 int8_t tmp; \ |
|
496 tmp = (int8_t)src2; \ |
|
497 if (abs(tmp) >= sizeof(src1) * 8) { \ |
|
498 dest = 0; \ |
|
499 } else if (tmp == -(int) (sizeof(src1) * 8)) { \ |
|
500 dest = src1 >> (tmp - 1); \ |
|
501 } else if (tmp < 0) { \ |
|
502 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ |
|
503 } else { \ |
|
504 dest = src1 << tmp; \ |
|
505 }} while (0) |
|
506 NEON_VOP(rshl_u8, neon_u8, 4) |
|
507 NEON_VOP(rshl_u16, neon_u16, 2) |
|
508 NEON_VOP(rshl_u32, neon_u32, 1) |
|
509 #undef NEON_FN |
|
510 |
|
511 uint64_t HELPER(neon_rshl_u64)(uint64_t val, uint64_t shiftop) |
|
512 { |
|
513 int8_t shift = (uint8_t)shiftop; |
|
514 if (shift >= 64 || shift < 64) { |
|
515 val = 0; |
|
516 } else if (shift == -64) { |
|
517 /* Rounding a 1-bit result just preserves that bit. */ |
|
518 val >>= 63; |
|
519 } if (shift < 0) { |
|
520 val = (val + ((uint64_t)1 << (-1 - shift))) >> -shift; |
|
521 val >>= -shift; |
|
522 } else { |
|
523 val <<= shift; |
|
524 } |
|
525 return val; |
|
526 } |
|
527 |
|
528 #define NEON_FN(dest, src1, src2) do { \ |
|
529 int8_t tmp; \ |
|
530 tmp = (int8_t)src2; \ |
|
531 if (tmp >= sizeof(src1) * 8) { \ |
|
532 if (src1) { \ |
|
533 SET_QC(); \ |
|
534 dest = ~0; \ |
|
535 } else { \ |
|
536 dest = 0; \ |
|
537 } \ |
|
538 } else if (tmp <= -(int) (sizeof(src1) * 8)) { \ |
|
539 dest = 0; \ |
|
540 } else if (tmp < 0) { \ |
|
541 dest = src1 >> -tmp; \ |
|
542 } else { \ |
|
543 dest = src1 << tmp; \ |
|
544 if ((dest >> tmp) != src1) { \ |
|
545 SET_QC(); \ |
|
546 dest = ~0; \ |
|
547 } \ |
|
548 }} while (0) |
|
549 NEON_VOP_ENV(qshl_u8, neon_u8, 4) |
|
550 NEON_VOP_ENV(qshl_u16, neon_u16, 2) |
|
551 NEON_VOP_ENV(qshl_u32, neon_u32, 1) |
|
552 #undef NEON_FN |
|
553 |
|
554 uint64_t HELPER(neon_qshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) |
|
555 { |
|
556 int8_t shift = (int8_t)shiftop; |
|
557 if (shift >= 64) { |
|
558 if (val) { |
|
559 val = ~(uint64_t)0; |
|
560 SET_QC(); |
|
561 } else { |
|
562 val = 0; |
|
563 } |
|
564 } else if (shift <= -64) { |
|
565 val = 0; |
|
566 } else if (shift < 0) { |
|
567 val >>= -shift; |
|
568 } else { |
|
569 uint64_t tmp = val; |
|
570 val <<= shift; |
|
571 if ((val >> shift) != tmp) { |
|
572 SET_QC(); |
|
573 val = ~(uint64_t)0; |
|
574 } |
|
575 } |
|
576 return val; |
|
577 } |
|
578 |
|
579 #define NEON_FN(dest, src1, src2) do { \ |
|
580 int8_t tmp; \ |
|
581 tmp = (int8_t)src2; \ |
|
582 if (tmp >= sizeof(src1) * 8) { \ |
|
583 if (src1) \ |
|
584 SET_QC(); \ |
|
585 dest = src1 >> 31; \ |
|
586 } else if (tmp <= -(int) (sizeof(src1) * 8)) { \ |
|
587 dest = src1 >> 31; \ |
|
588 } else if (tmp < 0) { \ |
|
589 dest = src1 >> -tmp; \ |
|
590 } else { \ |
|
591 dest = src1 << tmp; \ |
|
592 if ((dest >> tmp) != src1) { \ |
|
593 SET_QC(); \ |
|
594 dest = src2 >> 31; \ |
|
595 } \ |
|
596 }} while (0) |
|
597 NEON_VOP_ENV(qshl_s8, neon_s8, 4) |
|
598 NEON_VOP_ENV(qshl_s16, neon_s16, 2) |
|
599 NEON_VOP_ENV(qshl_s32, neon_s32, 1) |
|
600 #undef NEON_FN |
|
601 |
|
602 uint64_t HELPER(neon_qshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) |
|
603 { |
|
604 int8_t shift = (uint8_t)shiftop; |
|
605 int64_t val = valop; |
|
606 if (shift >= 64) { |
|
607 if (val) { |
|
608 SET_QC(); |
|
609 val = (val >> 63) & ~SIGNBIT64; |
|
610 } |
|
611 } else if (shift <= 64) { |
|
612 val >>= 63; |
|
613 } else if (shift < 0) { |
|
614 val >>= -shift; |
|
615 } else { |
|
616 int64_t tmp = val; |
|
617 val <<= shift; |
|
618 if ((val >> shift) != tmp) { |
|
619 SET_QC(); |
|
620 val = (tmp >> 63) ^ ~SIGNBIT64; |
|
621 } |
|
622 } |
|
623 return val; |
|
624 } |
|
625 |
|
626 |
|
627 /* FIXME: This is wrong. */ |
|
628 #define NEON_FN(dest, src1, src2) do { \ |
|
629 int8_t tmp; \ |
|
630 tmp = (int8_t)src2; \ |
|
631 if (tmp < 0) { \ |
|
632 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ |
|
633 } else { \ |
|
634 dest = src1 << tmp; \ |
|
635 if ((dest >> tmp) != src1) { \ |
|
636 SET_QC(); \ |
|
637 dest = ~0; \ |
|
638 } \ |
|
639 }} while (0) |
|
640 NEON_VOP_ENV(qrshl_u8, neon_u8, 4) |
|
641 NEON_VOP_ENV(qrshl_u16, neon_u16, 2) |
|
642 NEON_VOP_ENV(qrshl_u32, neon_u32, 1) |
|
643 #undef NEON_FN |
|
644 |
|
645 uint64_t HELPER(neon_qrshl_u64)(CPUState *env, uint64_t val, uint64_t shiftop) |
|
646 { |
|
647 int8_t shift = (int8_t)shiftop; |
|
648 if (shift < 0) { |
|
649 val = (val + (1 << (-1 - shift))) >> -shift; |
|
650 } else { \ |
|
651 uint64_t tmp = val; |
|
652 val <<= shift; |
|
653 if ((val >> shift) != tmp) { |
|
654 SET_QC(); |
|
655 val = ~0; |
|
656 } |
|
657 } |
|
658 return val; |
|
659 } |
|
660 |
|
661 #define NEON_FN(dest, src1, src2) do { \ |
|
662 int8_t tmp; \ |
|
663 tmp = (int8_t)src2; \ |
|
664 if (tmp < 0) { \ |
|
665 dest = (src1 + (1 << (-1 - tmp))) >> -tmp; \ |
|
666 } else { \ |
|
667 dest = src1 << tmp; \ |
|
668 if ((dest >> tmp) != src1) { \ |
|
669 SET_QC(); \ |
|
670 dest = src1 >> 31; \ |
|
671 } \ |
|
672 }} while (0) |
|
673 NEON_VOP_ENV(qrshl_s8, neon_s8, 4) |
|
674 NEON_VOP_ENV(qrshl_s16, neon_s16, 2) |
|
675 NEON_VOP_ENV(qrshl_s32, neon_s32, 1) |
|
676 #undef NEON_FN |
|
677 |
|
678 uint64_t HELPER(neon_qrshl_s64)(CPUState *env, uint64_t valop, uint64_t shiftop) |
|
679 { |
|
680 int8_t shift = (uint8_t)shiftop; |
|
681 int64_t val = valop; |
|
682 |
|
683 if (shift < 0) { |
|
684 val = (val + (1 << (-1 - shift))) >> -shift; |
|
685 } else { |
|
686 int64_t tmp = val;; |
|
687 val <<= shift; |
|
688 if ((val >> shift) != tmp) { |
|
689 SET_QC(); |
|
690 val = tmp >> 31; |
|
691 } |
|
692 } |
|
693 return val; |
|
694 } |
|
695 |
|
696 uint32_t HELPER(neon_add_u8)(uint32_t a, uint32_t b) |
|
697 { |
|
698 uint32_t mask; |
|
699 mask = (a ^ b) & 0x80808080u; |
|
700 a &= ~0x80808080u; |
|
701 b &= ~0x80808080u; |
|
702 return (a + b) ^ mask; |
|
703 } |
|
704 |
|
705 uint32_t HELPER(neon_add_u16)(uint32_t a, uint32_t b) |
|
706 { |
|
707 uint32_t mask; |
|
708 mask = (a ^ b) & 0x80008000u; |
|
709 a &= ~0x80008000u; |
|
710 b &= ~0x80008000u; |
|
711 return (a + b) ^ mask; |
|
712 } |
|
713 |
|
714 #define NEON_FN(dest, src1, src2) dest = src1 + src2 |
|
715 NEON_POP(padd_u8, neon_u8, 4) |
|
716 NEON_POP(padd_u16, neon_u16, 2) |
|
717 #undef NEON_FN |
|
718 |
|
719 #define NEON_FN(dest, src1, src2) dest = src1 - src2 |
|
720 NEON_VOP(sub_u8, neon_u8, 4) |
|
721 NEON_VOP(sub_u16, neon_u16, 2) |
|
722 #undef NEON_FN |
|
723 |
|
724 #define NEON_FN(dest, src1, src2) dest = src1 * src2 |
|
725 NEON_VOP(mul_u8, neon_u8, 4) |
|
726 NEON_VOP(mul_u16, neon_u16, 2) |
|
727 #undef NEON_FN |
|
728 |
|
729 /* Polynomial multiplication is like integer multiplication except the |
|
730 partial products are XORed, not added. */ |
|
731 uint32_t HELPER(neon_mul_p8)(uint32_t op1, uint32_t op2) |
|
732 { |
|
733 uint32_t mask; |
|
734 uint32_t result; |
|
735 result = 0; |
|
736 while (op1) { |
|
737 mask = 0; |
|
738 if (op1 & 1) |
|
739 mask |= 0xff; |
|
740 if (op1 & (1 << 8)) |
|
741 mask |= (0xff << 8); |
|
742 if (op1 & (1 << 16)) |
|
743 mask |= (0xff << 16); |
|
744 if (op1 & (1 << 24)) |
|
745 mask |= (0xff << 24); |
|
746 result ^= op2 & mask; |
|
747 op1 = (op1 >> 1) & 0x7f7f7f7f; |
|
748 op2 = (op2 << 1) & 0xfefefefe; |
|
749 } |
|
750 return result; |
|
751 } |
|
752 |
|
753 #define NEON_FN(dest, src1, src2) dest = (src1 & src2) ? -1 : 0 |
|
754 NEON_VOP(tst_u8, neon_u8, 4) |
|
755 NEON_VOP(tst_u16, neon_u16, 2) |
|
756 NEON_VOP(tst_u32, neon_u32, 1) |
|
757 #undef NEON_FN |
|
758 |
|
759 #define NEON_FN(dest, src1, src2) dest = (src1 == src2) ? -1 : 0 |
|
760 NEON_VOP(ceq_u8, neon_u8, 4) |
|
761 NEON_VOP(ceq_u16, neon_u16, 2) |
|
762 NEON_VOP(ceq_u32, neon_u32, 1) |
|
763 #undef NEON_FN |
|
764 |
|
765 #define NEON_FN(dest, src, dummy) dest = (src < 0) ? -src : src |
|
766 NEON_VOP1(abs_s8, neon_s8, 4) |
|
767 NEON_VOP1(abs_s16, neon_s16, 2) |
|
768 #undef NEON_FN |
|
769 |
|
770 /* Count Leading Sign/Zero Bits. */ |
|
771 static inline int do_clz8(uint8_t x) |
|
772 { |
|
773 int n; |
|
774 for (n = 8; x; n--) |
|
775 x >>= 1; |
|
776 return n; |
|
777 } |
|
778 |
|
779 static inline int do_clz16(uint16_t x) |
|
780 { |
|
781 int n; |
|
782 for (n = 16; x; n--) |
|
783 x >>= 1; |
|
784 return n; |
|
785 } |
|
786 |
|
787 #define NEON_FN(dest, src, dummy) dest = do_clz8(src) |
|
788 NEON_VOP1(clz_u8, neon_u8, 4) |
|
789 #undef NEON_FN |
|
790 |
|
791 #define NEON_FN(dest, src, dummy) dest = do_clz16(src) |
|
792 NEON_VOP1(clz_u16, neon_u16, 2) |
|
793 #undef NEON_FN |
|
794 |
|
795 #define NEON_FN(dest, src, dummy) dest = do_clz8((src < 0) ? ~src : src) - 1 |
|
796 NEON_VOP1(cls_s8, neon_s8, 4) |
|
797 #undef NEON_FN |
|
798 |
|
799 #define NEON_FN(dest, src, dummy) dest = do_clz16((src < 0) ? ~src : src) - 1 |
|
800 NEON_VOP1(cls_s16, neon_s16, 2) |
|
801 #undef NEON_FN |
|
802 |
|
803 uint32_t HELPER(neon_cls_s32)(uint32_t x) |
|
804 { |
|
805 int count; |
|
806 if ((int32_t)x < 0) |
|
807 x = ~x; |
|
808 for (count = 32; x; count--) |
|
809 x = x >> 1; |
|
810 return count - 1; |
|
811 } |
|
812 |
|
813 /* Bit count. */ |
|
814 uint32_t HELPER(neon_cnt_u8)(uint32_t x) |
|
815 { |
|
816 x = (x & 0x55555555) + ((x >> 1) & 0x55555555); |
|
817 x = (x & 0x33333333) + ((x >> 2) & 0x33333333); |
|
818 x = (x & 0x0f0f0f0f) + ((x >> 4) & 0x0f0f0f0f); |
|
819 return x; |
|
820 } |
|
821 |
|
822 #define NEON_QDMULH16(dest, src1, src2, round) do { \ |
|
823 uint32_t tmp = (int32_t)(int16_t) src1 * (int16_t) src2; \ |
|
824 if ((tmp ^ (tmp << 1)) & SIGNBIT) { \ |
|
825 SET_QC(); \ |
|
826 tmp = (tmp >> 31) ^ ~SIGNBIT; \ |
|
827 } \ |
|
828 tmp <<= 1; \ |
|
829 if (round) { \ |
|
830 int32_t old = tmp; \ |
|
831 tmp += 1 << 15; \ |
|
832 if ((int32_t)tmp < old) { \ |
|
833 SET_QC(); \ |
|
834 tmp = SIGNBIT - 1; \ |
|
835 } \ |
|
836 } \ |
|
837 dest = tmp >> 16; \ |
|
838 } while(0) |
|
839 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 0) |
|
840 NEON_VOP_ENV(qdmulh_s16, neon_s16, 2) |
|
841 #undef NEON_FN |
|
842 #define NEON_FN(dest, src1, src2) NEON_QDMULH16(dest, src1, src2, 1) |
|
843 NEON_VOP_ENV(qrdmulh_s16, neon_s16, 2) |
|
844 #undef NEON_FN |
|
845 #undef NEON_QDMULH16 |
|
846 |
|
847 #define NEON_QDMULH32(dest, src1, src2, round) do { \ |
|
848 uint64_t tmp = (int64_t)(int32_t) src1 * (int32_t) src2; \ |
|
849 if ((tmp ^ (tmp << 1)) & SIGNBIT64) { \ |
|
850 SET_QC(); \ |
|
851 tmp = (tmp >> 63) ^ ~SIGNBIT64; \ |
|
852 } else { \ |
|
853 tmp <<= 1; \ |
|
854 } \ |
|
855 if (round) { \ |
|
856 int64_t old = tmp; \ |
|
857 tmp += (int64_t)1 << 31; \ |
|
858 if ((int64_t)tmp < old) { \ |
|
859 SET_QC(); \ |
|
860 tmp = SIGNBIT64 - 1; \ |
|
861 } \ |
|
862 } \ |
|
863 dest = tmp >> 32; \ |
|
864 } while(0) |
|
865 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 0) |
|
866 NEON_VOP_ENV(qdmulh_s32, neon_s32, 1) |
|
867 #undef NEON_FN |
|
868 #define NEON_FN(dest, src1, src2) NEON_QDMULH32(dest, src1, src2, 1) |
|
869 NEON_VOP_ENV(qrdmulh_s32, neon_s32, 1) |
|
870 #undef NEON_FN |
|
871 #undef NEON_QDMULH32 |
|
872 |
|
873 uint32_t HELPER(neon_narrow_u8)(uint64_t x) |
|
874 { |
|
875 return (x & 0xffu) | ((x >> 8) & 0xff00u) | ((x >> 16) & 0xff0000u) |
|
876 | ((x >> 24) & 0xff000000u); |
|
877 } |
|
878 |
|
879 uint32_t HELPER(neon_narrow_u16)(uint64_t x) |
|
880 { |
|
881 return (x & 0xffffu) | ((x >> 16) & 0xffff0000u); |
|
882 } |
|
883 |
|
884 uint32_t HELPER(neon_narrow_high_u8)(uint64_t x) |
|
885 { |
|
886 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) |
|
887 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); |
|
888 } |
|
889 |
|
890 uint32_t HELPER(neon_narrow_high_u16)(uint64_t x) |
|
891 { |
|
892 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); |
|
893 } |
|
894 |
|
895 uint32_t HELPER(neon_narrow_round_high_u8)(uint64_t x) |
|
896 { |
|
897 x &= 0xff80ff80ff80ff80ull; |
|
898 x += 0x0080008000800080ull; |
|
899 return ((x >> 8) & 0xff) | ((x >> 16) & 0xff00) |
|
900 | ((x >> 24) & 0xff0000) | ((x >> 32) & 0xff000000); |
|
901 } |
|
902 |
|
903 uint32_t HELPER(neon_narrow_round_high_u16)(uint64_t x) |
|
904 { |
|
905 x &= 0xffff8000ffff8000ull; |
|
906 x += 0x0000800000008000ull; |
|
907 return ((x >> 16) & 0xffff) | ((x >> 32) & 0xffff0000); |
|
908 } |
|
909 |
|
910 uint32_t HELPER(neon_narrow_sat_u8)(CPUState *env, uint64_t x) |
|
911 { |
|
912 uint16_t s; |
|
913 uint8_t d; |
|
914 uint32_t res = 0; |
|
915 #define SAT8(n) \ |
|
916 s = x >> n; \ |
|
917 if (s > 0xff) { \ |
|
918 d = 0xff; \ |
|
919 SET_QC(); \ |
|
920 } else { \ |
|
921 d = s; \ |
|
922 } \ |
|
923 res |= (uint32_t)d << (n / 2); |
|
924 |
|
925 SAT8(0); |
|
926 SAT8(16); |
|
927 SAT8(32); |
|
928 SAT8(48); |
|
929 #undef SAT8 |
|
930 return res; |
|
931 } |
|
932 |
|
933 uint32_t HELPER(neon_narrow_sat_s8)(CPUState *env, uint64_t x) |
|
934 { |
|
935 int16_t s; |
|
936 uint8_t d; |
|
937 uint32_t res = 0; |
|
938 #define SAT8(n) \ |
|
939 s = x >> n; \ |
|
940 if (s != (int8_t)s) { \ |
|
941 d = (s >> 15) ^ 0x7f; \ |
|
942 SET_QC(); \ |
|
943 } else { \ |
|
944 d = s; \ |
|
945 } \ |
|
946 res |= (uint32_t)d << (n / 2); |
|
947 |
|
948 SAT8(0); |
|
949 SAT8(16); |
|
950 SAT8(32); |
|
951 SAT8(48); |
|
952 #undef SAT8 |
|
953 return res; |
|
954 } |
|
955 |
|
956 uint32_t HELPER(neon_narrow_sat_u16)(CPUState *env, uint64_t x) |
|
957 { |
|
958 uint32_t high; |
|
959 uint32_t low; |
|
960 low = x; |
|
961 if (low > 0xffff) { |
|
962 low = 0xffff; |
|
963 SET_QC(); |
|
964 } |
|
965 high = x >> 32; |
|
966 if (high > 0xffff) { |
|
967 high = 0xffff; |
|
968 SET_QC(); |
|
969 } |
|
970 return low | (high << 16); |
|
971 } |
|
972 |
|
973 uint32_t HELPER(neon_narrow_sat_s16)(CPUState *env, uint64_t x) |
|
974 { |
|
975 int32_t low; |
|
976 int32_t high; |
|
977 low = x; |
|
978 if (low != (int16_t)low) { |
|
979 low = (low >> 31) ^ 0x7fff; |
|
980 SET_QC(); |
|
981 } |
|
982 high = x >> 32; |
|
983 if (high != (int16_t)high) { |
|
984 high = (high >> 31) ^ 0x7fff; |
|
985 SET_QC(); |
|
986 } |
|
987 return (uint16_t)low | (high << 16); |
|
988 } |
|
989 |
|
990 uint32_t HELPER(neon_narrow_sat_u32)(CPUState *env, uint64_t x) |
|
991 { |
|
992 if (x > 0xffffffffu) { |
|
993 SET_QC(); |
|
994 return 0xffffffffu; |
|
995 } |
|
996 return x; |
|
997 } |
|
998 |
|
999 uint32_t HELPER(neon_narrow_sat_s32)(CPUState *env, uint64_t x) |
|
1000 { |
|
1001 if ((int64_t)x != (int32_t)x) { |
|
1002 SET_QC(); |
|
1003 return (x >> 63) ^ 0x7fffffff; |
|
1004 } |
|
1005 return x; |
|
1006 } |
|
1007 |
|
1008 uint64_t HELPER(neon_widen_u8)(uint32_t x) |
|
1009 { |
|
1010 uint64_t tmp; |
|
1011 uint64_t ret; |
|
1012 ret = (uint8_t)x; |
|
1013 tmp = (uint8_t)(x >> 8); |
|
1014 ret |= tmp << 16; |
|
1015 tmp = (uint8_t)(x >> 16); |
|
1016 ret |= tmp << 32; |
|
1017 tmp = (uint8_t)(x >> 24); |
|
1018 ret |= tmp << 48; |
|
1019 return ret; |
|
1020 } |
|
1021 |
|
1022 uint64_t HELPER(neon_widen_s8)(uint32_t x) |
|
1023 { |
|
1024 uint64_t tmp; |
|
1025 uint64_t ret; |
|
1026 ret = (uint16_t)(int8_t)x; |
|
1027 tmp = (uint16_t)(int8_t)(x >> 8); |
|
1028 ret |= tmp << 16; |
|
1029 tmp = (uint16_t)(int8_t)(x >> 16); |
|
1030 ret |= tmp << 32; |
|
1031 tmp = (uint16_t)(int8_t)(x >> 24); |
|
1032 ret |= tmp << 48; |
|
1033 return ret; |
|
1034 } |
|
1035 |
|
1036 uint64_t HELPER(neon_widen_u16)(uint32_t x) |
|
1037 { |
|
1038 uint64_t high = (uint16_t)(x >> 16); |
|
1039 return ((uint16_t)x) | (high << 32); |
|
1040 } |
|
1041 |
|
1042 uint64_t HELPER(neon_widen_s16)(uint32_t x) |
|
1043 { |
|
1044 uint64_t high = (int16_t)(x >> 16); |
|
1045 return ((uint32_t)(int16_t)x) | (high << 32); |
|
1046 } |
|
1047 |
|
1048 uint64_t HELPER(neon_addl_u16)(uint64_t a, uint64_t b) |
|
1049 { |
|
1050 uint64_t mask; |
|
1051 mask = (a ^ b) & 0x8000800080008000ull; |
|
1052 a &= ~0x8000800080008000ull; |
|
1053 b &= ~0x8000800080008000ull; |
|
1054 return (a + b) ^ mask; |
|
1055 } |
|
1056 |
|
1057 uint64_t HELPER(neon_addl_u32)(uint64_t a, uint64_t b) |
|
1058 { |
|
1059 uint64_t mask; |
|
1060 mask = (a ^ b) & 0x8000000080000000ull; |
|
1061 a &= ~0x8000000080000000ull; |
|
1062 b &= ~0x8000000080000000ull; |
|
1063 return (a + b) ^ mask; |
|
1064 } |
|
1065 |
|
1066 uint64_t HELPER(neon_paddl_u16)(uint64_t a, uint64_t b) |
|
1067 { |
|
1068 uint64_t tmp; |
|
1069 uint64_t tmp2; |
|
1070 |
|
1071 tmp = a & 0x0000ffff0000ffffull; |
|
1072 tmp += (a >> 16) & 0x0000ffff0000ffffull; |
|
1073 tmp2 = b & 0xffff0000ffff0000ull; |
|
1074 tmp2 += (b << 16) & 0xffff0000ffff0000ull; |
|
1075 return ( tmp & 0xffff) |
|
1076 | ((tmp >> 16) & 0xffff0000ull) |
|
1077 | ((tmp2 << 16) & 0xffff00000000ull) |
|
1078 | ( tmp2 & 0xffff000000000000ull); |
|
1079 } |
|
1080 |
|
1081 uint64_t HELPER(neon_paddl_u32)(uint64_t a, uint64_t b) |
|
1082 { |
|
1083 uint32_t low = a + (a >> 32); |
|
1084 uint32_t high = b + (b >> 32); |
|
1085 return low + ((uint64_t)high << 32); |
|
1086 } |
|
1087 |
|
1088 uint64_t HELPER(neon_subl_u16)(uint64_t a, uint64_t b) |
|
1089 { |
|
1090 uint64_t mask; |
|
1091 mask = (a ^ ~b) & 0x8000800080008000ull; |
|
1092 a |= 0x8000800080008000ull; |
|
1093 b &= ~0x8000800080008000ull; |
|
1094 return (a - b) ^ mask; |
|
1095 } |
|
1096 |
|
1097 uint64_t HELPER(neon_subl_u32)(uint64_t a, uint64_t b) |
|
1098 { |
|
1099 uint64_t mask; |
|
1100 mask = (a ^ ~b) & 0x8000000080000000ull; |
|
1101 a |= 0x8000000080000000ull; |
|
1102 b &= ~0x8000000080000000ull; |
|
1103 return (a - b) ^ mask; |
|
1104 } |
|
1105 |
|
1106 uint64_t HELPER(neon_addl_saturate_s32)(CPUState *env, uint64_t a, uint64_t b) |
|
1107 { |
|
1108 uint32_t x, y; |
|
1109 uint32_t low, high; |
|
1110 |
|
1111 x = a; |
|
1112 y = b; |
|
1113 low = x + y; |
|
1114 if (((low ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { |
|
1115 SET_QC(); |
|
1116 low = ((int32_t)x >> 31) ^ ~SIGNBIT; |
|
1117 } |
|
1118 x = a >> 32; |
|
1119 y = b >> 32; |
|
1120 high = x + y; |
|
1121 if (((high ^ x) & SIGNBIT) && !((x ^ y) & SIGNBIT)) { |
|
1122 SET_QC(); |
|
1123 high = ((int32_t)x >> 31) ^ ~SIGNBIT; |
|
1124 } |
|
1125 return low | ((uint64_t)high << 32); |
|
1126 } |
|
1127 |
|
1128 uint64_t HELPER(neon_addl_saturate_s64)(CPUState *env, uint64_t a, uint64_t b) |
|
1129 { |
|
1130 uint64_t result; |
|
1131 |
|
1132 result = a + b; |
|
1133 if (((result ^ a) & SIGNBIT64) && !((a ^ b) & SIGNBIT64)) { |
|
1134 SET_QC(); |
|
1135 result = ((int64_t)a >> 63) ^ ~SIGNBIT64; |
|
1136 } |
|
1137 return result; |
|
1138 } |
|
1139 |
|
1140 #define DO_ABD(dest, x, y, type) do { \ |
|
1141 type tmp_x = x; \ |
|
1142 type tmp_y = y; \ |
|
1143 dest = ((tmp_x > tmp_y) ? tmp_x - tmp_y : tmp_y - tmp_x); \ |
|
1144 } while(0) |
|
1145 |
|
1146 uint64_t HELPER(neon_abdl_u16)(uint32_t a, uint32_t b) |
|
1147 { |
|
1148 uint64_t tmp; |
|
1149 uint64_t result; |
|
1150 DO_ABD(result, a, b, uint8_t); |
|
1151 DO_ABD(tmp, a >> 8, b >> 8, uint8_t); |
|
1152 result |= tmp << 16; |
|
1153 DO_ABD(tmp, a >> 16, b >> 16, uint8_t); |
|
1154 result |= tmp << 32; |
|
1155 DO_ABD(tmp, a >> 24, b >> 24, uint8_t); |
|
1156 result |= tmp << 48; |
|
1157 return result; |
|
1158 } |
|
1159 |
|
1160 uint64_t HELPER(neon_abdl_s16)(uint32_t a, uint32_t b) |
|
1161 { |
|
1162 uint64_t tmp; |
|
1163 uint64_t result; |
|
1164 DO_ABD(result, a, b, int8_t); |
|
1165 DO_ABD(tmp, a >> 8, b >> 8, int8_t); |
|
1166 result |= tmp << 16; |
|
1167 DO_ABD(tmp, a >> 16, b >> 16, int8_t); |
|
1168 result |= tmp << 32; |
|
1169 DO_ABD(tmp, a >> 24, b >> 24, int8_t); |
|
1170 result |= tmp << 48; |
|
1171 return result; |
|
1172 } |
|
1173 |
|
1174 uint64_t HELPER(neon_abdl_u32)(uint32_t a, uint32_t b) |
|
1175 { |
|
1176 uint64_t tmp; |
|
1177 uint64_t result; |
|
1178 DO_ABD(result, a, b, uint16_t); |
|
1179 DO_ABD(tmp, a >> 16, b >> 16, uint16_t); |
|
1180 return result | (tmp << 32); |
|
1181 } |
|
1182 |
|
1183 uint64_t HELPER(neon_abdl_s32)(uint32_t a, uint32_t b) |
|
1184 { |
|
1185 uint64_t tmp; |
|
1186 uint64_t result; |
|
1187 DO_ABD(result, a, b, int16_t); |
|
1188 DO_ABD(tmp, a >> 16, b >> 16, int16_t); |
|
1189 return result | (tmp << 32); |
|
1190 } |
|
1191 |
|
1192 uint64_t HELPER(neon_abdl_u64)(uint32_t a, uint32_t b) |
|
1193 { |
|
1194 uint64_t result; |
|
1195 DO_ABD(result, a, b, uint32_t); |
|
1196 return result; |
|
1197 } |
|
1198 |
|
1199 uint64_t HELPER(neon_abdl_s64)(uint32_t a, uint32_t b) |
|
1200 { |
|
1201 uint64_t result; |
|
1202 DO_ABD(result, a, b, int32_t); |
|
1203 return result; |
|
1204 } |
|
1205 #undef DO_ABD |
|
1206 |
|
1207 /* Widening multiply. Named type is the source type. */ |
|
1208 #define DO_MULL(dest, x, y, type1, type2) do { \ |
|
1209 type1 tmp_x = x; \ |
|
1210 type1 tmp_y = y; \ |
|
1211 dest = (type2)((type2)tmp_x * (type2)tmp_y); \ |
|
1212 } while(0) |
|
1213 |
|
1214 uint64_t HELPER(neon_mull_u8)(uint32_t a, uint32_t b) |
|
1215 { |
|
1216 uint64_t tmp; |
|
1217 uint64_t result; |
|
1218 |
|
1219 DO_MULL(result, a, b, uint8_t, uint16_t); |
|
1220 DO_MULL(tmp, a >> 8, b >> 8, uint8_t, uint16_t); |
|
1221 result |= tmp << 16; |
|
1222 DO_MULL(tmp, a >> 16, b >> 16, uint8_t, uint16_t); |
|
1223 result |= tmp << 32; |
|
1224 DO_MULL(tmp, a >> 24, b >> 24, uint8_t, uint16_t); |
|
1225 result |= tmp << 48; |
|
1226 return result; |
|
1227 } |
|
1228 |
|
1229 uint64_t HELPER(neon_mull_s8)(uint32_t a, uint32_t b) |
|
1230 { |
|
1231 uint64_t tmp; |
|
1232 uint64_t result; |
|
1233 |
|
1234 DO_MULL(result, a, b, int8_t, uint16_t); |
|
1235 DO_MULL(tmp, a >> 8, b >> 8, int8_t, uint16_t); |
|
1236 result |= tmp << 16; |
|
1237 DO_MULL(tmp, a >> 16, b >> 16, int8_t, uint16_t); |
|
1238 result |= tmp << 32; |
|
1239 DO_MULL(tmp, a >> 24, b >> 24, int8_t, uint16_t); |
|
1240 result |= tmp << 48; |
|
1241 return result; |
|
1242 } |
|
1243 |
|
1244 uint64_t HELPER(neon_mull_u16)(uint32_t a, uint32_t b) |
|
1245 { |
|
1246 uint64_t tmp; |
|
1247 uint64_t result; |
|
1248 |
|
1249 DO_MULL(result, a, b, uint16_t, uint32_t); |
|
1250 DO_MULL(tmp, a >> 16, b >> 16, uint16_t, uint32_t); |
|
1251 return result | (tmp << 32); |
|
1252 } |
|
1253 |
|
1254 uint64_t HELPER(neon_mull_s16)(uint32_t a, uint32_t b) |
|
1255 { |
|
1256 uint64_t tmp; |
|
1257 uint64_t result; |
|
1258 |
|
1259 DO_MULL(result, a, b, int16_t, uint32_t); |
|
1260 DO_MULL(tmp, a >> 16, b >> 16, int16_t, uint32_t); |
|
1261 return result | (tmp << 32); |
|
1262 } |
|
1263 |
|
1264 uint64_t HELPER(neon_negl_u16)(uint64_t x) |
|
1265 { |
|
1266 uint16_t tmp; |
|
1267 uint64_t result; |
|
1268 result = (uint16_t)-x; |
|
1269 tmp = -(x >> 16); |
|
1270 result |= (uint64_t)tmp << 16; |
|
1271 tmp = -(x >> 32); |
|
1272 result |= (uint64_t)tmp << 32; |
|
1273 tmp = -(x >> 48); |
|
1274 result |= (uint64_t)tmp << 48; |
|
1275 return result; |
|
1276 } |
|
1277 |
|
1278 #include <stdio.h> |
|
1279 uint64_t HELPER(neon_negl_u32)(uint64_t x) |
|
1280 { |
|
1281 uint32_t low = -x; |
|
1282 uint32_t high = -(x >> 32); |
|
1283 return low | ((uint64_t)high << 32); |
|
1284 } |
|
1285 |
|
1286 /* FIXME: There should be a native op for this. */ |
|
1287 uint64_t HELPER(neon_negl_u64)(uint64_t x) |
|
1288 { |
|
1289 return -x; |
|
1290 } |
|
1291 |
|
1292 /* Saturnating sign manuipulation. */ |
|
1293 /* ??? Make these use NEON_VOP1 */ |
|
1294 #define DO_QABS8(x) do { \ |
|
1295 if (x == (int8_t)0x80) { \ |
|
1296 x = 0x7f; \ |
|
1297 SET_QC(); \ |
|
1298 } else if (x < 0) { \ |
|
1299 x = -x; \ |
|
1300 }} while (0) |
|
1301 uint32_t HELPER(neon_qabs_s8)(CPUState *env, uint32_t x) |
|
1302 { |
|
1303 neon_s8 vec; |
|
1304 NEON_UNPACK(neon_s8, vec, x); |
|
1305 DO_QABS8(vec.v1); |
|
1306 DO_QABS8(vec.v2); |
|
1307 DO_QABS8(vec.v3); |
|
1308 DO_QABS8(vec.v4); |
|
1309 NEON_PACK(neon_s8, x, vec); |
|
1310 return x; |
|
1311 } |
|
1312 #undef DO_QABS8 |
|
1313 |
|
1314 #define DO_QNEG8(x) do { \ |
|
1315 if (x == (int8_t)0x80) { \ |
|
1316 x = 0x7f; \ |
|
1317 SET_QC(); \ |
|
1318 } else { \ |
|
1319 x = -x; \ |
|
1320 }} while (0) |
|
1321 uint32_t HELPER(neon_qneg_s8)(CPUState *env, uint32_t x) |
|
1322 { |
|
1323 neon_s8 vec; |
|
1324 NEON_UNPACK(neon_s8, vec, x); |
|
1325 DO_QNEG8(vec.v1); |
|
1326 DO_QNEG8(vec.v2); |
|
1327 DO_QNEG8(vec.v3); |
|
1328 DO_QNEG8(vec.v4); |
|
1329 NEON_PACK(neon_s8, x, vec); |
|
1330 return x; |
|
1331 } |
|
1332 #undef DO_QNEG8 |
|
1333 |
|
1334 #define DO_QABS16(x) do { \ |
|
1335 if (x == (int16_t)0x8000) { \ |
|
1336 x = 0x7fff; \ |
|
1337 SET_QC(); \ |
|
1338 } else if (x < 0) { \ |
|
1339 x = -x; \ |
|
1340 }} while (0) |
|
1341 uint32_t HELPER(neon_qabs_s16)(CPUState *env, uint32_t x) |
|
1342 { |
|
1343 neon_s16 vec; |
|
1344 NEON_UNPACK(neon_s16, vec, x); |
|
1345 DO_QABS16(vec.v1); |
|
1346 DO_QABS16(vec.v2); |
|
1347 NEON_PACK(neon_s16, x, vec); |
|
1348 return x; |
|
1349 } |
|
1350 #undef DO_QABS16 |
|
1351 |
|
1352 #define DO_QNEG16(x) do { \ |
|
1353 if (x == (int16_t)0x8000) { \ |
|
1354 x = 0x7fff; \ |
|
1355 SET_QC(); \ |
|
1356 } else { \ |
|
1357 x = -x; \ |
|
1358 }} while (0) |
|
1359 uint32_t HELPER(neon_qneg_s16)(CPUState *env, uint32_t x) |
|
1360 { |
|
1361 neon_s16 vec; |
|
1362 NEON_UNPACK(neon_s16, vec, x); |
|
1363 DO_QNEG16(vec.v1); |
|
1364 DO_QNEG16(vec.v2); |
|
1365 NEON_PACK(neon_s16, x, vec); |
|
1366 return x; |
|
1367 } |
|
1368 #undef DO_QNEG16 |
|
1369 |
|
1370 uint32_t HELPER(neon_qabs_s32)(CPUState *env, uint32_t x) |
|
1371 { |
|
1372 if (x == SIGNBIT) { |
|
1373 SET_QC(); |
|
1374 x = ~SIGNBIT; |
|
1375 } else if ((int32_t)x < 0) { |
|
1376 x = -x; |
|
1377 } |
|
1378 return x; |
|
1379 } |
|
1380 |
|
1381 uint32_t HELPER(neon_qneg_s32)(CPUState *env, uint32_t x) |
|
1382 { |
|
1383 if (x == SIGNBIT) { |
|
1384 SET_QC(); |
|
1385 x = ~SIGNBIT; |
|
1386 } else { |
|
1387 x = -x; |
|
1388 } |
|
1389 return x; |
|
1390 } |
|
1391 |
|
1392 /* NEON Float helpers. */ |
|
1393 uint32_t HELPER(neon_min_f32)(uint32_t a, uint32_t b) |
|
1394 { |
|
1395 float32 f0 = vfp_itos(a); |
|
1396 float32 f1 = vfp_itos(b); |
|
1397 return (float32_compare_quiet(f0, f1, NFS) == -1) ? a : b; |
|
1398 } |
|
1399 |
|
1400 uint32_t HELPER(neon_max_f32)(uint32_t a, uint32_t b) |
|
1401 { |
|
1402 float32 f0 = vfp_itos(a); |
|
1403 float32 f1 = vfp_itos(b); |
|
1404 return (float32_compare_quiet(f0, f1, NFS) == 1) ? a : b; |
|
1405 } |
|
1406 |
|
1407 uint32_t HELPER(neon_abd_f32)(uint32_t a, uint32_t b) |
|
1408 { |
|
1409 float32 f0 = vfp_itos(a); |
|
1410 float32 f1 = vfp_itos(b); |
|
1411 return vfp_stoi((float32_compare_quiet(f0, f1, NFS) == 1) |
|
1412 ? float32_sub(f0, f1, NFS) |
|
1413 : float32_sub(f1, f0, NFS)); |
|
1414 } |
|
1415 |
|
1416 uint32_t HELPER(neon_add_f32)(uint32_t a, uint32_t b) |
|
1417 { |
|
1418 return vfp_stoi(float32_add(vfp_itos(a), vfp_itos(b), NFS)); |
|
1419 } |
|
1420 |
|
1421 uint32_t HELPER(neon_sub_f32)(uint32_t a, uint32_t b) |
|
1422 { |
|
1423 return vfp_stoi(float32_sub(vfp_itos(a), vfp_itos(b), NFS)); |
|
1424 } |
|
1425 |
|
1426 uint32_t HELPER(neon_mul_f32)(uint32_t a, uint32_t b) |
|
1427 { |
|
1428 return vfp_stoi(float32_mul(vfp_itos(a), vfp_itos(b), NFS)); |
|
1429 } |
|
1430 |
|
1431 /* Floating point comparisons produce an integer result. */ |
|
1432 #define NEON_VOP_FCMP(name, cmp) \ |
|
1433 uint32_t HELPER(neon_##name)(uint32_t a, uint32_t b) \ |
|
1434 { \ |
|
1435 if (float32_compare_quiet(vfp_itos(a), vfp_itos(b), NFS) cmp 0) \ |
|
1436 return ~0; \ |
|
1437 else \ |
|
1438 return 0; \ |
|
1439 } |
|
1440 |
|
1441 NEON_VOP_FCMP(ceq_f32, ==) |
|
1442 NEON_VOP_FCMP(cge_f32, >=) |
|
1443 NEON_VOP_FCMP(cgt_f32, >) |
|
1444 |
|
1445 uint32_t HELPER(neon_acge_f32)(uint32_t a, uint32_t b) |
|
1446 { |
|
1447 float32 f0 = float32_abs(vfp_itos(a)); |
|
1448 float32 f1 = float32_abs(vfp_itos(b)); |
|
1449 return (float32_compare_quiet(f0, f1,NFS) >= 0) ? ~0 : 0; |
|
1450 } |
|
1451 |
|
1452 uint32_t HELPER(neon_acgt_f32)(uint32_t a, uint32_t b) |
|
1453 { |
|
1454 float32 f0 = float32_abs(vfp_itos(a)); |
|
1455 float32 f1 = float32_abs(vfp_itos(b)); |
|
1456 return (float32_compare_quiet(f0, f1, NFS) > 0) ? ~0 : 0; |
|
1457 } |