|
1 /* |
|
2 * MMX/3DNow!/SSE/SSE2/SSE3/SSSE3/SSE4/PNI support |
|
3 * |
|
4 * Copyright (c) 2005 Fabrice Bellard |
|
5 * Copyright (c) 2008 Intel Corporation <andrew.zaborowski@intel.com> |
|
6 * |
|
7 * This library is free software; you can redistribute it and/or |
|
8 * modify it under the terms of the GNU Lesser General Public |
|
9 * License as published by the Free Software Foundation; either |
|
10 * version 2 of the License, or (at your option) any later version. |
|
11 * |
|
12 * This library is distributed in the hope that it will be useful, |
|
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
|
15 * Lesser General Public License for more details. |
|
16 * |
|
17 * You should have received a copy of the GNU Lesser General Public |
|
18 * License along with this library; if not, write to the Free Software |
|
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
|
20 */ |
|
21 #if SHIFT == 0 |
|
22 #define Reg MMXReg |
|
23 #define XMM_ONLY(x...) |
|
24 #define B(n) MMX_B(n) |
|
25 #define W(n) MMX_W(n) |
|
26 #define L(n) MMX_L(n) |
|
27 #define Q(n) q |
|
28 #define SUFFIX _mmx |
|
29 #else |
|
30 #define Reg XMMReg |
|
31 #define XMM_ONLY(x...) x |
|
32 #define B(n) XMM_B(n) |
|
33 #define W(n) XMM_W(n) |
|
34 #define L(n) XMM_L(n) |
|
35 #define Q(n) XMM_Q(n) |
|
36 #define SUFFIX _xmm |
|
37 #endif |
|
38 |
|
39 void glue(helper_psrlw, SUFFIX)(Reg *d, Reg *s) |
|
40 { |
|
41 int shift; |
|
42 |
|
43 if (s->Q(0) > 15) { |
|
44 d->Q(0) = 0; |
|
45 #if SHIFT == 1 |
|
46 d->Q(1) = 0; |
|
47 #endif |
|
48 } else { |
|
49 shift = s->B(0); |
|
50 d->W(0) >>= shift; |
|
51 d->W(1) >>= shift; |
|
52 d->W(2) >>= shift; |
|
53 d->W(3) >>= shift; |
|
54 #if SHIFT == 1 |
|
55 d->W(4) >>= shift; |
|
56 d->W(5) >>= shift; |
|
57 d->W(6) >>= shift; |
|
58 d->W(7) >>= shift; |
|
59 #endif |
|
60 } |
|
61 } |
|
62 |
|
63 void glue(helper_psraw, SUFFIX)(Reg *d, Reg *s) |
|
64 { |
|
65 int shift; |
|
66 |
|
67 if (s->Q(0) > 15) { |
|
68 shift = 15; |
|
69 } else { |
|
70 shift = s->B(0); |
|
71 } |
|
72 d->W(0) = (int16_t)d->W(0) >> shift; |
|
73 d->W(1) = (int16_t)d->W(1) >> shift; |
|
74 d->W(2) = (int16_t)d->W(2) >> shift; |
|
75 d->W(3) = (int16_t)d->W(3) >> shift; |
|
76 #if SHIFT == 1 |
|
77 d->W(4) = (int16_t)d->W(4) >> shift; |
|
78 d->W(5) = (int16_t)d->W(5) >> shift; |
|
79 d->W(6) = (int16_t)d->W(6) >> shift; |
|
80 d->W(7) = (int16_t)d->W(7) >> shift; |
|
81 #endif |
|
82 } |
|
83 |
|
84 void glue(helper_psllw, SUFFIX)(Reg *d, Reg *s) |
|
85 { |
|
86 int shift; |
|
87 |
|
88 if (s->Q(0) > 15) { |
|
89 d->Q(0) = 0; |
|
90 #if SHIFT == 1 |
|
91 d->Q(1) = 0; |
|
92 #endif |
|
93 } else { |
|
94 shift = s->B(0); |
|
95 d->W(0) <<= shift; |
|
96 d->W(1) <<= shift; |
|
97 d->W(2) <<= shift; |
|
98 d->W(3) <<= shift; |
|
99 #if SHIFT == 1 |
|
100 d->W(4) <<= shift; |
|
101 d->W(5) <<= shift; |
|
102 d->W(6) <<= shift; |
|
103 d->W(7) <<= shift; |
|
104 #endif |
|
105 } |
|
106 } |
|
107 |
|
108 void glue(helper_psrld, SUFFIX)(Reg *d, Reg *s) |
|
109 { |
|
110 int shift; |
|
111 |
|
112 if (s->Q(0) > 31) { |
|
113 d->Q(0) = 0; |
|
114 #if SHIFT == 1 |
|
115 d->Q(1) = 0; |
|
116 #endif |
|
117 } else { |
|
118 shift = s->B(0); |
|
119 d->L(0) >>= shift; |
|
120 d->L(1) >>= shift; |
|
121 #if SHIFT == 1 |
|
122 d->L(2) >>= shift; |
|
123 d->L(3) >>= shift; |
|
124 #endif |
|
125 } |
|
126 } |
|
127 |
|
128 void glue(helper_psrad, SUFFIX)(Reg *d, Reg *s) |
|
129 { |
|
130 int shift; |
|
131 |
|
132 if (s->Q(0) > 31) { |
|
133 shift = 31; |
|
134 } else { |
|
135 shift = s->B(0); |
|
136 } |
|
137 d->L(0) = (int32_t)d->L(0) >> shift; |
|
138 d->L(1) = (int32_t)d->L(1) >> shift; |
|
139 #if SHIFT == 1 |
|
140 d->L(2) = (int32_t)d->L(2) >> shift; |
|
141 d->L(3) = (int32_t)d->L(3) >> shift; |
|
142 #endif |
|
143 } |
|
144 |
|
145 void glue(helper_pslld, SUFFIX)(Reg *d, Reg *s) |
|
146 { |
|
147 int shift; |
|
148 |
|
149 if (s->Q(0) > 31) { |
|
150 d->Q(0) = 0; |
|
151 #if SHIFT == 1 |
|
152 d->Q(1) = 0; |
|
153 #endif |
|
154 } else { |
|
155 shift = s->B(0); |
|
156 d->L(0) <<= shift; |
|
157 d->L(1) <<= shift; |
|
158 #if SHIFT == 1 |
|
159 d->L(2) <<= shift; |
|
160 d->L(3) <<= shift; |
|
161 #endif |
|
162 } |
|
163 } |
|
164 |
|
165 void glue(helper_psrlq, SUFFIX)(Reg *d, Reg *s) |
|
166 { |
|
167 int shift; |
|
168 |
|
169 if (s->Q(0) > 63) { |
|
170 d->Q(0) = 0; |
|
171 #if SHIFT == 1 |
|
172 d->Q(1) = 0; |
|
173 #endif |
|
174 } else { |
|
175 shift = s->B(0); |
|
176 d->Q(0) >>= shift; |
|
177 #if SHIFT == 1 |
|
178 d->Q(1) >>= shift; |
|
179 #endif |
|
180 } |
|
181 } |
|
182 |
|
183 void glue(helper_psllq, SUFFIX)(Reg *d, Reg *s) |
|
184 { |
|
185 int shift; |
|
186 |
|
187 if (s->Q(0) > 63) { |
|
188 d->Q(0) = 0; |
|
189 #if SHIFT == 1 |
|
190 d->Q(1) = 0; |
|
191 #endif |
|
192 } else { |
|
193 shift = s->B(0); |
|
194 d->Q(0) <<= shift; |
|
195 #if SHIFT == 1 |
|
196 d->Q(1) <<= shift; |
|
197 #endif |
|
198 } |
|
199 } |
|
200 |
|
201 #if SHIFT == 1 |
|
202 void glue(helper_psrldq, SUFFIX)(Reg *d, Reg *s) |
|
203 { |
|
204 int shift, i; |
|
205 |
|
206 shift = s->L(0); |
|
207 if (shift > 16) |
|
208 shift = 16; |
|
209 for(i = 0; i < 16 - shift; i++) |
|
210 d->B(i) = d->B(i + shift); |
|
211 for(i = 16 - shift; i < 16; i++) |
|
212 d->B(i) = 0; |
|
213 } |
|
214 |
|
215 void glue(helper_pslldq, SUFFIX)(Reg *d, Reg *s) |
|
216 { |
|
217 int shift, i; |
|
218 |
|
219 shift = s->L(0); |
|
220 if (shift > 16) |
|
221 shift = 16; |
|
222 for(i = 15; i >= shift; i--) |
|
223 d->B(i) = d->B(i - shift); |
|
224 for(i = 0; i < shift; i++) |
|
225 d->B(i) = 0; |
|
226 } |
|
227 #endif |
|
228 |
|
229 #define SSE_HELPER_B(name, F)\ |
|
230 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
231 {\ |
|
232 d->B(0) = F(d->B(0), s->B(0));\ |
|
233 d->B(1) = F(d->B(1), s->B(1));\ |
|
234 d->B(2) = F(d->B(2), s->B(2));\ |
|
235 d->B(3) = F(d->B(3), s->B(3));\ |
|
236 d->B(4) = F(d->B(4), s->B(4));\ |
|
237 d->B(5) = F(d->B(5), s->B(5));\ |
|
238 d->B(6) = F(d->B(6), s->B(6));\ |
|
239 d->B(7) = F(d->B(7), s->B(7));\ |
|
240 XMM_ONLY(\ |
|
241 d->B(8) = F(d->B(8), s->B(8));\ |
|
242 d->B(9) = F(d->B(9), s->B(9));\ |
|
243 d->B(10) = F(d->B(10), s->B(10));\ |
|
244 d->B(11) = F(d->B(11), s->B(11));\ |
|
245 d->B(12) = F(d->B(12), s->B(12));\ |
|
246 d->B(13) = F(d->B(13), s->B(13));\ |
|
247 d->B(14) = F(d->B(14), s->B(14));\ |
|
248 d->B(15) = F(d->B(15), s->B(15));\ |
|
249 )\ |
|
250 } |
|
251 |
|
252 #define SSE_HELPER_W(name, F)\ |
|
253 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
254 {\ |
|
255 d->W(0) = F(d->W(0), s->W(0));\ |
|
256 d->W(1) = F(d->W(1), s->W(1));\ |
|
257 d->W(2) = F(d->W(2), s->W(2));\ |
|
258 d->W(3) = F(d->W(3), s->W(3));\ |
|
259 XMM_ONLY(\ |
|
260 d->W(4) = F(d->W(4), s->W(4));\ |
|
261 d->W(5) = F(d->W(5), s->W(5));\ |
|
262 d->W(6) = F(d->W(6), s->W(6));\ |
|
263 d->W(7) = F(d->W(7), s->W(7));\ |
|
264 )\ |
|
265 } |
|
266 |
|
267 #define SSE_HELPER_L(name, F)\ |
|
268 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
269 {\ |
|
270 d->L(0) = F(d->L(0), s->L(0));\ |
|
271 d->L(1) = F(d->L(1), s->L(1));\ |
|
272 XMM_ONLY(\ |
|
273 d->L(2) = F(d->L(2), s->L(2));\ |
|
274 d->L(3) = F(d->L(3), s->L(3));\ |
|
275 )\ |
|
276 } |
|
277 |
|
278 #define SSE_HELPER_Q(name, F)\ |
|
279 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
280 {\ |
|
281 d->Q(0) = F(d->Q(0), s->Q(0));\ |
|
282 XMM_ONLY(\ |
|
283 d->Q(1) = F(d->Q(1), s->Q(1));\ |
|
284 )\ |
|
285 } |
|
286 |
|
287 #if SHIFT == 0 |
|
288 static inline int satub(int x) |
|
289 { |
|
290 if (x < 0) |
|
291 return 0; |
|
292 else if (x > 255) |
|
293 return 255; |
|
294 else |
|
295 return x; |
|
296 } |
|
297 |
|
298 static inline int satuw(int x) |
|
299 { |
|
300 if (x < 0) |
|
301 return 0; |
|
302 else if (x > 65535) |
|
303 return 65535; |
|
304 else |
|
305 return x; |
|
306 } |
|
307 |
|
308 static inline int satsb(int x) |
|
309 { |
|
310 if (x < -128) |
|
311 return -128; |
|
312 else if (x > 127) |
|
313 return 127; |
|
314 else |
|
315 return x; |
|
316 } |
|
317 |
|
318 static inline int satsw(int x) |
|
319 { |
|
320 if (x < -32768) |
|
321 return -32768; |
|
322 else if (x > 32767) |
|
323 return 32767; |
|
324 else |
|
325 return x; |
|
326 } |
|
327 |
|
328 #define FADD(a, b) ((a) + (b)) |
|
329 #define FADDUB(a, b) satub((a) + (b)) |
|
330 #define FADDUW(a, b) satuw((a) + (b)) |
|
331 #define FADDSB(a, b) satsb((int8_t)(a) + (int8_t)(b)) |
|
332 #define FADDSW(a, b) satsw((int16_t)(a) + (int16_t)(b)) |
|
333 |
|
334 #define FSUB(a, b) ((a) - (b)) |
|
335 #define FSUBUB(a, b) satub((a) - (b)) |
|
336 #define FSUBUW(a, b) satuw((a) - (b)) |
|
337 #define FSUBSB(a, b) satsb((int8_t)(a) - (int8_t)(b)) |
|
338 #define FSUBSW(a, b) satsw((int16_t)(a) - (int16_t)(b)) |
|
339 #define FMINUB(a, b) ((a) < (b)) ? (a) : (b) |
|
340 #define FMINSW(a, b) ((int16_t)(a) < (int16_t)(b)) ? (a) : (b) |
|
341 #define FMAXUB(a, b) ((a) > (b)) ? (a) : (b) |
|
342 #define FMAXSW(a, b) ((int16_t)(a) > (int16_t)(b)) ? (a) : (b) |
|
343 |
|
344 #define FAND(a, b) (a) & (b) |
|
345 #define FANDN(a, b) ((~(a)) & (b)) |
|
346 #define FOR(a, b) (a) | (b) |
|
347 #define FXOR(a, b) (a) ^ (b) |
|
348 |
|
349 #define FCMPGTB(a, b) (int8_t)(a) > (int8_t)(b) ? -1 : 0 |
|
350 #define FCMPGTW(a, b) (int16_t)(a) > (int16_t)(b) ? -1 : 0 |
|
351 #define FCMPGTL(a, b) (int32_t)(a) > (int32_t)(b) ? -1 : 0 |
|
352 #define FCMPEQ(a, b) (a) == (b) ? -1 : 0 |
|
353 |
|
354 #define FMULLW(a, b) (a) * (b) |
|
355 #define FMULHRW(a, b) ((int16_t)(a) * (int16_t)(b) + 0x8000) >> 16 |
|
356 #define FMULHUW(a, b) (a) * (b) >> 16 |
|
357 #define FMULHW(a, b) (int16_t)(a) * (int16_t)(b) >> 16 |
|
358 |
|
359 #define FAVG(a, b) ((a) + (b) + 1) >> 1 |
|
360 #endif |
|
361 |
|
362 SSE_HELPER_B(helper_paddb, FADD) |
|
363 SSE_HELPER_W(helper_paddw, FADD) |
|
364 SSE_HELPER_L(helper_paddl, FADD) |
|
365 SSE_HELPER_Q(helper_paddq, FADD) |
|
366 |
|
367 SSE_HELPER_B(helper_psubb, FSUB) |
|
368 SSE_HELPER_W(helper_psubw, FSUB) |
|
369 SSE_HELPER_L(helper_psubl, FSUB) |
|
370 SSE_HELPER_Q(helper_psubq, FSUB) |
|
371 |
|
372 SSE_HELPER_B(helper_paddusb, FADDUB) |
|
373 SSE_HELPER_B(helper_paddsb, FADDSB) |
|
374 SSE_HELPER_B(helper_psubusb, FSUBUB) |
|
375 SSE_HELPER_B(helper_psubsb, FSUBSB) |
|
376 |
|
377 SSE_HELPER_W(helper_paddusw, FADDUW) |
|
378 SSE_HELPER_W(helper_paddsw, FADDSW) |
|
379 SSE_HELPER_W(helper_psubusw, FSUBUW) |
|
380 SSE_HELPER_W(helper_psubsw, FSUBSW) |
|
381 |
|
382 SSE_HELPER_B(helper_pminub, FMINUB) |
|
383 SSE_HELPER_B(helper_pmaxub, FMAXUB) |
|
384 |
|
385 SSE_HELPER_W(helper_pminsw, FMINSW) |
|
386 SSE_HELPER_W(helper_pmaxsw, FMAXSW) |
|
387 |
|
388 SSE_HELPER_Q(helper_pand, FAND) |
|
389 SSE_HELPER_Q(helper_pandn, FANDN) |
|
390 SSE_HELPER_Q(helper_por, FOR) |
|
391 SSE_HELPER_Q(helper_pxor, FXOR) |
|
392 |
|
393 SSE_HELPER_B(helper_pcmpgtb, FCMPGTB) |
|
394 SSE_HELPER_W(helper_pcmpgtw, FCMPGTW) |
|
395 SSE_HELPER_L(helper_pcmpgtl, FCMPGTL) |
|
396 |
|
397 SSE_HELPER_B(helper_pcmpeqb, FCMPEQ) |
|
398 SSE_HELPER_W(helper_pcmpeqw, FCMPEQ) |
|
399 SSE_HELPER_L(helper_pcmpeql, FCMPEQ) |
|
400 |
|
401 SSE_HELPER_W(helper_pmullw, FMULLW) |
|
402 #if SHIFT == 0 |
|
403 SSE_HELPER_W(helper_pmulhrw, FMULHRW) |
|
404 #endif |
|
405 SSE_HELPER_W(helper_pmulhuw, FMULHUW) |
|
406 SSE_HELPER_W(helper_pmulhw, FMULHW) |
|
407 |
|
408 SSE_HELPER_B(helper_pavgb, FAVG) |
|
409 SSE_HELPER_W(helper_pavgw, FAVG) |
|
410 |
|
411 void glue(helper_pmuludq, SUFFIX) (Reg *d, Reg *s) |
|
412 { |
|
413 d->Q(0) = (uint64_t)s->L(0) * (uint64_t)d->L(0); |
|
414 #if SHIFT == 1 |
|
415 d->Q(1) = (uint64_t)s->L(2) * (uint64_t)d->L(2); |
|
416 #endif |
|
417 } |
|
418 |
|
419 void glue(helper_pmaddwd, SUFFIX) (Reg *d, Reg *s) |
|
420 { |
|
421 int i; |
|
422 |
|
423 for(i = 0; i < (2 << SHIFT); i++) { |
|
424 d->L(i) = (int16_t)s->W(2*i) * (int16_t)d->W(2*i) + |
|
425 (int16_t)s->W(2*i+1) * (int16_t)d->W(2*i+1); |
|
426 } |
|
427 } |
|
428 |
|
429 #if SHIFT == 0 |
|
430 static inline int abs1(int a) |
|
431 { |
|
432 if (a < 0) |
|
433 return -a; |
|
434 else |
|
435 return a; |
|
436 } |
|
437 #endif |
|
438 void glue(helper_psadbw, SUFFIX) (Reg *d, Reg *s) |
|
439 { |
|
440 unsigned int val; |
|
441 |
|
442 val = 0; |
|
443 val += abs1(d->B(0) - s->B(0)); |
|
444 val += abs1(d->B(1) - s->B(1)); |
|
445 val += abs1(d->B(2) - s->B(2)); |
|
446 val += abs1(d->B(3) - s->B(3)); |
|
447 val += abs1(d->B(4) - s->B(4)); |
|
448 val += abs1(d->B(5) - s->B(5)); |
|
449 val += abs1(d->B(6) - s->B(6)); |
|
450 val += abs1(d->B(7) - s->B(7)); |
|
451 d->Q(0) = val; |
|
452 #if SHIFT == 1 |
|
453 val = 0; |
|
454 val += abs1(d->B(8) - s->B(8)); |
|
455 val += abs1(d->B(9) - s->B(9)); |
|
456 val += abs1(d->B(10) - s->B(10)); |
|
457 val += abs1(d->B(11) - s->B(11)); |
|
458 val += abs1(d->B(12) - s->B(12)); |
|
459 val += abs1(d->B(13) - s->B(13)); |
|
460 val += abs1(d->B(14) - s->B(14)); |
|
461 val += abs1(d->B(15) - s->B(15)); |
|
462 d->Q(1) = val; |
|
463 #endif |
|
464 } |
|
465 |
|
466 void glue(helper_maskmov, SUFFIX) (Reg *d, Reg *s, target_ulong a0) |
|
467 { |
|
468 int i; |
|
469 for(i = 0; i < (8 << SHIFT); i++) { |
|
470 if (s->B(i) & 0x80) |
|
471 stb(a0 + i, d->B(i)); |
|
472 } |
|
473 } |
|
474 |
|
475 void glue(helper_movl_mm_T0, SUFFIX) (Reg *d, uint32_t val) |
|
476 { |
|
477 d->L(0) = val; |
|
478 d->L(1) = 0; |
|
479 #if SHIFT == 1 |
|
480 d->Q(1) = 0; |
|
481 #endif |
|
482 } |
|
483 |
|
484 #ifdef TARGET_X86_64 |
|
485 void glue(helper_movq_mm_T0, SUFFIX) (Reg *d, uint64_t val) |
|
486 { |
|
487 d->Q(0) = val; |
|
488 #if SHIFT == 1 |
|
489 d->Q(1) = 0; |
|
490 #endif |
|
491 } |
|
492 #endif |
|
493 |
|
494 #if SHIFT == 0 |
|
495 void glue(helper_pshufw, SUFFIX) (Reg *d, Reg *s, int order) |
|
496 { |
|
497 Reg r; |
|
498 r.W(0) = s->W(order & 3); |
|
499 r.W(1) = s->W((order >> 2) & 3); |
|
500 r.W(2) = s->W((order >> 4) & 3); |
|
501 r.W(3) = s->W((order >> 6) & 3); |
|
502 *d = r; |
|
503 } |
|
504 #else |
|
505 void helper_shufps(Reg *d, Reg *s, int order) |
|
506 { |
|
507 Reg r; |
|
508 r.L(0) = d->L(order & 3); |
|
509 r.L(1) = d->L((order >> 2) & 3); |
|
510 r.L(2) = s->L((order >> 4) & 3); |
|
511 r.L(3) = s->L((order >> 6) & 3); |
|
512 *d = r; |
|
513 } |
|
514 |
|
515 void helper_shufpd(Reg *d, Reg *s, int order) |
|
516 { |
|
517 Reg r; |
|
518 r.Q(0) = d->Q(order & 1); |
|
519 r.Q(1) = s->Q((order >> 1) & 1); |
|
520 *d = r; |
|
521 } |
|
522 |
|
523 void glue(helper_pshufd, SUFFIX) (Reg *d, Reg *s, int order) |
|
524 { |
|
525 Reg r; |
|
526 r.L(0) = s->L(order & 3); |
|
527 r.L(1) = s->L((order >> 2) & 3); |
|
528 r.L(2) = s->L((order >> 4) & 3); |
|
529 r.L(3) = s->L((order >> 6) & 3); |
|
530 *d = r; |
|
531 } |
|
532 |
|
533 void glue(helper_pshuflw, SUFFIX) (Reg *d, Reg *s, int order) |
|
534 { |
|
535 Reg r; |
|
536 r.W(0) = s->W(order & 3); |
|
537 r.W(1) = s->W((order >> 2) & 3); |
|
538 r.W(2) = s->W((order >> 4) & 3); |
|
539 r.W(3) = s->W((order >> 6) & 3); |
|
540 r.Q(1) = s->Q(1); |
|
541 *d = r; |
|
542 } |
|
543 |
|
544 void glue(helper_pshufhw, SUFFIX) (Reg *d, Reg *s, int order) |
|
545 { |
|
546 Reg r; |
|
547 r.Q(0) = s->Q(0); |
|
548 r.W(4) = s->W(4 + (order & 3)); |
|
549 r.W(5) = s->W(4 + ((order >> 2) & 3)); |
|
550 r.W(6) = s->W(4 + ((order >> 4) & 3)); |
|
551 r.W(7) = s->W(4 + ((order >> 6) & 3)); |
|
552 *d = r; |
|
553 } |
|
554 #endif |
|
555 |
|
556 #if SHIFT == 1 |
|
557 /* FPU ops */ |
|
558 /* XXX: not accurate */ |
|
559 |
|
560 #define SSE_HELPER_S(name, F)\ |
|
561 void helper_ ## name ## ps (Reg *d, Reg *s)\ |
|
562 {\ |
|
563 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\ |
|
564 d->XMM_S(1) = F(32, d->XMM_S(1), s->XMM_S(1));\ |
|
565 d->XMM_S(2) = F(32, d->XMM_S(2), s->XMM_S(2));\ |
|
566 d->XMM_S(3) = F(32, d->XMM_S(3), s->XMM_S(3));\ |
|
567 }\ |
|
568 \ |
|
569 void helper_ ## name ## ss (Reg *d, Reg *s)\ |
|
570 {\ |
|
571 d->XMM_S(0) = F(32, d->XMM_S(0), s->XMM_S(0));\ |
|
572 }\ |
|
573 void helper_ ## name ## pd (Reg *d, Reg *s)\ |
|
574 {\ |
|
575 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\ |
|
576 d->XMM_D(1) = F(64, d->XMM_D(1), s->XMM_D(1));\ |
|
577 }\ |
|
578 \ |
|
579 void helper_ ## name ## sd (Reg *d, Reg *s)\ |
|
580 {\ |
|
581 d->XMM_D(0) = F(64, d->XMM_D(0), s->XMM_D(0));\ |
|
582 } |
|
583 |
|
584 #define FPU_ADD(size, a, b) float ## size ## _add(a, b, &env->sse_status) |
|
585 #define FPU_SUB(size, a, b) float ## size ## _sub(a, b, &env->sse_status) |
|
586 #define FPU_MUL(size, a, b) float ## size ## _mul(a, b, &env->sse_status) |
|
587 #define FPU_DIV(size, a, b) float ## size ## _div(a, b, &env->sse_status) |
|
588 #define FPU_MIN(size, a, b) (a) < (b) ? (a) : (b) |
|
589 #define FPU_MAX(size, a, b) (a) > (b) ? (a) : (b) |
|
590 #define FPU_SQRT(size, a, b) float ## size ## _sqrt(b, &env->sse_status) |
|
591 |
|
592 SSE_HELPER_S(add, FPU_ADD) |
|
593 SSE_HELPER_S(sub, FPU_SUB) |
|
594 SSE_HELPER_S(mul, FPU_MUL) |
|
595 SSE_HELPER_S(div, FPU_DIV) |
|
596 SSE_HELPER_S(min, FPU_MIN) |
|
597 SSE_HELPER_S(max, FPU_MAX) |
|
598 SSE_HELPER_S(sqrt, FPU_SQRT) |
|
599 |
|
600 |
|
601 /* float to float conversions */ |
|
602 void helper_cvtps2pd(Reg *d, Reg *s) |
|
603 { |
|
604 float32 s0, s1; |
|
605 s0 = s->XMM_S(0); |
|
606 s1 = s->XMM_S(1); |
|
607 d->XMM_D(0) = float32_to_float64(s0, &env->sse_status); |
|
608 d->XMM_D(1) = float32_to_float64(s1, &env->sse_status); |
|
609 } |
|
610 |
|
611 void helper_cvtpd2ps(Reg *d, Reg *s) |
|
612 { |
|
613 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status); |
|
614 d->XMM_S(1) = float64_to_float32(s->XMM_D(1), &env->sse_status); |
|
615 d->Q(1) = 0; |
|
616 } |
|
617 |
|
618 void helper_cvtss2sd(Reg *d, Reg *s) |
|
619 { |
|
620 d->XMM_D(0) = float32_to_float64(s->XMM_S(0), &env->sse_status); |
|
621 } |
|
622 |
|
623 void helper_cvtsd2ss(Reg *d, Reg *s) |
|
624 { |
|
625 d->XMM_S(0) = float64_to_float32(s->XMM_D(0), &env->sse_status); |
|
626 } |
|
627 |
|
628 /* integer to float */ |
|
629 void helper_cvtdq2ps(Reg *d, Reg *s) |
|
630 { |
|
631 d->XMM_S(0) = int32_to_float32(s->XMM_L(0), &env->sse_status); |
|
632 d->XMM_S(1) = int32_to_float32(s->XMM_L(1), &env->sse_status); |
|
633 d->XMM_S(2) = int32_to_float32(s->XMM_L(2), &env->sse_status); |
|
634 d->XMM_S(3) = int32_to_float32(s->XMM_L(3), &env->sse_status); |
|
635 } |
|
636 |
|
637 void helper_cvtdq2pd(Reg *d, Reg *s) |
|
638 { |
|
639 int32_t l0, l1; |
|
640 l0 = (int32_t)s->XMM_L(0); |
|
641 l1 = (int32_t)s->XMM_L(1); |
|
642 d->XMM_D(0) = int32_to_float64(l0, &env->sse_status); |
|
643 d->XMM_D(1) = int32_to_float64(l1, &env->sse_status); |
|
644 } |
|
645 |
|
646 void helper_cvtpi2ps(XMMReg *d, MMXReg *s) |
|
647 { |
|
648 d->XMM_S(0) = int32_to_float32(s->MMX_L(0), &env->sse_status); |
|
649 d->XMM_S(1) = int32_to_float32(s->MMX_L(1), &env->sse_status); |
|
650 } |
|
651 |
|
652 void helper_cvtpi2pd(XMMReg *d, MMXReg *s) |
|
653 { |
|
654 d->XMM_D(0) = int32_to_float64(s->MMX_L(0), &env->sse_status); |
|
655 d->XMM_D(1) = int32_to_float64(s->MMX_L(1), &env->sse_status); |
|
656 } |
|
657 |
|
658 void helper_cvtsi2ss(XMMReg *d, uint32_t val) |
|
659 { |
|
660 d->XMM_S(0) = int32_to_float32(val, &env->sse_status); |
|
661 } |
|
662 |
|
663 void helper_cvtsi2sd(XMMReg *d, uint32_t val) |
|
664 { |
|
665 d->XMM_D(0) = int32_to_float64(val, &env->sse_status); |
|
666 } |
|
667 |
|
668 #ifdef TARGET_X86_64 |
|
669 void helper_cvtsq2ss(XMMReg *d, uint64_t val) |
|
670 { |
|
671 d->XMM_S(0) = int64_to_float32(val, &env->sse_status); |
|
672 } |
|
673 |
|
674 void helper_cvtsq2sd(XMMReg *d, uint64_t val) |
|
675 { |
|
676 d->XMM_D(0) = int64_to_float64(val, &env->sse_status); |
|
677 } |
|
678 #endif |
|
679 |
|
680 /* float to integer */ |
|
681 void helper_cvtps2dq(XMMReg *d, XMMReg *s) |
|
682 { |
|
683 d->XMM_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status); |
|
684 d->XMM_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status); |
|
685 d->XMM_L(2) = float32_to_int32(s->XMM_S(2), &env->sse_status); |
|
686 d->XMM_L(3) = float32_to_int32(s->XMM_S(3), &env->sse_status); |
|
687 } |
|
688 |
|
689 void helper_cvtpd2dq(XMMReg *d, XMMReg *s) |
|
690 { |
|
691 d->XMM_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status); |
|
692 d->XMM_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status); |
|
693 d->XMM_Q(1) = 0; |
|
694 } |
|
695 |
|
696 void helper_cvtps2pi(MMXReg *d, XMMReg *s) |
|
697 { |
|
698 d->MMX_L(0) = float32_to_int32(s->XMM_S(0), &env->sse_status); |
|
699 d->MMX_L(1) = float32_to_int32(s->XMM_S(1), &env->sse_status); |
|
700 } |
|
701 |
|
702 void helper_cvtpd2pi(MMXReg *d, XMMReg *s) |
|
703 { |
|
704 d->MMX_L(0) = float64_to_int32(s->XMM_D(0), &env->sse_status); |
|
705 d->MMX_L(1) = float64_to_int32(s->XMM_D(1), &env->sse_status); |
|
706 } |
|
707 |
|
708 int32_t helper_cvtss2si(XMMReg *s) |
|
709 { |
|
710 return float32_to_int32(s->XMM_S(0), &env->sse_status); |
|
711 } |
|
712 |
|
713 int32_t helper_cvtsd2si(XMMReg *s) |
|
714 { |
|
715 return float64_to_int32(s->XMM_D(0), &env->sse_status); |
|
716 } |
|
717 |
|
718 #ifdef TARGET_X86_64 |
|
719 int64_t helper_cvtss2sq(XMMReg *s) |
|
720 { |
|
721 return float32_to_int64(s->XMM_S(0), &env->sse_status); |
|
722 } |
|
723 |
|
724 int64_t helper_cvtsd2sq(XMMReg *s) |
|
725 { |
|
726 return float64_to_int64(s->XMM_D(0), &env->sse_status); |
|
727 } |
|
728 #endif |
|
729 |
|
730 /* float to integer truncated */ |
|
731 void helper_cvttps2dq(XMMReg *d, XMMReg *s) |
|
732 { |
|
733 d->XMM_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status); |
|
734 d->XMM_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status); |
|
735 d->XMM_L(2) = float32_to_int32_round_to_zero(s->XMM_S(2), &env->sse_status); |
|
736 d->XMM_L(3) = float32_to_int32_round_to_zero(s->XMM_S(3), &env->sse_status); |
|
737 } |
|
738 |
|
739 void helper_cvttpd2dq(XMMReg *d, XMMReg *s) |
|
740 { |
|
741 d->XMM_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status); |
|
742 d->XMM_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status); |
|
743 d->XMM_Q(1) = 0; |
|
744 } |
|
745 |
|
746 void helper_cvttps2pi(MMXReg *d, XMMReg *s) |
|
747 { |
|
748 d->MMX_L(0) = float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status); |
|
749 d->MMX_L(1) = float32_to_int32_round_to_zero(s->XMM_S(1), &env->sse_status); |
|
750 } |
|
751 |
|
752 void helper_cvttpd2pi(MMXReg *d, XMMReg *s) |
|
753 { |
|
754 d->MMX_L(0) = float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status); |
|
755 d->MMX_L(1) = float64_to_int32_round_to_zero(s->XMM_D(1), &env->sse_status); |
|
756 } |
|
757 |
|
758 int32_t helper_cvttss2si(XMMReg *s) |
|
759 { |
|
760 return float32_to_int32_round_to_zero(s->XMM_S(0), &env->sse_status); |
|
761 } |
|
762 |
|
763 int32_t helper_cvttsd2si(XMMReg *s) |
|
764 { |
|
765 return float64_to_int32_round_to_zero(s->XMM_D(0), &env->sse_status); |
|
766 } |
|
767 |
|
768 #ifdef TARGET_X86_64 |
|
769 int64_t helper_cvttss2sq(XMMReg *s) |
|
770 { |
|
771 return float32_to_int64_round_to_zero(s->XMM_S(0), &env->sse_status); |
|
772 } |
|
773 |
|
774 int64_t helper_cvttsd2sq(XMMReg *s) |
|
775 { |
|
776 return float64_to_int64_round_to_zero(s->XMM_D(0), &env->sse_status); |
|
777 } |
|
778 #endif |
|
779 |
|
780 void helper_rsqrtps(XMMReg *d, XMMReg *s) |
|
781 { |
|
782 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0)); |
|
783 d->XMM_S(1) = approx_rsqrt(s->XMM_S(1)); |
|
784 d->XMM_S(2) = approx_rsqrt(s->XMM_S(2)); |
|
785 d->XMM_S(3) = approx_rsqrt(s->XMM_S(3)); |
|
786 } |
|
787 |
|
788 void helper_rsqrtss(XMMReg *d, XMMReg *s) |
|
789 { |
|
790 d->XMM_S(0) = approx_rsqrt(s->XMM_S(0)); |
|
791 } |
|
792 |
|
793 void helper_rcpps(XMMReg *d, XMMReg *s) |
|
794 { |
|
795 d->XMM_S(0) = approx_rcp(s->XMM_S(0)); |
|
796 d->XMM_S(1) = approx_rcp(s->XMM_S(1)); |
|
797 d->XMM_S(2) = approx_rcp(s->XMM_S(2)); |
|
798 d->XMM_S(3) = approx_rcp(s->XMM_S(3)); |
|
799 } |
|
800 |
|
801 void helper_rcpss(XMMReg *d, XMMReg *s) |
|
802 { |
|
803 d->XMM_S(0) = approx_rcp(s->XMM_S(0)); |
|
804 } |
|
805 |
|
806 void helper_haddps(XMMReg *d, XMMReg *s) |
|
807 { |
|
808 XMMReg r; |
|
809 r.XMM_S(0) = d->XMM_S(0) + d->XMM_S(1); |
|
810 r.XMM_S(1) = d->XMM_S(2) + d->XMM_S(3); |
|
811 r.XMM_S(2) = s->XMM_S(0) + s->XMM_S(1); |
|
812 r.XMM_S(3) = s->XMM_S(2) + s->XMM_S(3); |
|
813 *d = r; |
|
814 } |
|
815 |
|
816 void helper_haddpd(XMMReg *d, XMMReg *s) |
|
817 { |
|
818 XMMReg r; |
|
819 r.XMM_D(0) = d->XMM_D(0) + d->XMM_D(1); |
|
820 r.XMM_D(1) = s->XMM_D(0) + s->XMM_D(1); |
|
821 *d = r; |
|
822 } |
|
823 |
|
824 void helper_hsubps(XMMReg *d, XMMReg *s) |
|
825 { |
|
826 XMMReg r; |
|
827 r.XMM_S(0) = d->XMM_S(0) - d->XMM_S(1); |
|
828 r.XMM_S(1) = d->XMM_S(2) - d->XMM_S(3); |
|
829 r.XMM_S(2) = s->XMM_S(0) - s->XMM_S(1); |
|
830 r.XMM_S(3) = s->XMM_S(2) - s->XMM_S(3); |
|
831 *d = r; |
|
832 } |
|
833 |
|
834 void helper_hsubpd(XMMReg *d, XMMReg *s) |
|
835 { |
|
836 XMMReg r; |
|
837 r.XMM_D(0) = d->XMM_D(0) - d->XMM_D(1); |
|
838 r.XMM_D(1) = s->XMM_D(0) - s->XMM_D(1); |
|
839 *d = r; |
|
840 } |
|
841 |
|
842 void helper_addsubps(XMMReg *d, XMMReg *s) |
|
843 { |
|
844 d->XMM_S(0) = d->XMM_S(0) - s->XMM_S(0); |
|
845 d->XMM_S(1) = d->XMM_S(1) + s->XMM_S(1); |
|
846 d->XMM_S(2) = d->XMM_S(2) - s->XMM_S(2); |
|
847 d->XMM_S(3) = d->XMM_S(3) + s->XMM_S(3); |
|
848 } |
|
849 |
|
850 void helper_addsubpd(XMMReg *d, XMMReg *s) |
|
851 { |
|
852 d->XMM_D(0) = d->XMM_D(0) - s->XMM_D(0); |
|
853 d->XMM_D(1) = d->XMM_D(1) + s->XMM_D(1); |
|
854 } |
|
855 |
|
856 /* XXX: unordered */ |
|
857 #define SSE_HELPER_CMP(name, F)\ |
|
858 void helper_ ## name ## ps (Reg *d, Reg *s)\ |
|
859 {\ |
|
860 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\ |
|
861 d->XMM_L(1) = F(32, d->XMM_S(1), s->XMM_S(1));\ |
|
862 d->XMM_L(2) = F(32, d->XMM_S(2), s->XMM_S(2));\ |
|
863 d->XMM_L(3) = F(32, d->XMM_S(3), s->XMM_S(3));\ |
|
864 }\ |
|
865 \ |
|
866 void helper_ ## name ## ss (Reg *d, Reg *s)\ |
|
867 {\ |
|
868 d->XMM_L(0) = F(32, d->XMM_S(0), s->XMM_S(0));\ |
|
869 }\ |
|
870 void helper_ ## name ## pd (Reg *d, Reg *s)\ |
|
871 {\ |
|
872 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\ |
|
873 d->XMM_Q(1) = F(64, d->XMM_D(1), s->XMM_D(1));\ |
|
874 }\ |
|
875 \ |
|
876 void helper_ ## name ## sd (Reg *d, Reg *s)\ |
|
877 {\ |
|
878 d->XMM_Q(0) = F(64, d->XMM_D(0), s->XMM_D(0));\ |
|
879 } |
|
880 |
|
881 #define FPU_CMPEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? -1 : 0 |
|
882 #define FPU_CMPLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? -1 : 0 |
|
883 #define FPU_CMPLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? -1 : 0 |
|
884 #define FPU_CMPUNORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? - 1 : 0 |
|
885 #define FPU_CMPNEQ(size, a, b) float ## size ## _eq(a, b, &env->sse_status) ? 0 : -1 |
|
886 #define FPU_CMPNLT(size, a, b) float ## size ## _lt(a, b, &env->sse_status) ? 0 : -1 |
|
887 #define FPU_CMPNLE(size, a, b) float ## size ## _le(a, b, &env->sse_status) ? 0 : -1 |
|
888 #define FPU_CMPORD(size, a, b) float ## size ## _unordered(a, b, &env->sse_status) ? 0 : -1 |
|
889 |
|
890 SSE_HELPER_CMP(cmpeq, FPU_CMPEQ) |
|
891 SSE_HELPER_CMP(cmplt, FPU_CMPLT) |
|
892 SSE_HELPER_CMP(cmple, FPU_CMPLE) |
|
893 SSE_HELPER_CMP(cmpunord, FPU_CMPUNORD) |
|
894 SSE_HELPER_CMP(cmpneq, FPU_CMPNEQ) |
|
895 SSE_HELPER_CMP(cmpnlt, FPU_CMPNLT) |
|
896 SSE_HELPER_CMP(cmpnle, FPU_CMPNLE) |
|
897 SSE_HELPER_CMP(cmpord, FPU_CMPORD) |
|
898 |
|
899 const int comis_eflags[4] = {CC_C, CC_Z, 0, CC_Z | CC_P | CC_C}; |
|
900 |
|
901 void helper_ucomiss(Reg *d, Reg *s) |
|
902 { |
|
903 int ret; |
|
904 float32 s0, s1; |
|
905 |
|
906 s0 = d->XMM_S(0); |
|
907 s1 = s->XMM_S(0); |
|
908 ret = float32_compare_quiet(s0, s1, &env->sse_status); |
|
909 CC_SRC = comis_eflags[ret + 1]; |
|
910 } |
|
911 |
|
912 void helper_comiss(Reg *d, Reg *s) |
|
913 { |
|
914 int ret; |
|
915 float32 s0, s1; |
|
916 |
|
917 s0 = d->XMM_S(0); |
|
918 s1 = s->XMM_S(0); |
|
919 ret = float32_compare(s0, s1, &env->sse_status); |
|
920 CC_SRC = comis_eflags[ret + 1]; |
|
921 } |
|
922 |
|
923 void helper_ucomisd(Reg *d, Reg *s) |
|
924 { |
|
925 int ret; |
|
926 float64 d0, d1; |
|
927 |
|
928 d0 = d->XMM_D(0); |
|
929 d1 = s->XMM_D(0); |
|
930 ret = float64_compare_quiet(d0, d1, &env->sse_status); |
|
931 CC_SRC = comis_eflags[ret + 1]; |
|
932 } |
|
933 |
|
934 void helper_comisd(Reg *d, Reg *s) |
|
935 { |
|
936 int ret; |
|
937 float64 d0, d1; |
|
938 |
|
939 d0 = d->XMM_D(0); |
|
940 d1 = s->XMM_D(0); |
|
941 ret = float64_compare(d0, d1, &env->sse_status); |
|
942 CC_SRC = comis_eflags[ret + 1]; |
|
943 } |
|
944 |
|
945 uint32_t helper_movmskps(Reg *s) |
|
946 { |
|
947 int b0, b1, b2, b3; |
|
948 b0 = s->XMM_L(0) >> 31; |
|
949 b1 = s->XMM_L(1) >> 31; |
|
950 b2 = s->XMM_L(2) >> 31; |
|
951 b3 = s->XMM_L(3) >> 31; |
|
952 return b0 | (b1 << 1) | (b2 << 2) | (b3 << 3); |
|
953 } |
|
954 |
|
955 uint32_t helper_movmskpd(Reg *s) |
|
956 { |
|
957 int b0, b1; |
|
958 b0 = s->XMM_L(1) >> 31; |
|
959 b1 = s->XMM_L(3) >> 31; |
|
960 return b0 | (b1 << 1); |
|
961 } |
|
962 |
|
963 #endif |
|
964 |
|
965 uint32_t glue(helper_pmovmskb, SUFFIX)(Reg *s) |
|
966 { |
|
967 uint32_t val; |
|
968 val = 0; |
|
969 val |= (s->B(0) >> 7); |
|
970 val |= (s->B(1) >> 6) & 0x02; |
|
971 val |= (s->B(2) >> 5) & 0x04; |
|
972 val |= (s->B(3) >> 4) & 0x08; |
|
973 val |= (s->B(4) >> 3) & 0x10; |
|
974 val |= (s->B(5) >> 2) & 0x20; |
|
975 val |= (s->B(6) >> 1) & 0x40; |
|
976 val |= (s->B(7)) & 0x80; |
|
977 #if SHIFT == 1 |
|
978 val |= (s->B(8) << 1) & 0x0100; |
|
979 val |= (s->B(9) << 2) & 0x0200; |
|
980 val |= (s->B(10) << 3) & 0x0400; |
|
981 val |= (s->B(11) << 4) & 0x0800; |
|
982 val |= (s->B(12) << 5) & 0x1000; |
|
983 val |= (s->B(13) << 6) & 0x2000; |
|
984 val |= (s->B(14) << 7) & 0x4000; |
|
985 val |= (s->B(15) << 8) & 0x8000; |
|
986 #endif |
|
987 return val; |
|
988 } |
|
989 |
|
990 void glue(helper_packsswb, SUFFIX) (Reg *d, Reg *s) |
|
991 { |
|
992 Reg r; |
|
993 |
|
994 r.B(0) = satsb((int16_t)d->W(0)); |
|
995 r.B(1) = satsb((int16_t)d->W(1)); |
|
996 r.B(2) = satsb((int16_t)d->W(2)); |
|
997 r.B(3) = satsb((int16_t)d->W(3)); |
|
998 #if SHIFT == 1 |
|
999 r.B(4) = satsb((int16_t)d->W(4)); |
|
1000 r.B(5) = satsb((int16_t)d->W(5)); |
|
1001 r.B(6) = satsb((int16_t)d->W(6)); |
|
1002 r.B(7) = satsb((int16_t)d->W(7)); |
|
1003 #endif |
|
1004 r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0)); |
|
1005 r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1)); |
|
1006 r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2)); |
|
1007 r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3)); |
|
1008 #if SHIFT == 1 |
|
1009 r.B(12) = satsb((int16_t)s->W(4)); |
|
1010 r.B(13) = satsb((int16_t)s->W(5)); |
|
1011 r.B(14) = satsb((int16_t)s->W(6)); |
|
1012 r.B(15) = satsb((int16_t)s->W(7)); |
|
1013 #endif |
|
1014 *d = r; |
|
1015 } |
|
1016 |
|
1017 void glue(helper_packuswb, SUFFIX) (Reg *d, Reg *s) |
|
1018 { |
|
1019 Reg r; |
|
1020 |
|
1021 r.B(0) = satub((int16_t)d->W(0)); |
|
1022 r.B(1) = satub((int16_t)d->W(1)); |
|
1023 r.B(2) = satub((int16_t)d->W(2)); |
|
1024 r.B(3) = satub((int16_t)d->W(3)); |
|
1025 #if SHIFT == 1 |
|
1026 r.B(4) = satub((int16_t)d->W(4)); |
|
1027 r.B(5) = satub((int16_t)d->W(5)); |
|
1028 r.B(6) = satub((int16_t)d->W(6)); |
|
1029 r.B(7) = satub((int16_t)d->W(7)); |
|
1030 #endif |
|
1031 r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0)); |
|
1032 r.B((4 << SHIFT) + 1) = satub((int16_t)s->W(1)); |
|
1033 r.B((4 << SHIFT) + 2) = satub((int16_t)s->W(2)); |
|
1034 r.B((4 << SHIFT) + 3) = satub((int16_t)s->W(3)); |
|
1035 #if SHIFT == 1 |
|
1036 r.B(12) = satub((int16_t)s->W(4)); |
|
1037 r.B(13) = satub((int16_t)s->W(5)); |
|
1038 r.B(14) = satub((int16_t)s->W(6)); |
|
1039 r.B(15) = satub((int16_t)s->W(7)); |
|
1040 #endif |
|
1041 *d = r; |
|
1042 } |
|
1043 |
|
1044 void glue(helper_packssdw, SUFFIX) (Reg *d, Reg *s) |
|
1045 { |
|
1046 Reg r; |
|
1047 |
|
1048 r.W(0) = satsw(d->L(0)); |
|
1049 r.W(1) = satsw(d->L(1)); |
|
1050 #if SHIFT == 1 |
|
1051 r.W(2) = satsw(d->L(2)); |
|
1052 r.W(3) = satsw(d->L(3)); |
|
1053 #endif |
|
1054 r.W((2 << SHIFT) + 0) = satsw(s->L(0)); |
|
1055 r.W((2 << SHIFT) + 1) = satsw(s->L(1)); |
|
1056 #if SHIFT == 1 |
|
1057 r.W(6) = satsw(s->L(2)); |
|
1058 r.W(7) = satsw(s->L(3)); |
|
1059 #endif |
|
1060 *d = r; |
|
1061 } |
|
1062 |
|
1063 #define UNPCK_OP(base_name, base) \ |
|
1064 \ |
|
1065 void glue(helper_punpck ## base_name ## bw, SUFFIX) (Reg *d, Reg *s) \ |
|
1066 { \ |
|
1067 Reg r; \ |
|
1068 \ |
|
1069 r.B(0) = d->B((base << (SHIFT + 2)) + 0); \ |
|
1070 r.B(1) = s->B((base << (SHIFT + 2)) + 0); \ |
|
1071 r.B(2) = d->B((base << (SHIFT + 2)) + 1); \ |
|
1072 r.B(3) = s->B((base << (SHIFT + 2)) + 1); \ |
|
1073 r.B(4) = d->B((base << (SHIFT + 2)) + 2); \ |
|
1074 r.B(5) = s->B((base << (SHIFT + 2)) + 2); \ |
|
1075 r.B(6) = d->B((base << (SHIFT + 2)) + 3); \ |
|
1076 r.B(7) = s->B((base << (SHIFT + 2)) + 3); \ |
|
1077 XMM_ONLY( \ |
|
1078 r.B(8) = d->B((base << (SHIFT + 2)) + 4); \ |
|
1079 r.B(9) = s->B((base << (SHIFT + 2)) + 4); \ |
|
1080 r.B(10) = d->B((base << (SHIFT + 2)) + 5); \ |
|
1081 r.B(11) = s->B((base << (SHIFT + 2)) + 5); \ |
|
1082 r.B(12) = d->B((base << (SHIFT + 2)) + 6); \ |
|
1083 r.B(13) = s->B((base << (SHIFT + 2)) + 6); \ |
|
1084 r.B(14) = d->B((base << (SHIFT + 2)) + 7); \ |
|
1085 r.B(15) = s->B((base << (SHIFT + 2)) + 7); \ |
|
1086 ) \ |
|
1087 *d = r; \ |
|
1088 } \ |
|
1089 \ |
|
1090 void glue(helper_punpck ## base_name ## wd, SUFFIX) (Reg *d, Reg *s) \ |
|
1091 { \ |
|
1092 Reg r; \ |
|
1093 \ |
|
1094 r.W(0) = d->W((base << (SHIFT + 1)) + 0); \ |
|
1095 r.W(1) = s->W((base << (SHIFT + 1)) + 0); \ |
|
1096 r.W(2) = d->W((base << (SHIFT + 1)) + 1); \ |
|
1097 r.W(3) = s->W((base << (SHIFT + 1)) + 1); \ |
|
1098 XMM_ONLY( \ |
|
1099 r.W(4) = d->W((base << (SHIFT + 1)) + 2); \ |
|
1100 r.W(5) = s->W((base << (SHIFT + 1)) + 2); \ |
|
1101 r.W(6) = d->W((base << (SHIFT + 1)) + 3); \ |
|
1102 r.W(7) = s->W((base << (SHIFT + 1)) + 3); \ |
|
1103 ) \ |
|
1104 *d = r; \ |
|
1105 } \ |
|
1106 \ |
|
1107 void glue(helper_punpck ## base_name ## dq, SUFFIX) (Reg *d, Reg *s) \ |
|
1108 { \ |
|
1109 Reg r; \ |
|
1110 \ |
|
1111 r.L(0) = d->L((base << SHIFT) + 0); \ |
|
1112 r.L(1) = s->L((base << SHIFT) + 0); \ |
|
1113 XMM_ONLY( \ |
|
1114 r.L(2) = d->L((base << SHIFT) + 1); \ |
|
1115 r.L(3) = s->L((base << SHIFT) + 1); \ |
|
1116 ) \ |
|
1117 *d = r; \ |
|
1118 } \ |
|
1119 \ |
|
1120 XMM_ONLY( \ |
|
1121 void glue(helper_punpck ## base_name ## qdq, SUFFIX) (Reg *d, Reg *s) \ |
|
1122 { \ |
|
1123 Reg r; \ |
|
1124 \ |
|
1125 r.Q(0) = d->Q(base); \ |
|
1126 r.Q(1) = s->Q(base); \ |
|
1127 *d = r; \ |
|
1128 } \ |
|
1129 ) |
|
1130 |
|
1131 UNPCK_OP(l, 0) |
|
1132 UNPCK_OP(h, 1) |
|
1133 |
|
1134 /* 3DNow! float ops */ |
|
1135 #if SHIFT == 0 |
|
1136 void helper_pi2fd(MMXReg *d, MMXReg *s) |
|
1137 { |
|
1138 d->MMX_S(0) = int32_to_float32(s->MMX_L(0), &env->mmx_status); |
|
1139 d->MMX_S(1) = int32_to_float32(s->MMX_L(1), &env->mmx_status); |
|
1140 } |
|
1141 |
|
1142 void helper_pi2fw(MMXReg *d, MMXReg *s) |
|
1143 { |
|
1144 d->MMX_S(0) = int32_to_float32((int16_t)s->MMX_W(0), &env->mmx_status); |
|
1145 d->MMX_S(1) = int32_to_float32((int16_t)s->MMX_W(2), &env->mmx_status); |
|
1146 } |
|
1147 |
|
1148 void helper_pf2id(MMXReg *d, MMXReg *s) |
|
1149 { |
|
1150 d->MMX_L(0) = float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status); |
|
1151 d->MMX_L(1) = float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status); |
|
1152 } |
|
1153 |
|
1154 void helper_pf2iw(MMXReg *d, MMXReg *s) |
|
1155 { |
|
1156 d->MMX_L(0) = satsw(float32_to_int32_round_to_zero(s->MMX_S(0), &env->mmx_status)); |
|
1157 d->MMX_L(1) = satsw(float32_to_int32_round_to_zero(s->MMX_S(1), &env->mmx_status)); |
|
1158 } |
|
1159 |
|
1160 void helper_pfacc(MMXReg *d, MMXReg *s) |
|
1161 { |
|
1162 MMXReg r; |
|
1163 r.MMX_S(0) = float32_add(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); |
|
1164 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); |
|
1165 *d = r; |
|
1166 } |
|
1167 |
|
1168 void helper_pfadd(MMXReg *d, MMXReg *s) |
|
1169 { |
|
1170 d->MMX_S(0) = float32_add(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); |
|
1171 d->MMX_S(1) = float32_add(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); |
|
1172 } |
|
1173 |
|
1174 void helper_pfcmpeq(MMXReg *d, MMXReg *s) |
|
1175 { |
|
1176 d->MMX_L(0) = float32_eq(d->MMX_S(0), s->MMX_S(0), &env->mmx_status) ? -1 : 0; |
|
1177 d->MMX_L(1) = float32_eq(d->MMX_S(1), s->MMX_S(1), &env->mmx_status) ? -1 : 0; |
|
1178 } |
|
1179 |
|
1180 void helper_pfcmpge(MMXReg *d, MMXReg *s) |
|
1181 { |
|
1182 d->MMX_L(0) = float32_le(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0; |
|
1183 d->MMX_L(1) = float32_le(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0; |
|
1184 } |
|
1185 |
|
1186 void helper_pfcmpgt(MMXReg *d, MMXReg *s) |
|
1187 { |
|
1188 d->MMX_L(0) = float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status) ? -1 : 0; |
|
1189 d->MMX_L(1) = float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status) ? -1 : 0; |
|
1190 } |
|
1191 |
|
1192 void helper_pfmax(MMXReg *d, MMXReg *s) |
|
1193 { |
|
1194 if (float32_lt(d->MMX_S(0), s->MMX_S(0), &env->mmx_status)) |
|
1195 d->MMX_S(0) = s->MMX_S(0); |
|
1196 if (float32_lt(d->MMX_S(1), s->MMX_S(1), &env->mmx_status)) |
|
1197 d->MMX_S(1) = s->MMX_S(1); |
|
1198 } |
|
1199 |
|
1200 void helper_pfmin(MMXReg *d, MMXReg *s) |
|
1201 { |
|
1202 if (float32_lt(s->MMX_S(0), d->MMX_S(0), &env->mmx_status)) |
|
1203 d->MMX_S(0) = s->MMX_S(0); |
|
1204 if (float32_lt(s->MMX_S(1), d->MMX_S(1), &env->mmx_status)) |
|
1205 d->MMX_S(1) = s->MMX_S(1); |
|
1206 } |
|
1207 |
|
1208 void helper_pfmul(MMXReg *d, MMXReg *s) |
|
1209 { |
|
1210 d->MMX_S(0) = float32_mul(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); |
|
1211 d->MMX_S(1) = float32_mul(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); |
|
1212 } |
|
1213 |
|
1214 void helper_pfnacc(MMXReg *d, MMXReg *s) |
|
1215 { |
|
1216 MMXReg r; |
|
1217 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); |
|
1218 r.MMX_S(1) = float32_sub(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); |
|
1219 *d = r; |
|
1220 } |
|
1221 |
|
1222 void helper_pfpnacc(MMXReg *d, MMXReg *s) |
|
1223 { |
|
1224 MMXReg r; |
|
1225 r.MMX_S(0) = float32_sub(d->MMX_S(0), d->MMX_S(1), &env->mmx_status); |
|
1226 r.MMX_S(1) = float32_add(s->MMX_S(0), s->MMX_S(1), &env->mmx_status); |
|
1227 *d = r; |
|
1228 } |
|
1229 |
|
1230 void helper_pfrcp(MMXReg *d, MMXReg *s) |
|
1231 { |
|
1232 d->MMX_S(0) = approx_rcp(s->MMX_S(0)); |
|
1233 d->MMX_S(1) = d->MMX_S(0); |
|
1234 } |
|
1235 |
|
1236 void helper_pfrsqrt(MMXReg *d, MMXReg *s) |
|
1237 { |
|
1238 d->MMX_L(1) = s->MMX_L(0) & 0x7fffffff; |
|
1239 d->MMX_S(1) = approx_rsqrt(d->MMX_S(1)); |
|
1240 d->MMX_L(1) |= s->MMX_L(0) & 0x80000000; |
|
1241 d->MMX_L(0) = d->MMX_L(1); |
|
1242 } |
|
1243 |
|
1244 void helper_pfsub(MMXReg *d, MMXReg *s) |
|
1245 { |
|
1246 d->MMX_S(0) = float32_sub(d->MMX_S(0), s->MMX_S(0), &env->mmx_status); |
|
1247 d->MMX_S(1) = float32_sub(d->MMX_S(1), s->MMX_S(1), &env->mmx_status); |
|
1248 } |
|
1249 |
|
1250 void helper_pfsubr(MMXReg *d, MMXReg *s) |
|
1251 { |
|
1252 d->MMX_S(0) = float32_sub(s->MMX_S(0), d->MMX_S(0), &env->mmx_status); |
|
1253 d->MMX_S(1) = float32_sub(s->MMX_S(1), d->MMX_S(1), &env->mmx_status); |
|
1254 } |
|
1255 |
|
1256 void helper_pswapd(MMXReg *d, MMXReg *s) |
|
1257 { |
|
1258 MMXReg r; |
|
1259 r.MMX_L(0) = s->MMX_L(1); |
|
1260 r.MMX_L(1) = s->MMX_L(0); |
|
1261 *d = r; |
|
1262 } |
|
1263 #endif |
|
1264 |
|
1265 /* SSSE3 op helpers */ |
|
1266 void glue(helper_pshufb, SUFFIX) (Reg *d, Reg *s) |
|
1267 { |
|
1268 int i; |
|
1269 Reg r; |
|
1270 |
|
1271 for (i = 0; i < (8 << SHIFT); i++) |
|
1272 r.B(i) = (s->B(i) & 0x80) ? 0 : (d->B(s->B(i) & ((8 << SHIFT) - 1))); |
|
1273 |
|
1274 *d = r; |
|
1275 } |
|
1276 |
|
1277 void glue(helper_phaddw, SUFFIX) (Reg *d, Reg *s) |
|
1278 { |
|
1279 d->W(0) = (int16_t)d->W(0) + (int16_t)d->W(1); |
|
1280 d->W(1) = (int16_t)d->W(2) + (int16_t)d->W(3); |
|
1281 XMM_ONLY(d->W(2) = (int16_t)d->W(4) + (int16_t)d->W(5)); |
|
1282 XMM_ONLY(d->W(3) = (int16_t)d->W(6) + (int16_t)d->W(7)); |
|
1283 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) + (int16_t)s->W(1); |
|
1284 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) + (int16_t)s->W(3); |
|
1285 XMM_ONLY(d->W(6) = (int16_t)s->W(4) + (int16_t)s->W(5)); |
|
1286 XMM_ONLY(d->W(7) = (int16_t)s->W(6) + (int16_t)s->W(7)); |
|
1287 } |
|
1288 |
|
1289 void glue(helper_phaddd, SUFFIX) (Reg *d, Reg *s) |
|
1290 { |
|
1291 d->L(0) = (int32_t)d->L(0) + (int32_t)d->L(1); |
|
1292 XMM_ONLY(d->L(1) = (int32_t)d->L(2) + (int32_t)d->L(3)); |
|
1293 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) + (int32_t)s->L(1); |
|
1294 XMM_ONLY(d->L(3) = (int32_t)s->L(2) + (int32_t)s->L(3)); |
|
1295 } |
|
1296 |
|
1297 void glue(helper_phaddsw, SUFFIX) (Reg *d, Reg *s) |
|
1298 { |
|
1299 d->W(0) = satsw((int16_t)d->W(0) + (int16_t)d->W(1)); |
|
1300 d->W(1) = satsw((int16_t)d->W(2) + (int16_t)d->W(3)); |
|
1301 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) + (int16_t)d->W(5))); |
|
1302 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) + (int16_t)d->W(7))); |
|
1303 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) + (int16_t)s->W(1)); |
|
1304 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) + (int16_t)s->W(3)); |
|
1305 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) + (int16_t)s->W(5))); |
|
1306 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) + (int16_t)s->W(7))); |
|
1307 } |
|
1308 |
|
1309 void glue(helper_pmaddubsw, SUFFIX) (Reg *d, Reg *s) |
|
1310 { |
|
1311 d->W(0) = satsw((int8_t)s->B( 0) * (uint8_t)d->B( 0) + |
|
1312 (int8_t)s->B( 1) * (uint8_t)d->B( 1)); |
|
1313 d->W(1) = satsw((int8_t)s->B( 2) * (uint8_t)d->B( 2) + |
|
1314 (int8_t)s->B( 3) * (uint8_t)d->B( 3)); |
|
1315 d->W(2) = satsw((int8_t)s->B( 4) * (uint8_t)d->B( 4) + |
|
1316 (int8_t)s->B( 5) * (uint8_t)d->B( 5)); |
|
1317 d->W(3) = satsw((int8_t)s->B( 6) * (uint8_t)d->B( 6) + |
|
1318 (int8_t)s->B( 7) * (uint8_t)d->B( 7)); |
|
1319 #if SHIFT == 1 |
|
1320 d->W(4) = satsw((int8_t)s->B( 8) * (uint8_t)d->B( 8) + |
|
1321 (int8_t)s->B( 9) * (uint8_t)d->B( 9)); |
|
1322 d->W(5) = satsw((int8_t)s->B(10) * (uint8_t)d->B(10) + |
|
1323 (int8_t)s->B(11) * (uint8_t)d->B(11)); |
|
1324 d->W(6) = satsw((int8_t)s->B(12) * (uint8_t)d->B(12) + |
|
1325 (int8_t)s->B(13) * (uint8_t)d->B(13)); |
|
1326 d->W(7) = satsw((int8_t)s->B(14) * (uint8_t)d->B(14) + |
|
1327 (int8_t)s->B(15) * (uint8_t)d->B(15)); |
|
1328 #endif |
|
1329 } |
|
1330 |
|
1331 void glue(helper_phsubw, SUFFIX) (Reg *d, Reg *s) |
|
1332 { |
|
1333 d->W(0) = (int16_t)d->W(0) - (int16_t)d->W(1); |
|
1334 d->W(1) = (int16_t)d->W(2) - (int16_t)d->W(3); |
|
1335 XMM_ONLY(d->W(2) = (int16_t)d->W(4) - (int16_t)d->W(5)); |
|
1336 XMM_ONLY(d->W(3) = (int16_t)d->W(6) - (int16_t)d->W(7)); |
|
1337 d->W((2 << SHIFT) + 0) = (int16_t)s->W(0) - (int16_t)s->W(1); |
|
1338 d->W((2 << SHIFT) + 1) = (int16_t)s->W(2) - (int16_t)s->W(3); |
|
1339 XMM_ONLY(d->W(6) = (int16_t)s->W(4) - (int16_t)s->W(5)); |
|
1340 XMM_ONLY(d->W(7) = (int16_t)s->W(6) - (int16_t)s->W(7)); |
|
1341 } |
|
1342 |
|
1343 void glue(helper_phsubd, SUFFIX) (Reg *d, Reg *s) |
|
1344 { |
|
1345 d->L(0) = (int32_t)d->L(0) - (int32_t)d->L(1); |
|
1346 XMM_ONLY(d->L(1) = (int32_t)d->L(2) - (int32_t)d->L(3)); |
|
1347 d->L((1 << SHIFT) + 0) = (int32_t)s->L(0) - (int32_t)s->L(1); |
|
1348 XMM_ONLY(d->L(3) = (int32_t)s->L(2) - (int32_t)s->L(3)); |
|
1349 } |
|
1350 |
|
1351 void glue(helper_phsubsw, SUFFIX) (Reg *d, Reg *s) |
|
1352 { |
|
1353 d->W(0) = satsw((int16_t)d->W(0) - (int16_t)d->W(1)); |
|
1354 d->W(1) = satsw((int16_t)d->W(2) - (int16_t)d->W(3)); |
|
1355 XMM_ONLY(d->W(2) = satsw((int16_t)d->W(4) - (int16_t)d->W(5))); |
|
1356 XMM_ONLY(d->W(3) = satsw((int16_t)d->W(6) - (int16_t)d->W(7))); |
|
1357 d->W((2 << SHIFT) + 0) = satsw((int16_t)s->W(0) - (int16_t)s->W(1)); |
|
1358 d->W((2 << SHIFT) + 1) = satsw((int16_t)s->W(2) - (int16_t)s->W(3)); |
|
1359 XMM_ONLY(d->W(6) = satsw((int16_t)s->W(4) - (int16_t)s->W(5))); |
|
1360 XMM_ONLY(d->W(7) = satsw((int16_t)s->W(6) - (int16_t)s->W(7))); |
|
1361 } |
|
1362 |
|
1363 #define FABSB(_, x) x > INT8_MAX ? -(int8_t ) x : x |
|
1364 #define FABSW(_, x) x > INT16_MAX ? -(int16_t) x : x |
|
1365 #define FABSL(_, x) x > INT32_MAX ? -(int32_t) x : x |
|
1366 SSE_HELPER_B(helper_pabsb, FABSB) |
|
1367 SSE_HELPER_W(helper_pabsw, FABSW) |
|
1368 SSE_HELPER_L(helper_pabsd, FABSL) |
|
1369 |
|
1370 #define FMULHRSW(d, s) ((int16_t) d * (int16_t) s + 0x4000) >> 15 |
|
1371 SSE_HELPER_W(helper_pmulhrsw, FMULHRSW) |
|
1372 |
|
1373 #define FSIGNB(d, s) s <= INT8_MAX ? s ? d : 0 : -(int8_t ) d |
|
1374 #define FSIGNW(d, s) s <= INT16_MAX ? s ? d : 0 : -(int16_t) d |
|
1375 #define FSIGNL(d, s) s <= INT32_MAX ? s ? d : 0 : -(int32_t) d |
|
1376 SSE_HELPER_B(helper_psignb, FSIGNB) |
|
1377 SSE_HELPER_W(helper_psignw, FSIGNW) |
|
1378 SSE_HELPER_L(helper_psignd, FSIGNL) |
|
1379 |
|
1380 void glue(helper_palignr, SUFFIX) (Reg *d, Reg *s, int32_t shift) |
|
1381 { |
|
1382 Reg r; |
|
1383 |
|
1384 /* XXX could be checked during translation */ |
|
1385 if (shift >= (16 << SHIFT)) { |
|
1386 r.Q(0) = 0; |
|
1387 XMM_ONLY(r.Q(1) = 0); |
|
1388 } else { |
|
1389 shift <<= 3; |
|
1390 #define SHR(v, i) (i < 64 && i > -64 ? i > 0 ? v >> (i) : (v << -(i)) : 0) |
|
1391 #if SHIFT == 0 |
|
1392 r.Q(0) = SHR(s->Q(0), shift - 0) | |
|
1393 SHR(d->Q(0), shift - 64); |
|
1394 #else |
|
1395 r.Q(0) = SHR(s->Q(0), shift - 0) | |
|
1396 SHR(s->Q(1), shift - 64) | |
|
1397 SHR(d->Q(0), shift - 128) | |
|
1398 SHR(d->Q(1), shift - 192); |
|
1399 r.Q(1) = SHR(s->Q(0), shift + 64) | |
|
1400 SHR(s->Q(1), shift - 0) | |
|
1401 SHR(d->Q(0), shift - 64) | |
|
1402 SHR(d->Q(1), shift - 128); |
|
1403 #endif |
|
1404 #undef SHR |
|
1405 } |
|
1406 |
|
1407 *d = r; |
|
1408 } |
|
1409 |
|
1410 #define XMM0 env->xmm_regs[0] |
|
1411 |
|
1412 #if SHIFT == 1 |
|
1413 #define SSE_HELPER_V(name, elem, num, F)\ |
|
1414 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
1415 {\ |
|
1416 d->elem(0) = F(d->elem(0), s->elem(0), XMM0.elem(0));\ |
|
1417 d->elem(1) = F(d->elem(1), s->elem(1), XMM0.elem(1));\ |
|
1418 if (num > 2) {\ |
|
1419 d->elem(2) = F(d->elem(2), s->elem(2), XMM0.elem(2));\ |
|
1420 d->elem(3) = F(d->elem(3), s->elem(3), XMM0.elem(3));\ |
|
1421 if (num > 4) {\ |
|
1422 d->elem(4) = F(d->elem(4), s->elem(4), XMM0.elem(4));\ |
|
1423 d->elem(5) = F(d->elem(5), s->elem(5), XMM0.elem(5));\ |
|
1424 d->elem(6) = F(d->elem(6), s->elem(6), XMM0.elem(6));\ |
|
1425 d->elem(7) = F(d->elem(7), s->elem(7), XMM0.elem(7));\ |
|
1426 if (num > 8) {\ |
|
1427 d->elem(8) = F(d->elem(8), s->elem(8), XMM0.elem(8));\ |
|
1428 d->elem(9) = F(d->elem(9), s->elem(9), XMM0.elem(9));\ |
|
1429 d->elem(10) = F(d->elem(10), s->elem(10), XMM0.elem(10));\ |
|
1430 d->elem(11) = F(d->elem(11), s->elem(11), XMM0.elem(11));\ |
|
1431 d->elem(12) = F(d->elem(12), s->elem(12), XMM0.elem(12));\ |
|
1432 d->elem(13) = F(d->elem(13), s->elem(13), XMM0.elem(13));\ |
|
1433 d->elem(14) = F(d->elem(14), s->elem(14), XMM0.elem(14));\ |
|
1434 d->elem(15) = F(d->elem(15), s->elem(15), XMM0.elem(15));\ |
|
1435 }\ |
|
1436 }\ |
|
1437 }\ |
|
1438 } |
|
1439 |
|
1440 #define SSE_HELPER_I(name, elem, num, F)\ |
|
1441 void glue(name, SUFFIX) (Reg *d, Reg *s, uint32_t imm)\ |
|
1442 {\ |
|
1443 d->elem(0) = F(d->elem(0), s->elem(0), ((imm >> 0) & 1));\ |
|
1444 d->elem(1) = F(d->elem(1), s->elem(1), ((imm >> 1) & 1));\ |
|
1445 if (num > 2) {\ |
|
1446 d->elem(2) = F(d->elem(2), s->elem(2), ((imm >> 2) & 1));\ |
|
1447 d->elem(3) = F(d->elem(3), s->elem(3), ((imm >> 3) & 1));\ |
|
1448 if (num > 4) {\ |
|
1449 d->elem(4) = F(d->elem(4), s->elem(4), ((imm >> 4) & 1));\ |
|
1450 d->elem(5) = F(d->elem(5), s->elem(5), ((imm >> 5) & 1));\ |
|
1451 d->elem(6) = F(d->elem(6), s->elem(6), ((imm >> 6) & 1));\ |
|
1452 d->elem(7) = F(d->elem(7), s->elem(7), ((imm >> 7) & 1));\ |
|
1453 if (num > 8) {\ |
|
1454 d->elem(8) = F(d->elem(8), s->elem(8), ((imm >> 8) & 1));\ |
|
1455 d->elem(9) = F(d->elem(9), s->elem(9), ((imm >> 9) & 1));\ |
|
1456 d->elem(10) = F(d->elem(10), s->elem(10), ((imm >> 10) & 1));\ |
|
1457 d->elem(11) = F(d->elem(11), s->elem(11), ((imm >> 11) & 1));\ |
|
1458 d->elem(12) = F(d->elem(12), s->elem(12), ((imm >> 12) & 1));\ |
|
1459 d->elem(13) = F(d->elem(13), s->elem(13), ((imm >> 13) & 1));\ |
|
1460 d->elem(14) = F(d->elem(14), s->elem(14), ((imm >> 14) & 1));\ |
|
1461 d->elem(15) = F(d->elem(15), s->elem(15), ((imm >> 15) & 1));\ |
|
1462 }\ |
|
1463 }\ |
|
1464 }\ |
|
1465 } |
|
1466 |
|
1467 /* SSE4.1 op helpers */ |
|
1468 #define FBLENDVB(d, s, m) (m & 0x80) ? s : d |
|
1469 #define FBLENDVPS(d, s, m) (m & 0x80000000) ? s : d |
|
1470 #define FBLENDVPD(d, s, m) (m & 0x8000000000000000LL) ? s : d |
|
1471 SSE_HELPER_V(helper_pblendvb, B, 16, FBLENDVB) |
|
1472 SSE_HELPER_V(helper_blendvps, L, 4, FBLENDVPS) |
|
1473 SSE_HELPER_V(helper_blendvpd, Q, 2, FBLENDVPD) |
|
1474 |
|
1475 void glue(helper_ptest, SUFFIX) (Reg *d, Reg *s) |
|
1476 { |
|
1477 uint64_t zf = (s->Q(0) & d->Q(0)) | (s->Q(1) & d->Q(1)); |
|
1478 uint64_t cf = (s->Q(0) & ~d->Q(0)) | (s->Q(1) & ~d->Q(1)); |
|
1479 |
|
1480 CC_SRC = (zf ? 0 : CC_Z) | (cf ? 0 : CC_C); |
|
1481 } |
|
1482 |
|
1483 #define SSE_HELPER_F(name, elem, num, F)\ |
|
1484 void glue(name, SUFFIX) (Reg *d, Reg *s)\ |
|
1485 {\ |
|
1486 d->elem(0) = F(0);\ |
|
1487 d->elem(1) = F(1);\ |
|
1488 if (num > 2) {\ |
|
1489 d->elem(2) = F(2);\ |
|
1490 d->elem(3) = F(3);\ |
|
1491 if (num > 4) {\ |
|
1492 d->elem(4) = F(4);\ |
|
1493 d->elem(5) = F(5);\ |
|
1494 d->elem(6) = F(6);\ |
|
1495 d->elem(7) = F(7);\ |
|
1496 }\ |
|
1497 }\ |
|
1498 } |
|
1499 |
|
1500 SSE_HELPER_F(helper_pmovsxbw, W, 8, (int8_t) s->B) |
|
1501 SSE_HELPER_F(helper_pmovsxbd, L, 4, (int8_t) s->B) |
|
1502 SSE_HELPER_F(helper_pmovsxbq, Q, 2, (int8_t) s->B) |
|
1503 SSE_HELPER_F(helper_pmovsxwd, L, 4, (int16_t) s->W) |
|
1504 SSE_HELPER_F(helper_pmovsxwq, Q, 2, (int16_t) s->W) |
|
1505 SSE_HELPER_F(helper_pmovsxdq, Q, 2, (int32_t) s->L) |
|
1506 SSE_HELPER_F(helper_pmovzxbw, W, 8, s->B) |
|
1507 SSE_HELPER_F(helper_pmovzxbd, L, 4, s->B) |
|
1508 SSE_HELPER_F(helper_pmovzxbq, Q, 2, s->B) |
|
1509 SSE_HELPER_F(helper_pmovzxwd, L, 4, s->W) |
|
1510 SSE_HELPER_F(helper_pmovzxwq, Q, 2, s->W) |
|
1511 SSE_HELPER_F(helper_pmovzxdq, Q, 2, s->L) |
|
1512 |
|
1513 void glue(helper_pmuldq, SUFFIX) (Reg *d, Reg *s) |
|
1514 { |
|
1515 d->Q(0) = (int64_t) (int32_t) d->L(0) * (int32_t) s->L(0); |
|
1516 d->Q(1) = (int64_t) (int32_t) d->L(2) * (int32_t) s->L(2); |
|
1517 } |
|
1518 |
|
1519 #define FCMPEQQ(d, s) d == s ? -1 : 0 |
|
1520 SSE_HELPER_Q(helper_pcmpeqq, FCMPEQQ) |
|
1521 |
|
1522 void glue(helper_packusdw, SUFFIX) (Reg *d, Reg *s) |
|
1523 { |
|
1524 d->W(0) = satuw((int32_t) d->L(0)); |
|
1525 d->W(1) = satuw((int32_t) d->L(1)); |
|
1526 d->W(2) = satuw((int32_t) d->L(2)); |
|
1527 d->W(3) = satuw((int32_t) d->L(3)); |
|
1528 d->W(4) = satuw((int32_t) s->L(0)); |
|
1529 d->W(5) = satuw((int32_t) s->L(1)); |
|
1530 d->W(6) = satuw((int32_t) s->L(2)); |
|
1531 d->W(7) = satuw((int32_t) s->L(3)); |
|
1532 } |
|
1533 |
|
1534 #define FMINSB(d, s) MIN((int8_t) d, (int8_t) s) |
|
1535 #define FMINSD(d, s) MIN((int32_t) d, (int32_t) s) |
|
1536 #define FMAXSB(d, s) MAX((int8_t) d, (int8_t) s) |
|
1537 #define FMAXSD(d, s) MAX((int32_t) d, (int32_t) s) |
|
1538 SSE_HELPER_B(helper_pminsb, FMINSB) |
|
1539 SSE_HELPER_L(helper_pminsd, FMINSD) |
|
1540 SSE_HELPER_W(helper_pminuw, MIN) |
|
1541 SSE_HELPER_L(helper_pminud, MIN) |
|
1542 SSE_HELPER_B(helper_pmaxsb, FMAXSB) |
|
1543 SSE_HELPER_L(helper_pmaxsd, FMAXSD) |
|
1544 SSE_HELPER_W(helper_pmaxuw, MAX) |
|
1545 SSE_HELPER_L(helper_pmaxud, MAX) |
|
1546 |
|
1547 #define FMULLD(d, s) (int32_t) d * (int32_t) s |
|
1548 SSE_HELPER_L(helper_pmulld, FMULLD) |
|
1549 |
|
1550 void glue(helper_phminposuw, SUFFIX) (Reg *d, Reg *s) |
|
1551 { |
|
1552 int idx = 0; |
|
1553 |
|
1554 if (s->W(1) < s->W(idx)) |
|
1555 idx = 1; |
|
1556 if (s->W(2) < s->W(idx)) |
|
1557 idx = 2; |
|
1558 if (s->W(3) < s->W(idx)) |
|
1559 idx = 3; |
|
1560 if (s->W(4) < s->W(idx)) |
|
1561 idx = 4; |
|
1562 if (s->W(5) < s->W(idx)) |
|
1563 idx = 5; |
|
1564 if (s->W(6) < s->W(idx)) |
|
1565 idx = 6; |
|
1566 if (s->W(7) < s->W(idx)) |
|
1567 idx = 7; |
|
1568 |
|
1569 d->Q(1) = 0; |
|
1570 d->L(1) = 0; |
|
1571 d->W(1) = idx; |
|
1572 d->W(0) = s->W(idx); |
|
1573 } |
|
1574 |
|
1575 void glue(helper_roundps, SUFFIX) (Reg *d, Reg *s, uint32_t mode) |
|
1576 { |
|
1577 signed char prev_rounding_mode; |
|
1578 |
|
1579 prev_rounding_mode = env->sse_status.float_rounding_mode; |
|
1580 if (!(mode & (1 << 2))) |
|
1581 switch (mode & 3) { |
|
1582 case 0: |
|
1583 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); |
|
1584 break; |
|
1585 case 1: |
|
1586 set_float_rounding_mode(float_round_down, &env->sse_status); |
|
1587 break; |
|
1588 case 2: |
|
1589 set_float_rounding_mode(float_round_up, &env->sse_status); |
|
1590 break; |
|
1591 case 3: |
|
1592 set_float_rounding_mode(float_round_to_zero, &env->sse_status); |
|
1593 break; |
|
1594 } |
|
1595 |
|
1596 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); |
|
1597 d->L(1) = float64_round_to_int(s->L(1), &env->sse_status); |
|
1598 d->L(2) = float64_round_to_int(s->L(2), &env->sse_status); |
|
1599 d->L(3) = float64_round_to_int(s->L(3), &env->sse_status); |
|
1600 |
|
1601 #if 0 /* TODO */ |
|
1602 if (mode & (1 << 3)) |
|
1603 set_float_exception_flags( |
|
1604 get_float_exception_flags(&env->sse_status) & |
|
1605 ~float_flag_inexact, |
|
1606 &env->sse_status); |
|
1607 #endif |
|
1608 env->sse_status.float_rounding_mode = prev_rounding_mode; |
|
1609 } |
|
1610 |
|
1611 void glue(helper_roundpd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) |
|
1612 { |
|
1613 signed char prev_rounding_mode; |
|
1614 |
|
1615 prev_rounding_mode = env->sse_status.float_rounding_mode; |
|
1616 if (!(mode & (1 << 2))) |
|
1617 switch (mode & 3) { |
|
1618 case 0: |
|
1619 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); |
|
1620 break; |
|
1621 case 1: |
|
1622 set_float_rounding_mode(float_round_down, &env->sse_status); |
|
1623 break; |
|
1624 case 2: |
|
1625 set_float_rounding_mode(float_round_up, &env->sse_status); |
|
1626 break; |
|
1627 case 3: |
|
1628 set_float_rounding_mode(float_round_to_zero, &env->sse_status); |
|
1629 break; |
|
1630 } |
|
1631 |
|
1632 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); |
|
1633 d->Q(1) = float64_round_to_int(s->Q(1), &env->sse_status); |
|
1634 |
|
1635 #if 0 /* TODO */ |
|
1636 if (mode & (1 << 3)) |
|
1637 set_float_exception_flags( |
|
1638 get_float_exception_flags(&env->sse_status) & |
|
1639 ~float_flag_inexact, |
|
1640 &env->sse_status); |
|
1641 #endif |
|
1642 env->sse_status.float_rounding_mode = prev_rounding_mode; |
|
1643 } |
|
1644 |
|
1645 void glue(helper_roundss, SUFFIX) (Reg *d, Reg *s, uint32_t mode) |
|
1646 { |
|
1647 signed char prev_rounding_mode; |
|
1648 |
|
1649 prev_rounding_mode = env->sse_status.float_rounding_mode; |
|
1650 if (!(mode & (1 << 2))) |
|
1651 switch (mode & 3) { |
|
1652 case 0: |
|
1653 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); |
|
1654 break; |
|
1655 case 1: |
|
1656 set_float_rounding_mode(float_round_down, &env->sse_status); |
|
1657 break; |
|
1658 case 2: |
|
1659 set_float_rounding_mode(float_round_up, &env->sse_status); |
|
1660 break; |
|
1661 case 3: |
|
1662 set_float_rounding_mode(float_round_to_zero, &env->sse_status); |
|
1663 break; |
|
1664 } |
|
1665 |
|
1666 d->L(0) = float64_round_to_int(s->L(0), &env->sse_status); |
|
1667 |
|
1668 #if 0 /* TODO */ |
|
1669 if (mode & (1 << 3)) |
|
1670 set_float_exception_flags( |
|
1671 get_float_exception_flags(&env->sse_status) & |
|
1672 ~float_flag_inexact, |
|
1673 &env->sse_status); |
|
1674 #endif |
|
1675 env->sse_status.float_rounding_mode = prev_rounding_mode; |
|
1676 } |
|
1677 |
|
1678 void glue(helper_roundsd, SUFFIX) (Reg *d, Reg *s, uint32_t mode) |
|
1679 { |
|
1680 signed char prev_rounding_mode; |
|
1681 |
|
1682 prev_rounding_mode = env->sse_status.float_rounding_mode; |
|
1683 if (!(mode & (1 << 2))) |
|
1684 switch (mode & 3) { |
|
1685 case 0: |
|
1686 set_float_rounding_mode(float_round_nearest_even, &env->sse_status); |
|
1687 break; |
|
1688 case 1: |
|
1689 set_float_rounding_mode(float_round_down, &env->sse_status); |
|
1690 break; |
|
1691 case 2: |
|
1692 set_float_rounding_mode(float_round_up, &env->sse_status); |
|
1693 break; |
|
1694 case 3: |
|
1695 set_float_rounding_mode(float_round_to_zero, &env->sse_status); |
|
1696 break; |
|
1697 } |
|
1698 |
|
1699 d->Q(0) = float64_round_to_int(s->Q(0), &env->sse_status); |
|
1700 |
|
1701 #if 0 /* TODO */ |
|
1702 if (mode & (1 << 3)) |
|
1703 set_float_exception_flags( |
|
1704 get_float_exception_flags(&env->sse_status) & |
|
1705 ~float_flag_inexact, |
|
1706 &env->sse_status); |
|
1707 #endif |
|
1708 env->sse_status.float_rounding_mode = prev_rounding_mode; |
|
1709 } |
|
1710 |
|
1711 #define FBLENDP(d, s, m) m ? s : d |
|
1712 SSE_HELPER_I(helper_blendps, L, 4, FBLENDP) |
|
1713 SSE_HELPER_I(helper_blendpd, Q, 2, FBLENDP) |
|
1714 SSE_HELPER_I(helper_pblendw, W, 8, FBLENDP) |
|
1715 |
|
1716 void glue(helper_dpps, SUFFIX) (Reg *d, Reg *s, uint32_t mask) |
|
1717 { |
|
1718 float32 iresult = 0 /*float32_zero*/; |
|
1719 |
|
1720 if (mask & (1 << 4)) |
|
1721 iresult = float32_add(iresult, |
|
1722 float32_mul(d->L(0), s->L(0), &env->sse_status), |
|
1723 &env->sse_status); |
|
1724 if (mask & (1 << 5)) |
|
1725 iresult = float32_add(iresult, |
|
1726 float32_mul(d->L(1), s->L(1), &env->sse_status), |
|
1727 &env->sse_status); |
|
1728 if (mask & (1 << 6)) |
|
1729 iresult = float32_add(iresult, |
|
1730 float32_mul(d->L(2), s->L(2), &env->sse_status), |
|
1731 &env->sse_status); |
|
1732 if (mask & (1 << 7)) |
|
1733 iresult = float32_add(iresult, |
|
1734 float32_mul(d->L(3), s->L(3), &env->sse_status), |
|
1735 &env->sse_status); |
|
1736 d->L(0) = (mask & (1 << 0)) ? iresult : 0 /*float32_zero*/; |
|
1737 d->L(1) = (mask & (1 << 1)) ? iresult : 0 /*float32_zero*/; |
|
1738 d->L(2) = (mask & (1 << 2)) ? iresult : 0 /*float32_zero*/; |
|
1739 d->L(3) = (mask & (1 << 3)) ? iresult : 0 /*float32_zero*/; |
|
1740 } |
|
1741 |
|
1742 void glue(helper_dppd, SUFFIX) (Reg *d, Reg *s, uint32_t mask) |
|
1743 { |
|
1744 float64 iresult = 0 /*float64_zero*/; |
|
1745 |
|
1746 if (mask & (1 << 4)) |
|
1747 iresult = float64_add(iresult, |
|
1748 float64_mul(d->Q(0), s->Q(0), &env->sse_status), |
|
1749 &env->sse_status); |
|
1750 if (mask & (1 << 5)) |
|
1751 iresult = float64_add(iresult, |
|
1752 float64_mul(d->Q(1), s->Q(1), &env->sse_status), |
|
1753 &env->sse_status); |
|
1754 d->Q(0) = (mask & (1 << 0)) ? iresult : 0 /*float64_zero*/; |
|
1755 d->Q(1) = (mask & (1 << 1)) ? iresult : 0 /*float64_zero*/; |
|
1756 } |
|
1757 |
|
1758 void glue(helper_mpsadbw, SUFFIX) (Reg *d, Reg *s, uint32_t offset) |
|
1759 { |
|
1760 int s0 = (offset & 3) << 2; |
|
1761 int d0 = (offset & 4) << 0; |
|
1762 int i; |
|
1763 Reg r; |
|
1764 |
|
1765 for (i = 0; i < 8; i++, d0++) { |
|
1766 r.W(i) = 0; |
|
1767 r.W(i) += abs1(d->B(d0 + 0) - s->B(s0 + 0)); |
|
1768 r.W(i) += abs1(d->B(d0 + 1) - s->B(s0 + 1)); |
|
1769 r.W(i) += abs1(d->B(d0 + 2) - s->B(s0 + 2)); |
|
1770 r.W(i) += abs1(d->B(d0 + 3) - s->B(s0 + 3)); |
|
1771 } |
|
1772 |
|
1773 *d = r; |
|
1774 } |
|
1775 |
|
1776 /* SSE4.2 op helpers */ |
|
1777 /* it's unclear whether signed or unsigned */ |
|
1778 #define FCMPGTQ(d, s) d > s ? -1 : 0 |
|
1779 SSE_HELPER_Q(helper_pcmpgtq, FCMPGTQ) |
|
1780 |
|
1781 static inline int pcmp_elen(int reg, uint32_t ctrl) |
|
1782 { |
|
1783 int val; |
|
1784 |
|
1785 /* Presence of REX.W is indicated by a bit higher than 7 set */ |
|
1786 if (ctrl >> 8) |
|
1787 val = abs1((int64_t) env->regs[reg]); |
|
1788 else |
|
1789 val = abs1((int32_t) env->regs[reg]); |
|
1790 |
|
1791 if (ctrl & 1) { |
|
1792 if (val > 8) |
|
1793 return 8; |
|
1794 } else |
|
1795 if (val > 16) |
|
1796 return 16; |
|
1797 |
|
1798 return val; |
|
1799 } |
|
1800 |
|
1801 static inline int pcmp_ilen(Reg *r, uint8_t ctrl) |
|
1802 { |
|
1803 int val = 0; |
|
1804 |
|
1805 if (ctrl & 1) { |
|
1806 while (val < 8 && r->W(val)) |
|
1807 val++; |
|
1808 } else |
|
1809 while (val < 16 && r->B(val)) |
|
1810 val++; |
|
1811 |
|
1812 return val; |
|
1813 } |
|
1814 |
|
1815 static inline int pcmp_val(Reg *r, uint8_t ctrl, int i) |
|
1816 { |
|
1817 switch ((ctrl >> 0) & 3) { |
|
1818 case 0: |
|
1819 return r->B(i); |
|
1820 case 1: |
|
1821 return r->W(i); |
|
1822 case 2: |
|
1823 return (int8_t) r->B(i); |
|
1824 case 3: |
|
1825 default: |
|
1826 return (int16_t) r->W(i); |
|
1827 } |
|
1828 } |
|
1829 |
|
1830 static inline unsigned pcmpxstrx(Reg *d, Reg *s, |
|
1831 int8_t ctrl, int valids, int validd) |
|
1832 { |
|
1833 unsigned int res = 0; |
|
1834 int v; |
|
1835 int j, i; |
|
1836 int upper = (ctrl & 1) ? 7 : 15; |
|
1837 |
|
1838 valids--; |
|
1839 validd--; |
|
1840 |
|
1841 CC_SRC = (valids < upper ? CC_Z : 0) | (validd < upper ? CC_S : 0); |
|
1842 |
|
1843 switch ((ctrl >> 2) & 3) { |
|
1844 case 0: |
|
1845 for (j = valids; j >= 0; j--) { |
|
1846 res <<= 1; |
|
1847 v = pcmp_val(s, ctrl, j); |
|
1848 for (i = validd; i >= 0; i--) |
|
1849 res |= (v == pcmp_val(d, ctrl, i)); |
|
1850 } |
|
1851 break; |
|
1852 case 1: |
|
1853 for (j = valids; j >= 0; j--) { |
|
1854 res <<= 1; |
|
1855 v = pcmp_val(s, ctrl, j); |
|
1856 for (i = ((validd - 1) | 1); i >= 0; i -= 2) |
|
1857 res |= (pcmp_val(d, ctrl, i - 0) <= v && |
|
1858 pcmp_val(d, ctrl, i - 1) >= v); |
|
1859 } |
|
1860 break; |
|
1861 case 2: |
|
1862 res = (2 << (upper - MAX(valids, validd))) - 1; |
|
1863 res <<= MAX(valids, validd) - MIN(valids, validd); |
|
1864 for (i = MIN(valids, validd); i >= 0; i--) { |
|
1865 res <<= 1; |
|
1866 v = pcmp_val(s, ctrl, i); |
|
1867 res |= (v == pcmp_val(d, ctrl, i)); |
|
1868 } |
|
1869 break; |
|
1870 case 3: |
|
1871 for (j = valids - validd; j >= 0; j--) { |
|
1872 res <<= 1; |
|
1873 res |= 1; |
|
1874 for (i = MIN(upper - j, validd); i >= 0; i--) |
|
1875 res &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i)); |
|
1876 } |
|
1877 break; |
|
1878 } |
|
1879 |
|
1880 switch ((ctrl >> 4) & 3) { |
|
1881 case 1: |
|
1882 res ^= (2 << upper) - 1; |
|
1883 break; |
|
1884 case 3: |
|
1885 res ^= (2 << valids) - 1; |
|
1886 break; |
|
1887 } |
|
1888 |
|
1889 if (res) |
|
1890 CC_SRC |= CC_C; |
|
1891 if (res & 1) |
|
1892 CC_SRC |= CC_O; |
|
1893 |
|
1894 return res; |
|
1895 } |
|
1896 |
|
1897 static inline int rffs1(unsigned int val) |
|
1898 { |
|
1899 int ret = 1, hi; |
|
1900 |
|
1901 for (hi = sizeof(val) * 4; hi; hi /= 2) |
|
1902 if (val >> hi) { |
|
1903 val >>= hi; |
|
1904 ret += hi; |
|
1905 } |
|
1906 |
|
1907 return ret; |
|
1908 } |
|
1909 |
|
1910 static inline int ffs1(unsigned int val) |
|
1911 { |
|
1912 int ret = 1, hi; |
|
1913 |
|
1914 for (hi = sizeof(val) * 4; hi; hi /= 2) |
|
1915 if (val << hi) { |
|
1916 val <<= hi; |
|
1917 ret += hi; |
|
1918 } |
|
1919 |
|
1920 return ret; |
|
1921 } |
|
1922 |
|
1923 void glue(helper_pcmpestri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) |
|
1924 { |
|
1925 unsigned int res = pcmpxstrx(d, s, ctrl, |
|
1926 pcmp_elen(R_EDX, ctrl), |
|
1927 pcmp_elen(R_EAX, ctrl)); |
|
1928 |
|
1929 if (res) |
|
1930 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; |
|
1931 else |
|
1932 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); |
|
1933 } |
|
1934 |
|
1935 void glue(helper_pcmpestrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) |
|
1936 { |
|
1937 int i; |
|
1938 unsigned int res = pcmpxstrx(d, s, ctrl, |
|
1939 pcmp_elen(R_EDX, ctrl), |
|
1940 pcmp_elen(R_EAX, ctrl)); |
|
1941 |
|
1942 if ((ctrl >> 6) & 1) { |
|
1943 if (ctrl & 1) |
|
1944 for (i = 0; i <= 8; i--, res >>= 1) |
|
1945 d->W(i) = (res & 1) ? ~0 : 0; |
|
1946 else |
|
1947 for (i = 0; i <= 16; i--, res >>= 1) |
|
1948 d->B(i) = (res & 1) ? ~0 : 0; |
|
1949 } else { |
|
1950 d->Q(1) = 0; |
|
1951 d->Q(0) = res; |
|
1952 } |
|
1953 } |
|
1954 |
|
1955 void glue(helper_pcmpistri, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) |
|
1956 { |
|
1957 unsigned int res = pcmpxstrx(d, s, ctrl, |
|
1958 pcmp_ilen(s, ctrl), |
|
1959 pcmp_ilen(d, ctrl)); |
|
1960 |
|
1961 if (res) |
|
1962 env->regs[R_ECX] = ((ctrl & (1 << 6)) ? rffs1 : ffs1)(res) - 1; |
|
1963 else |
|
1964 env->regs[R_ECX] = 16 >> (ctrl & (1 << 0)); |
|
1965 } |
|
1966 |
|
1967 void glue(helper_pcmpistrm, SUFFIX) (Reg *d, Reg *s, uint32_t ctrl) |
|
1968 { |
|
1969 int i; |
|
1970 unsigned int res = pcmpxstrx(d, s, ctrl, |
|
1971 pcmp_ilen(s, ctrl), |
|
1972 pcmp_ilen(d, ctrl)); |
|
1973 |
|
1974 if ((ctrl >> 6) & 1) { |
|
1975 if (ctrl & 1) |
|
1976 for (i = 0; i <= 8; i--, res >>= 1) |
|
1977 d->W(i) = (res & 1) ? ~0 : 0; |
|
1978 else |
|
1979 for (i = 0; i <= 16; i--, res >>= 1) |
|
1980 d->B(i) = (res & 1) ? ~0 : 0; |
|
1981 } else { |
|
1982 d->Q(1) = 0; |
|
1983 d->Q(0) = res; |
|
1984 } |
|
1985 } |
|
1986 |
|
1987 #define CRCPOLY 0x1edc6f41 |
|
1988 #define CRCPOLY_BITREV 0x82f63b78 |
|
1989 target_ulong helper_crc32(uint32_t crc1, target_ulong msg, uint32_t len) |
|
1990 { |
|
1991 target_ulong crc = (msg & ((target_ulong) -1 >> |
|
1992 (TARGET_LONG_BITS - len))) ^ crc1; |
|
1993 |
|
1994 while (len--) |
|
1995 crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_BITREV : 0); |
|
1996 |
|
1997 return crc; |
|
1998 } |
|
1999 |
|
2000 #define POPMASK(i) ((target_ulong) -1 / ((1LL << (1 << i)) + 1)) |
|
2001 #define POPCOUNT(n, i) (n & POPMASK(i)) + ((n >> (1 << i)) & POPMASK(i)) |
|
2002 target_ulong helper_popcnt(target_ulong n, uint32_t type) |
|
2003 { |
|
2004 CC_SRC = n ? 0 : CC_Z; |
|
2005 |
|
2006 n = POPCOUNT(n, 0); |
|
2007 n = POPCOUNT(n, 1); |
|
2008 n = POPCOUNT(n, 2); |
|
2009 n = POPCOUNT(n, 3); |
|
2010 if (type == 1) |
|
2011 return n & 0xff; |
|
2012 |
|
2013 n = POPCOUNT(n, 4); |
|
2014 #ifndef TARGET_X86_64 |
|
2015 return n; |
|
2016 #else |
|
2017 if (type == 2) |
|
2018 return n & 0xff; |
|
2019 |
|
2020 return POPCOUNT(n, 5); |
|
2021 #endif |
|
2022 } |
|
2023 #endif |
|
2024 |
|
2025 #undef SHIFT |
|
2026 #undef XMM_ONLY |
|
2027 #undef Reg |
|
2028 #undef B |
|
2029 #undef W |
|
2030 #undef L |
|
2031 #undef Q |
|
2032 #undef SUFFIX |