|
1 |
|
2 /*============================================================================ |
|
3 |
|
4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic |
|
5 Package, Release 2b. |
|
6 |
|
7 Written by John R. Hauser. This work was made possible in part by the |
|
8 International Computer Science Institute, located at Suite 600, 1947 Center |
|
9 Street, Berkeley, California 94704. Funding was partially provided by the |
|
10 National Science Foundation under grant MIP-9311980. The original version |
|
11 of this code was written as part of a project to build a fixed-point vector |
|
12 processor in collaboration with the University of California at Berkeley, |
|
13 overseen by Profs. Nelson Morgan and John Wawrzynek. More information |
|
14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/ |
|
15 arithmetic/SoftFloat.html'. |
|
16 |
|
17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort has |
|
18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES |
|
19 RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS |
|
20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES, |
|
21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE |
|
22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE |
|
23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR |
|
24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE. |
|
25 |
|
26 Derivative works are acceptable, even for commercial purposes, so long as |
|
27 (1) the source code for the derivative work includes prominent notice that |
|
28 the work is derivative, and (2) the source code includes prominent notice with |
|
29 these four paragraphs for those parts of this code that are retained. |
|
30 |
|
31 =============================================================================*/ |
|
32 |
|
33 /* FIXME: Flush-To-Zero only effects results. Denormal inputs should also |
|
34 be flushed to zero. */ |
|
35 #include "softfloat.h" |
|
36 |
|
37 /*---------------------------------------------------------------------------- |
|
38 | Primitive arithmetic functions, including multi-word arithmetic, and |
|
39 | division and square root approximations. (Can be specialized to target if |
|
40 | desired.) |
|
41 *----------------------------------------------------------------------------*/ |
|
42 #include "softfloat-macros.h" |
|
43 |
|
44 /*---------------------------------------------------------------------------- |
|
45 | Functions and definitions to determine: (1) whether tininess for underflow |
|
46 | is detected before or after rounding by default, (2) what (if anything) |
|
47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished |
|
48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs |
|
49 | are propagated from function inputs to output. These details are target- |
|
50 | specific. |
|
51 *----------------------------------------------------------------------------*/ |
|
52 #include "softfloat-specialize.h" |
|
53 |
|
54 void set_float_rounding_mode(int val STATUS_PARAM) |
|
55 { |
|
56 STATUS(float_rounding_mode) = val; |
|
57 } |
|
58 |
|
59 void set_float_exception_flags(int val STATUS_PARAM) |
|
60 { |
|
61 STATUS(float_exception_flags) = val; |
|
62 } |
|
63 |
|
64 #ifdef FLOATX80 |
|
65 void set_floatx80_rounding_precision(int val STATUS_PARAM) |
|
66 { |
|
67 STATUS(floatx80_rounding_precision) = val; |
|
68 } |
|
69 #endif |
|
70 |
|
71 /*---------------------------------------------------------------------------- |
|
72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6 |
|
73 | and 7, and returns the properly rounded 32-bit integer corresponding to the |
|
74 | input. If `zSign' is 1, the input is negated before being converted to an |
|
75 | integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input |
|
76 | is simply rounded to an integer, with the inexact exception raised if the |
|
77 | input cannot be represented exactly as an integer. However, if the fixed- |
|
78 | point input is too large, the invalid exception is raised and the largest |
|
79 | positive or negative integer is returned. |
|
80 *----------------------------------------------------------------------------*/ |
|
81 |
|
82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM) |
|
83 { |
|
84 int8 roundingMode; |
|
85 flag roundNearestEven; |
|
86 int8 roundIncrement, roundBits; |
|
87 int32 z; |
|
88 |
|
89 roundingMode = STATUS(float_rounding_mode); |
|
90 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
91 roundIncrement = 0x40; |
|
92 if ( ! roundNearestEven ) { |
|
93 if ( roundingMode == float_round_to_zero ) { |
|
94 roundIncrement = 0; |
|
95 } |
|
96 else { |
|
97 roundIncrement = 0x7F; |
|
98 if ( zSign ) { |
|
99 if ( roundingMode == float_round_up ) roundIncrement = 0; |
|
100 } |
|
101 else { |
|
102 if ( roundingMode == float_round_down ) roundIncrement = 0; |
|
103 } |
|
104 } |
|
105 } |
|
106 roundBits = absZ & 0x7F; |
|
107 absZ = ( absZ + roundIncrement )>>7; |
|
108 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
|
109 z = absZ; |
|
110 if ( zSign ) z = - z; |
|
111 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) { |
|
112 float_raise( float_flag_invalid STATUS_VAR); |
|
113 return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
|
114 } |
|
115 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
116 return z; |
|
117 |
|
118 } |
|
119 |
|
120 /*---------------------------------------------------------------------------- |
|
121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and |
|
122 | `absZ1', with binary point between bits 63 and 64 (between the input words), |
|
123 | and returns the properly rounded 64-bit integer corresponding to the input. |
|
124 | If `zSign' is 1, the input is negated before being converted to an integer. |
|
125 | Ordinarily, the fixed-point input is simply rounded to an integer, with |
|
126 | the inexact exception raised if the input cannot be represented exactly as |
|
127 | an integer. However, if the fixed-point input is too large, the invalid |
|
128 | exception is raised and the largest positive or negative integer is |
|
129 | returned. |
|
130 *----------------------------------------------------------------------------*/ |
|
131 |
|
132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM) |
|
133 { |
|
134 int8 roundingMode; |
|
135 flag roundNearestEven, increment; |
|
136 int64 z; |
|
137 |
|
138 roundingMode = STATUS(float_rounding_mode); |
|
139 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
140 increment = ( (sbits64) absZ1 < 0 ); |
|
141 if ( ! roundNearestEven ) { |
|
142 if ( roundingMode == float_round_to_zero ) { |
|
143 increment = 0; |
|
144 } |
|
145 else { |
|
146 if ( zSign ) { |
|
147 increment = ( roundingMode == float_round_down ) && absZ1; |
|
148 } |
|
149 else { |
|
150 increment = ( roundingMode == float_round_up ) && absZ1; |
|
151 } |
|
152 } |
|
153 } |
|
154 if ( increment ) { |
|
155 ++absZ0; |
|
156 if ( absZ0 == 0 ) goto overflow; |
|
157 absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven ); |
|
158 } |
|
159 z = absZ0; |
|
160 if ( zSign ) z = - z; |
|
161 if ( z && ( ( z < 0 ) ^ zSign ) ) { |
|
162 overflow: |
|
163 float_raise( float_flag_invalid STATUS_VAR); |
|
164 return |
|
165 zSign ? (sbits64) LIT64( 0x8000000000000000 ) |
|
166 : LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
167 } |
|
168 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
169 return z; |
|
170 |
|
171 } |
|
172 |
|
173 /*---------------------------------------------------------------------------- |
|
174 | Returns the fraction bits of the single-precision floating-point value `a'. |
|
175 *----------------------------------------------------------------------------*/ |
|
176 |
|
177 INLINE bits32 extractFloat32Frac( float32 a ) |
|
178 { |
|
179 |
|
180 return float32_val(a) & 0x007FFFFF; |
|
181 |
|
182 } |
|
183 |
|
184 /*---------------------------------------------------------------------------- |
|
185 | Returns the exponent bits of the single-precision floating-point value `a'. |
|
186 *----------------------------------------------------------------------------*/ |
|
187 |
|
188 INLINE int16 extractFloat32Exp( float32 a ) |
|
189 { |
|
190 |
|
191 return ( float32_val(a)>>23 ) & 0xFF; |
|
192 |
|
193 } |
|
194 |
|
195 /*---------------------------------------------------------------------------- |
|
196 | Returns the sign bit of the single-precision floating-point value `a'. |
|
197 *----------------------------------------------------------------------------*/ |
|
198 |
|
199 INLINE flag extractFloat32Sign( float32 a ) |
|
200 { |
|
201 |
|
202 return float32_val(a)>>31; |
|
203 |
|
204 } |
|
205 |
|
206 /*---------------------------------------------------------------------------- |
|
207 | Normalizes the subnormal single-precision floating-point value represented |
|
208 | by the denormalized significand `aSig'. The normalized exponent and |
|
209 | significand are stored at the locations pointed to by `zExpPtr' and |
|
210 | `zSigPtr', respectively. |
|
211 *----------------------------------------------------------------------------*/ |
|
212 |
|
213 static void |
|
214 normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr ) |
|
215 { |
|
216 int8 shiftCount; |
|
217 |
|
218 shiftCount = countLeadingZeros32( aSig ) - 8; |
|
219 *zSigPtr = aSig<<shiftCount; |
|
220 *zExpPtr = 1 - shiftCount; |
|
221 |
|
222 } |
|
223 |
|
224 /*---------------------------------------------------------------------------- |
|
225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a |
|
226 | single-precision floating-point value, returning the result. After being |
|
227 | shifted into the proper positions, the three fields are simply added |
|
228 | together to form the result. This means that any integer portion of `zSig' |
|
229 | will be added into the exponent. Since a properly normalized significand |
|
230 | will have an integer portion equal to 1, the `zExp' input should be 1 less |
|
231 | than the desired result exponent whenever `zSig' is a complete, normalized |
|
232 | significand. |
|
233 *----------------------------------------------------------------------------*/ |
|
234 |
|
235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig ) |
|
236 { |
|
237 |
|
238 return make_float32( |
|
239 ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig); |
|
240 |
|
241 } |
|
242 |
|
243 /*---------------------------------------------------------------------------- |
|
244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
245 | and significand `zSig', and returns the proper single-precision floating- |
|
246 | point value corresponding to the abstract input. Ordinarily, the abstract |
|
247 | value is simply rounded and packed into the single-precision format, with |
|
248 | the inexact exception raised if the abstract input cannot be represented |
|
249 | exactly. However, if the abstract value is too large, the overflow and |
|
250 | inexact exceptions are raised and an infinity or maximal finite value is |
|
251 | returned. If the abstract value is too small, the input value is rounded to |
|
252 | a subnormal number, and the underflow and inexact exceptions are raised if |
|
253 | the abstract input cannot be represented exactly as a subnormal single- |
|
254 | precision floating-point number. |
|
255 | The input significand `zSig' has its binary point between bits 30 |
|
256 | and 29, which is 7 bits to the left of the usual location. This shifted |
|
257 | significand must be normalized or smaller. If `zSig' is not normalized, |
|
258 | `zExp' must be 0; in that case, the result returned is a subnormal number, |
|
259 | and it must not require rounding. In the usual case that `zSig' is |
|
260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. |
|
261 | The handling of underflow and overflow follows the IEC/IEEE Standard for |
|
262 | Binary Floating-Point Arithmetic. |
|
263 *----------------------------------------------------------------------------*/ |
|
264 |
|
265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) |
|
266 { |
|
267 int8 roundingMode; |
|
268 flag roundNearestEven; |
|
269 int8 roundIncrement, roundBits; |
|
270 flag isTiny; |
|
271 |
|
272 roundingMode = STATUS(float_rounding_mode); |
|
273 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
274 roundIncrement = 0x40; |
|
275 if ( ! roundNearestEven ) { |
|
276 if ( roundingMode == float_round_to_zero ) { |
|
277 roundIncrement = 0; |
|
278 } |
|
279 else { |
|
280 roundIncrement = 0x7F; |
|
281 if ( zSign ) { |
|
282 if ( roundingMode == float_round_up ) roundIncrement = 0; |
|
283 } |
|
284 else { |
|
285 if ( roundingMode == float_round_down ) roundIncrement = 0; |
|
286 } |
|
287 } |
|
288 } |
|
289 roundBits = zSig & 0x7F; |
|
290 if ( 0xFD <= (bits16) zExp ) { |
|
291 if ( ( 0xFD < zExp ) |
|
292 || ( ( zExp == 0xFD ) |
|
293 && ( (sbits32) ( zSig + roundIncrement ) < 0 ) ) |
|
294 ) { |
|
295 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
296 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 )); |
|
297 } |
|
298 if ( zExp < 0 ) { |
|
299 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); |
|
300 isTiny = |
|
301 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) |
|
302 || ( zExp < -1 ) |
|
303 || ( zSig + roundIncrement < 0x80000000 ); |
|
304 shift32RightJamming( zSig, - zExp, &zSig ); |
|
305 zExp = 0; |
|
306 roundBits = zSig & 0x7F; |
|
307 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); |
|
308 } |
|
309 } |
|
310 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
311 zSig = ( zSig + roundIncrement )>>7; |
|
312 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven ); |
|
313 if ( zSig == 0 ) zExp = 0; |
|
314 return packFloat32( zSign, zExp, zSig ); |
|
315 |
|
316 } |
|
317 |
|
318 /*---------------------------------------------------------------------------- |
|
319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
320 | and significand `zSig', and returns the proper single-precision floating- |
|
321 | point value corresponding to the abstract input. This routine is just like |
|
322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized. |
|
323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' |
|
324 | floating-point exponent. |
|
325 *----------------------------------------------------------------------------*/ |
|
326 |
|
327 static float32 |
|
328 normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM) |
|
329 { |
|
330 int8 shiftCount; |
|
331 |
|
332 shiftCount = countLeadingZeros32( zSig ) - 1; |
|
333 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); |
|
334 |
|
335 } |
|
336 |
|
337 /*---------------------------------------------------------------------------- |
|
338 | Returns the fraction bits of the double-precision floating-point value `a'. |
|
339 *----------------------------------------------------------------------------*/ |
|
340 |
|
341 INLINE bits64 extractFloat64Frac( float64 a ) |
|
342 { |
|
343 |
|
344 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF ); |
|
345 |
|
346 } |
|
347 |
|
348 /*---------------------------------------------------------------------------- |
|
349 | Returns the exponent bits of the double-precision floating-point value `a'. |
|
350 *----------------------------------------------------------------------------*/ |
|
351 |
|
352 INLINE int16 extractFloat64Exp( float64 a ) |
|
353 { |
|
354 |
|
355 return ( float64_val(a)>>52 ) & 0x7FF; |
|
356 |
|
357 } |
|
358 |
|
359 /*---------------------------------------------------------------------------- |
|
360 | Returns the sign bit of the double-precision floating-point value `a'. |
|
361 *----------------------------------------------------------------------------*/ |
|
362 |
|
363 INLINE flag extractFloat64Sign( float64 a ) |
|
364 { |
|
365 |
|
366 return float64_val(a)>>63; |
|
367 |
|
368 } |
|
369 |
|
370 /*---------------------------------------------------------------------------- |
|
371 | Normalizes the subnormal double-precision floating-point value represented |
|
372 | by the denormalized significand `aSig'. The normalized exponent and |
|
373 | significand are stored at the locations pointed to by `zExpPtr' and |
|
374 | `zSigPtr', respectively. |
|
375 *----------------------------------------------------------------------------*/ |
|
376 |
|
377 static void |
|
378 normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr ) |
|
379 { |
|
380 int8 shiftCount; |
|
381 |
|
382 shiftCount = countLeadingZeros64( aSig ) - 11; |
|
383 *zSigPtr = aSig<<shiftCount; |
|
384 *zExpPtr = 1 - shiftCount; |
|
385 |
|
386 } |
|
387 |
|
388 /*---------------------------------------------------------------------------- |
|
389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a |
|
390 | double-precision floating-point value, returning the result. After being |
|
391 | shifted into the proper positions, the three fields are simply added |
|
392 | together to form the result. This means that any integer portion of `zSig' |
|
393 | will be added into the exponent. Since a properly normalized significand |
|
394 | will have an integer portion equal to 1, the `zExp' input should be 1 less |
|
395 | than the desired result exponent whenever `zSig' is a complete, normalized |
|
396 | significand. |
|
397 *----------------------------------------------------------------------------*/ |
|
398 |
|
399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig ) |
|
400 { |
|
401 |
|
402 return make_float64( |
|
403 ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig); |
|
404 |
|
405 } |
|
406 |
|
407 /*---------------------------------------------------------------------------- |
|
408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
409 | and significand `zSig', and returns the proper double-precision floating- |
|
410 | point value corresponding to the abstract input. Ordinarily, the abstract |
|
411 | value is simply rounded and packed into the double-precision format, with |
|
412 | the inexact exception raised if the abstract input cannot be represented |
|
413 | exactly. However, if the abstract value is too large, the overflow and |
|
414 | inexact exceptions are raised and an infinity or maximal finite value is |
|
415 | returned. If the abstract value is too small, the input value is rounded |
|
416 | to a subnormal number, and the underflow and inexact exceptions are raised |
|
417 | if the abstract input cannot be represented exactly as a subnormal double- |
|
418 | precision floating-point number. |
|
419 | The input significand `zSig' has its binary point between bits 62 |
|
420 | and 61, which is 10 bits to the left of the usual location. This shifted |
|
421 | significand must be normalized or smaller. If `zSig' is not normalized, |
|
422 | `zExp' must be 0; in that case, the result returned is a subnormal number, |
|
423 | and it must not require rounding. In the usual case that `zSig' is |
|
424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent. |
|
425 | The handling of underflow and overflow follows the IEC/IEEE Standard for |
|
426 | Binary Floating-Point Arithmetic. |
|
427 *----------------------------------------------------------------------------*/ |
|
428 |
|
429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) |
|
430 { |
|
431 int8 roundingMode; |
|
432 flag roundNearestEven; |
|
433 int16 roundIncrement, roundBits; |
|
434 flag isTiny; |
|
435 |
|
436 roundingMode = STATUS(float_rounding_mode); |
|
437 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
438 roundIncrement = 0x200; |
|
439 if ( ! roundNearestEven ) { |
|
440 if ( roundingMode == float_round_to_zero ) { |
|
441 roundIncrement = 0; |
|
442 } |
|
443 else { |
|
444 roundIncrement = 0x3FF; |
|
445 if ( zSign ) { |
|
446 if ( roundingMode == float_round_up ) roundIncrement = 0; |
|
447 } |
|
448 else { |
|
449 if ( roundingMode == float_round_down ) roundIncrement = 0; |
|
450 } |
|
451 } |
|
452 } |
|
453 roundBits = zSig & 0x3FF; |
|
454 if ( 0x7FD <= (bits16) zExp ) { |
|
455 if ( ( 0x7FD < zExp ) |
|
456 || ( ( zExp == 0x7FD ) |
|
457 && ( (sbits64) ( zSig + roundIncrement ) < 0 ) ) |
|
458 ) { |
|
459 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
460 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 )); |
|
461 } |
|
462 if ( zExp < 0 ) { |
|
463 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); |
|
464 isTiny = |
|
465 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) |
|
466 || ( zExp < -1 ) |
|
467 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) ); |
|
468 shift64RightJamming( zSig, - zExp, &zSig ); |
|
469 zExp = 0; |
|
470 roundBits = zSig & 0x3FF; |
|
471 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); |
|
472 } |
|
473 } |
|
474 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
475 zSig = ( zSig + roundIncrement )>>10; |
|
476 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven ); |
|
477 if ( zSig == 0 ) zExp = 0; |
|
478 return packFloat64( zSign, zExp, zSig ); |
|
479 |
|
480 } |
|
481 |
|
482 /*---------------------------------------------------------------------------- |
|
483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
484 | and significand `zSig', and returns the proper double-precision floating- |
|
485 | point value corresponding to the abstract input. This routine is just like |
|
486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized. |
|
487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true'' |
|
488 | floating-point exponent. |
|
489 *----------------------------------------------------------------------------*/ |
|
490 |
|
491 static float64 |
|
492 normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM) |
|
493 { |
|
494 int8 shiftCount; |
|
495 |
|
496 shiftCount = countLeadingZeros64( zSig ) - 1; |
|
497 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR); |
|
498 |
|
499 } |
|
500 |
|
501 #ifdef FLOATX80 |
|
502 |
|
503 /*---------------------------------------------------------------------------- |
|
504 | Returns the fraction bits of the extended double-precision floating-point |
|
505 | value `a'. |
|
506 *----------------------------------------------------------------------------*/ |
|
507 |
|
508 INLINE bits64 extractFloatx80Frac( floatx80 a ) |
|
509 { |
|
510 |
|
511 return a.low; |
|
512 |
|
513 } |
|
514 |
|
515 /*---------------------------------------------------------------------------- |
|
516 | Returns the exponent bits of the extended double-precision floating-point |
|
517 | value `a'. |
|
518 *----------------------------------------------------------------------------*/ |
|
519 |
|
520 INLINE int32 extractFloatx80Exp( floatx80 a ) |
|
521 { |
|
522 |
|
523 return a.high & 0x7FFF; |
|
524 |
|
525 } |
|
526 |
|
527 /*---------------------------------------------------------------------------- |
|
528 | Returns the sign bit of the extended double-precision floating-point value |
|
529 | `a'. |
|
530 *----------------------------------------------------------------------------*/ |
|
531 |
|
532 INLINE flag extractFloatx80Sign( floatx80 a ) |
|
533 { |
|
534 |
|
535 return a.high>>15; |
|
536 |
|
537 } |
|
538 |
|
539 /*---------------------------------------------------------------------------- |
|
540 | Normalizes the subnormal extended double-precision floating-point value |
|
541 | represented by the denormalized significand `aSig'. The normalized exponent |
|
542 | and significand are stored at the locations pointed to by `zExpPtr' and |
|
543 | `zSigPtr', respectively. |
|
544 *----------------------------------------------------------------------------*/ |
|
545 |
|
546 static void |
|
547 normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr ) |
|
548 { |
|
549 int8 shiftCount; |
|
550 |
|
551 shiftCount = countLeadingZeros64( aSig ); |
|
552 *zSigPtr = aSig<<shiftCount; |
|
553 *zExpPtr = 1 - shiftCount; |
|
554 |
|
555 } |
|
556 |
|
557 /*---------------------------------------------------------------------------- |
|
558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an |
|
559 | extended double-precision floating-point value, returning the result. |
|
560 *----------------------------------------------------------------------------*/ |
|
561 |
|
562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig ) |
|
563 { |
|
564 floatx80 z; |
|
565 |
|
566 z.low = zSig; |
|
567 z.high = ( ( (bits16) zSign )<<15 ) + zExp; |
|
568 return z; |
|
569 |
|
570 } |
|
571 |
|
572 /*---------------------------------------------------------------------------- |
|
573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
574 | and extended significand formed by the concatenation of `zSig0' and `zSig1', |
|
575 | and returns the proper extended double-precision floating-point value |
|
576 | corresponding to the abstract input. Ordinarily, the abstract value is |
|
577 | rounded and packed into the extended double-precision format, with the |
|
578 | inexact exception raised if the abstract input cannot be represented |
|
579 | exactly. However, if the abstract value is too large, the overflow and |
|
580 | inexact exceptions are raised and an infinity or maximal finite value is |
|
581 | returned. If the abstract value is too small, the input value is rounded to |
|
582 | a subnormal number, and the underflow and inexact exceptions are raised if |
|
583 | the abstract input cannot be represented exactly as a subnormal extended |
|
584 | double-precision floating-point number. |
|
585 | If `roundingPrecision' is 32 or 64, the result is rounded to the same |
|
586 | number of bits as single or double precision, respectively. Otherwise, the |
|
587 | result is rounded to the full precision of the extended double-precision |
|
588 | format. |
|
589 | The input significand must be normalized or smaller. If the input |
|
590 | significand is not normalized, `zExp' must be 0; in that case, the result |
|
591 | returned is a subnormal number, and it must not require rounding. The |
|
592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary |
|
593 | Floating-Point Arithmetic. |
|
594 *----------------------------------------------------------------------------*/ |
|
595 |
|
596 static floatx80 |
|
597 roundAndPackFloatx80( |
|
598 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
|
599 STATUS_PARAM) |
|
600 { |
|
601 int8 roundingMode; |
|
602 flag roundNearestEven, increment, isTiny; |
|
603 int64 roundIncrement, roundMask, roundBits; |
|
604 |
|
605 roundingMode = STATUS(float_rounding_mode); |
|
606 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
607 if ( roundingPrecision == 80 ) goto precision80; |
|
608 if ( roundingPrecision == 64 ) { |
|
609 roundIncrement = LIT64( 0x0000000000000400 ); |
|
610 roundMask = LIT64( 0x00000000000007FF ); |
|
611 } |
|
612 else if ( roundingPrecision == 32 ) { |
|
613 roundIncrement = LIT64( 0x0000008000000000 ); |
|
614 roundMask = LIT64( 0x000000FFFFFFFFFF ); |
|
615 } |
|
616 else { |
|
617 goto precision80; |
|
618 } |
|
619 zSig0 |= ( zSig1 != 0 ); |
|
620 if ( ! roundNearestEven ) { |
|
621 if ( roundingMode == float_round_to_zero ) { |
|
622 roundIncrement = 0; |
|
623 } |
|
624 else { |
|
625 roundIncrement = roundMask; |
|
626 if ( zSign ) { |
|
627 if ( roundingMode == float_round_up ) roundIncrement = 0; |
|
628 } |
|
629 else { |
|
630 if ( roundingMode == float_round_down ) roundIncrement = 0; |
|
631 } |
|
632 } |
|
633 } |
|
634 roundBits = zSig0 & roundMask; |
|
635 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
|
636 if ( ( 0x7FFE < zExp ) |
|
637 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) ) |
|
638 ) { |
|
639 goto overflow; |
|
640 } |
|
641 if ( zExp <= 0 ) { |
|
642 if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 ); |
|
643 isTiny = |
|
644 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) |
|
645 || ( zExp < 0 ) |
|
646 || ( zSig0 <= zSig0 + roundIncrement ); |
|
647 shift64RightJamming( zSig0, 1 - zExp, &zSig0 ); |
|
648 zExp = 0; |
|
649 roundBits = zSig0 & roundMask; |
|
650 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR); |
|
651 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
652 zSig0 += roundIncrement; |
|
653 if ( (sbits64) zSig0 < 0 ) zExp = 1; |
|
654 roundIncrement = roundMask + 1; |
|
655 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
|
656 roundMask |= roundIncrement; |
|
657 } |
|
658 zSig0 &= ~ roundMask; |
|
659 return packFloatx80( zSign, zExp, zSig0 ); |
|
660 } |
|
661 } |
|
662 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
663 zSig0 += roundIncrement; |
|
664 if ( zSig0 < roundIncrement ) { |
|
665 ++zExp; |
|
666 zSig0 = LIT64( 0x8000000000000000 ); |
|
667 } |
|
668 roundIncrement = roundMask + 1; |
|
669 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) { |
|
670 roundMask |= roundIncrement; |
|
671 } |
|
672 zSig0 &= ~ roundMask; |
|
673 if ( zSig0 == 0 ) zExp = 0; |
|
674 return packFloatx80( zSign, zExp, zSig0 ); |
|
675 precision80: |
|
676 increment = ( (sbits64) zSig1 < 0 ); |
|
677 if ( ! roundNearestEven ) { |
|
678 if ( roundingMode == float_round_to_zero ) { |
|
679 increment = 0; |
|
680 } |
|
681 else { |
|
682 if ( zSign ) { |
|
683 increment = ( roundingMode == float_round_down ) && zSig1; |
|
684 } |
|
685 else { |
|
686 increment = ( roundingMode == float_round_up ) && zSig1; |
|
687 } |
|
688 } |
|
689 } |
|
690 if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) { |
|
691 if ( ( 0x7FFE < zExp ) |
|
692 || ( ( zExp == 0x7FFE ) |
|
693 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) ) |
|
694 && increment |
|
695 ) |
|
696 ) { |
|
697 roundMask = 0; |
|
698 overflow: |
|
699 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
700 if ( ( roundingMode == float_round_to_zero ) |
|
701 || ( zSign && ( roundingMode == float_round_up ) ) |
|
702 || ( ! zSign && ( roundingMode == float_round_down ) ) |
|
703 ) { |
|
704 return packFloatx80( zSign, 0x7FFE, ~ roundMask ); |
|
705 } |
|
706 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
707 } |
|
708 if ( zExp <= 0 ) { |
|
709 isTiny = |
|
710 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) |
|
711 || ( zExp < 0 ) |
|
712 || ! increment |
|
713 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) ); |
|
714 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 ); |
|
715 zExp = 0; |
|
716 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR); |
|
717 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
718 if ( roundNearestEven ) { |
|
719 increment = ( (sbits64) zSig1 < 0 ); |
|
720 } |
|
721 else { |
|
722 if ( zSign ) { |
|
723 increment = ( roundingMode == float_round_down ) && zSig1; |
|
724 } |
|
725 else { |
|
726 increment = ( roundingMode == float_round_up ) && zSig1; |
|
727 } |
|
728 } |
|
729 if ( increment ) { |
|
730 ++zSig0; |
|
731 zSig0 &= |
|
732 ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); |
|
733 if ( (sbits64) zSig0 < 0 ) zExp = 1; |
|
734 } |
|
735 return packFloatx80( zSign, zExp, zSig0 ); |
|
736 } |
|
737 } |
|
738 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
739 if ( increment ) { |
|
740 ++zSig0; |
|
741 if ( zSig0 == 0 ) { |
|
742 ++zExp; |
|
743 zSig0 = LIT64( 0x8000000000000000 ); |
|
744 } |
|
745 else { |
|
746 zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven ); |
|
747 } |
|
748 } |
|
749 else { |
|
750 if ( zSig0 == 0 ) zExp = 0; |
|
751 } |
|
752 return packFloatx80( zSign, zExp, zSig0 ); |
|
753 |
|
754 } |
|
755 |
|
756 /*---------------------------------------------------------------------------- |
|
757 | Takes an abstract floating-point value having sign `zSign', exponent |
|
758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1', |
|
759 | and returns the proper extended double-precision floating-point value |
|
760 | corresponding to the abstract input. This routine is just like |
|
761 | `roundAndPackFloatx80' except that the input significand does not have to be |
|
762 | normalized. |
|
763 *----------------------------------------------------------------------------*/ |
|
764 |
|
765 static floatx80 |
|
766 normalizeRoundAndPackFloatx80( |
|
767 int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 |
|
768 STATUS_PARAM) |
|
769 { |
|
770 int8 shiftCount; |
|
771 |
|
772 if ( zSig0 == 0 ) { |
|
773 zSig0 = zSig1; |
|
774 zSig1 = 0; |
|
775 zExp -= 64; |
|
776 } |
|
777 shiftCount = countLeadingZeros64( zSig0 ); |
|
778 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
|
779 zExp -= shiftCount; |
|
780 return |
|
781 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR); |
|
782 |
|
783 } |
|
784 |
|
785 #endif |
|
786 |
|
787 #ifdef FLOAT128 |
|
788 |
|
789 /*---------------------------------------------------------------------------- |
|
790 | Returns the least-significant 64 fraction bits of the quadruple-precision |
|
791 | floating-point value `a'. |
|
792 *----------------------------------------------------------------------------*/ |
|
793 |
|
794 INLINE bits64 extractFloat128Frac1( float128 a ) |
|
795 { |
|
796 |
|
797 return a.low; |
|
798 |
|
799 } |
|
800 |
|
801 /*---------------------------------------------------------------------------- |
|
802 | Returns the most-significant 48 fraction bits of the quadruple-precision |
|
803 | floating-point value `a'. |
|
804 *----------------------------------------------------------------------------*/ |
|
805 |
|
806 INLINE bits64 extractFloat128Frac0( float128 a ) |
|
807 { |
|
808 |
|
809 return a.high & LIT64( 0x0000FFFFFFFFFFFF ); |
|
810 |
|
811 } |
|
812 |
|
813 /*---------------------------------------------------------------------------- |
|
814 | Returns the exponent bits of the quadruple-precision floating-point value |
|
815 | `a'. |
|
816 *----------------------------------------------------------------------------*/ |
|
817 |
|
818 INLINE int32 extractFloat128Exp( float128 a ) |
|
819 { |
|
820 |
|
821 return ( a.high>>48 ) & 0x7FFF; |
|
822 |
|
823 } |
|
824 |
|
825 /*---------------------------------------------------------------------------- |
|
826 | Returns the sign bit of the quadruple-precision floating-point value `a'. |
|
827 *----------------------------------------------------------------------------*/ |
|
828 |
|
829 INLINE flag extractFloat128Sign( float128 a ) |
|
830 { |
|
831 |
|
832 return a.high>>63; |
|
833 |
|
834 } |
|
835 |
|
836 /*---------------------------------------------------------------------------- |
|
837 | Normalizes the subnormal quadruple-precision floating-point value |
|
838 | represented by the denormalized significand formed by the concatenation of |
|
839 | `aSig0' and `aSig1'. The normalized exponent is stored at the location |
|
840 | pointed to by `zExpPtr'. The most significant 49 bits of the normalized |
|
841 | significand are stored at the location pointed to by `zSig0Ptr', and the |
|
842 | least significant 64 bits of the normalized significand are stored at the |
|
843 | location pointed to by `zSig1Ptr'. |
|
844 *----------------------------------------------------------------------------*/ |
|
845 |
|
846 static void |
|
847 normalizeFloat128Subnormal( |
|
848 bits64 aSig0, |
|
849 bits64 aSig1, |
|
850 int32 *zExpPtr, |
|
851 bits64 *zSig0Ptr, |
|
852 bits64 *zSig1Ptr |
|
853 ) |
|
854 { |
|
855 int8 shiftCount; |
|
856 |
|
857 if ( aSig0 == 0 ) { |
|
858 shiftCount = countLeadingZeros64( aSig1 ) - 15; |
|
859 if ( shiftCount < 0 ) { |
|
860 *zSig0Ptr = aSig1>>( - shiftCount ); |
|
861 *zSig1Ptr = aSig1<<( shiftCount & 63 ); |
|
862 } |
|
863 else { |
|
864 *zSig0Ptr = aSig1<<shiftCount; |
|
865 *zSig1Ptr = 0; |
|
866 } |
|
867 *zExpPtr = - shiftCount - 63; |
|
868 } |
|
869 else { |
|
870 shiftCount = countLeadingZeros64( aSig0 ) - 15; |
|
871 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr ); |
|
872 *zExpPtr = 1 - shiftCount; |
|
873 } |
|
874 |
|
875 } |
|
876 |
|
877 /*---------------------------------------------------------------------------- |
|
878 | Packs the sign `zSign', the exponent `zExp', and the significand formed |
|
879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision |
|
880 | floating-point value, returning the result. After being shifted into the |
|
881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply |
|
882 | added together to form the most significant 32 bits of the result. This |
|
883 | means that any integer portion of `zSig0' will be added into the exponent. |
|
884 | Since a properly normalized significand will have an integer portion equal |
|
885 | to 1, the `zExp' input should be 1 less than the desired result exponent |
|
886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized |
|
887 | significand. |
|
888 *----------------------------------------------------------------------------*/ |
|
889 |
|
890 INLINE float128 |
|
891 packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 ) |
|
892 { |
|
893 float128 z; |
|
894 |
|
895 z.low = zSig1; |
|
896 z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0; |
|
897 return z; |
|
898 |
|
899 } |
|
900 |
|
901 /*---------------------------------------------------------------------------- |
|
902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
903 | and extended significand formed by the concatenation of `zSig0', `zSig1', |
|
904 | and `zSig2', and returns the proper quadruple-precision floating-point value |
|
905 | corresponding to the abstract input. Ordinarily, the abstract value is |
|
906 | simply rounded and packed into the quadruple-precision format, with the |
|
907 | inexact exception raised if the abstract input cannot be represented |
|
908 | exactly. However, if the abstract value is too large, the overflow and |
|
909 | inexact exceptions are raised and an infinity or maximal finite value is |
|
910 | returned. If the abstract value is too small, the input value is rounded to |
|
911 | a subnormal number, and the underflow and inexact exceptions are raised if |
|
912 | the abstract input cannot be represented exactly as a subnormal quadruple- |
|
913 | precision floating-point number. |
|
914 | The input significand must be normalized or smaller. If the input |
|
915 | significand is not normalized, `zExp' must be 0; in that case, the result |
|
916 | returned is a subnormal number, and it must not require rounding. In the |
|
917 | usual case that the input significand is normalized, `zExp' must be 1 less |
|
918 | than the ``true'' floating-point exponent. The handling of underflow and |
|
919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
920 *----------------------------------------------------------------------------*/ |
|
921 |
|
922 static float128 |
|
923 roundAndPackFloat128( |
|
924 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM) |
|
925 { |
|
926 int8 roundingMode; |
|
927 flag roundNearestEven, increment, isTiny; |
|
928 |
|
929 roundingMode = STATUS(float_rounding_mode); |
|
930 roundNearestEven = ( roundingMode == float_round_nearest_even ); |
|
931 increment = ( (sbits64) zSig2 < 0 ); |
|
932 if ( ! roundNearestEven ) { |
|
933 if ( roundingMode == float_round_to_zero ) { |
|
934 increment = 0; |
|
935 } |
|
936 else { |
|
937 if ( zSign ) { |
|
938 increment = ( roundingMode == float_round_down ) && zSig2; |
|
939 } |
|
940 else { |
|
941 increment = ( roundingMode == float_round_up ) && zSig2; |
|
942 } |
|
943 } |
|
944 } |
|
945 if ( 0x7FFD <= (bits32) zExp ) { |
|
946 if ( ( 0x7FFD < zExp ) |
|
947 || ( ( zExp == 0x7FFD ) |
|
948 && eq128( |
|
949 LIT64( 0x0001FFFFFFFFFFFF ), |
|
950 LIT64( 0xFFFFFFFFFFFFFFFF ), |
|
951 zSig0, |
|
952 zSig1 |
|
953 ) |
|
954 && increment |
|
955 ) |
|
956 ) { |
|
957 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
958 if ( ( roundingMode == float_round_to_zero ) |
|
959 || ( zSign && ( roundingMode == float_round_up ) ) |
|
960 || ( ! zSign && ( roundingMode == float_round_down ) ) |
|
961 ) { |
|
962 return |
|
963 packFloat128( |
|
964 zSign, |
|
965 0x7FFE, |
|
966 LIT64( 0x0000FFFFFFFFFFFF ), |
|
967 LIT64( 0xFFFFFFFFFFFFFFFF ) |
|
968 ); |
|
969 } |
|
970 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
971 } |
|
972 if ( zExp < 0 ) { |
|
973 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); |
|
974 isTiny = |
|
975 ( STATUS(float_detect_tininess) == float_tininess_before_rounding ) |
|
976 || ( zExp < -1 ) |
|
977 || ! increment |
|
978 || lt128( |
|
979 zSig0, |
|
980 zSig1, |
|
981 LIT64( 0x0001FFFFFFFFFFFF ), |
|
982 LIT64( 0xFFFFFFFFFFFFFFFF ) |
|
983 ); |
|
984 shift128ExtraRightJamming( |
|
985 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 ); |
|
986 zExp = 0; |
|
987 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR); |
|
988 if ( roundNearestEven ) { |
|
989 increment = ( (sbits64) zSig2 < 0 ); |
|
990 } |
|
991 else { |
|
992 if ( zSign ) { |
|
993 increment = ( roundingMode == float_round_down ) && zSig2; |
|
994 } |
|
995 else { |
|
996 increment = ( roundingMode == float_round_up ) && zSig2; |
|
997 } |
|
998 } |
|
999 } |
|
1000 } |
|
1001 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
1002 if ( increment ) { |
|
1003 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 ); |
|
1004 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven ); |
|
1005 } |
|
1006 else { |
|
1007 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0; |
|
1008 } |
|
1009 return packFloat128( zSign, zExp, zSig0, zSig1 ); |
|
1010 |
|
1011 } |
|
1012 |
|
1013 /*---------------------------------------------------------------------------- |
|
1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp', |
|
1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and |
|
1016 | returns the proper quadruple-precision floating-point value corresponding |
|
1017 | to the abstract input. This routine is just like `roundAndPackFloat128' |
|
1018 | except that the input significand has fewer bits and does not have to be |
|
1019 | normalized. In all cases, `zExp' must be 1 less than the ``true'' floating- |
|
1020 | point exponent. |
|
1021 *----------------------------------------------------------------------------*/ |
|
1022 |
|
1023 static float128 |
|
1024 normalizeRoundAndPackFloat128( |
|
1025 flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM) |
|
1026 { |
|
1027 int8 shiftCount; |
|
1028 bits64 zSig2; |
|
1029 |
|
1030 if ( zSig0 == 0 ) { |
|
1031 zSig0 = zSig1; |
|
1032 zSig1 = 0; |
|
1033 zExp -= 64; |
|
1034 } |
|
1035 shiftCount = countLeadingZeros64( zSig0 ) - 15; |
|
1036 if ( 0 <= shiftCount ) { |
|
1037 zSig2 = 0; |
|
1038 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
|
1039 } |
|
1040 else { |
|
1041 shift128ExtraRightJamming( |
|
1042 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 ); |
|
1043 } |
|
1044 zExp -= shiftCount; |
|
1045 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR); |
|
1046 |
|
1047 } |
|
1048 |
|
1049 #endif |
|
1050 |
|
1051 /*---------------------------------------------------------------------------- |
|
1052 | Returns the result of converting the 32-bit two's complement integer `a' |
|
1053 | to the single-precision floating-point format. The conversion is performed |
|
1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1055 *----------------------------------------------------------------------------*/ |
|
1056 |
|
1057 float32 int32_to_float32( int32 a STATUS_PARAM ) |
|
1058 { |
|
1059 flag zSign; |
|
1060 |
|
1061 if ( a == 0 ) return float32_zero; |
|
1062 if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 ); |
|
1063 zSign = ( a < 0 ); |
|
1064 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR ); |
|
1065 |
|
1066 } |
|
1067 |
|
1068 /*---------------------------------------------------------------------------- |
|
1069 | Returns the result of converting the 32-bit two's complement integer `a' |
|
1070 | to the double-precision floating-point format. The conversion is performed |
|
1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1072 *----------------------------------------------------------------------------*/ |
|
1073 |
|
1074 float64 int32_to_float64( int32 a STATUS_PARAM ) |
|
1075 { |
|
1076 flag zSign; |
|
1077 uint32 absA; |
|
1078 int8 shiftCount; |
|
1079 bits64 zSig; |
|
1080 |
|
1081 if ( a == 0 ) return float64_zero; |
|
1082 zSign = ( a < 0 ); |
|
1083 absA = zSign ? - a : a; |
|
1084 shiftCount = countLeadingZeros32( absA ) + 21; |
|
1085 zSig = absA; |
|
1086 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount ); |
|
1087 |
|
1088 } |
|
1089 |
|
1090 #ifdef FLOATX80 |
|
1091 |
|
1092 /*---------------------------------------------------------------------------- |
|
1093 | Returns the result of converting the 32-bit two's complement integer `a' |
|
1094 | to the extended double-precision floating-point format. The conversion |
|
1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1096 | Arithmetic. |
|
1097 *----------------------------------------------------------------------------*/ |
|
1098 |
|
1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM ) |
|
1100 { |
|
1101 flag zSign; |
|
1102 uint32 absA; |
|
1103 int8 shiftCount; |
|
1104 bits64 zSig; |
|
1105 |
|
1106 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
|
1107 zSign = ( a < 0 ); |
|
1108 absA = zSign ? - a : a; |
|
1109 shiftCount = countLeadingZeros32( absA ) + 32; |
|
1110 zSig = absA; |
|
1111 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount ); |
|
1112 |
|
1113 } |
|
1114 |
|
1115 #endif |
|
1116 |
|
1117 #ifdef FLOAT128 |
|
1118 |
|
1119 /*---------------------------------------------------------------------------- |
|
1120 | Returns the result of converting the 32-bit two's complement integer `a' to |
|
1121 | the quadruple-precision floating-point format. The conversion is performed |
|
1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1123 *----------------------------------------------------------------------------*/ |
|
1124 |
|
1125 float128 int32_to_float128( int32 a STATUS_PARAM ) |
|
1126 { |
|
1127 flag zSign; |
|
1128 uint32 absA; |
|
1129 int8 shiftCount; |
|
1130 bits64 zSig0; |
|
1131 |
|
1132 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); |
|
1133 zSign = ( a < 0 ); |
|
1134 absA = zSign ? - a : a; |
|
1135 shiftCount = countLeadingZeros32( absA ) + 17; |
|
1136 zSig0 = absA; |
|
1137 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 ); |
|
1138 |
|
1139 } |
|
1140 |
|
1141 #endif |
|
1142 |
|
1143 /*---------------------------------------------------------------------------- |
|
1144 | Returns the result of converting the 64-bit two's complement integer `a' |
|
1145 | to the single-precision floating-point format. The conversion is performed |
|
1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1147 *----------------------------------------------------------------------------*/ |
|
1148 |
|
1149 float32 int64_to_float32( int64 a STATUS_PARAM ) |
|
1150 { |
|
1151 flag zSign; |
|
1152 uint64 absA; |
|
1153 int8 shiftCount; |
|
1154 |
|
1155 if ( a == 0 ) return float32_zero; |
|
1156 zSign = ( a < 0 ); |
|
1157 absA = zSign ? - a : a; |
|
1158 shiftCount = countLeadingZeros64( absA ) - 40; |
|
1159 if ( 0 <= shiftCount ) { |
|
1160 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount ); |
|
1161 } |
|
1162 else { |
|
1163 shiftCount += 7; |
|
1164 if ( shiftCount < 0 ) { |
|
1165 shift64RightJamming( absA, - shiftCount, &absA ); |
|
1166 } |
|
1167 else { |
|
1168 absA <<= shiftCount; |
|
1169 } |
|
1170 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR ); |
|
1171 } |
|
1172 |
|
1173 } |
|
1174 |
|
1175 float32 uint64_to_float32( uint64 a STATUS_PARAM ) |
|
1176 { |
|
1177 int8 shiftCount; |
|
1178 |
|
1179 if ( a == 0 ) return float32_zero; |
|
1180 shiftCount = countLeadingZeros64( a ) - 40; |
|
1181 if ( 0 <= shiftCount ) { |
|
1182 return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount ); |
|
1183 } |
|
1184 else { |
|
1185 shiftCount += 7; |
|
1186 if ( shiftCount < 0 ) { |
|
1187 shift64RightJamming( a, - shiftCount, &a ); |
|
1188 } |
|
1189 else { |
|
1190 a <<= shiftCount; |
|
1191 } |
|
1192 return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR ); |
|
1193 } |
|
1194 } |
|
1195 |
|
1196 /*---------------------------------------------------------------------------- |
|
1197 | Returns the result of converting the 64-bit two's complement integer `a' |
|
1198 | to the double-precision floating-point format. The conversion is performed |
|
1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1200 *----------------------------------------------------------------------------*/ |
|
1201 |
|
1202 float64 int64_to_float64( int64 a STATUS_PARAM ) |
|
1203 { |
|
1204 flag zSign; |
|
1205 |
|
1206 if ( a == 0 ) return float64_zero; |
|
1207 if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) { |
|
1208 return packFloat64( 1, 0x43E, 0 ); |
|
1209 } |
|
1210 zSign = ( a < 0 ); |
|
1211 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR ); |
|
1212 |
|
1213 } |
|
1214 |
|
1215 float64 uint64_to_float64( uint64 a STATUS_PARAM ) |
|
1216 { |
|
1217 if ( a == 0 ) return float64_zero; |
|
1218 return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR ); |
|
1219 |
|
1220 } |
|
1221 |
|
1222 #ifdef FLOATX80 |
|
1223 |
|
1224 /*---------------------------------------------------------------------------- |
|
1225 | Returns the result of converting the 64-bit two's complement integer `a' |
|
1226 | to the extended double-precision floating-point format. The conversion |
|
1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1228 | Arithmetic. |
|
1229 *----------------------------------------------------------------------------*/ |
|
1230 |
|
1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM ) |
|
1232 { |
|
1233 flag zSign; |
|
1234 uint64 absA; |
|
1235 int8 shiftCount; |
|
1236 |
|
1237 if ( a == 0 ) return packFloatx80( 0, 0, 0 ); |
|
1238 zSign = ( a < 0 ); |
|
1239 absA = zSign ? - a : a; |
|
1240 shiftCount = countLeadingZeros64( absA ); |
|
1241 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount ); |
|
1242 |
|
1243 } |
|
1244 |
|
1245 #endif |
|
1246 |
|
1247 #ifdef FLOAT128 |
|
1248 |
|
1249 /*---------------------------------------------------------------------------- |
|
1250 | Returns the result of converting the 64-bit two's complement integer `a' to |
|
1251 | the quadruple-precision floating-point format. The conversion is performed |
|
1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1253 *----------------------------------------------------------------------------*/ |
|
1254 |
|
1255 float128 int64_to_float128( int64 a STATUS_PARAM ) |
|
1256 { |
|
1257 flag zSign; |
|
1258 uint64 absA; |
|
1259 int8 shiftCount; |
|
1260 int32 zExp; |
|
1261 bits64 zSig0, zSig1; |
|
1262 |
|
1263 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 ); |
|
1264 zSign = ( a < 0 ); |
|
1265 absA = zSign ? - a : a; |
|
1266 shiftCount = countLeadingZeros64( absA ) + 49; |
|
1267 zExp = 0x406E - shiftCount; |
|
1268 if ( 64 <= shiftCount ) { |
|
1269 zSig1 = 0; |
|
1270 zSig0 = absA; |
|
1271 shiftCount -= 64; |
|
1272 } |
|
1273 else { |
|
1274 zSig1 = absA; |
|
1275 zSig0 = 0; |
|
1276 } |
|
1277 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 ); |
|
1278 return packFloat128( zSign, zExp, zSig0, zSig1 ); |
|
1279 |
|
1280 } |
|
1281 |
|
1282 #endif |
|
1283 |
|
1284 /*---------------------------------------------------------------------------- |
|
1285 | Returns the result of converting the single-precision floating-point value |
|
1286 | `a' to the 32-bit two's complement integer format. The conversion is |
|
1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1288 | Arithmetic---which means in particular that the conversion is rounded |
|
1289 | according to the current rounding mode. If `a' is a NaN, the largest |
|
1290 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
1291 | largest integer with the same sign as `a' is returned. |
|
1292 *----------------------------------------------------------------------------*/ |
|
1293 |
|
1294 int32 float32_to_int32( float32 a STATUS_PARAM ) |
|
1295 { |
|
1296 flag aSign; |
|
1297 int16 aExp, shiftCount; |
|
1298 bits32 aSig; |
|
1299 bits64 aSig64; |
|
1300 |
|
1301 aSig = extractFloat32Frac( a ); |
|
1302 aExp = extractFloat32Exp( a ); |
|
1303 aSign = extractFloat32Sign( a ); |
|
1304 if ( ( aExp == 0xFF ) && aSig ) aSign = 0; |
|
1305 if ( aExp ) aSig |= 0x00800000; |
|
1306 shiftCount = 0xAF - aExp; |
|
1307 aSig64 = aSig; |
|
1308 aSig64 <<= 32; |
|
1309 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 ); |
|
1310 return roundAndPackInt32( aSign, aSig64 STATUS_VAR ); |
|
1311 |
|
1312 } |
|
1313 |
|
1314 /*---------------------------------------------------------------------------- |
|
1315 | Returns the result of converting the single-precision floating-point value |
|
1316 | `a' to the 32-bit two's complement integer format. The conversion is |
|
1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1318 | Arithmetic, except that the conversion is always rounded toward zero. |
|
1319 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
|
1320 | the conversion overflows, the largest integer with the same sign as `a' is |
|
1321 | returned. |
|
1322 *----------------------------------------------------------------------------*/ |
|
1323 |
|
1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM ) |
|
1325 { |
|
1326 flag aSign; |
|
1327 int16 aExp, shiftCount; |
|
1328 bits32 aSig; |
|
1329 int32 z; |
|
1330 |
|
1331 aSig = extractFloat32Frac( a ); |
|
1332 aExp = extractFloat32Exp( a ); |
|
1333 aSign = extractFloat32Sign( a ); |
|
1334 shiftCount = aExp - 0x9E; |
|
1335 if ( 0 <= shiftCount ) { |
|
1336 if ( float32_val(a) != 0xCF000000 ) { |
|
1337 float_raise( float_flag_invalid STATUS_VAR); |
|
1338 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF; |
|
1339 } |
|
1340 return (sbits32) 0x80000000; |
|
1341 } |
|
1342 else if ( aExp <= 0x7E ) { |
|
1343 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
1344 return 0; |
|
1345 } |
|
1346 aSig = ( aSig | 0x00800000 )<<8; |
|
1347 z = aSig>>( - shiftCount ); |
|
1348 if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) { |
|
1349 STATUS(float_exception_flags) |= float_flag_inexact; |
|
1350 } |
|
1351 if ( aSign ) z = - z; |
|
1352 return z; |
|
1353 |
|
1354 } |
|
1355 |
|
1356 /*---------------------------------------------------------------------------- |
|
1357 | Returns the result of converting the single-precision floating-point value |
|
1358 | `a' to the 64-bit two's complement integer format. The conversion is |
|
1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1360 | Arithmetic---which means in particular that the conversion is rounded |
|
1361 | according to the current rounding mode. If `a' is a NaN, the largest |
|
1362 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
1363 | largest integer with the same sign as `a' is returned. |
|
1364 *----------------------------------------------------------------------------*/ |
|
1365 |
|
1366 int64 float32_to_int64( float32 a STATUS_PARAM ) |
|
1367 { |
|
1368 flag aSign; |
|
1369 int16 aExp, shiftCount; |
|
1370 bits32 aSig; |
|
1371 bits64 aSig64, aSigExtra; |
|
1372 |
|
1373 aSig = extractFloat32Frac( a ); |
|
1374 aExp = extractFloat32Exp( a ); |
|
1375 aSign = extractFloat32Sign( a ); |
|
1376 shiftCount = 0xBE - aExp; |
|
1377 if ( shiftCount < 0 ) { |
|
1378 float_raise( float_flag_invalid STATUS_VAR); |
|
1379 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { |
|
1380 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
1381 } |
|
1382 return (sbits64) LIT64( 0x8000000000000000 ); |
|
1383 } |
|
1384 if ( aExp ) aSig |= 0x00800000; |
|
1385 aSig64 = aSig; |
|
1386 aSig64 <<= 40; |
|
1387 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra ); |
|
1388 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR ); |
|
1389 |
|
1390 } |
|
1391 |
|
1392 /*---------------------------------------------------------------------------- |
|
1393 | Returns the result of converting the single-precision floating-point value |
|
1394 | `a' to the 64-bit two's complement integer format. The conversion is |
|
1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1396 | Arithmetic, except that the conversion is always rounded toward zero. If |
|
1397 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the |
|
1398 | conversion overflows, the largest integer with the same sign as `a' is |
|
1399 | returned. |
|
1400 *----------------------------------------------------------------------------*/ |
|
1401 |
|
1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM ) |
|
1403 { |
|
1404 flag aSign; |
|
1405 int16 aExp, shiftCount; |
|
1406 bits32 aSig; |
|
1407 bits64 aSig64; |
|
1408 int64 z; |
|
1409 |
|
1410 aSig = extractFloat32Frac( a ); |
|
1411 aExp = extractFloat32Exp( a ); |
|
1412 aSign = extractFloat32Sign( a ); |
|
1413 shiftCount = aExp - 0xBE; |
|
1414 if ( 0 <= shiftCount ) { |
|
1415 if ( float32_val(a) != 0xDF000000 ) { |
|
1416 float_raise( float_flag_invalid STATUS_VAR); |
|
1417 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) { |
|
1418 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
1419 } |
|
1420 } |
|
1421 return (sbits64) LIT64( 0x8000000000000000 ); |
|
1422 } |
|
1423 else if ( aExp <= 0x7E ) { |
|
1424 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
1425 return 0; |
|
1426 } |
|
1427 aSig64 = aSig | 0x00800000; |
|
1428 aSig64 <<= 40; |
|
1429 z = aSig64>>( - shiftCount ); |
|
1430 if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) { |
|
1431 STATUS(float_exception_flags) |= float_flag_inexact; |
|
1432 } |
|
1433 if ( aSign ) z = - z; |
|
1434 return z; |
|
1435 |
|
1436 } |
|
1437 |
|
1438 /*---------------------------------------------------------------------------- |
|
1439 | Returns the result of converting the single-precision floating-point value |
|
1440 | `a' to the double-precision floating-point format. The conversion is |
|
1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1442 | Arithmetic. |
|
1443 *----------------------------------------------------------------------------*/ |
|
1444 |
|
1445 float64 float32_to_float64( float32 a STATUS_PARAM ) |
|
1446 { |
|
1447 flag aSign; |
|
1448 int16 aExp; |
|
1449 bits32 aSig; |
|
1450 |
|
1451 aSig = extractFloat32Frac( a ); |
|
1452 aExp = extractFloat32Exp( a ); |
|
1453 aSign = extractFloat32Sign( a ); |
|
1454 if ( aExp == 0xFF ) { |
|
1455 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR )); |
|
1456 return packFloat64( aSign, 0x7FF, 0 ); |
|
1457 } |
|
1458 if ( aExp == 0 ) { |
|
1459 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 ); |
|
1460 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1461 --aExp; |
|
1462 } |
|
1463 return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 ); |
|
1464 |
|
1465 } |
|
1466 |
|
1467 #ifdef FLOATX80 |
|
1468 |
|
1469 /*---------------------------------------------------------------------------- |
|
1470 | Returns the result of converting the single-precision floating-point value |
|
1471 | `a' to the extended double-precision floating-point format. The conversion |
|
1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1473 | Arithmetic. |
|
1474 *----------------------------------------------------------------------------*/ |
|
1475 |
|
1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM ) |
|
1477 { |
|
1478 flag aSign; |
|
1479 int16 aExp; |
|
1480 bits32 aSig; |
|
1481 |
|
1482 aSig = extractFloat32Frac( a ); |
|
1483 aExp = extractFloat32Exp( a ); |
|
1484 aSign = extractFloat32Sign( a ); |
|
1485 if ( aExp == 0xFF ) { |
|
1486 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) ); |
|
1487 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
1488 } |
|
1489 if ( aExp == 0 ) { |
|
1490 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); |
|
1491 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1492 } |
|
1493 aSig |= 0x00800000; |
|
1494 return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 ); |
|
1495 |
|
1496 } |
|
1497 |
|
1498 #endif |
|
1499 |
|
1500 #ifdef FLOAT128 |
|
1501 |
|
1502 /*---------------------------------------------------------------------------- |
|
1503 | Returns the result of converting the single-precision floating-point value |
|
1504 | `a' to the double-precision floating-point format. The conversion is |
|
1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
1506 | Arithmetic. |
|
1507 *----------------------------------------------------------------------------*/ |
|
1508 |
|
1509 float128 float32_to_float128( float32 a STATUS_PARAM ) |
|
1510 { |
|
1511 flag aSign; |
|
1512 int16 aExp; |
|
1513 bits32 aSig; |
|
1514 |
|
1515 aSig = extractFloat32Frac( a ); |
|
1516 aExp = extractFloat32Exp( a ); |
|
1517 aSign = extractFloat32Sign( a ); |
|
1518 if ( aExp == 0xFF ) { |
|
1519 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) ); |
|
1520 return packFloat128( aSign, 0x7FFF, 0, 0 ); |
|
1521 } |
|
1522 if ( aExp == 0 ) { |
|
1523 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); |
|
1524 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1525 --aExp; |
|
1526 } |
|
1527 return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 ); |
|
1528 |
|
1529 } |
|
1530 |
|
1531 #endif |
|
1532 |
|
1533 /*---------------------------------------------------------------------------- |
|
1534 | Rounds the single-precision floating-point value `a' to an integer, and |
|
1535 | returns the result as a single-precision floating-point value. The |
|
1536 | operation is performed according to the IEC/IEEE Standard for Binary |
|
1537 | Floating-Point Arithmetic. |
|
1538 *----------------------------------------------------------------------------*/ |
|
1539 |
|
1540 float32 float32_round_to_int( float32 a STATUS_PARAM) |
|
1541 { |
|
1542 flag aSign; |
|
1543 int16 aExp; |
|
1544 bits32 lastBitMask, roundBitsMask; |
|
1545 int8 roundingMode; |
|
1546 bits32 z; |
|
1547 |
|
1548 aExp = extractFloat32Exp( a ); |
|
1549 if ( 0x96 <= aExp ) { |
|
1550 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) { |
|
1551 return propagateFloat32NaN( a, a STATUS_VAR ); |
|
1552 } |
|
1553 return a; |
|
1554 } |
|
1555 if ( aExp <= 0x7E ) { |
|
1556 if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a; |
|
1557 STATUS(float_exception_flags) |= float_flag_inexact; |
|
1558 aSign = extractFloat32Sign( a ); |
|
1559 switch ( STATUS(float_rounding_mode) ) { |
|
1560 case float_round_nearest_even: |
|
1561 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) { |
|
1562 return packFloat32( aSign, 0x7F, 0 ); |
|
1563 } |
|
1564 break; |
|
1565 case float_round_down: |
|
1566 return make_float32(aSign ? 0xBF800000 : 0); |
|
1567 case float_round_up: |
|
1568 return make_float32(aSign ? 0x80000000 : 0x3F800000); |
|
1569 } |
|
1570 return packFloat32( aSign, 0, 0 ); |
|
1571 } |
|
1572 lastBitMask = 1; |
|
1573 lastBitMask <<= 0x96 - aExp; |
|
1574 roundBitsMask = lastBitMask - 1; |
|
1575 z = float32_val(a); |
|
1576 roundingMode = STATUS(float_rounding_mode); |
|
1577 if ( roundingMode == float_round_nearest_even ) { |
|
1578 z += lastBitMask>>1; |
|
1579 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; |
|
1580 } |
|
1581 else if ( roundingMode != float_round_to_zero ) { |
|
1582 if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) { |
|
1583 z += roundBitsMask; |
|
1584 } |
|
1585 } |
|
1586 z &= ~ roundBitsMask; |
|
1587 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
1588 return make_float32(z); |
|
1589 |
|
1590 } |
|
1591 |
|
1592 /*---------------------------------------------------------------------------- |
|
1593 | Returns the result of adding the absolute values of the single-precision |
|
1594 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated |
|
1595 | before being returned. `zSign' is ignored if the result is a NaN. |
|
1596 | The addition is performed according to the IEC/IEEE Standard for Binary |
|
1597 | Floating-Point Arithmetic. |
|
1598 *----------------------------------------------------------------------------*/ |
|
1599 |
|
1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) |
|
1601 { |
|
1602 int16 aExp, bExp, zExp; |
|
1603 bits32 aSig, bSig, zSig; |
|
1604 int16 expDiff; |
|
1605 |
|
1606 aSig = extractFloat32Frac( a ); |
|
1607 aExp = extractFloat32Exp( a ); |
|
1608 bSig = extractFloat32Frac( b ); |
|
1609 bExp = extractFloat32Exp( b ); |
|
1610 expDiff = aExp - bExp; |
|
1611 aSig <<= 6; |
|
1612 bSig <<= 6; |
|
1613 if ( 0 < expDiff ) { |
|
1614 if ( aExp == 0xFF ) { |
|
1615 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1616 return a; |
|
1617 } |
|
1618 if ( bExp == 0 ) { |
|
1619 --expDiff; |
|
1620 } |
|
1621 else { |
|
1622 bSig |= 0x20000000; |
|
1623 } |
|
1624 shift32RightJamming( bSig, expDiff, &bSig ); |
|
1625 zExp = aExp; |
|
1626 } |
|
1627 else if ( expDiff < 0 ) { |
|
1628 if ( bExp == 0xFF ) { |
|
1629 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1630 return packFloat32( zSign, 0xFF, 0 ); |
|
1631 } |
|
1632 if ( aExp == 0 ) { |
|
1633 ++expDiff; |
|
1634 } |
|
1635 else { |
|
1636 aSig |= 0x20000000; |
|
1637 } |
|
1638 shift32RightJamming( aSig, - expDiff, &aSig ); |
|
1639 zExp = bExp; |
|
1640 } |
|
1641 else { |
|
1642 if ( aExp == 0xFF ) { |
|
1643 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1644 return a; |
|
1645 } |
|
1646 if ( aExp == 0 ) { |
|
1647 if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 ); |
|
1648 return packFloat32( zSign, 0, ( aSig + bSig )>>6 ); |
|
1649 } |
|
1650 zSig = 0x40000000 + aSig + bSig; |
|
1651 zExp = aExp; |
|
1652 goto roundAndPack; |
|
1653 } |
|
1654 aSig |= 0x20000000; |
|
1655 zSig = ( aSig + bSig )<<1; |
|
1656 --zExp; |
|
1657 if ( (sbits32) zSig < 0 ) { |
|
1658 zSig = aSig + bSig; |
|
1659 ++zExp; |
|
1660 } |
|
1661 roundAndPack: |
|
1662 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); |
|
1663 |
|
1664 } |
|
1665 |
|
1666 /*---------------------------------------------------------------------------- |
|
1667 | Returns the result of subtracting the absolute values of the single- |
|
1668 | precision floating-point values `a' and `b'. If `zSign' is 1, the |
|
1669 | difference is negated before being returned. `zSign' is ignored if the |
|
1670 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
|
1671 | Standard for Binary Floating-Point Arithmetic. |
|
1672 *----------------------------------------------------------------------------*/ |
|
1673 |
|
1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM) |
|
1675 { |
|
1676 int16 aExp, bExp, zExp; |
|
1677 bits32 aSig, bSig, zSig; |
|
1678 int16 expDiff; |
|
1679 |
|
1680 aSig = extractFloat32Frac( a ); |
|
1681 aExp = extractFloat32Exp( a ); |
|
1682 bSig = extractFloat32Frac( b ); |
|
1683 bExp = extractFloat32Exp( b ); |
|
1684 expDiff = aExp - bExp; |
|
1685 aSig <<= 7; |
|
1686 bSig <<= 7; |
|
1687 if ( 0 < expDiff ) goto aExpBigger; |
|
1688 if ( expDiff < 0 ) goto bExpBigger; |
|
1689 if ( aExp == 0xFF ) { |
|
1690 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1691 float_raise( float_flag_invalid STATUS_VAR); |
|
1692 return float32_default_nan; |
|
1693 } |
|
1694 if ( aExp == 0 ) { |
|
1695 aExp = 1; |
|
1696 bExp = 1; |
|
1697 } |
|
1698 if ( bSig < aSig ) goto aBigger; |
|
1699 if ( aSig < bSig ) goto bBigger; |
|
1700 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); |
|
1701 bExpBigger: |
|
1702 if ( bExp == 0xFF ) { |
|
1703 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1704 return packFloat32( zSign ^ 1, 0xFF, 0 ); |
|
1705 } |
|
1706 if ( aExp == 0 ) { |
|
1707 ++expDiff; |
|
1708 } |
|
1709 else { |
|
1710 aSig |= 0x40000000; |
|
1711 } |
|
1712 shift32RightJamming( aSig, - expDiff, &aSig ); |
|
1713 bSig |= 0x40000000; |
|
1714 bBigger: |
|
1715 zSig = bSig - aSig; |
|
1716 zExp = bExp; |
|
1717 zSign ^= 1; |
|
1718 goto normalizeRoundAndPack; |
|
1719 aExpBigger: |
|
1720 if ( aExp == 0xFF ) { |
|
1721 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1722 return a; |
|
1723 } |
|
1724 if ( bExp == 0 ) { |
|
1725 --expDiff; |
|
1726 } |
|
1727 else { |
|
1728 bSig |= 0x40000000; |
|
1729 } |
|
1730 shift32RightJamming( bSig, expDiff, &bSig ); |
|
1731 aSig |= 0x40000000; |
|
1732 aBigger: |
|
1733 zSig = aSig - bSig; |
|
1734 zExp = aExp; |
|
1735 normalizeRoundAndPack: |
|
1736 --zExp; |
|
1737 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); |
|
1738 |
|
1739 } |
|
1740 |
|
1741 /*---------------------------------------------------------------------------- |
|
1742 | Returns the result of adding the single-precision floating-point values `a' |
|
1743 | and `b'. The operation is performed according to the IEC/IEEE Standard for |
|
1744 | Binary Floating-Point Arithmetic. |
|
1745 *----------------------------------------------------------------------------*/ |
|
1746 |
|
1747 float32 float32_add( float32 a, float32 b STATUS_PARAM ) |
|
1748 { |
|
1749 flag aSign, bSign; |
|
1750 |
|
1751 aSign = extractFloat32Sign( a ); |
|
1752 bSign = extractFloat32Sign( b ); |
|
1753 if ( aSign == bSign ) { |
|
1754 return addFloat32Sigs( a, b, aSign STATUS_VAR); |
|
1755 } |
|
1756 else { |
|
1757 return subFloat32Sigs( a, b, aSign STATUS_VAR ); |
|
1758 } |
|
1759 |
|
1760 } |
|
1761 |
|
1762 /*---------------------------------------------------------------------------- |
|
1763 | Returns the result of subtracting the single-precision floating-point values |
|
1764 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
|
1765 | for Binary Floating-Point Arithmetic. |
|
1766 *----------------------------------------------------------------------------*/ |
|
1767 |
|
1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM ) |
|
1769 { |
|
1770 flag aSign, bSign; |
|
1771 |
|
1772 aSign = extractFloat32Sign( a ); |
|
1773 bSign = extractFloat32Sign( b ); |
|
1774 if ( aSign == bSign ) { |
|
1775 return subFloat32Sigs( a, b, aSign STATUS_VAR ); |
|
1776 } |
|
1777 else { |
|
1778 return addFloat32Sigs( a, b, aSign STATUS_VAR ); |
|
1779 } |
|
1780 |
|
1781 } |
|
1782 |
|
1783 /*---------------------------------------------------------------------------- |
|
1784 | Returns the result of multiplying the single-precision floating-point values |
|
1785 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
|
1786 | for Binary Floating-Point Arithmetic. |
|
1787 *----------------------------------------------------------------------------*/ |
|
1788 |
|
1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM ) |
|
1790 { |
|
1791 flag aSign, bSign, zSign; |
|
1792 int16 aExp, bExp, zExp; |
|
1793 bits32 aSig, bSig; |
|
1794 bits64 zSig64; |
|
1795 bits32 zSig; |
|
1796 |
|
1797 aSig = extractFloat32Frac( a ); |
|
1798 aExp = extractFloat32Exp( a ); |
|
1799 aSign = extractFloat32Sign( a ); |
|
1800 bSig = extractFloat32Frac( b ); |
|
1801 bExp = extractFloat32Exp( b ); |
|
1802 bSign = extractFloat32Sign( b ); |
|
1803 zSign = aSign ^ bSign; |
|
1804 if ( aExp == 0xFF ) { |
|
1805 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { |
|
1806 return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1807 } |
|
1808 if ( ( bExp | bSig ) == 0 ) { |
|
1809 float_raise( float_flag_invalid STATUS_VAR); |
|
1810 return float32_default_nan; |
|
1811 } |
|
1812 return packFloat32( zSign, 0xFF, 0 ); |
|
1813 } |
|
1814 if ( bExp == 0xFF ) { |
|
1815 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1816 if ( ( aExp | aSig ) == 0 ) { |
|
1817 float_raise( float_flag_invalid STATUS_VAR); |
|
1818 return float32_default_nan; |
|
1819 } |
|
1820 return packFloat32( zSign, 0xFF, 0 ); |
|
1821 } |
|
1822 if ( aExp == 0 ) { |
|
1823 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); |
|
1824 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1825 } |
|
1826 if ( bExp == 0 ) { |
|
1827 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 ); |
|
1828 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
|
1829 } |
|
1830 zExp = aExp + bExp - 0x7F; |
|
1831 aSig = ( aSig | 0x00800000 )<<7; |
|
1832 bSig = ( bSig | 0x00800000 )<<8; |
|
1833 shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 ); |
|
1834 zSig = zSig64; |
|
1835 if ( 0 <= (sbits32) ( zSig<<1 ) ) { |
|
1836 zSig <<= 1; |
|
1837 --zExp; |
|
1838 } |
|
1839 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); |
|
1840 |
|
1841 } |
|
1842 |
|
1843 /*---------------------------------------------------------------------------- |
|
1844 | Returns the result of dividing the single-precision floating-point value `a' |
|
1845 | by the corresponding value `b'. The operation is performed according to the |
|
1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1847 *----------------------------------------------------------------------------*/ |
|
1848 |
|
1849 float32 float32_div( float32 a, float32 b STATUS_PARAM ) |
|
1850 { |
|
1851 flag aSign, bSign, zSign; |
|
1852 int16 aExp, bExp, zExp; |
|
1853 bits32 aSig, bSig, zSig; |
|
1854 |
|
1855 aSig = extractFloat32Frac( a ); |
|
1856 aExp = extractFloat32Exp( a ); |
|
1857 aSign = extractFloat32Sign( a ); |
|
1858 bSig = extractFloat32Frac( b ); |
|
1859 bExp = extractFloat32Exp( b ); |
|
1860 bSign = extractFloat32Sign( b ); |
|
1861 zSign = aSign ^ bSign; |
|
1862 if ( aExp == 0xFF ) { |
|
1863 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1864 if ( bExp == 0xFF ) { |
|
1865 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1866 float_raise( float_flag_invalid STATUS_VAR); |
|
1867 return float32_default_nan; |
|
1868 } |
|
1869 return packFloat32( zSign, 0xFF, 0 ); |
|
1870 } |
|
1871 if ( bExp == 0xFF ) { |
|
1872 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1873 return packFloat32( zSign, 0, 0 ); |
|
1874 } |
|
1875 if ( bExp == 0 ) { |
|
1876 if ( bSig == 0 ) { |
|
1877 if ( ( aExp | aSig ) == 0 ) { |
|
1878 float_raise( float_flag_invalid STATUS_VAR); |
|
1879 return float32_default_nan; |
|
1880 } |
|
1881 float_raise( float_flag_divbyzero STATUS_VAR); |
|
1882 return packFloat32( zSign, 0xFF, 0 ); |
|
1883 } |
|
1884 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
|
1885 } |
|
1886 if ( aExp == 0 ) { |
|
1887 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 ); |
|
1888 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1889 } |
|
1890 zExp = aExp - bExp + 0x7D; |
|
1891 aSig = ( aSig | 0x00800000 )<<7; |
|
1892 bSig = ( bSig | 0x00800000 )<<8; |
|
1893 if ( bSig <= ( aSig + aSig ) ) { |
|
1894 aSig >>= 1; |
|
1895 ++zExp; |
|
1896 } |
|
1897 zSig = ( ( (bits64) aSig )<<32 ) / bSig; |
|
1898 if ( ( zSig & 0x3F ) == 0 ) { |
|
1899 zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 ); |
|
1900 } |
|
1901 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR ); |
|
1902 |
|
1903 } |
|
1904 |
|
1905 /*---------------------------------------------------------------------------- |
|
1906 | Returns the remainder of the single-precision floating-point value `a' |
|
1907 | with respect to the corresponding value `b'. The operation is performed |
|
1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
1909 *----------------------------------------------------------------------------*/ |
|
1910 |
|
1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM ) |
|
1912 { |
|
1913 flag aSign, bSign, zSign; |
|
1914 int16 aExp, bExp, expDiff; |
|
1915 bits32 aSig, bSig; |
|
1916 bits32 q; |
|
1917 bits64 aSig64, bSig64, q64; |
|
1918 bits32 alternateASig; |
|
1919 sbits32 sigMean; |
|
1920 |
|
1921 aSig = extractFloat32Frac( a ); |
|
1922 aExp = extractFloat32Exp( a ); |
|
1923 aSign = extractFloat32Sign( a ); |
|
1924 bSig = extractFloat32Frac( b ); |
|
1925 bExp = extractFloat32Exp( b ); |
|
1926 bSign = extractFloat32Sign( b ); |
|
1927 if ( aExp == 0xFF ) { |
|
1928 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) { |
|
1929 return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1930 } |
|
1931 float_raise( float_flag_invalid STATUS_VAR); |
|
1932 return float32_default_nan; |
|
1933 } |
|
1934 if ( bExp == 0xFF ) { |
|
1935 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR ); |
|
1936 return a; |
|
1937 } |
|
1938 if ( bExp == 0 ) { |
|
1939 if ( bSig == 0 ) { |
|
1940 float_raise( float_flag_invalid STATUS_VAR); |
|
1941 return float32_default_nan; |
|
1942 } |
|
1943 normalizeFloat32Subnormal( bSig, &bExp, &bSig ); |
|
1944 } |
|
1945 if ( aExp == 0 ) { |
|
1946 if ( aSig == 0 ) return a; |
|
1947 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
1948 } |
|
1949 expDiff = aExp - bExp; |
|
1950 aSig |= 0x00800000; |
|
1951 bSig |= 0x00800000; |
|
1952 if ( expDiff < 32 ) { |
|
1953 aSig <<= 8; |
|
1954 bSig <<= 8; |
|
1955 if ( expDiff < 0 ) { |
|
1956 if ( expDiff < -1 ) return a; |
|
1957 aSig >>= 1; |
|
1958 } |
|
1959 q = ( bSig <= aSig ); |
|
1960 if ( q ) aSig -= bSig; |
|
1961 if ( 0 < expDiff ) { |
|
1962 q = ( ( (bits64) aSig )<<32 ) / bSig; |
|
1963 q >>= 32 - expDiff; |
|
1964 bSig >>= 2; |
|
1965 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; |
|
1966 } |
|
1967 else { |
|
1968 aSig >>= 2; |
|
1969 bSig >>= 2; |
|
1970 } |
|
1971 } |
|
1972 else { |
|
1973 if ( bSig <= aSig ) aSig -= bSig; |
|
1974 aSig64 = ( (bits64) aSig )<<40; |
|
1975 bSig64 = ( (bits64) bSig )<<40; |
|
1976 expDiff -= 64; |
|
1977 while ( 0 < expDiff ) { |
|
1978 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); |
|
1979 q64 = ( 2 < q64 ) ? q64 - 2 : 0; |
|
1980 aSig64 = - ( ( bSig * q64 )<<38 ); |
|
1981 expDiff -= 62; |
|
1982 } |
|
1983 expDiff += 64; |
|
1984 q64 = estimateDiv128To64( aSig64, 0, bSig64 ); |
|
1985 q64 = ( 2 < q64 ) ? q64 - 2 : 0; |
|
1986 q = q64>>( 64 - expDiff ); |
|
1987 bSig <<= 6; |
|
1988 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q; |
|
1989 } |
|
1990 do { |
|
1991 alternateASig = aSig; |
|
1992 ++q; |
|
1993 aSig -= bSig; |
|
1994 } while ( 0 <= (sbits32) aSig ); |
|
1995 sigMean = aSig + alternateASig; |
|
1996 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { |
|
1997 aSig = alternateASig; |
|
1998 } |
|
1999 zSign = ( (sbits32) aSig < 0 ); |
|
2000 if ( zSign ) aSig = - aSig; |
|
2001 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR ); |
|
2002 |
|
2003 } |
|
2004 |
|
2005 /*---------------------------------------------------------------------------- |
|
2006 | Returns the square root of the single-precision floating-point value `a'. |
|
2007 | The operation is performed according to the IEC/IEEE Standard for Binary |
|
2008 | Floating-Point Arithmetic. |
|
2009 *----------------------------------------------------------------------------*/ |
|
2010 |
|
2011 float32 float32_sqrt( float32 a STATUS_PARAM ) |
|
2012 { |
|
2013 flag aSign; |
|
2014 int16 aExp, zExp; |
|
2015 bits32 aSig, zSig; |
|
2016 bits64 rem, term; |
|
2017 |
|
2018 aSig = extractFloat32Frac( a ); |
|
2019 aExp = extractFloat32Exp( a ); |
|
2020 aSign = extractFloat32Sign( a ); |
|
2021 if ( aExp == 0xFF ) { |
|
2022 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR ); |
|
2023 if ( ! aSign ) return a; |
|
2024 float_raise( float_flag_invalid STATUS_VAR); |
|
2025 return float32_default_nan; |
|
2026 } |
|
2027 if ( aSign ) { |
|
2028 if ( ( aExp | aSig ) == 0 ) return a; |
|
2029 float_raise( float_flag_invalid STATUS_VAR); |
|
2030 return float32_default_nan; |
|
2031 } |
|
2032 if ( aExp == 0 ) { |
|
2033 if ( aSig == 0 ) return float32_zero; |
|
2034 normalizeFloat32Subnormal( aSig, &aExp, &aSig ); |
|
2035 } |
|
2036 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E; |
|
2037 aSig = ( aSig | 0x00800000 )<<8; |
|
2038 zSig = estimateSqrt32( aExp, aSig ) + 2; |
|
2039 if ( ( zSig & 0x7F ) <= 5 ) { |
|
2040 if ( zSig < 2 ) { |
|
2041 zSig = 0x7FFFFFFF; |
|
2042 goto roundAndPack; |
|
2043 } |
|
2044 aSig >>= aExp & 1; |
|
2045 term = ( (bits64) zSig ) * zSig; |
|
2046 rem = ( ( (bits64) aSig )<<32 ) - term; |
|
2047 while ( (sbits64) rem < 0 ) { |
|
2048 --zSig; |
|
2049 rem += ( ( (bits64) zSig )<<1 ) | 1; |
|
2050 } |
|
2051 zSig |= ( rem != 0 ); |
|
2052 } |
|
2053 shift32RightJamming( zSig, 1, &zSig ); |
|
2054 roundAndPack: |
|
2055 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR ); |
|
2056 |
|
2057 } |
|
2058 |
|
2059 /*---------------------------------------------------------------------------- |
|
2060 | Returns 1 if the single-precision floating-point value `a' is equal to |
|
2061 | the corresponding value `b', and 0 otherwise. The comparison is performed |
|
2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
2063 *----------------------------------------------------------------------------*/ |
|
2064 |
|
2065 int float32_eq( float32 a, float32 b STATUS_PARAM ) |
|
2066 { |
|
2067 |
|
2068 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2069 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2070 ) { |
|
2071 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { |
|
2072 float_raise( float_flag_invalid STATUS_VAR); |
|
2073 } |
|
2074 return 0; |
|
2075 } |
|
2076 return ( float32_val(a) == float32_val(b) ) || |
|
2077 ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 ); |
|
2078 |
|
2079 } |
|
2080 |
|
2081 /*---------------------------------------------------------------------------- |
|
2082 | Returns 1 if the single-precision floating-point value `a' is less than |
|
2083 | or equal to the corresponding value `b', and 0 otherwise. The comparison |
|
2084 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2085 | Arithmetic. |
|
2086 *----------------------------------------------------------------------------*/ |
|
2087 |
|
2088 int float32_le( float32 a, float32 b STATUS_PARAM ) |
|
2089 { |
|
2090 flag aSign, bSign; |
|
2091 bits32 av, bv; |
|
2092 |
|
2093 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2094 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2095 ) { |
|
2096 float_raise( float_flag_invalid STATUS_VAR); |
|
2097 return 0; |
|
2098 } |
|
2099 aSign = extractFloat32Sign( a ); |
|
2100 bSign = extractFloat32Sign( b ); |
|
2101 av = float32_val(a); |
|
2102 bv = float32_val(b); |
|
2103 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); |
|
2104 return ( av == bv ) || ( aSign ^ ( av < bv ) ); |
|
2105 |
|
2106 } |
|
2107 |
|
2108 /*---------------------------------------------------------------------------- |
|
2109 | Returns 1 if the single-precision floating-point value `a' is less than |
|
2110 | the corresponding value `b', and 0 otherwise. The comparison is performed |
|
2111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
2112 *----------------------------------------------------------------------------*/ |
|
2113 |
|
2114 int float32_lt( float32 a, float32 b STATUS_PARAM ) |
|
2115 { |
|
2116 flag aSign, bSign; |
|
2117 bits32 av, bv; |
|
2118 |
|
2119 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2120 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2121 ) { |
|
2122 float_raise( float_flag_invalid STATUS_VAR); |
|
2123 return 0; |
|
2124 } |
|
2125 aSign = extractFloat32Sign( a ); |
|
2126 bSign = extractFloat32Sign( b ); |
|
2127 av = float32_val(a); |
|
2128 bv = float32_val(b); |
|
2129 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); |
|
2130 return ( av != bv ) && ( aSign ^ ( av < bv ) ); |
|
2131 |
|
2132 } |
|
2133 |
|
2134 /*---------------------------------------------------------------------------- |
|
2135 | Returns 1 if the single-precision floating-point value `a' is equal to |
|
2136 | the corresponding value `b', and 0 otherwise. The invalid exception is |
|
2137 | raised if either operand is a NaN. Otherwise, the comparison is performed |
|
2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
2139 *----------------------------------------------------------------------------*/ |
|
2140 |
|
2141 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM ) |
|
2142 { |
|
2143 bits32 av, bv; |
|
2144 |
|
2145 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2146 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2147 ) { |
|
2148 float_raise( float_flag_invalid STATUS_VAR); |
|
2149 return 0; |
|
2150 } |
|
2151 av = float32_val(a); |
|
2152 bv = float32_val(b); |
|
2153 return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 ); |
|
2154 |
|
2155 } |
|
2156 |
|
2157 /*---------------------------------------------------------------------------- |
|
2158 | Returns 1 if the single-precision floating-point value `a' is less than or |
|
2159 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not |
|
2160 | cause an exception. Otherwise, the comparison is performed according to the |
|
2161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
2162 *----------------------------------------------------------------------------*/ |
|
2163 |
|
2164 int float32_le_quiet( float32 a, float32 b STATUS_PARAM ) |
|
2165 { |
|
2166 flag aSign, bSign; |
|
2167 bits32 av, bv; |
|
2168 |
|
2169 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2170 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2171 ) { |
|
2172 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { |
|
2173 float_raise( float_flag_invalid STATUS_VAR); |
|
2174 } |
|
2175 return 0; |
|
2176 } |
|
2177 aSign = extractFloat32Sign( a ); |
|
2178 bSign = extractFloat32Sign( b ); |
|
2179 av = float32_val(a); |
|
2180 bv = float32_val(b); |
|
2181 if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 ); |
|
2182 return ( av == bv ) || ( aSign ^ ( av < bv ) ); |
|
2183 |
|
2184 } |
|
2185 |
|
2186 /*---------------------------------------------------------------------------- |
|
2187 | Returns 1 if the single-precision floating-point value `a' is less than |
|
2188 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an |
|
2189 | exception. Otherwise, the comparison is performed according to the IEC/IEEE |
|
2190 | Standard for Binary Floating-Point Arithmetic. |
|
2191 *----------------------------------------------------------------------------*/ |
|
2192 |
|
2193 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM ) |
|
2194 { |
|
2195 flag aSign, bSign; |
|
2196 bits32 av, bv; |
|
2197 |
|
2198 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) ) |
|
2199 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) ) |
|
2200 ) { |
|
2201 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) { |
|
2202 float_raise( float_flag_invalid STATUS_VAR); |
|
2203 } |
|
2204 return 0; |
|
2205 } |
|
2206 aSign = extractFloat32Sign( a ); |
|
2207 bSign = extractFloat32Sign( b ); |
|
2208 av = float32_val(a); |
|
2209 bv = float32_val(b); |
|
2210 if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 ); |
|
2211 return ( av != bv ) && ( aSign ^ ( av < bv ) ); |
|
2212 |
|
2213 } |
|
2214 |
|
2215 /*---------------------------------------------------------------------------- |
|
2216 | Returns the result of converting the double-precision floating-point value |
|
2217 | `a' to the 32-bit two's complement integer format. The conversion is |
|
2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2219 | Arithmetic---which means in particular that the conversion is rounded |
|
2220 | according to the current rounding mode. If `a' is a NaN, the largest |
|
2221 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
2222 | largest integer with the same sign as `a' is returned. |
|
2223 *----------------------------------------------------------------------------*/ |
|
2224 |
|
2225 int32 float64_to_int32( float64 a STATUS_PARAM ) |
|
2226 { |
|
2227 flag aSign; |
|
2228 int16 aExp, shiftCount; |
|
2229 bits64 aSig; |
|
2230 |
|
2231 aSig = extractFloat64Frac( a ); |
|
2232 aExp = extractFloat64Exp( a ); |
|
2233 aSign = extractFloat64Sign( a ); |
|
2234 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
|
2235 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); |
|
2236 shiftCount = 0x42C - aExp; |
|
2237 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig ); |
|
2238 return roundAndPackInt32( aSign, aSig STATUS_VAR ); |
|
2239 |
|
2240 } |
|
2241 |
|
2242 /*---------------------------------------------------------------------------- |
|
2243 | Returns the result of converting the double-precision floating-point value |
|
2244 | `a' to the 32-bit two's complement integer format. The conversion is |
|
2245 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2246 | Arithmetic, except that the conversion is always rounded toward zero. |
|
2247 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
|
2248 | the conversion overflows, the largest integer with the same sign as `a' is |
|
2249 | returned. |
|
2250 *----------------------------------------------------------------------------*/ |
|
2251 |
|
2252 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM ) |
|
2253 { |
|
2254 flag aSign; |
|
2255 int16 aExp, shiftCount; |
|
2256 bits64 aSig, savedASig; |
|
2257 int32 z; |
|
2258 |
|
2259 aSig = extractFloat64Frac( a ); |
|
2260 aExp = extractFloat64Exp( a ); |
|
2261 aSign = extractFloat64Sign( a ); |
|
2262 if ( 0x41E < aExp ) { |
|
2263 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0; |
|
2264 goto invalid; |
|
2265 } |
|
2266 else if ( aExp < 0x3FF ) { |
|
2267 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
2268 return 0; |
|
2269 } |
|
2270 aSig |= LIT64( 0x0010000000000000 ); |
|
2271 shiftCount = 0x433 - aExp; |
|
2272 savedASig = aSig; |
|
2273 aSig >>= shiftCount; |
|
2274 z = aSig; |
|
2275 if ( aSign ) z = - z; |
|
2276 if ( ( z < 0 ) ^ aSign ) { |
|
2277 invalid: |
|
2278 float_raise( float_flag_invalid STATUS_VAR); |
|
2279 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
|
2280 } |
|
2281 if ( ( aSig<<shiftCount ) != savedASig ) { |
|
2282 STATUS(float_exception_flags) |= float_flag_inexact; |
|
2283 } |
|
2284 return z; |
|
2285 |
|
2286 } |
|
2287 |
|
2288 /*---------------------------------------------------------------------------- |
|
2289 | Returns the result of converting the double-precision floating-point value |
|
2290 | `a' to the 64-bit two's complement integer format. The conversion is |
|
2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2292 | Arithmetic---which means in particular that the conversion is rounded |
|
2293 | according to the current rounding mode. If `a' is a NaN, the largest |
|
2294 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
2295 | largest integer with the same sign as `a' is returned. |
|
2296 *----------------------------------------------------------------------------*/ |
|
2297 |
|
2298 int64 float64_to_int64( float64 a STATUS_PARAM ) |
|
2299 { |
|
2300 flag aSign; |
|
2301 int16 aExp, shiftCount; |
|
2302 bits64 aSig, aSigExtra; |
|
2303 |
|
2304 aSig = extractFloat64Frac( a ); |
|
2305 aExp = extractFloat64Exp( a ); |
|
2306 aSign = extractFloat64Sign( a ); |
|
2307 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); |
|
2308 shiftCount = 0x433 - aExp; |
|
2309 if ( shiftCount <= 0 ) { |
|
2310 if ( 0x43E < aExp ) { |
|
2311 float_raise( float_flag_invalid STATUS_VAR); |
|
2312 if ( ! aSign |
|
2313 || ( ( aExp == 0x7FF ) |
|
2314 && ( aSig != LIT64( 0x0010000000000000 ) ) ) |
|
2315 ) { |
|
2316 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
2317 } |
|
2318 return (sbits64) LIT64( 0x8000000000000000 ); |
|
2319 } |
|
2320 aSigExtra = 0; |
|
2321 aSig <<= - shiftCount; |
|
2322 } |
|
2323 else { |
|
2324 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); |
|
2325 } |
|
2326 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); |
|
2327 |
|
2328 } |
|
2329 |
|
2330 /*---------------------------------------------------------------------------- |
|
2331 | Returns the result of converting the double-precision floating-point value |
|
2332 | `a' to the 64-bit two's complement integer format. The conversion is |
|
2333 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2334 | Arithmetic, except that the conversion is always rounded toward zero. |
|
2335 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
|
2336 | the conversion overflows, the largest integer with the same sign as `a' is |
|
2337 | returned. |
|
2338 *----------------------------------------------------------------------------*/ |
|
2339 |
|
2340 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM ) |
|
2341 { |
|
2342 flag aSign; |
|
2343 int16 aExp, shiftCount; |
|
2344 bits64 aSig; |
|
2345 int64 z; |
|
2346 |
|
2347 aSig = extractFloat64Frac( a ); |
|
2348 aExp = extractFloat64Exp( a ); |
|
2349 aSign = extractFloat64Sign( a ); |
|
2350 if ( aExp ) aSig |= LIT64( 0x0010000000000000 ); |
|
2351 shiftCount = aExp - 0x433; |
|
2352 if ( 0 <= shiftCount ) { |
|
2353 if ( 0x43E <= aExp ) { |
|
2354 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) { |
|
2355 float_raise( float_flag_invalid STATUS_VAR); |
|
2356 if ( ! aSign |
|
2357 || ( ( aExp == 0x7FF ) |
|
2358 && ( aSig != LIT64( 0x0010000000000000 ) ) ) |
|
2359 ) { |
|
2360 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
2361 } |
|
2362 } |
|
2363 return (sbits64) LIT64( 0x8000000000000000 ); |
|
2364 } |
|
2365 z = aSig<<shiftCount; |
|
2366 } |
|
2367 else { |
|
2368 if ( aExp < 0x3FE ) { |
|
2369 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
2370 return 0; |
|
2371 } |
|
2372 z = aSig>>( - shiftCount ); |
|
2373 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { |
|
2374 STATUS(float_exception_flags) |= float_flag_inexact; |
|
2375 } |
|
2376 } |
|
2377 if ( aSign ) z = - z; |
|
2378 return z; |
|
2379 |
|
2380 } |
|
2381 |
|
2382 /*---------------------------------------------------------------------------- |
|
2383 | Returns the result of converting the double-precision floating-point value |
|
2384 | `a' to the single-precision floating-point format. The conversion is |
|
2385 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2386 | Arithmetic. |
|
2387 *----------------------------------------------------------------------------*/ |
|
2388 |
|
2389 float32 float64_to_float32( float64 a STATUS_PARAM ) |
|
2390 { |
|
2391 flag aSign; |
|
2392 int16 aExp; |
|
2393 bits64 aSig; |
|
2394 bits32 zSig; |
|
2395 |
|
2396 aSig = extractFloat64Frac( a ); |
|
2397 aExp = extractFloat64Exp( a ); |
|
2398 aSign = extractFloat64Sign( a ); |
|
2399 if ( aExp == 0x7FF ) { |
|
2400 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) ); |
|
2401 return packFloat32( aSign, 0xFF, 0 ); |
|
2402 } |
|
2403 shift64RightJamming( aSig, 22, &aSig ); |
|
2404 zSig = aSig; |
|
2405 if ( aExp || zSig ) { |
|
2406 zSig |= 0x40000000; |
|
2407 aExp -= 0x381; |
|
2408 } |
|
2409 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); |
|
2410 |
|
2411 } |
|
2412 |
|
2413 |
|
2414 /*---------------------------------------------------------------------------- |
|
2415 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a |
|
2416 | half-precision floating-point value, returning the result. After being |
|
2417 | shifted into the proper positions, the three fields are simply added |
|
2418 | together to form the result. This means that any integer portion of `zSig' |
|
2419 | will be added into the exponent. Since a properly normalized significand |
|
2420 | will have an integer portion equal to 1, the `zExp' input should be 1 less |
|
2421 | than the desired result exponent whenever `zSig' is a complete, normalized |
|
2422 | significand. |
|
2423 *----------------------------------------------------------------------------*/ |
|
2424 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig) |
|
2425 { |
|
2426 return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig; |
|
2427 } |
|
2428 |
|
2429 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM ) |
|
2430 { |
|
2431 flag aSign; |
|
2432 int16 aExp; |
|
2433 bits32 aSig; |
|
2434 |
|
2435 aSign = a >> 15; |
|
2436 aExp = (a >> 10) & 0x1f; |
|
2437 aSig = a & 0x3ff; |
|
2438 |
|
2439 if (aExp == 0x1f && ieee) { |
|
2440 if (aSig) { |
|
2441 /* Make sure correct exceptions are raised. */ |
|
2442 float32ToCommonNaN(a STATUS_VAR); |
|
2443 aSig |= 0x200; |
|
2444 } |
|
2445 return packFloat32(aSign, 0xff, aSig << 13); |
|
2446 } |
|
2447 if (aExp == 0) { |
|
2448 int8 shiftCount; |
|
2449 |
|
2450 if (aSig == 0) { |
|
2451 return packFloat32(aSign, 0, 0); |
|
2452 } |
|
2453 |
|
2454 shiftCount = countLeadingZeros32( aSig ) - 21; |
|
2455 aSig = aSig << shiftCount; |
|
2456 aExp = -shiftCount; |
|
2457 } |
|
2458 return packFloat32( aSign, aExp + 0x70, aSig << 13); |
|
2459 } |
|
2460 |
|
2461 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM) |
|
2462 { |
|
2463 flag aSign; |
|
2464 int16 aExp; |
|
2465 bits32 aSig; |
|
2466 bits32 mask; |
|
2467 bits32 increment; |
|
2468 int8 roundingMode; |
|
2469 |
|
2470 aSig = extractFloat32Frac( a ); |
|
2471 aExp = extractFloat32Exp( a ); |
|
2472 aSign = extractFloat32Sign( a ); |
|
2473 if ( aExp == 0xFF ) { |
|
2474 if (aSig) { |
|
2475 /* Make sure correct exceptions are raised. */ |
|
2476 float32ToCommonNaN(a STATUS_VAR); |
|
2477 aSig |= 0x00400000; |
|
2478 } |
|
2479 return packFloat16(aSign, 0x1f, aSig >> 13); |
|
2480 } |
|
2481 if (aExp == 0 && aSign == 0) { |
|
2482 return packFloat16(aSign, 0, 0); |
|
2483 } |
|
2484 /* Decimal point between bits 22 and 23. */ |
|
2485 aSig |= 0x00800000; |
|
2486 aExp -= 0x7f; |
|
2487 if (aExp < -14) { |
|
2488 mask = 0x007fffff; |
|
2489 if (aExp < -25) { |
|
2490 aExp = -26; |
|
2491 } else if (aExp != -25) { |
|
2492 mask >>= 24 + aExp; |
|
2493 } |
|
2494 } else { |
|
2495 mask = 0x00001fff; |
|
2496 } |
|
2497 if (aSig & mask) { |
|
2498 float_raise( float_flag_underflow STATUS_VAR ); |
|
2499 roundingMode = STATUS(float_rounding_mode); |
|
2500 switch (roundingMode) { |
|
2501 case float_round_nearest_even: |
|
2502 increment = (mask + 1) >> 1; |
|
2503 if ((aSig & mask) == increment) { |
|
2504 increment = aSig & (increment << 1); |
|
2505 } |
|
2506 break; |
|
2507 case float_round_up: |
|
2508 increment = aSign ? 0 : mask; |
|
2509 break; |
|
2510 case float_round_down: |
|
2511 increment = aSign ? mask : 0; |
|
2512 break; |
|
2513 default: /* round_to_zero */ |
|
2514 increment = 0; |
|
2515 break; |
|
2516 } |
|
2517 aSig += increment; |
|
2518 if (aSig >= 0x01000000) { |
|
2519 aSig >>= 1; |
|
2520 aExp++; |
|
2521 } |
|
2522 } else if (aExp < -14 |
|
2523 && STATUS(float_detect_tininess) == float_tininess_before_rounding) { |
|
2524 float_raise( float_flag_underflow STATUS_VAR); |
|
2525 } |
|
2526 |
|
2527 if (ieee) { |
|
2528 if (aExp > 15) { |
|
2529 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
2530 return packFloat16(aSign, 0x1f, 0); |
|
2531 } |
|
2532 } else { |
|
2533 if (aExp > 16) { |
|
2534 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR); |
|
2535 return packFloat16(aSign, 0x1f, 0x3ff); |
|
2536 } |
|
2537 } |
|
2538 if (aExp < -24) { |
|
2539 return packFloat16(aSign, 0, 0); |
|
2540 } |
|
2541 if (aExp < -14) { |
|
2542 aSig >>= -14 - aExp; |
|
2543 aExp = -14; |
|
2544 } |
|
2545 return packFloat16(aSign, aExp + 14, aSig >> 13); |
|
2546 } |
|
2547 |
|
2548 #ifdef FLOATX80 |
|
2549 |
|
2550 /*---------------------------------------------------------------------------- |
|
2551 | Returns the result of converting the double-precision floating-point value |
|
2552 | `a' to the extended double-precision floating-point format. The conversion |
|
2553 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2554 | Arithmetic. |
|
2555 *----------------------------------------------------------------------------*/ |
|
2556 |
|
2557 floatx80 float64_to_floatx80( float64 a STATUS_PARAM ) |
|
2558 { |
|
2559 flag aSign; |
|
2560 int16 aExp; |
|
2561 bits64 aSig; |
|
2562 |
|
2563 aSig = extractFloat64Frac( a ); |
|
2564 aExp = extractFloat64Exp( a ); |
|
2565 aSign = extractFloat64Sign( a ); |
|
2566 if ( aExp == 0x7FF ) { |
|
2567 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) ); |
|
2568 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
2569 } |
|
2570 if ( aExp == 0 ) { |
|
2571 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 ); |
|
2572 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
2573 } |
|
2574 return |
|
2575 packFloatx80( |
|
2576 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 ); |
|
2577 |
|
2578 } |
|
2579 |
|
2580 #endif |
|
2581 |
|
2582 #ifdef FLOAT128 |
|
2583 |
|
2584 /*---------------------------------------------------------------------------- |
|
2585 | Returns the result of converting the double-precision floating-point value |
|
2586 | `a' to the quadruple-precision floating-point format. The conversion is |
|
2587 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
2588 | Arithmetic. |
|
2589 *----------------------------------------------------------------------------*/ |
|
2590 |
|
2591 float128 float64_to_float128( float64 a STATUS_PARAM ) |
|
2592 { |
|
2593 flag aSign; |
|
2594 int16 aExp; |
|
2595 bits64 aSig, zSig0, zSig1; |
|
2596 |
|
2597 aSig = extractFloat64Frac( a ); |
|
2598 aExp = extractFloat64Exp( a ); |
|
2599 aSign = extractFloat64Sign( a ); |
|
2600 if ( aExp == 0x7FF ) { |
|
2601 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) ); |
|
2602 return packFloat128( aSign, 0x7FFF, 0, 0 ); |
|
2603 } |
|
2604 if ( aExp == 0 ) { |
|
2605 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 ); |
|
2606 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
2607 --aExp; |
|
2608 } |
|
2609 shift128Right( aSig, 0, 4, &zSig0, &zSig1 ); |
|
2610 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 ); |
|
2611 |
|
2612 } |
|
2613 |
|
2614 #endif |
|
2615 |
|
2616 /*---------------------------------------------------------------------------- |
|
2617 | Rounds the double-precision floating-point value `a' to an integer, and |
|
2618 | returns the result as a double-precision floating-point value. The |
|
2619 | operation is performed according to the IEC/IEEE Standard for Binary |
|
2620 | Floating-Point Arithmetic. |
|
2621 *----------------------------------------------------------------------------*/ |
|
2622 |
|
2623 float64 float64_round_to_int( float64 a STATUS_PARAM ) |
|
2624 { |
|
2625 flag aSign; |
|
2626 int16 aExp; |
|
2627 bits64 lastBitMask, roundBitsMask; |
|
2628 int8 roundingMode; |
|
2629 bits64 z; |
|
2630 |
|
2631 aExp = extractFloat64Exp( a ); |
|
2632 if ( 0x433 <= aExp ) { |
|
2633 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) { |
|
2634 return propagateFloat64NaN( a, a STATUS_VAR ); |
|
2635 } |
|
2636 return a; |
|
2637 } |
|
2638 if ( aExp < 0x3FF ) { |
|
2639 if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a; |
|
2640 STATUS(float_exception_flags) |= float_flag_inexact; |
|
2641 aSign = extractFloat64Sign( a ); |
|
2642 switch ( STATUS(float_rounding_mode) ) { |
|
2643 case float_round_nearest_even: |
|
2644 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) { |
|
2645 return packFloat64( aSign, 0x3FF, 0 ); |
|
2646 } |
|
2647 break; |
|
2648 case float_round_down: |
|
2649 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0); |
|
2650 case float_round_up: |
|
2651 return make_float64( |
|
2652 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 )); |
|
2653 } |
|
2654 return packFloat64( aSign, 0, 0 ); |
|
2655 } |
|
2656 lastBitMask = 1; |
|
2657 lastBitMask <<= 0x433 - aExp; |
|
2658 roundBitsMask = lastBitMask - 1; |
|
2659 z = float64_val(a); |
|
2660 roundingMode = STATUS(float_rounding_mode); |
|
2661 if ( roundingMode == float_round_nearest_even ) { |
|
2662 z += lastBitMask>>1; |
|
2663 if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask; |
|
2664 } |
|
2665 else if ( roundingMode != float_round_to_zero ) { |
|
2666 if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) { |
|
2667 z += roundBitsMask; |
|
2668 } |
|
2669 } |
|
2670 z &= ~ roundBitsMask; |
|
2671 if ( z != float64_val(a) ) |
|
2672 STATUS(float_exception_flags) |= float_flag_inexact; |
|
2673 return make_float64(z); |
|
2674 |
|
2675 } |
|
2676 |
|
2677 float64 float64_trunc_to_int( float64 a STATUS_PARAM) |
|
2678 { |
|
2679 int oldmode; |
|
2680 float64 res; |
|
2681 oldmode = STATUS(float_rounding_mode); |
|
2682 STATUS(float_rounding_mode) = float_round_to_zero; |
|
2683 res = float64_round_to_int(a STATUS_VAR); |
|
2684 STATUS(float_rounding_mode) = oldmode; |
|
2685 return res; |
|
2686 } |
|
2687 |
|
2688 /*---------------------------------------------------------------------------- |
|
2689 | Returns the result of adding the absolute values of the double-precision |
|
2690 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated |
|
2691 | before being returned. `zSign' is ignored if the result is a NaN. |
|
2692 | The addition is performed according to the IEC/IEEE Standard for Binary |
|
2693 | Floating-Point Arithmetic. |
|
2694 *----------------------------------------------------------------------------*/ |
|
2695 |
|
2696 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) |
|
2697 { |
|
2698 int16 aExp, bExp, zExp; |
|
2699 bits64 aSig, bSig, zSig; |
|
2700 int16 expDiff; |
|
2701 |
|
2702 aSig = extractFloat64Frac( a ); |
|
2703 aExp = extractFloat64Exp( a ); |
|
2704 bSig = extractFloat64Frac( b ); |
|
2705 bExp = extractFloat64Exp( b ); |
|
2706 expDiff = aExp - bExp; |
|
2707 aSig <<= 9; |
|
2708 bSig <<= 9; |
|
2709 if ( 0 < expDiff ) { |
|
2710 if ( aExp == 0x7FF ) { |
|
2711 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2712 return a; |
|
2713 } |
|
2714 if ( bExp == 0 ) { |
|
2715 --expDiff; |
|
2716 } |
|
2717 else { |
|
2718 bSig |= LIT64( 0x2000000000000000 ); |
|
2719 } |
|
2720 shift64RightJamming( bSig, expDiff, &bSig ); |
|
2721 zExp = aExp; |
|
2722 } |
|
2723 else if ( expDiff < 0 ) { |
|
2724 if ( bExp == 0x7FF ) { |
|
2725 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2726 return packFloat64( zSign, 0x7FF, 0 ); |
|
2727 } |
|
2728 if ( aExp == 0 ) { |
|
2729 ++expDiff; |
|
2730 } |
|
2731 else { |
|
2732 aSig |= LIT64( 0x2000000000000000 ); |
|
2733 } |
|
2734 shift64RightJamming( aSig, - expDiff, &aSig ); |
|
2735 zExp = bExp; |
|
2736 } |
|
2737 else { |
|
2738 if ( aExp == 0x7FF ) { |
|
2739 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2740 return a; |
|
2741 } |
|
2742 if ( aExp == 0 ) { |
|
2743 if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 ); |
|
2744 return packFloat64( zSign, 0, ( aSig + bSig )>>9 ); |
|
2745 } |
|
2746 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig; |
|
2747 zExp = aExp; |
|
2748 goto roundAndPack; |
|
2749 } |
|
2750 aSig |= LIT64( 0x2000000000000000 ); |
|
2751 zSig = ( aSig + bSig )<<1; |
|
2752 --zExp; |
|
2753 if ( (sbits64) zSig < 0 ) { |
|
2754 zSig = aSig + bSig; |
|
2755 ++zExp; |
|
2756 } |
|
2757 roundAndPack: |
|
2758 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); |
|
2759 |
|
2760 } |
|
2761 |
|
2762 /*---------------------------------------------------------------------------- |
|
2763 | Returns the result of subtracting the absolute values of the double- |
|
2764 | precision floating-point values `a' and `b'. If `zSign' is 1, the |
|
2765 | difference is negated before being returned. `zSign' is ignored if the |
|
2766 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
|
2767 | Standard for Binary Floating-Point Arithmetic. |
|
2768 *----------------------------------------------------------------------------*/ |
|
2769 |
|
2770 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM ) |
|
2771 { |
|
2772 int16 aExp, bExp, zExp; |
|
2773 bits64 aSig, bSig, zSig; |
|
2774 int16 expDiff; |
|
2775 |
|
2776 aSig = extractFloat64Frac( a ); |
|
2777 aExp = extractFloat64Exp( a ); |
|
2778 bSig = extractFloat64Frac( b ); |
|
2779 bExp = extractFloat64Exp( b ); |
|
2780 expDiff = aExp - bExp; |
|
2781 aSig <<= 10; |
|
2782 bSig <<= 10; |
|
2783 if ( 0 < expDiff ) goto aExpBigger; |
|
2784 if ( expDiff < 0 ) goto bExpBigger; |
|
2785 if ( aExp == 0x7FF ) { |
|
2786 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2787 float_raise( float_flag_invalid STATUS_VAR); |
|
2788 return float64_default_nan; |
|
2789 } |
|
2790 if ( aExp == 0 ) { |
|
2791 aExp = 1; |
|
2792 bExp = 1; |
|
2793 } |
|
2794 if ( bSig < aSig ) goto aBigger; |
|
2795 if ( aSig < bSig ) goto bBigger; |
|
2796 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); |
|
2797 bExpBigger: |
|
2798 if ( bExp == 0x7FF ) { |
|
2799 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2800 return packFloat64( zSign ^ 1, 0x7FF, 0 ); |
|
2801 } |
|
2802 if ( aExp == 0 ) { |
|
2803 ++expDiff; |
|
2804 } |
|
2805 else { |
|
2806 aSig |= LIT64( 0x4000000000000000 ); |
|
2807 } |
|
2808 shift64RightJamming( aSig, - expDiff, &aSig ); |
|
2809 bSig |= LIT64( 0x4000000000000000 ); |
|
2810 bBigger: |
|
2811 zSig = bSig - aSig; |
|
2812 zExp = bExp; |
|
2813 zSign ^= 1; |
|
2814 goto normalizeRoundAndPack; |
|
2815 aExpBigger: |
|
2816 if ( aExp == 0x7FF ) { |
|
2817 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2818 return a; |
|
2819 } |
|
2820 if ( bExp == 0 ) { |
|
2821 --expDiff; |
|
2822 } |
|
2823 else { |
|
2824 bSig |= LIT64( 0x4000000000000000 ); |
|
2825 } |
|
2826 shift64RightJamming( bSig, expDiff, &bSig ); |
|
2827 aSig |= LIT64( 0x4000000000000000 ); |
|
2828 aBigger: |
|
2829 zSig = aSig - bSig; |
|
2830 zExp = aExp; |
|
2831 normalizeRoundAndPack: |
|
2832 --zExp; |
|
2833 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); |
|
2834 |
|
2835 } |
|
2836 |
|
2837 /*---------------------------------------------------------------------------- |
|
2838 | Returns the result of adding the double-precision floating-point values `a' |
|
2839 | and `b'. The operation is performed according to the IEC/IEEE Standard for |
|
2840 | Binary Floating-Point Arithmetic. |
|
2841 *----------------------------------------------------------------------------*/ |
|
2842 |
|
2843 float64 float64_add( float64 a, float64 b STATUS_PARAM ) |
|
2844 { |
|
2845 flag aSign, bSign; |
|
2846 |
|
2847 aSign = extractFloat64Sign( a ); |
|
2848 bSign = extractFloat64Sign( b ); |
|
2849 if ( aSign == bSign ) { |
|
2850 return addFloat64Sigs( a, b, aSign STATUS_VAR ); |
|
2851 } |
|
2852 else { |
|
2853 return subFloat64Sigs( a, b, aSign STATUS_VAR ); |
|
2854 } |
|
2855 |
|
2856 } |
|
2857 |
|
2858 /*---------------------------------------------------------------------------- |
|
2859 | Returns the result of subtracting the double-precision floating-point values |
|
2860 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
|
2861 | for Binary Floating-Point Arithmetic. |
|
2862 *----------------------------------------------------------------------------*/ |
|
2863 |
|
2864 float64 float64_sub( float64 a, float64 b STATUS_PARAM ) |
|
2865 { |
|
2866 flag aSign, bSign; |
|
2867 |
|
2868 aSign = extractFloat64Sign( a ); |
|
2869 bSign = extractFloat64Sign( b ); |
|
2870 if ( aSign == bSign ) { |
|
2871 return subFloat64Sigs( a, b, aSign STATUS_VAR ); |
|
2872 } |
|
2873 else { |
|
2874 return addFloat64Sigs( a, b, aSign STATUS_VAR ); |
|
2875 } |
|
2876 |
|
2877 } |
|
2878 |
|
2879 /*---------------------------------------------------------------------------- |
|
2880 | Returns the result of multiplying the double-precision floating-point values |
|
2881 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
|
2882 | for Binary Floating-Point Arithmetic. |
|
2883 *----------------------------------------------------------------------------*/ |
|
2884 |
|
2885 float64 float64_mul( float64 a, float64 b STATUS_PARAM ) |
|
2886 { |
|
2887 flag aSign, bSign, zSign; |
|
2888 int16 aExp, bExp, zExp; |
|
2889 bits64 aSig, bSig, zSig0, zSig1; |
|
2890 |
|
2891 aSig = extractFloat64Frac( a ); |
|
2892 aExp = extractFloat64Exp( a ); |
|
2893 aSign = extractFloat64Sign( a ); |
|
2894 bSig = extractFloat64Frac( b ); |
|
2895 bExp = extractFloat64Exp( b ); |
|
2896 bSign = extractFloat64Sign( b ); |
|
2897 zSign = aSign ^ bSign; |
|
2898 if ( aExp == 0x7FF ) { |
|
2899 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { |
|
2900 return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2901 } |
|
2902 if ( ( bExp | bSig ) == 0 ) { |
|
2903 float_raise( float_flag_invalid STATUS_VAR); |
|
2904 return float64_default_nan; |
|
2905 } |
|
2906 return packFloat64( zSign, 0x7FF, 0 ); |
|
2907 } |
|
2908 if ( bExp == 0x7FF ) { |
|
2909 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2910 if ( ( aExp | aSig ) == 0 ) { |
|
2911 float_raise( float_flag_invalid STATUS_VAR); |
|
2912 return float64_default_nan; |
|
2913 } |
|
2914 return packFloat64( zSign, 0x7FF, 0 ); |
|
2915 } |
|
2916 if ( aExp == 0 ) { |
|
2917 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); |
|
2918 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
2919 } |
|
2920 if ( bExp == 0 ) { |
|
2921 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 ); |
|
2922 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
|
2923 } |
|
2924 zExp = aExp + bExp - 0x3FF; |
|
2925 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; |
|
2926 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
|
2927 mul64To128( aSig, bSig, &zSig0, &zSig1 ); |
|
2928 zSig0 |= ( zSig1 != 0 ); |
|
2929 if ( 0 <= (sbits64) ( zSig0<<1 ) ) { |
|
2930 zSig0 <<= 1; |
|
2931 --zExp; |
|
2932 } |
|
2933 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR ); |
|
2934 |
|
2935 } |
|
2936 |
|
2937 /*---------------------------------------------------------------------------- |
|
2938 | Returns the result of dividing the double-precision floating-point value `a' |
|
2939 | by the corresponding value `b'. The operation is performed according to |
|
2940 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
2941 *----------------------------------------------------------------------------*/ |
|
2942 |
|
2943 float64 float64_div( float64 a, float64 b STATUS_PARAM ) |
|
2944 { |
|
2945 flag aSign, bSign, zSign; |
|
2946 int16 aExp, bExp, zExp; |
|
2947 bits64 aSig, bSig, zSig; |
|
2948 bits64 rem0, rem1; |
|
2949 bits64 term0, term1; |
|
2950 |
|
2951 aSig = extractFloat64Frac( a ); |
|
2952 aExp = extractFloat64Exp( a ); |
|
2953 aSign = extractFloat64Sign( a ); |
|
2954 bSig = extractFloat64Frac( b ); |
|
2955 bExp = extractFloat64Exp( b ); |
|
2956 bSign = extractFloat64Sign( b ); |
|
2957 zSign = aSign ^ bSign; |
|
2958 if ( aExp == 0x7FF ) { |
|
2959 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2960 if ( bExp == 0x7FF ) { |
|
2961 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2962 float_raise( float_flag_invalid STATUS_VAR); |
|
2963 return float64_default_nan; |
|
2964 } |
|
2965 return packFloat64( zSign, 0x7FF, 0 ); |
|
2966 } |
|
2967 if ( bExp == 0x7FF ) { |
|
2968 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
2969 return packFloat64( zSign, 0, 0 ); |
|
2970 } |
|
2971 if ( bExp == 0 ) { |
|
2972 if ( bSig == 0 ) { |
|
2973 if ( ( aExp | aSig ) == 0 ) { |
|
2974 float_raise( float_flag_invalid STATUS_VAR); |
|
2975 return float64_default_nan; |
|
2976 } |
|
2977 float_raise( float_flag_divbyzero STATUS_VAR); |
|
2978 return packFloat64( zSign, 0x7FF, 0 ); |
|
2979 } |
|
2980 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
|
2981 } |
|
2982 if ( aExp == 0 ) { |
|
2983 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 ); |
|
2984 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
2985 } |
|
2986 zExp = aExp - bExp + 0x3FD; |
|
2987 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10; |
|
2988 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
|
2989 if ( bSig <= ( aSig + aSig ) ) { |
|
2990 aSig >>= 1; |
|
2991 ++zExp; |
|
2992 } |
|
2993 zSig = estimateDiv128To64( aSig, 0, bSig ); |
|
2994 if ( ( zSig & 0x1FF ) <= 2 ) { |
|
2995 mul64To128( bSig, zSig, &term0, &term1 ); |
|
2996 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); |
|
2997 while ( (sbits64) rem0 < 0 ) { |
|
2998 --zSig; |
|
2999 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); |
|
3000 } |
|
3001 zSig |= ( rem1 != 0 ); |
|
3002 } |
|
3003 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR ); |
|
3004 |
|
3005 } |
|
3006 |
|
3007 /*---------------------------------------------------------------------------- |
|
3008 | Returns the remainder of the double-precision floating-point value `a' |
|
3009 | with respect to the corresponding value `b'. The operation is performed |
|
3010 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3011 *----------------------------------------------------------------------------*/ |
|
3012 |
|
3013 float64 float64_rem( float64 a, float64 b STATUS_PARAM ) |
|
3014 { |
|
3015 flag aSign, bSign, zSign; |
|
3016 int16 aExp, bExp, expDiff; |
|
3017 bits64 aSig, bSig; |
|
3018 bits64 q, alternateASig; |
|
3019 sbits64 sigMean; |
|
3020 |
|
3021 aSig = extractFloat64Frac( a ); |
|
3022 aExp = extractFloat64Exp( a ); |
|
3023 aSign = extractFloat64Sign( a ); |
|
3024 bSig = extractFloat64Frac( b ); |
|
3025 bExp = extractFloat64Exp( b ); |
|
3026 bSign = extractFloat64Sign( b ); |
|
3027 if ( aExp == 0x7FF ) { |
|
3028 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) { |
|
3029 return propagateFloat64NaN( a, b STATUS_VAR ); |
|
3030 } |
|
3031 float_raise( float_flag_invalid STATUS_VAR); |
|
3032 return float64_default_nan; |
|
3033 } |
|
3034 if ( bExp == 0x7FF ) { |
|
3035 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR ); |
|
3036 return a; |
|
3037 } |
|
3038 if ( bExp == 0 ) { |
|
3039 if ( bSig == 0 ) { |
|
3040 float_raise( float_flag_invalid STATUS_VAR); |
|
3041 return float64_default_nan; |
|
3042 } |
|
3043 normalizeFloat64Subnormal( bSig, &bExp, &bSig ); |
|
3044 } |
|
3045 if ( aExp == 0 ) { |
|
3046 if ( aSig == 0 ) return a; |
|
3047 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
3048 } |
|
3049 expDiff = aExp - bExp; |
|
3050 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11; |
|
3051 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11; |
|
3052 if ( expDiff < 0 ) { |
|
3053 if ( expDiff < -1 ) return a; |
|
3054 aSig >>= 1; |
|
3055 } |
|
3056 q = ( bSig <= aSig ); |
|
3057 if ( q ) aSig -= bSig; |
|
3058 expDiff -= 64; |
|
3059 while ( 0 < expDiff ) { |
|
3060 q = estimateDiv128To64( aSig, 0, bSig ); |
|
3061 q = ( 2 < q ) ? q - 2 : 0; |
|
3062 aSig = - ( ( bSig>>2 ) * q ); |
|
3063 expDiff -= 62; |
|
3064 } |
|
3065 expDiff += 64; |
|
3066 if ( 0 < expDiff ) { |
|
3067 q = estimateDiv128To64( aSig, 0, bSig ); |
|
3068 q = ( 2 < q ) ? q - 2 : 0; |
|
3069 q >>= 64 - expDiff; |
|
3070 bSig >>= 2; |
|
3071 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q; |
|
3072 } |
|
3073 else { |
|
3074 aSig >>= 2; |
|
3075 bSig >>= 2; |
|
3076 } |
|
3077 do { |
|
3078 alternateASig = aSig; |
|
3079 ++q; |
|
3080 aSig -= bSig; |
|
3081 } while ( 0 <= (sbits64) aSig ); |
|
3082 sigMean = aSig + alternateASig; |
|
3083 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) { |
|
3084 aSig = alternateASig; |
|
3085 } |
|
3086 zSign = ( (sbits64) aSig < 0 ); |
|
3087 if ( zSign ) aSig = - aSig; |
|
3088 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR ); |
|
3089 |
|
3090 } |
|
3091 |
|
3092 /*---------------------------------------------------------------------------- |
|
3093 | Returns the square root of the double-precision floating-point value `a'. |
|
3094 | The operation is performed according to the IEC/IEEE Standard for Binary |
|
3095 | Floating-Point Arithmetic. |
|
3096 *----------------------------------------------------------------------------*/ |
|
3097 |
|
3098 float64 float64_sqrt( float64 a STATUS_PARAM ) |
|
3099 { |
|
3100 flag aSign; |
|
3101 int16 aExp, zExp; |
|
3102 bits64 aSig, zSig, doubleZSig; |
|
3103 bits64 rem0, rem1, term0, term1; |
|
3104 |
|
3105 aSig = extractFloat64Frac( a ); |
|
3106 aExp = extractFloat64Exp( a ); |
|
3107 aSign = extractFloat64Sign( a ); |
|
3108 if ( aExp == 0x7FF ) { |
|
3109 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR ); |
|
3110 if ( ! aSign ) return a; |
|
3111 float_raise( float_flag_invalid STATUS_VAR); |
|
3112 return float64_default_nan; |
|
3113 } |
|
3114 if ( aSign ) { |
|
3115 if ( ( aExp | aSig ) == 0 ) return a; |
|
3116 float_raise( float_flag_invalid STATUS_VAR); |
|
3117 return float64_default_nan; |
|
3118 } |
|
3119 if ( aExp == 0 ) { |
|
3120 if ( aSig == 0 ) return float64_zero; |
|
3121 normalizeFloat64Subnormal( aSig, &aExp, &aSig ); |
|
3122 } |
|
3123 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE; |
|
3124 aSig |= LIT64( 0x0010000000000000 ); |
|
3125 zSig = estimateSqrt32( aExp, aSig>>21 ); |
|
3126 aSig <<= 9 - ( aExp & 1 ); |
|
3127 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 ); |
|
3128 if ( ( zSig & 0x1FF ) <= 5 ) { |
|
3129 doubleZSig = zSig<<1; |
|
3130 mul64To128( zSig, zSig, &term0, &term1 ); |
|
3131 sub128( aSig, 0, term0, term1, &rem0, &rem1 ); |
|
3132 while ( (sbits64) rem0 < 0 ) { |
|
3133 --zSig; |
|
3134 doubleZSig -= 2; |
|
3135 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 ); |
|
3136 } |
|
3137 zSig |= ( ( rem0 | rem1 ) != 0 ); |
|
3138 } |
|
3139 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR ); |
|
3140 |
|
3141 } |
|
3142 |
|
3143 /*---------------------------------------------------------------------------- |
|
3144 | Returns 1 if the double-precision floating-point value `a' is equal to the |
|
3145 | corresponding value `b', and 0 otherwise. The comparison is performed |
|
3146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3147 *----------------------------------------------------------------------------*/ |
|
3148 |
|
3149 int float64_eq( float64 a, float64 b STATUS_PARAM ) |
|
3150 { |
|
3151 bits64 av, bv; |
|
3152 |
|
3153 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3154 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3155 ) { |
|
3156 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
|
3157 float_raise( float_flag_invalid STATUS_VAR); |
|
3158 } |
|
3159 return 0; |
|
3160 } |
|
3161 av = float64_val(a); |
|
3162 bv = float64_val(b); |
|
3163 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); |
|
3164 |
|
3165 } |
|
3166 |
|
3167 /*---------------------------------------------------------------------------- |
|
3168 | Returns 1 if the double-precision floating-point value `a' is less than or |
|
3169 | equal to the corresponding value `b', and 0 otherwise. The comparison is |
|
3170 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
3171 | Arithmetic. |
|
3172 *----------------------------------------------------------------------------*/ |
|
3173 |
|
3174 int float64_le( float64 a, float64 b STATUS_PARAM ) |
|
3175 { |
|
3176 flag aSign, bSign; |
|
3177 bits64 av, bv; |
|
3178 |
|
3179 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3180 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3181 ) { |
|
3182 float_raise( float_flag_invalid STATUS_VAR); |
|
3183 return 0; |
|
3184 } |
|
3185 aSign = extractFloat64Sign( a ); |
|
3186 bSign = extractFloat64Sign( b ); |
|
3187 av = float64_val(a); |
|
3188 bv = float64_val(b); |
|
3189 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); |
|
3190 return ( av == bv ) || ( aSign ^ ( av < bv ) ); |
|
3191 |
|
3192 } |
|
3193 |
|
3194 /*---------------------------------------------------------------------------- |
|
3195 | Returns 1 if the double-precision floating-point value `a' is less than |
|
3196 | the corresponding value `b', and 0 otherwise. The comparison is performed |
|
3197 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3198 *----------------------------------------------------------------------------*/ |
|
3199 |
|
3200 int float64_lt( float64 a, float64 b STATUS_PARAM ) |
|
3201 { |
|
3202 flag aSign, bSign; |
|
3203 bits64 av, bv; |
|
3204 |
|
3205 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3206 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3207 ) { |
|
3208 float_raise( float_flag_invalid STATUS_VAR); |
|
3209 return 0; |
|
3210 } |
|
3211 aSign = extractFloat64Sign( a ); |
|
3212 bSign = extractFloat64Sign( b ); |
|
3213 av = float64_val(a); |
|
3214 bv = float64_val(b); |
|
3215 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); |
|
3216 return ( av != bv ) && ( aSign ^ ( av < bv ) ); |
|
3217 |
|
3218 } |
|
3219 |
|
3220 /*---------------------------------------------------------------------------- |
|
3221 | Returns 1 if the double-precision floating-point value `a' is equal to the |
|
3222 | corresponding value `b', and 0 otherwise. The invalid exception is raised |
|
3223 | if either operand is a NaN. Otherwise, the comparison is performed |
|
3224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3225 *----------------------------------------------------------------------------*/ |
|
3226 |
|
3227 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM ) |
|
3228 { |
|
3229 bits64 av, bv; |
|
3230 |
|
3231 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3232 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3233 ) { |
|
3234 float_raise( float_flag_invalid STATUS_VAR); |
|
3235 return 0; |
|
3236 } |
|
3237 av = float64_val(a); |
|
3238 bv = float64_val(b); |
|
3239 return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 ); |
|
3240 |
|
3241 } |
|
3242 |
|
3243 /*---------------------------------------------------------------------------- |
|
3244 | Returns 1 if the double-precision floating-point value `a' is less than or |
|
3245 | equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not |
|
3246 | cause an exception. Otherwise, the comparison is performed according to the |
|
3247 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3248 *----------------------------------------------------------------------------*/ |
|
3249 |
|
3250 int float64_le_quiet( float64 a, float64 b STATUS_PARAM ) |
|
3251 { |
|
3252 flag aSign, bSign; |
|
3253 bits64 av, bv; |
|
3254 |
|
3255 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3256 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3257 ) { |
|
3258 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
|
3259 float_raise( float_flag_invalid STATUS_VAR); |
|
3260 } |
|
3261 return 0; |
|
3262 } |
|
3263 aSign = extractFloat64Sign( a ); |
|
3264 bSign = extractFloat64Sign( b ); |
|
3265 av = float64_val(a); |
|
3266 bv = float64_val(b); |
|
3267 if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 ); |
|
3268 return ( av == bv ) || ( aSign ^ ( av < bv ) ); |
|
3269 |
|
3270 } |
|
3271 |
|
3272 /*---------------------------------------------------------------------------- |
|
3273 | Returns 1 if the double-precision floating-point value `a' is less than |
|
3274 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an |
|
3275 | exception. Otherwise, the comparison is performed according to the IEC/IEEE |
|
3276 | Standard for Binary Floating-Point Arithmetic. |
|
3277 *----------------------------------------------------------------------------*/ |
|
3278 |
|
3279 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM ) |
|
3280 { |
|
3281 flag aSign, bSign; |
|
3282 bits64 av, bv; |
|
3283 |
|
3284 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) ) |
|
3285 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) ) |
|
3286 ) { |
|
3287 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) { |
|
3288 float_raise( float_flag_invalid STATUS_VAR); |
|
3289 } |
|
3290 return 0; |
|
3291 } |
|
3292 aSign = extractFloat64Sign( a ); |
|
3293 bSign = extractFloat64Sign( b ); |
|
3294 av = float64_val(a); |
|
3295 bv = float64_val(b); |
|
3296 if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 ); |
|
3297 return ( av != bv ) && ( aSign ^ ( av < bv ) ); |
|
3298 |
|
3299 } |
|
3300 |
|
3301 #ifdef FLOATX80 |
|
3302 |
|
3303 /*---------------------------------------------------------------------------- |
|
3304 | Returns the result of converting the extended double-precision floating- |
|
3305 | point value `a' to the 32-bit two's complement integer format. The |
|
3306 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3307 | Floating-Point Arithmetic---which means in particular that the conversion |
|
3308 | is rounded according to the current rounding mode. If `a' is a NaN, the |
|
3309 | largest positive integer is returned. Otherwise, if the conversion |
|
3310 | overflows, the largest integer with the same sign as `a' is returned. |
|
3311 *----------------------------------------------------------------------------*/ |
|
3312 |
|
3313 int32 floatx80_to_int32( floatx80 a STATUS_PARAM ) |
|
3314 { |
|
3315 flag aSign; |
|
3316 int32 aExp, shiftCount; |
|
3317 bits64 aSig; |
|
3318 |
|
3319 aSig = extractFloatx80Frac( a ); |
|
3320 aExp = extractFloatx80Exp( a ); |
|
3321 aSign = extractFloatx80Sign( a ); |
|
3322 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
|
3323 shiftCount = 0x4037 - aExp; |
|
3324 if ( shiftCount <= 0 ) shiftCount = 1; |
|
3325 shift64RightJamming( aSig, shiftCount, &aSig ); |
|
3326 return roundAndPackInt32( aSign, aSig STATUS_VAR ); |
|
3327 |
|
3328 } |
|
3329 |
|
3330 /*---------------------------------------------------------------------------- |
|
3331 | Returns the result of converting the extended double-precision floating- |
|
3332 | point value `a' to the 32-bit two's complement integer format. The |
|
3333 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3334 | Floating-Point Arithmetic, except that the conversion is always rounded |
|
3335 | toward zero. If `a' is a NaN, the largest positive integer is returned. |
|
3336 | Otherwise, if the conversion overflows, the largest integer with the same |
|
3337 | sign as `a' is returned. |
|
3338 *----------------------------------------------------------------------------*/ |
|
3339 |
|
3340 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM ) |
|
3341 { |
|
3342 flag aSign; |
|
3343 int32 aExp, shiftCount; |
|
3344 bits64 aSig, savedASig; |
|
3345 int32 z; |
|
3346 |
|
3347 aSig = extractFloatx80Frac( a ); |
|
3348 aExp = extractFloatx80Exp( a ); |
|
3349 aSign = extractFloatx80Sign( a ); |
|
3350 if ( 0x401E < aExp ) { |
|
3351 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0; |
|
3352 goto invalid; |
|
3353 } |
|
3354 else if ( aExp < 0x3FFF ) { |
|
3355 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
3356 return 0; |
|
3357 } |
|
3358 shiftCount = 0x403E - aExp; |
|
3359 savedASig = aSig; |
|
3360 aSig >>= shiftCount; |
|
3361 z = aSig; |
|
3362 if ( aSign ) z = - z; |
|
3363 if ( ( z < 0 ) ^ aSign ) { |
|
3364 invalid: |
|
3365 float_raise( float_flag_invalid STATUS_VAR); |
|
3366 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
|
3367 } |
|
3368 if ( ( aSig<<shiftCount ) != savedASig ) { |
|
3369 STATUS(float_exception_flags) |= float_flag_inexact; |
|
3370 } |
|
3371 return z; |
|
3372 |
|
3373 } |
|
3374 |
|
3375 /*---------------------------------------------------------------------------- |
|
3376 | Returns the result of converting the extended double-precision floating- |
|
3377 | point value `a' to the 64-bit two's complement integer format. The |
|
3378 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3379 | Floating-Point Arithmetic---which means in particular that the conversion |
|
3380 | is rounded according to the current rounding mode. If `a' is a NaN, |
|
3381 | the largest positive integer is returned. Otherwise, if the conversion |
|
3382 | overflows, the largest integer with the same sign as `a' is returned. |
|
3383 *----------------------------------------------------------------------------*/ |
|
3384 |
|
3385 int64 floatx80_to_int64( floatx80 a STATUS_PARAM ) |
|
3386 { |
|
3387 flag aSign; |
|
3388 int32 aExp, shiftCount; |
|
3389 bits64 aSig, aSigExtra; |
|
3390 |
|
3391 aSig = extractFloatx80Frac( a ); |
|
3392 aExp = extractFloatx80Exp( a ); |
|
3393 aSign = extractFloatx80Sign( a ); |
|
3394 shiftCount = 0x403E - aExp; |
|
3395 if ( shiftCount <= 0 ) { |
|
3396 if ( shiftCount ) { |
|
3397 float_raise( float_flag_invalid STATUS_VAR); |
|
3398 if ( ! aSign |
|
3399 || ( ( aExp == 0x7FFF ) |
|
3400 && ( aSig != LIT64( 0x8000000000000000 ) ) ) |
|
3401 ) { |
|
3402 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
3403 } |
|
3404 return (sbits64) LIT64( 0x8000000000000000 ); |
|
3405 } |
|
3406 aSigExtra = 0; |
|
3407 } |
|
3408 else { |
|
3409 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra ); |
|
3410 } |
|
3411 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR ); |
|
3412 |
|
3413 } |
|
3414 |
|
3415 /*---------------------------------------------------------------------------- |
|
3416 | Returns the result of converting the extended double-precision floating- |
|
3417 | point value `a' to the 64-bit two's complement integer format. The |
|
3418 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3419 | Floating-Point Arithmetic, except that the conversion is always rounded |
|
3420 | toward zero. If `a' is a NaN, the largest positive integer is returned. |
|
3421 | Otherwise, if the conversion overflows, the largest integer with the same |
|
3422 | sign as `a' is returned. |
|
3423 *----------------------------------------------------------------------------*/ |
|
3424 |
|
3425 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM ) |
|
3426 { |
|
3427 flag aSign; |
|
3428 int32 aExp, shiftCount; |
|
3429 bits64 aSig; |
|
3430 int64 z; |
|
3431 |
|
3432 aSig = extractFloatx80Frac( a ); |
|
3433 aExp = extractFloatx80Exp( a ); |
|
3434 aSign = extractFloatx80Sign( a ); |
|
3435 shiftCount = aExp - 0x403E; |
|
3436 if ( 0 <= shiftCount ) { |
|
3437 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
3438 if ( ( a.high != 0xC03E ) || aSig ) { |
|
3439 float_raise( float_flag_invalid STATUS_VAR); |
|
3440 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) { |
|
3441 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
3442 } |
|
3443 } |
|
3444 return (sbits64) LIT64( 0x8000000000000000 ); |
|
3445 } |
|
3446 else if ( aExp < 0x3FFF ) { |
|
3447 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
3448 return 0; |
|
3449 } |
|
3450 z = aSig>>( - shiftCount ); |
|
3451 if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) { |
|
3452 STATUS(float_exception_flags) |= float_flag_inexact; |
|
3453 } |
|
3454 if ( aSign ) z = - z; |
|
3455 return z; |
|
3456 |
|
3457 } |
|
3458 |
|
3459 /*---------------------------------------------------------------------------- |
|
3460 | Returns the result of converting the extended double-precision floating- |
|
3461 | point value `a' to the single-precision floating-point format. The |
|
3462 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3463 | Floating-Point Arithmetic. |
|
3464 *----------------------------------------------------------------------------*/ |
|
3465 |
|
3466 float32 floatx80_to_float32( floatx80 a STATUS_PARAM ) |
|
3467 { |
|
3468 flag aSign; |
|
3469 int32 aExp; |
|
3470 bits64 aSig; |
|
3471 |
|
3472 aSig = extractFloatx80Frac( a ); |
|
3473 aExp = extractFloatx80Exp( a ); |
|
3474 aSign = extractFloatx80Sign( a ); |
|
3475 if ( aExp == 0x7FFF ) { |
|
3476 if ( (bits64) ( aSig<<1 ) ) { |
|
3477 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) ); |
|
3478 } |
|
3479 return packFloat32( aSign, 0xFF, 0 ); |
|
3480 } |
|
3481 shift64RightJamming( aSig, 33, &aSig ); |
|
3482 if ( aExp || aSig ) aExp -= 0x3F81; |
|
3483 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); |
|
3484 |
|
3485 } |
|
3486 |
|
3487 /*---------------------------------------------------------------------------- |
|
3488 | Returns the result of converting the extended double-precision floating- |
|
3489 | point value `a' to the double-precision floating-point format. The |
|
3490 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3491 | Floating-Point Arithmetic. |
|
3492 *----------------------------------------------------------------------------*/ |
|
3493 |
|
3494 float64 floatx80_to_float64( floatx80 a STATUS_PARAM ) |
|
3495 { |
|
3496 flag aSign; |
|
3497 int32 aExp; |
|
3498 bits64 aSig, zSig; |
|
3499 |
|
3500 aSig = extractFloatx80Frac( a ); |
|
3501 aExp = extractFloatx80Exp( a ); |
|
3502 aSign = extractFloatx80Sign( a ); |
|
3503 if ( aExp == 0x7FFF ) { |
|
3504 if ( (bits64) ( aSig<<1 ) ) { |
|
3505 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) ); |
|
3506 } |
|
3507 return packFloat64( aSign, 0x7FF, 0 ); |
|
3508 } |
|
3509 shift64RightJamming( aSig, 1, &zSig ); |
|
3510 if ( aExp || aSig ) aExp -= 0x3C01; |
|
3511 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR ); |
|
3512 |
|
3513 } |
|
3514 |
|
3515 #ifdef FLOAT128 |
|
3516 |
|
3517 /*---------------------------------------------------------------------------- |
|
3518 | Returns the result of converting the extended double-precision floating- |
|
3519 | point value `a' to the quadruple-precision floating-point format. The |
|
3520 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
3521 | Floating-Point Arithmetic. |
|
3522 *----------------------------------------------------------------------------*/ |
|
3523 |
|
3524 float128 floatx80_to_float128( floatx80 a STATUS_PARAM ) |
|
3525 { |
|
3526 flag aSign; |
|
3527 int16 aExp; |
|
3528 bits64 aSig, zSig0, zSig1; |
|
3529 |
|
3530 aSig = extractFloatx80Frac( a ); |
|
3531 aExp = extractFloatx80Exp( a ); |
|
3532 aSign = extractFloatx80Sign( a ); |
|
3533 if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) { |
|
3534 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) ); |
|
3535 } |
|
3536 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 ); |
|
3537 return packFloat128( aSign, aExp, zSig0, zSig1 ); |
|
3538 |
|
3539 } |
|
3540 |
|
3541 #endif |
|
3542 |
|
3543 /*---------------------------------------------------------------------------- |
|
3544 | Rounds the extended double-precision floating-point value `a' to an integer, |
|
3545 | and returns the result as an extended quadruple-precision floating-point |
|
3546 | value. The operation is performed according to the IEC/IEEE Standard for |
|
3547 | Binary Floating-Point Arithmetic. |
|
3548 *----------------------------------------------------------------------------*/ |
|
3549 |
|
3550 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM ) |
|
3551 { |
|
3552 flag aSign; |
|
3553 int32 aExp; |
|
3554 bits64 lastBitMask, roundBitsMask; |
|
3555 int8 roundingMode; |
|
3556 floatx80 z; |
|
3557 |
|
3558 aExp = extractFloatx80Exp( a ); |
|
3559 if ( 0x403E <= aExp ) { |
|
3560 if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) { |
|
3561 return propagateFloatx80NaN( a, a STATUS_VAR ); |
|
3562 } |
|
3563 return a; |
|
3564 } |
|
3565 if ( aExp < 0x3FFF ) { |
|
3566 if ( ( aExp == 0 ) |
|
3567 && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) { |
|
3568 return a; |
|
3569 } |
|
3570 STATUS(float_exception_flags) |= float_flag_inexact; |
|
3571 aSign = extractFloatx80Sign( a ); |
|
3572 switch ( STATUS(float_rounding_mode) ) { |
|
3573 case float_round_nearest_even: |
|
3574 if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 ) |
|
3575 ) { |
|
3576 return |
|
3577 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
|
3578 } |
|
3579 break; |
|
3580 case float_round_down: |
|
3581 return |
|
3582 aSign ? |
|
3583 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) ) |
|
3584 : packFloatx80( 0, 0, 0 ); |
|
3585 case float_round_up: |
|
3586 return |
|
3587 aSign ? packFloatx80( 1, 0, 0 ) |
|
3588 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) ); |
|
3589 } |
|
3590 return packFloatx80( aSign, 0, 0 ); |
|
3591 } |
|
3592 lastBitMask = 1; |
|
3593 lastBitMask <<= 0x403E - aExp; |
|
3594 roundBitsMask = lastBitMask - 1; |
|
3595 z = a; |
|
3596 roundingMode = STATUS(float_rounding_mode); |
|
3597 if ( roundingMode == float_round_nearest_even ) { |
|
3598 z.low += lastBitMask>>1; |
|
3599 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; |
|
3600 } |
|
3601 else if ( roundingMode != float_round_to_zero ) { |
|
3602 if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) { |
|
3603 z.low += roundBitsMask; |
|
3604 } |
|
3605 } |
|
3606 z.low &= ~ roundBitsMask; |
|
3607 if ( z.low == 0 ) { |
|
3608 ++z.high; |
|
3609 z.low = LIT64( 0x8000000000000000 ); |
|
3610 } |
|
3611 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
3612 return z; |
|
3613 |
|
3614 } |
|
3615 |
|
3616 /*---------------------------------------------------------------------------- |
|
3617 | Returns the result of adding the absolute values of the extended double- |
|
3618 | precision floating-point values `a' and `b'. If `zSign' is 1, the sum is |
|
3619 | negated before being returned. `zSign' is ignored if the result is a NaN. |
|
3620 | The addition is performed according to the IEC/IEEE Standard for Binary |
|
3621 | Floating-Point Arithmetic. |
|
3622 *----------------------------------------------------------------------------*/ |
|
3623 |
|
3624 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM) |
|
3625 { |
|
3626 int32 aExp, bExp, zExp; |
|
3627 bits64 aSig, bSig, zSig0, zSig1; |
|
3628 int32 expDiff; |
|
3629 |
|
3630 aSig = extractFloatx80Frac( a ); |
|
3631 aExp = extractFloatx80Exp( a ); |
|
3632 bSig = extractFloatx80Frac( b ); |
|
3633 bExp = extractFloatx80Exp( b ); |
|
3634 expDiff = aExp - bExp; |
|
3635 if ( 0 < expDiff ) { |
|
3636 if ( aExp == 0x7FFF ) { |
|
3637 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3638 return a; |
|
3639 } |
|
3640 if ( bExp == 0 ) --expDiff; |
|
3641 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); |
|
3642 zExp = aExp; |
|
3643 } |
|
3644 else if ( expDiff < 0 ) { |
|
3645 if ( bExp == 0x7FFF ) { |
|
3646 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3647 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3648 } |
|
3649 if ( aExp == 0 ) ++expDiff; |
|
3650 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); |
|
3651 zExp = bExp; |
|
3652 } |
|
3653 else { |
|
3654 if ( aExp == 0x7FFF ) { |
|
3655 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
|
3656 return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3657 } |
|
3658 return a; |
|
3659 } |
|
3660 zSig1 = 0; |
|
3661 zSig0 = aSig + bSig; |
|
3662 if ( aExp == 0 ) { |
|
3663 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 ); |
|
3664 goto roundAndPack; |
|
3665 } |
|
3666 zExp = aExp; |
|
3667 goto shiftRight1; |
|
3668 } |
|
3669 zSig0 = aSig + bSig; |
|
3670 if ( (sbits64) zSig0 < 0 ) goto roundAndPack; |
|
3671 shiftRight1: |
|
3672 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 ); |
|
3673 zSig0 |= LIT64( 0x8000000000000000 ); |
|
3674 ++zExp; |
|
3675 roundAndPack: |
|
3676 return |
|
3677 roundAndPackFloatx80( |
|
3678 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); |
|
3679 |
|
3680 } |
|
3681 |
|
3682 /*---------------------------------------------------------------------------- |
|
3683 | Returns the result of subtracting the absolute values of the extended |
|
3684 | double-precision floating-point values `a' and `b'. If `zSign' is 1, the |
|
3685 | difference is negated before being returned. `zSign' is ignored if the |
|
3686 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
|
3687 | Standard for Binary Floating-Point Arithmetic. |
|
3688 *----------------------------------------------------------------------------*/ |
|
3689 |
|
3690 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM ) |
|
3691 { |
|
3692 int32 aExp, bExp, zExp; |
|
3693 bits64 aSig, bSig, zSig0, zSig1; |
|
3694 int32 expDiff; |
|
3695 floatx80 z; |
|
3696 |
|
3697 aSig = extractFloatx80Frac( a ); |
|
3698 aExp = extractFloatx80Exp( a ); |
|
3699 bSig = extractFloatx80Frac( b ); |
|
3700 bExp = extractFloatx80Exp( b ); |
|
3701 expDiff = aExp - bExp; |
|
3702 if ( 0 < expDiff ) goto aExpBigger; |
|
3703 if ( expDiff < 0 ) goto bExpBigger; |
|
3704 if ( aExp == 0x7FFF ) { |
|
3705 if ( (bits64) ( ( aSig | bSig )<<1 ) ) { |
|
3706 return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3707 } |
|
3708 float_raise( float_flag_invalid STATUS_VAR); |
|
3709 z.low = floatx80_default_nan_low; |
|
3710 z.high = floatx80_default_nan_high; |
|
3711 return z; |
|
3712 } |
|
3713 if ( aExp == 0 ) { |
|
3714 aExp = 1; |
|
3715 bExp = 1; |
|
3716 } |
|
3717 zSig1 = 0; |
|
3718 if ( bSig < aSig ) goto aBigger; |
|
3719 if ( aSig < bSig ) goto bBigger; |
|
3720 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 ); |
|
3721 bExpBigger: |
|
3722 if ( bExp == 0x7FFF ) { |
|
3723 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3724 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3725 } |
|
3726 if ( aExp == 0 ) ++expDiff; |
|
3727 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 ); |
|
3728 bBigger: |
|
3729 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 ); |
|
3730 zExp = bExp; |
|
3731 zSign ^= 1; |
|
3732 goto normalizeRoundAndPack; |
|
3733 aExpBigger: |
|
3734 if ( aExp == 0x7FFF ) { |
|
3735 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3736 return a; |
|
3737 } |
|
3738 if ( bExp == 0 ) --expDiff; |
|
3739 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 ); |
|
3740 aBigger: |
|
3741 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 ); |
|
3742 zExp = aExp; |
|
3743 normalizeRoundAndPack: |
|
3744 return |
|
3745 normalizeRoundAndPackFloatx80( |
|
3746 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); |
|
3747 |
|
3748 } |
|
3749 |
|
3750 /*---------------------------------------------------------------------------- |
|
3751 | Returns the result of adding the extended double-precision floating-point |
|
3752 | values `a' and `b'. The operation is performed according to the IEC/IEEE |
|
3753 | Standard for Binary Floating-Point Arithmetic. |
|
3754 *----------------------------------------------------------------------------*/ |
|
3755 |
|
3756 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM ) |
|
3757 { |
|
3758 flag aSign, bSign; |
|
3759 |
|
3760 aSign = extractFloatx80Sign( a ); |
|
3761 bSign = extractFloatx80Sign( b ); |
|
3762 if ( aSign == bSign ) { |
|
3763 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); |
|
3764 } |
|
3765 else { |
|
3766 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); |
|
3767 } |
|
3768 |
|
3769 } |
|
3770 |
|
3771 /*---------------------------------------------------------------------------- |
|
3772 | Returns the result of subtracting the extended double-precision floating- |
|
3773 | point values `a' and `b'. The operation is performed according to the |
|
3774 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3775 *----------------------------------------------------------------------------*/ |
|
3776 |
|
3777 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM ) |
|
3778 { |
|
3779 flag aSign, bSign; |
|
3780 |
|
3781 aSign = extractFloatx80Sign( a ); |
|
3782 bSign = extractFloatx80Sign( b ); |
|
3783 if ( aSign == bSign ) { |
|
3784 return subFloatx80Sigs( a, b, aSign STATUS_VAR ); |
|
3785 } |
|
3786 else { |
|
3787 return addFloatx80Sigs( a, b, aSign STATUS_VAR ); |
|
3788 } |
|
3789 |
|
3790 } |
|
3791 |
|
3792 /*---------------------------------------------------------------------------- |
|
3793 | Returns the result of multiplying the extended double-precision floating- |
|
3794 | point values `a' and `b'. The operation is performed according to the |
|
3795 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3796 *----------------------------------------------------------------------------*/ |
|
3797 |
|
3798 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM ) |
|
3799 { |
|
3800 flag aSign, bSign, zSign; |
|
3801 int32 aExp, bExp, zExp; |
|
3802 bits64 aSig, bSig, zSig0, zSig1; |
|
3803 floatx80 z; |
|
3804 |
|
3805 aSig = extractFloatx80Frac( a ); |
|
3806 aExp = extractFloatx80Exp( a ); |
|
3807 aSign = extractFloatx80Sign( a ); |
|
3808 bSig = extractFloatx80Frac( b ); |
|
3809 bExp = extractFloatx80Exp( b ); |
|
3810 bSign = extractFloatx80Sign( b ); |
|
3811 zSign = aSign ^ bSign; |
|
3812 if ( aExp == 0x7FFF ) { |
|
3813 if ( (bits64) ( aSig<<1 ) |
|
3814 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
|
3815 return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3816 } |
|
3817 if ( ( bExp | bSig ) == 0 ) goto invalid; |
|
3818 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3819 } |
|
3820 if ( bExp == 0x7FFF ) { |
|
3821 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3822 if ( ( aExp | aSig ) == 0 ) { |
|
3823 invalid: |
|
3824 float_raise( float_flag_invalid STATUS_VAR); |
|
3825 z.low = floatx80_default_nan_low; |
|
3826 z.high = floatx80_default_nan_high; |
|
3827 return z; |
|
3828 } |
|
3829 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3830 } |
|
3831 if ( aExp == 0 ) { |
|
3832 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
|
3833 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
|
3834 } |
|
3835 if ( bExp == 0 ) { |
|
3836 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
|
3837 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
|
3838 } |
|
3839 zExp = aExp + bExp - 0x3FFE; |
|
3840 mul64To128( aSig, bSig, &zSig0, &zSig1 ); |
|
3841 if ( 0 < (sbits64) zSig0 ) { |
|
3842 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 ); |
|
3843 --zExp; |
|
3844 } |
|
3845 return |
|
3846 roundAndPackFloatx80( |
|
3847 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); |
|
3848 |
|
3849 } |
|
3850 |
|
3851 /*---------------------------------------------------------------------------- |
|
3852 | Returns the result of dividing the extended double-precision floating-point |
|
3853 | value `a' by the corresponding value `b'. The operation is performed |
|
3854 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3855 *----------------------------------------------------------------------------*/ |
|
3856 |
|
3857 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM ) |
|
3858 { |
|
3859 flag aSign, bSign, zSign; |
|
3860 int32 aExp, bExp, zExp; |
|
3861 bits64 aSig, bSig, zSig0, zSig1; |
|
3862 bits64 rem0, rem1, rem2, term0, term1, term2; |
|
3863 floatx80 z; |
|
3864 |
|
3865 aSig = extractFloatx80Frac( a ); |
|
3866 aExp = extractFloatx80Exp( a ); |
|
3867 aSign = extractFloatx80Sign( a ); |
|
3868 bSig = extractFloatx80Frac( b ); |
|
3869 bExp = extractFloatx80Exp( b ); |
|
3870 bSign = extractFloatx80Sign( b ); |
|
3871 zSign = aSign ^ bSign; |
|
3872 if ( aExp == 0x7FFF ) { |
|
3873 if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3874 if ( bExp == 0x7FFF ) { |
|
3875 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3876 goto invalid; |
|
3877 } |
|
3878 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3879 } |
|
3880 if ( bExp == 0x7FFF ) { |
|
3881 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3882 return packFloatx80( zSign, 0, 0 ); |
|
3883 } |
|
3884 if ( bExp == 0 ) { |
|
3885 if ( bSig == 0 ) { |
|
3886 if ( ( aExp | aSig ) == 0 ) { |
|
3887 invalid: |
|
3888 float_raise( float_flag_invalid STATUS_VAR); |
|
3889 z.low = floatx80_default_nan_low; |
|
3890 z.high = floatx80_default_nan_high; |
|
3891 return z; |
|
3892 } |
|
3893 float_raise( float_flag_divbyzero STATUS_VAR); |
|
3894 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
3895 } |
|
3896 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
|
3897 } |
|
3898 if ( aExp == 0 ) { |
|
3899 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 ); |
|
3900 normalizeFloatx80Subnormal( aSig, &aExp, &aSig ); |
|
3901 } |
|
3902 zExp = aExp - bExp + 0x3FFE; |
|
3903 rem1 = 0; |
|
3904 if ( bSig <= aSig ) { |
|
3905 shift128Right( aSig, 0, 1, &aSig, &rem1 ); |
|
3906 ++zExp; |
|
3907 } |
|
3908 zSig0 = estimateDiv128To64( aSig, rem1, bSig ); |
|
3909 mul64To128( bSig, zSig0, &term0, &term1 ); |
|
3910 sub128( aSig, rem1, term0, term1, &rem0, &rem1 ); |
|
3911 while ( (sbits64) rem0 < 0 ) { |
|
3912 --zSig0; |
|
3913 add128( rem0, rem1, 0, bSig, &rem0, &rem1 ); |
|
3914 } |
|
3915 zSig1 = estimateDiv128To64( rem1, 0, bSig ); |
|
3916 if ( (bits64) ( zSig1<<1 ) <= 8 ) { |
|
3917 mul64To128( bSig, zSig1, &term1, &term2 ); |
|
3918 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); |
|
3919 while ( (sbits64) rem1 < 0 ) { |
|
3920 --zSig1; |
|
3921 add128( rem1, rem2, 0, bSig, &rem1, &rem2 ); |
|
3922 } |
|
3923 zSig1 |= ( ( rem1 | rem2 ) != 0 ); |
|
3924 } |
|
3925 return |
|
3926 roundAndPackFloatx80( |
|
3927 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR ); |
|
3928 |
|
3929 } |
|
3930 |
|
3931 /*---------------------------------------------------------------------------- |
|
3932 | Returns the remainder of the extended double-precision floating-point value |
|
3933 | `a' with respect to the corresponding value `b'. The operation is performed |
|
3934 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
3935 *----------------------------------------------------------------------------*/ |
|
3936 |
|
3937 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM ) |
|
3938 { |
|
3939 flag aSign, bSign, zSign; |
|
3940 int32 aExp, bExp, expDiff; |
|
3941 bits64 aSig0, aSig1, bSig; |
|
3942 bits64 q, term0, term1, alternateASig0, alternateASig1; |
|
3943 floatx80 z; |
|
3944 |
|
3945 aSig0 = extractFloatx80Frac( a ); |
|
3946 aExp = extractFloatx80Exp( a ); |
|
3947 aSign = extractFloatx80Sign( a ); |
|
3948 bSig = extractFloatx80Frac( b ); |
|
3949 bExp = extractFloatx80Exp( b ); |
|
3950 bSign = extractFloatx80Sign( b ); |
|
3951 if ( aExp == 0x7FFF ) { |
|
3952 if ( (bits64) ( aSig0<<1 ) |
|
3953 || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) { |
|
3954 return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3955 } |
|
3956 goto invalid; |
|
3957 } |
|
3958 if ( bExp == 0x7FFF ) { |
|
3959 if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR ); |
|
3960 return a; |
|
3961 } |
|
3962 if ( bExp == 0 ) { |
|
3963 if ( bSig == 0 ) { |
|
3964 invalid: |
|
3965 float_raise( float_flag_invalid STATUS_VAR); |
|
3966 z.low = floatx80_default_nan_low; |
|
3967 z.high = floatx80_default_nan_high; |
|
3968 return z; |
|
3969 } |
|
3970 normalizeFloatx80Subnormal( bSig, &bExp, &bSig ); |
|
3971 } |
|
3972 if ( aExp == 0 ) { |
|
3973 if ( (bits64) ( aSig0<<1 ) == 0 ) return a; |
|
3974 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
|
3975 } |
|
3976 bSig |= LIT64( 0x8000000000000000 ); |
|
3977 zSign = aSign; |
|
3978 expDiff = aExp - bExp; |
|
3979 aSig1 = 0; |
|
3980 if ( expDiff < 0 ) { |
|
3981 if ( expDiff < -1 ) return a; |
|
3982 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 ); |
|
3983 expDiff = 0; |
|
3984 } |
|
3985 q = ( bSig <= aSig0 ); |
|
3986 if ( q ) aSig0 -= bSig; |
|
3987 expDiff -= 64; |
|
3988 while ( 0 < expDiff ) { |
|
3989 q = estimateDiv128To64( aSig0, aSig1, bSig ); |
|
3990 q = ( 2 < q ) ? q - 2 : 0; |
|
3991 mul64To128( bSig, q, &term0, &term1 ); |
|
3992 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
|
3993 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 ); |
|
3994 expDiff -= 62; |
|
3995 } |
|
3996 expDiff += 64; |
|
3997 if ( 0 < expDiff ) { |
|
3998 q = estimateDiv128To64( aSig0, aSig1, bSig ); |
|
3999 q = ( 2 < q ) ? q - 2 : 0; |
|
4000 q >>= 64 - expDiff; |
|
4001 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 ); |
|
4002 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
|
4003 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 ); |
|
4004 while ( le128( term0, term1, aSig0, aSig1 ) ) { |
|
4005 ++q; |
|
4006 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 ); |
|
4007 } |
|
4008 } |
|
4009 else { |
|
4010 term1 = 0; |
|
4011 term0 = bSig; |
|
4012 } |
|
4013 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 ); |
|
4014 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 ) |
|
4015 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 ) |
|
4016 && ( q & 1 ) ) |
|
4017 ) { |
|
4018 aSig0 = alternateASig0; |
|
4019 aSig1 = alternateASig1; |
|
4020 zSign = ! zSign; |
|
4021 } |
|
4022 return |
|
4023 normalizeRoundAndPackFloatx80( |
|
4024 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR ); |
|
4025 |
|
4026 } |
|
4027 |
|
4028 /*---------------------------------------------------------------------------- |
|
4029 | Returns the square root of the extended double-precision floating-point |
|
4030 | value `a'. The operation is performed according to the IEC/IEEE Standard |
|
4031 | for Binary Floating-Point Arithmetic. |
|
4032 *----------------------------------------------------------------------------*/ |
|
4033 |
|
4034 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM ) |
|
4035 { |
|
4036 flag aSign; |
|
4037 int32 aExp, zExp; |
|
4038 bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0; |
|
4039 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; |
|
4040 floatx80 z; |
|
4041 |
|
4042 aSig0 = extractFloatx80Frac( a ); |
|
4043 aExp = extractFloatx80Exp( a ); |
|
4044 aSign = extractFloatx80Sign( a ); |
|
4045 if ( aExp == 0x7FFF ) { |
|
4046 if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR ); |
|
4047 if ( ! aSign ) return a; |
|
4048 goto invalid; |
|
4049 } |
|
4050 if ( aSign ) { |
|
4051 if ( ( aExp | aSig0 ) == 0 ) return a; |
|
4052 invalid: |
|
4053 float_raise( float_flag_invalid STATUS_VAR); |
|
4054 z.low = floatx80_default_nan_low; |
|
4055 z.high = floatx80_default_nan_high; |
|
4056 return z; |
|
4057 } |
|
4058 if ( aExp == 0 ) { |
|
4059 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 ); |
|
4060 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 ); |
|
4061 } |
|
4062 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF; |
|
4063 zSig0 = estimateSqrt32( aExp, aSig0>>32 ); |
|
4064 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 ); |
|
4065 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); |
|
4066 doubleZSig0 = zSig0<<1; |
|
4067 mul64To128( zSig0, zSig0, &term0, &term1 ); |
|
4068 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); |
|
4069 while ( (sbits64) rem0 < 0 ) { |
|
4070 --zSig0; |
|
4071 doubleZSig0 -= 2; |
|
4072 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); |
|
4073 } |
|
4074 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); |
|
4075 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) { |
|
4076 if ( zSig1 == 0 ) zSig1 = 1; |
|
4077 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); |
|
4078 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); |
|
4079 mul64To128( zSig1, zSig1, &term2, &term3 ); |
|
4080 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); |
|
4081 while ( (sbits64) rem1 < 0 ) { |
|
4082 --zSig1; |
|
4083 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); |
|
4084 term3 |= 1; |
|
4085 term2 |= doubleZSig0; |
|
4086 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); |
|
4087 } |
|
4088 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); |
|
4089 } |
|
4090 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 ); |
|
4091 zSig0 |= doubleZSig0; |
|
4092 return |
|
4093 roundAndPackFloatx80( |
|
4094 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR ); |
|
4095 |
|
4096 } |
|
4097 |
|
4098 /*---------------------------------------------------------------------------- |
|
4099 | Returns 1 if the extended double-precision floating-point value `a' is |
|
4100 | equal to the corresponding value `b', and 0 otherwise. The comparison is |
|
4101 | performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4102 | Arithmetic. |
|
4103 *----------------------------------------------------------------------------*/ |
|
4104 |
|
4105 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4106 { |
|
4107 |
|
4108 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4109 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4110 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4111 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4112 ) { |
|
4113 if ( floatx80_is_signaling_nan( a ) |
|
4114 || floatx80_is_signaling_nan( b ) ) { |
|
4115 float_raise( float_flag_invalid STATUS_VAR); |
|
4116 } |
|
4117 return 0; |
|
4118 } |
|
4119 return |
|
4120 ( a.low == b.low ) |
|
4121 && ( ( a.high == b.high ) |
|
4122 || ( ( a.low == 0 ) |
|
4123 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
|
4124 ); |
|
4125 |
|
4126 } |
|
4127 |
|
4128 /*---------------------------------------------------------------------------- |
|
4129 | Returns 1 if the extended double-precision floating-point value `a' is |
|
4130 | less than or equal to the corresponding value `b', and 0 otherwise. The |
|
4131 | comparison is performed according to the IEC/IEEE Standard for Binary |
|
4132 | Floating-Point Arithmetic. |
|
4133 *----------------------------------------------------------------------------*/ |
|
4134 |
|
4135 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4136 { |
|
4137 flag aSign, bSign; |
|
4138 |
|
4139 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4140 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4141 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4142 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4143 ) { |
|
4144 float_raise( float_flag_invalid STATUS_VAR); |
|
4145 return 0; |
|
4146 } |
|
4147 aSign = extractFloatx80Sign( a ); |
|
4148 bSign = extractFloatx80Sign( b ); |
|
4149 if ( aSign != bSign ) { |
|
4150 return |
|
4151 aSign |
|
4152 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
4153 == 0 ); |
|
4154 } |
|
4155 return |
|
4156 aSign ? le128( b.high, b.low, a.high, a.low ) |
|
4157 : le128( a.high, a.low, b.high, b.low ); |
|
4158 |
|
4159 } |
|
4160 |
|
4161 /*---------------------------------------------------------------------------- |
|
4162 | Returns 1 if the extended double-precision floating-point value `a' is |
|
4163 | less than the corresponding value `b', and 0 otherwise. The comparison |
|
4164 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4165 | Arithmetic. |
|
4166 *----------------------------------------------------------------------------*/ |
|
4167 |
|
4168 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4169 { |
|
4170 flag aSign, bSign; |
|
4171 |
|
4172 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4173 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4174 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4175 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4176 ) { |
|
4177 float_raise( float_flag_invalid STATUS_VAR); |
|
4178 return 0; |
|
4179 } |
|
4180 aSign = extractFloatx80Sign( a ); |
|
4181 bSign = extractFloatx80Sign( b ); |
|
4182 if ( aSign != bSign ) { |
|
4183 return |
|
4184 aSign |
|
4185 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
4186 != 0 ); |
|
4187 } |
|
4188 return |
|
4189 aSign ? lt128( b.high, b.low, a.high, a.low ) |
|
4190 : lt128( a.high, a.low, b.high, b.low ); |
|
4191 |
|
4192 } |
|
4193 |
|
4194 /*---------------------------------------------------------------------------- |
|
4195 | Returns 1 if the extended double-precision floating-point value `a' is equal |
|
4196 | to the corresponding value `b', and 0 otherwise. The invalid exception is |
|
4197 | raised if either operand is a NaN. Otherwise, the comparison is performed |
|
4198 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
4199 *----------------------------------------------------------------------------*/ |
|
4200 |
|
4201 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4202 { |
|
4203 |
|
4204 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4205 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4206 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4207 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4208 ) { |
|
4209 float_raise( float_flag_invalid STATUS_VAR); |
|
4210 return 0; |
|
4211 } |
|
4212 return |
|
4213 ( a.low == b.low ) |
|
4214 && ( ( a.high == b.high ) |
|
4215 || ( ( a.low == 0 ) |
|
4216 && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) ) |
|
4217 ); |
|
4218 |
|
4219 } |
|
4220 |
|
4221 /*---------------------------------------------------------------------------- |
|
4222 | Returns 1 if the extended double-precision floating-point value `a' is less |
|
4223 | than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs |
|
4224 | do not cause an exception. Otherwise, the comparison is performed according |
|
4225 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
4226 *----------------------------------------------------------------------------*/ |
|
4227 |
|
4228 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4229 { |
|
4230 flag aSign, bSign; |
|
4231 |
|
4232 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4233 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4234 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4235 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4236 ) { |
|
4237 if ( floatx80_is_signaling_nan( a ) |
|
4238 || floatx80_is_signaling_nan( b ) ) { |
|
4239 float_raise( float_flag_invalid STATUS_VAR); |
|
4240 } |
|
4241 return 0; |
|
4242 } |
|
4243 aSign = extractFloatx80Sign( a ); |
|
4244 bSign = extractFloatx80Sign( b ); |
|
4245 if ( aSign != bSign ) { |
|
4246 return |
|
4247 aSign |
|
4248 || ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
4249 == 0 ); |
|
4250 } |
|
4251 return |
|
4252 aSign ? le128( b.high, b.low, a.high, a.low ) |
|
4253 : le128( a.high, a.low, b.high, b.low ); |
|
4254 |
|
4255 } |
|
4256 |
|
4257 /*---------------------------------------------------------------------------- |
|
4258 | Returns 1 if the extended double-precision floating-point value `a' is less |
|
4259 | than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause |
|
4260 | an exception. Otherwise, the comparison is performed according to the |
|
4261 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
4262 *----------------------------------------------------------------------------*/ |
|
4263 |
|
4264 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM ) |
|
4265 { |
|
4266 flag aSign, bSign; |
|
4267 |
|
4268 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF ) |
|
4269 && (bits64) ( extractFloatx80Frac( a )<<1 ) ) |
|
4270 || ( ( extractFloatx80Exp( b ) == 0x7FFF ) |
|
4271 && (bits64) ( extractFloatx80Frac( b )<<1 ) ) |
|
4272 ) { |
|
4273 if ( floatx80_is_signaling_nan( a ) |
|
4274 || floatx80_is_signaling_nan( b ) ) { |
|
4275 float_raise( float_flag_invalid STATUS_VAR); |
|
4276 } |
|
4277 return 0; |
|
4278 } |
|
4279 aSign = extractFloatx80Sign( a ); |
|
4280 bSign = extractFloatx80Sign( b ); |
|
4281 if ( aSign != bSign ) { |
|
4282 return |
|
4283 aSign |
|
4284 && ( ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
4285 != 0 ); |
|
4286 } |
|
4287 return |
|
4288 aSign ? lt128( b.high, b.low, a.high, a.low ) |
|
4289 : lt128( a.high, a.low, b.high, b.low ); |
|
4290 |
|
4291 } |
|
4292 |
|
4293 #endif |
|
4294 |
|
4295 #ifdef FLOAT128 |
|
4296 |
|
4297 /*---------------------------------------------------------------------------- |
|
4298 | Returns the result of converting the quadruple-precision floating-point |
|
4299 | value `a' to the 32-bit two's complement integer format. The conversion |
|
4300 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4301 | Arithmetic---which means in particular that the conversion is rounded |
|
4302 | according to the current rounding mode. If `a' is a NaN, the largest |
|
4303 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
4304 | largest integer with the same sign as `a' is returned. |
|
4305 *----------------------------------------------------------------------------*/ |
|
4306 |
|
4307 int32 float128_to_int32( float128 a STATUS_PARAM ) |
|
4308 { |
|
4309 flag aSign; |
|
4310 int32 aExp, shiftCount; |
|
4311 bits64 aSig0, aSig1; |
|
4312 |
|
4313 aSig1 = extractFloat128Frac1( a ); |
|
4314 aSig0 = extractFloat128Frac0( a ); |
|
4315 aExp = extractFloat128Exp( a ); |
|
4316 aSign = extractFloat128Sign( a ); |
|
4317 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0; |
|
4318 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
|
4319 aSig0 |= ( aSig1 != 0 ); |
|
4320 shiftCount = 0x4028 - aExp; |
|
4321 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 ); |
|
4322 return roundAndPackInt32( aSign, aSig0 STATUS_VAR ); |
|
4323 |
|
4324 } |
|
4325 |
|
4326 /*---------------------------------------------------------------------------- |
|
4327 | Returns the result of converting the quadruple-precision floating-point |
|
4328 | value `a' to the 32-bit two's complement integer format. The conversion |
|
4329 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4330 | Arithmetic, except that the conversion is always rounded toward zero. If |
|
4331 | `a' is a NaN, the largest positive integer is returned. Otherwise, if the |
|
4332 | conversion overflows, the largest integer with the same sign as `a' is |
|
4333 | returned. |
|
4334 *----------------------------------------------------------------------------*/ |
|
4335 |
|
4336 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM ) |
|
4337 { |
|
4338 flag aSign; |
|
4339 int32 aExp, shiftCount; |
|
4340 bits64 aSig0, aSig1, savedASig; |
|
4341 int32 z; |
|
4342 |
|
4343 aSig1 = extractFloat128Frac1( a ); |
|
4344 aSig0 = extractFloat128Frac0( a ); |
|
4345 aExp = extractFloat128Exp( a ); |
|
4346 aSign = extractFloat128Sign( a ); |
|
4347 aSig0 |= ( aSig1 != 0 ); |
|
4348 if ( 0x401E < aExp ) { |
|
4349 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0; |
|
4350 goto invalid; |
|
4351 } |
|
4352 else if ( aExp < 0x3FFF ) { |
|
4353 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
4354 return 0; |
|
4355 } |
|
4356 aSig0 |= LIT64( 0x0001000000000000 ); |
|
4357 shiftCount = 0x402F - aExp; |
|
4358 savedASig = aSig0; |
|
4359 aSig0 >>= shiftCount; |
|
4360 z = aSig0; |
|
4361 if ( aSign ) z = - z; |
|
4362 if ( ( z < 0 ) ^ aSign ) { |
|
4363 invalid: |
|
4364 float_raise( float_flag_invalid STATUS_VAR); |
|
4365 return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF; |
|
4366 } |
|
4367 if ( ( aSig0<<shiftCount ) != savedASig ) { |
|
4368 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4369 } |
|
4370 return z; |
|
4371 |
|
4372 } |
|
4373 |
|
4374 /*---------------------------------------------------------------------------- |
|
4375 | Returns the result of converting the quadruple-precision floating-point |
|
4376 | value `a' to the 64-bit two's complement integer format. The conversion |
|
4377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4378 | Arithmetic---which means in particular that the conversion is rounded |
|
4379 | according to the current rounding mode. If `a' is a NaN, the largest |
|
4380 | positive integer is returned. Otherwise, if the conversion overflows, the |
|
4381 | largest integer with the same sign as `a' is returned. |
|
4382 *----------------------------------------------------------------------------*/ |
|
4383 |
|
4384 int64 float128_to_int64( float128 a STATUS_PARAM ) |
|
4385 { |
|
4386 flag aSign; |
|
4387 int32 aExp, shiftCount; |
|
4388 bits64 aSig0, aSig1; |
|
4389 |
|
4390 aSig1 = extractFloat128Frac1( a ); |
|
4391 aSig0 = extractFloat128Frac0( a ); |
|
4392 aExp = extractFloat128Exp( a ); |
|
4393 aSign = extractFloat128Sign( a ); |
|
4394 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
|
4395 shiftCount = 0x402F - aExp; |
|
4396 if ( shiftCount <= 0 ) { |
|
4397 if ( 0x403E < aExp ) { |
|
4398 float_raise( float_flag_invalid STATUS_VAR); |
|
4399 if ( ! aSign |
|
4400 || ( ( aExp == 0x7FFF ) |
|
4401 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) ) |
|
4402 ) |
|
4403 ) { |
|
4404 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
4405 } |
|
4406 return (sbits64) LIT64( 0x8000000000000000 ); |
|
4407 } |
|
4408 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 ); |
|
4409 } |
|
4410 else { |
|
4411 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 ); |
|
4412 } |
|
4413 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR ); |
|
4414 |
|
4415 } |
|
4416 |
|
4417 /*---------------------------------------------------------------------------- |
|
4418 | Returns the result of converting the quadruple-precision floating-point |
|
4419 | value `a' to the 64-bit two's complement integer format. The conversion |
|
4420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4421 | Arithmetic, except that the conversion is always rounded toward zero. |
|
4422 | If `a' is a NaN, the largest positive integer is returned. Otherwise, if |
|
4423 | the conversion overflows, the largest integer with the same sign as `a' is |
|
4424 | returned. |
|
4425 *----------------------------------------------------------------------------*/ |
|
4426 |
|
4427 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM ) |
|
4428 { |
|
4429 flag aSign; |
|
4430 int32 aExp, shiftCount; |
|
4431 bits64 aSig0, aSig1; |
|
4432 int64 z; |
|
4433 |
|
4434 aSig1 = extractFloat128Frac1( a ); |
|
4435 aSig0 = extractFloat128Frac0( a ); |
|
4436 aExp = extractFloat128Exp( a ); |
|
4437 aSign = extractFloat128Sign( a ); |
|
4438 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 ); |
|
4439 shiftCount = aExp - 0x402F; |
|
4440 if ( 0 < shiftCount ) { |
|
4441 if ( 0x403E <= aExp ) { |
|
4442 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF ); |
|
4443 if ( ( a.high == LIT64( 0xC03E000000000000 ) ) |
|
4444 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) { |
|
4445 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact; |
|
4446 } |
|
4447 else { |
|
4448 float_raise( float_flag_invalid STATUS_VAR); |
|
4449 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) { |
|
4450 return LIT64( 0x7FFFFFFFFFFFFFFF ); |
|
4451 } |
|
4452 } |
|
4453 return (sbits64) LIT64( 0x8000000000000000 ); |
|
4454 } |
|
4455 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) ); |
|
4456 if ( (bits64) ( aSig1<<shiftCount ) ) { |
|
4457 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4458 } |
|
4459 } |
|
4460 else { |
|
4461 if ( aExp < 0x3FFF ) { |
|
4462 if ( aExp | aSig0 | aSig1 ) { |
|
4463 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4464 } |
|
4465 return 0; |
|
4466 } |
|
4467 z = aSig0>>( - shiftCount ); |
|
4468 if ( aSig1 |
|
4469 || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) { |
|
4470 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4471 } |
|
4472 } |
|
4473 if ( aSign ) z = - z; |
|
4474 return z; |
|
4475 |
|
4476 } |
|
4477 |
|
4478 /*---------------------------------------------------------------------------- |
|
4479 | Returns the result of converting the quadruple-precision floating-point |
|
4480 | value `a' to the single-precision floating-point format. The conversion |
|
4481 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4482 | Arithmetic. |
|
4483 *----------------------------------------------------------------------------*/ |
|
4484 |
|
4485 float32 float128_to_float32( float128 a STATUS_PARAM ) |
|
4486 { |
|
4487 flag aSign; |
|
4488 int32 aExp; |
|
4489 bits64 aSig0, aSig1; |
|
4490 bits32 zSig; |
|
4491 |
|
4492 aSig1 = extractFloat128Frac1( a ); |
|
4493 aSig0 = extractFloat128Frac0( a ); |
|
4494 aExp = extractFloat128Exp( a ); |
|
4495 aSign = extractFloat128Sign( a ); |
|
4496 if ( aExp == 0x7FFF ) { |
|
4497 if ( aSig0 | aSig1 ) { |
|
4498 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) ); |
|
4499 } |
|
4500 return packFloat32( aSign, 0xFF, 0 ); |
|
4501 } |
|
4502 aSig0 |= ( aSig1 != 0 ); |
|
4503 shift64RightJamming( aSig0, 18, &aSig0 ); |
|
4504 zSig = aSig0; |
|
4505 if ( aExp || zSig ) { |
|
4506 zSig |= 0x40000000; |
|
4507 aExp -= 0x3F81; |
|
4508 } |
|
4509 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR ); |
|
4510 |
|
4511 } |
|
4512 |
|
4513 /*---------------------------------------------------------------------------- |
|
4514 | Returns the result of converting the quadruple-precision floating-point |
|
4515 | value `a' to the double-precision floating-point format. The conversion |
|
4516 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
4517 | Arithmetic. |
|
4518 *----------------------------------------------------------------------------*/ |
|
4519 |
|
4520 float64 float128_to_float64( float128 a STATUS_PARAM ) |
|
4521 { |
|
4522 flag aSign; |
|
4523 int32 aExp; |
|
4524 bits64 aSig0, aSig1; |
|
4525 |
|
4526 aSig1 = extractFloat128Frac1( a ); |
|
4527 aSig0 = extractFloat128Frac0( a ); |
|
4528 aExp = extractFloat128Exp( a ); |
|
4529 aSign = extractFloat128Sign( a ); |
|
4530 if ( aExp == 0x7FFF ) { |
|
4531 if ( aSig0 | aSig1 ) { |
|
4532 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) ); |
|
4533 } |
|
4534 return packFloat64( aSign, 0x7FF, 0 ); |
|
4535 } |
|
4536 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); |
|
4537 aSig0 |= ( aSig1 != 0 ); |
|
4538 if ( aExp || aSig0 ) { |
|
4539 aSig0 |= LIT64( 0x4000000000000000 ); |
|
4540 aExp -= 0x3C01; |
|
4541 } |
|
4542 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR ); |
|
4543 |
|
4544 } |
|
4545 |
|
4546 #ifdef FLOATX80 |
|
4547 |
|
4548 /*---------------------------------------------------------------------------- |
|
4549 | Returns the result of converting the quadruple-precision floating-point |
|
4550 | value `a' to the extended double-precision floating-point format. The |
|
4551 | conversion is performed according to the IEC/IEEE Standard for Binary |
|
4552 | Floating-Point Arithmetic. |
|
4553 *----------------------------------------------------------------------------*/ |
|
4554 |
|
4555 floatx80 float128_to_floatx80( float128 a STATUS_PARAM ) |
|
4556 { |
|
4557 flag aSign; |
|
4558 int32 aExp; |
|
4559 bits64 aSig0, aSig1; |
|
4560 |
|
4561 aSig1 = extractFloat128Frac1( a ); |
|
4562 aSig0 = extractFloat128Frac0( a ); |
|
4563 aExp = extractFloat128Exp( a ); |
|
4564 aSign = extractFloat128Sign( a ); |
|
4565 if ( aExp == 0x7FFF ) { |
|
4566 if ( aSig0 | aSig1 ) { |
|
4567 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) ); |
|
4568 } |
|
4569 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) ); |
|
4570 } |
|
4571 if ( aExp == 0 ) { |
|
4572 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 ); |
|
4573 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
|
4574 } |
|
4575 else { |
|
4576 aSig0 |= LIT64( 0x0001000000000000 ); |
|
4577 } |
|
4578 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 ); |
|
4579 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR ); |
|
4580 |
|
4581 } |
|
4582 |
|
4583 #endif |
|
4584 |
|
4585 /*---------------------------------------------------------------------------- |
|
4586 | Rounds the quadruple-precision floating-point value `a' to an integer, and |
|
4587 | returns the result as a quadruple-precision floating-point value. The |
|
4588 | operation is performed according to the IEC/IEEE Standard for Binary |
|
4589 | Floating-Point Arithmetic. |
|
4590 *----------------------------------------------------------------------------*/ |
|
4591 |
|
4592 float128 float128_round_to_int( float128 a STATUS_PARAM ) |
|
4593 { |
|
4594 flag aSign; |
|
4595 int32 aExp; |
|
4596 bits64 lastBitMask, roundBitsMask; |
|
4597 int8 roundingMode; |
|
4598 float128 z; |
|
4599 |
|
4600 aExp = extractFloat128Exp( a ); |
|
4601 if ( 0x402F <= aExp ) { |
|
4602 if ( 0x406F <= aExp ) { |
|
4603 if ( ( aExp == 0x7FFF ) |
|
4604 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) |
|
4605 ) { |
|
4606 return propagateFloat128NaN( a, a STATUS_VAR ); |
|
4607 } |
|
4608 return a; |
|
4609 } |
|
4610 lastBitMask = 1; |
|
4611 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1; |
|
4612 roundBitsMask = lastBitMask - 1; |
|
4613 z = a; |
|
4614 roundingMode = STATUS(float_rounding_mode); |
|
4615 if ( roundingMode == float_round_nearest_even ) { |
|
4616 if ( lastBitMask ) { |
|
4617 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low ); |
|
4618 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask; |
|
4619 } |
|
4620 else { |
|
4621 if ( (sbits64) z.low < 0 ) { |
|
4622 ++z.high; |
|
4623 if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1; |
|
4624 } |
|
4625 } |
|
4626 } |
|
4627 else if ( roundingMode != float_round_to_zero ) { |
|
4628 if ( extractFloat128Sign( z ) |
|
4629 ^ ( roundingMode == float_round_up ) ) { |
|
4630 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low ); |
|
4631 } |
|
4632 } |
|
4633 z.low &= ~ roundBitsMask; |
|
4634 } |
|
4635 else { |
|
4636 if ( aExp < 0x3FFF ) { |
|
4637 if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a; |
|
4638 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4639 aSign = extractFloat128Sign( a ); |
|
4640 switch ( STATUS(float_rounding_mode) ) { |
|
4641 case float_round_nearest_even: |
|
4642 if ( ( aExp == 0x3FFE ) |
|
4643 && ( extractFloat128Frac0( a ) |
|
4644 | extractFloat128Frac1( a ) ) |
|
4645 ) { |
|
4646 return packFloat128( aSign, 0x3FFF, 0, 0 ); |
|
4647 } |
|
4648 break; |
|
4649 case float_round_down: |
|
4650 return |
|
4651 aSign ? packFloat128( 1, 0x3FFF, 0, 0 ) |
|
4652 : packFloat128( 0, 0, 0, 0 ); |
|
4653 case float_round_up: |
|
4654 return |
|
4655 aSign ? packFloat128( 1, 0, 0, 0 ) |
|
4656 : packFloat128( 0, 0x3FFF, 0, 0 ); |
|
4657 } |
|
4658 return packFloat128( aSign, 0, 0, 0 ); |
|
4659 } |
|
4660 lastBitMask = 1; |
|
4661 lastBitMask <<= 0x402F - aExp; |
|
4662 roundBitsMask = lastBitMask - 1; |
|
4663 z.low = 0; |
|
4664 z.high = a.high; |
|
4665 roundingMode = STATUS(float_rounding_mode); |
|
4666 if ( roundingMode == float_round_nearest_even ) { |
|
4667 z.high += lastBitMask>>1; |
|
4668 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) { |
|
4669 z.high &= ~ lastBitMask; |
|
4670 } |
|
4671 } |
|
4672 else if ( roundingMode != float_round_to_zero ) { |
|
4673 if ( extractFloat128Sign( z ) |
|
4674 ^ ( roundingMode == float_round_up ) ) { |
|
4675 z.high |= ( a.low != 0 ); |
|
4676 z.high += roundBitsMask; |
|
4677 } |
|
4678 } |
|
4679 z.high &= ~ roundBitsMask; |
|
4680 } |
|
4681 if ( ( z.low != a.low ) || ( z.high != a.high ) ) { |
|
4682 STATUS(float_exception_flags) |= float_flag_inexact; |
|
4683 } |
|
4684 return z; |
|
4685 |
|
4686 } |
|
4687 |
|
4688 /*---------------------------------------------------------------------------- |
|
4689 | Returns the result of adding the absolute values of the quadruple-precision |
|
4690 | floating-point values `a' and `b'. If `zSign' is 1, the sum is negated |
|
4691 | before being returned. `zSign' is ignored if the result is a NaN. |
|
4692 | The addition is performed according to the IEC/IEEE Standard for Binary |
|
4693 | Floating-Point Arithmetic. |
|
4694 *----------------------------------------------------------------------------*/ |
|
4695 |
|
4696 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) |
|
4697 { |
|
4698 int32 aExp, bExp, zExp; |
|
4699 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; |
|
4700 int32 expDiff; |
|
4701 |
|
4702 aSig1 = extractFloat128Frac1( a ); |
|
4703 aSig0 = extractFloat128Frac0( a ); |
|
4704 aExp = extractFloat128Exp( a ); |
|
4705 bSig1 = extractFloat128Frac1( b ); |
|
4706 bSig0 = extractFloat128Frac0( b ); |
|
4707 bExp = extractFloat128Exp( b ); |
|
4708 expDiff = aExp - bExp; |
|
4709 if ( 0 < expDiff ) { |
|
4710 if ( aExp == 0x7FFF ) { |
|
4711 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4712 return a; |
|
4713 } |
|
4714 if ( bExp == 0 ) { |
|
4715 --expDiff; |
|
4716 } |
|
4717 else { |
|
4718 bSig0 |= LIT64( 0x0001000000000000 ); |
|
4719 } |
|
4720 shift128ExtraRightJamming( |
|
4721 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 ); |
|
4722 zExp = aExp; |
|
4723 } |
|
4724 else if ( expDiff < 0 ) { |
|
4725 if ( bExp == 0x7FFF ) { |
|
4726 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4727 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
4728 } |
|
4729 if ( aExp == 0 ) { |
|
4730 ++expDiff; |
|
4731 } |
|
4732 else { |
|
4733 aSig0 |= LIT64( 0x0001000000000000 ); |
|
4734 } |
|
4735 shift128ExtraRightJamming( |
|
4736 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 ); |
|
4737 zExp = bExp; |
|
4738 } |
|
4739 else { |
|
4740 if ( aExp == 0x7FFF ) { |
|
4741 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { |
|
4742 return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4743 } |
|
4744 return a; |
|
4745 } |
|
4746 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
|
4747 if ( aExp == 0 ) { |
|
4748 if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 ); |
|
4749 return packFloat128( zSign, 0, zSig0, zSig1 ); |
|
4750 } |
|
4751 zSig2 = 0; |
|
4752 zSig0 |= LIT64( 0x0002000000000000 ); |
|
4753 zExp = aExp; |
|
4754 goto shiftRight1; |
|
4755 } |
|
4756 aSig0 |= LIT64( 0x0001000000000000 ); |
|
4757 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
|
4758 --zExp; |
|
4759 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack; |
|
4760 ++zExp; |
|
4761 shiftRight1: |
|
4762 shift128ExtraRightJamming( |
|
4763 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); |
|
4764 roundAndPack: |
|
4765 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); |
|
4766 |
|
4767 } |
|
4768 |
|
4769 /*---------------------------------------------------------------------------- |
|
4770 | Returns the result of subtracting the absolute values of the quadruple- |
|
4771 | precision floating-point values `a' and `b'. If `zSign' is 1, the |
|
4772 | difference is negated before being returned. `zSign' is ignored if the |
|
4773 | result is a NaN. The subtraction is performed according to the IEC/IEEE |
|
4774 | Standard for Binary Floating-Point Arithmetic. |
|
4775 *----------------------------------------------------------------------------*/ |
|
4776 |
|
4777 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM) |
|
4778 { |
|
4779 int32 aExp, bExp, zExp; |
|
4780 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1; |
|
4781 int32 expDiff; |
|
4782 float128 z; |
|
4783 |
|
4784 aSig1 = extractFloat128Frac1( a ); |
|
4785 aSig0 = extractFloat128Frac0( a ); |
|
4786 aExp = extractFloat128Exp( a ); |
|
4787 bSig1 = extractFloat128Frac1( b ); |
|
4788 bSig0 = extractFloat128Frac0( b ); |
|
4789 bExp = extractFloat128Exp( b ); |
|
4790 expDiff = aExp - bExp; |
|
4791 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 ); |
|
4792 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 ); |
|
4793 if ( 0 < expDiff ) goto aExpBigger; |
|
4794 if ( expDiff < 0 ) goto bExpBigger; |
|
4795 if ( aExp == 0x7FFF ) { |
|
4796 if ( aSig0 | aSig1 | bSig0 | bSig1 ) { |
|
4797 return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4798 } |
|
4799 float_raise( float_flag_invalid STATUS_VAR); |
|
4800 z.low = float128_default_nan_low; |
|
4801 z.high = float128_default_nan_high; |
|
4802 return z; |
|
4803 } |
|
4804 if ( aExp == 0 ) { |
|
4805 aExp = 1; |
|
4806 bExp = 1; |
|
4807 } |
|
4808 if ( bSig0 < aSig0 ) goto aBigger; |
|
4809 if ( aSig0 < bSig0 ) goto bBigger; |
|
4810 if ( bSig1 < aSig1 ) goto aBigger; |
|
4811 if ( aSig1 < bSig1 ) goto bBigger; |
|
4812 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 ); |
|
4813 bExpBigger: |
|
4814 if ( bExp == 0x7FFF ) { |
|
4815 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4816 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 ); |
|
4817 } |
|
4818 if ( aExp == 0 ) { |
|
4819 ++expDiff; |
|
4820 } |
|
4821 else { |
|
4822 aSig0 |= LIT64( 0x4000000000000000 ); |
|
4823 } |
|
4824 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); |
|
4825 bSig0 |= LIT64( 0x4000000000000000 ); |
|
4826 bBigger: |
|
4827 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 ); |
|
4828 zExp = bExp; |
|
4829 zSign ^= 1; |
|
4830 goto normalizeRoundAndPack; |
|
4831 aExpBigger: |
|
4832 if ( aExp == 0x7FFF ) { |
|
4833 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4834 return a; |
|
4835 } |
|
4836 if ( bExp == 0 ) { |
|
4837 --expDiff; |
|
4838 } |
|
4839 else { |
|
4840 bSig0 |= LIT64( 0x4000000000000000 ); |
|
4841 } |
|
4842 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 ); |
|
4843 aSig0 |= LIT64( 0x4000000000000000 ); |
|
4844 aBigger: |
|
4845 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 ); |
|
4846 zExp = aExp; |
|
4847 normalizeRoundAndPack: |
|
4848 --zExp; |
|
4849 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR ); |
|
4850 |
|
4851 } |
|
4852 |
|
4853 /*---------------------------------------------------------------------------- |
|
4854 | Returns the result of adding the quadruple-precision floating-point values |
|
4855 | `a' and `b'. The operation is performed according to the IEC/IEEE Standard |
|
4856 | for Binary Floating-Point Arithmetic. |
|
4857 *----------------------------------------------------------------------------*/ |
|
4858 |
|
4859 float128 float128_add( float128 a, float128 b STATUS_PARAM ) |
|
4860 { |
|
4861 flag aSign, bSign; |
|
4862 |
|
4863 aSign = extractFloat128Sign( a ); |
|
4864 bSign = extractFloat128Sign( b ); |
|
4865 if ( aSign == bSign ) { |
|
4866 return addFloat128Sigs( a, b, aSign STATUS_VAR ); |
|
4867 } |
|
4868 else { |
|
4869 return subFloat128Sigs( a, b, aSign STATUS_VAR ); |
|
4870 } |
|
4871 |
|
4872 } |
|
4873 |
|
4874 /*---------------------------------------------------------------------------- |
|
4875 | Returns the result of subtracting the quadruple-precision floating-point |
|
4876 | values `a' and `b'. The operation is performed according to the IEC/IEEE |
|
4877 | Standard for Binary Floating-Point Arithmetic. |
|
4878 *----------------------------------------------------------------------------*/ |
|
4879 |
|
4880 float128 float128_sub( float128 a, float128 b STATUS_PARAM ) |
|
4881 { |
|
4882 flag aSign, bSign; |
|
4883 |
|
4884 aSign = extractFloat128Sign( a ); |
|
4885 bSign = extractFloat128Sign( b ); |
|
4886 if ( aSign == bSign ) { |
|
4887 return subFloat128Sigs( a, b, aSign STATUS_VAR ); |
|
4888 } |
|
4889 else { |
|
4890 return addFloat128Sigs( a, b, aSign STATUS_VAR ); |
|
4891 } |
|
4892 |
|
4893 } |
|
4894 |
|
4895 /*---------------------------------------------------------------------------- |
|
4896 | Returns the result of multiplying the quadruple-precision floating-point |
|
4897 | values `a' and `b'. The operation is performed according to the IEC/IEEE |
|
4898 | Standard for Binary Floating-Point Arithmetic. |
|
4899 *----------------------------------------------------------------------------*/ |
|
4900 |
|
4901 float128 float128_mul( float128 a, float128 b STATUS_PARAM ) |
|
4902 { |
|
4903 flag aSign, bSign, zSign; |
|
4904 int32 aExp, bExp, zExp; |
|
4905 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3; |
|
4906 float128 z; |
|
4907 |
|
4908 aSig1 = extractFloat128Frac1( a ); |
|
4909 aSig0 = extractFloat128Frac0( a ); |
|
4910 aExp = extractFloat128Exp( a ); |
|
4911 aSign = extractFloat128Sign( a ); |
|
4912 bSig1 = extractFloat128Frac1( b ); |
|
4913 bSig0 = extractFloat128Frac0( b ); |
|
4914 bExp = extractFloat128Exp( b ); |
|
4915 bSign = extractFloat128Sign( b ); |
|
4916 zSign = aSign ^ bSign; |
|
4917 if ( aExp == 0x7FFF ) { |
|
4918 if ( ( aSig0 | aSig1 ) |
|
4919 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { |
|
4920 return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4921 } |
|
4922 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid; |
|
4923 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
4924 } |
|
4925 if ( bExp == 0x7FFF ) { |
|
4926 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4927 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { |
|
4928 invalid: |
|
4929 float_raise( float_flag_invalid STATUS_VAR); |
|
4930 z.low = float128_default_nan_low; |
|
4931 z.high = float128_default_nan_high; |
|
4932 return z; |
|
4933 } |
|
4934 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
4935 } |
|
4936 if ( aExp == 0 ) { |
|
4937 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); |
|
4938 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
|
4939 } |
|
4940 if ( bExp == 0 ) { |
|
4941 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); |
|
4942 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); |
|
4943 } |
|
4944 zExp = aExp + bExp - 0x4000; |
|
4945 aSig0 |= LIT64( 0x0001000000000000 ); |
|
4946 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 ); |
|
4947 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 ); |
|
4948 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 ); |
|
4949 zSig2 |= ( zSig3 != 0 ); |
|
4950 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) { |
|
4951 shift128ExtraRightJamming( |
|
4952 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 ); |
|
4953 ++zExp; |
|
4954 } |
|
4955 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); |
|
4956 |
|
4957 } |
|
4958 |
|
4959 /*---------------------------------------------------------------------------- |
|
4960 | Returns the result of dividing the quadruple-precision floating-point value |
|
4961 | `a' by the corresponding value `b'. The operation is performed according to |
|
4962 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
4963 *----------------------------------------------------------------------------*/ |
|
4964 |
|
4965 float128 float128_div( float128 a, float128 b STATUS_PARAM ) |
|
4966 { |
|
4967 flag aSign, bSign, zSign; |
|
4968 int32 aExp, bExp, zExp; |
|
4969 bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2; |
|
4970 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; |
|
4971 float128 z; |
|
4972 |
|
4973 aSig1 = extractFloat128Frac1( a ); |
|
4974 aSig0 = extractFloat128Frac0( a ); |
|
4975 aExp = extractFloat128Exp( a ); |
|
4976 aSign = extractFloat128Sign( a ); |
|
4977 bSig1 = extractFloat128Frac1( b ); |
|
4978 bSig0 = extractFloat128Frac0( b ); |
|
4979 bExp = extractFloat128Exp( b ); |
|
4980 bSign = extractFloat128Sign( b ); |
|
4981 zSign = aSign ^ bSign; |
|
4982 if ( aExp == 0x7FFF ) { |
|
4983 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4984 if ( bExp == 0x7FFF ) { |
|
4985 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4986 goto invalid; |
|
4987 } |
|
4988 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
4989 } |
|
4990 if ( bExp == 0x7FFF ) { |
|
4991 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
4992 return packFloat128( zSign, 0, 0, 0 ); |
|
4993 } |
|
4994 if ( bExp == 0 ) { |
|
4995 if ( ( bSig0 | bSig1 ) == 0 ) { |
|
4996 if ( ( aExp | aSig0 | aSig1 ) == 0 ) { |
|
4997 invalid: |
|
4998 float_raise( float_flag_invalid STATUS_VAR); |
|
4999 z.low = float128_default_nan_low; |
|
5000 z.high = float128_default_nan_high; |
|
5001 return z; |
|
5002 } |
|
5003 float_raise( float_flag_divbyzero STATUS_VAR); |
|
5004 return packFloat128( zSign, 0x7FFF, 0, 0 ); |
|
5005 } |
|
5006 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); |
|
5007 } |
|
5008 if ( aExp == 0 ) { |
|
5009 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 ); |
|
5010 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
|
5011 } |
|
5012 zExp = aExp - bExp + 0x3FFD; |
|
5013 shortShift128Left( |
|
5014 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 ); |
|
5015 shortShift128Left( |
|
5016 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); |
|
5017 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) { |
|
5018 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 ); |
|
5019 ++zExp; |
|
5020 } |
|
5021 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 ); |
|
5022 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 ); |
|
5023 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 ); |
|
5024 while ( (sbits64) rem0 < 0 ) { |
|
5025 --zSig0; |
|
5026 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 ); |
|
5027 } |
|
5028 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 ); |
|
5029 if ( ( zSig1 & 0x3FFF ) <= 4 ) { |
|
5030 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 ); |
|
5031 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 ); |
|
5032 while ( (sbits64) rem1 < 0 ) { |
|
5033 --zSig1; |
|
5034 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 ); |
|
5035 } |
|
5036 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); |
|
5037 } |
|
5038 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 ); |
|
5039 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); |
|
5040 |
|
5041 } |
|
5042 |
|
5043 /*---------------------------------------------------------------------------- |
|
5044 | Returns the remainder of the quadruple-precision floating-point value `a' |
|
5045 | with respect to the corresponding value `b'. The operation is performed |
|
5046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
5047 *----------------------------------------------------------------------------*/ |
|
5048 |
|
5049 float128 float128_rem( float128 a, float128 b STATUS_PARAM ) |
|
5050 { |
|
5051 flag aSign, bSign, zSign; |
|
5052 int32 aExp, bExp, expDiff; |
|
5053 bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2; |
|
5054 bits64 allZero, alternateASig0, alternateASig1, sigMean1; |
|
5055 sbits64 sigMean0; |
|
5056 float128 z; |
|
5057 |
|
5058 aSig1 = extractFloat128Frac1( a ); |
|
5059 aSig0 = extractFloat128Frac0( a ); |
|
5060 aExp = extractFloat128Exp( a ); |
|
5061 aSign = extractFloat128Sign( a ); |
|
5062 bSig1 = extractFloat128Frac1( b ); |
|
5063 bSig0 = extractFloat128Frac0( b ); |
|
5064 bExp = extractFloat128Exp( b ); |
|
5065 bSign = extractFloat128Sign( b ); |
|
5066 if ( aExp == 0x7FFF ) { |
|
5067 if ( ( aSig0 | aSig1 ) |
|
5068 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) { |
|
5069 return propagateFloat128NaN( a, b STATUS_VAR ); |
|
5070 } |
|
5071 goto invalid; |
|
5072 } |
|
5073 if ( bExp == 0x7FFF ) { |
|
5074 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR ); |
|
5075 return a; |
|
5076 } |
|
5077 if ( bExp == 0 ) { |
|
5078 if ( ( bSig0 | bSig1 ) == 0 ) { |
|
5079 invalid: |
|
5080 float_raise( float_flag_invalid STATUS_VAR); |
|
5081 z.low = float128_default_nan_low; |
|
5082 z.high = float128_default_nan_high; |
|
5083 return z; |
|
5084 } |
|
5085 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 ); |
|
5086 } |
|
5087 if ( aExp == 0 ) { |
|
5088 if ( ( aSig0 | aSig1 ) == 0 ) return a; |
|
5089 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
|
5090 } |
|
5091 expDiff = aExp - bExp; |
|
5092 if ( expDiff < -1 ) return a; |
|
5093 shortShift128Left( |
|
5094 aSig0 | LIT64( 0x0001000000000000 ), |
|
5095 aSig1, |
|
5096 15 - ( expDiff < 0 ), |
|
5097 &aSig0, |
|
5098 &aSig1 |
|
5099 ); |
|
5100 shortShift128Left( |
|
5101 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 ); |
|
5102 q = le128( bSig0, bSig1, aSig0, aSig1 ); |
|
5103 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); |
|
5104 expDiff -= 64; |
|
5105 while ( 0 < expDiff ) { |
|
5106 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); |
|
5107 q = ( 4 < q ) ? q - 4 : 0; |
|
5108 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); |
|
5109 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero ); |
|
5110 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero ); |
|
5111 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 ); |
|
5112 expDiff -= 61; |
|
5113 } |
|
5114 if ( -64 < expDiff ) { |
|
5115 q = estimateDiv128To64( aSig0, aSig1, bSig0 ); |
|
5116 q = ( 4 < q ) ? q - 4 : 0; |
|
5117 q >>= - expDiff; |
|
5118 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); |
|
5119 expDiff += 52; |
|
5120 if ( expDiff < 0 ) { |
|
5121 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 ); |
|
5122 } |
|
5123 else { |
|
5124 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 ); |
|
5125 } |
|
5126 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 ); |
|
5127 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 ); |
|
5128 } |
|
5129 else { |
|
5130 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 ); |
|
5131 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 ); |
|
5132 } |
|
5133 do { |
|
5134 alternateASig0 = aSig0; |
|
5135 alternateASig1 = aSig1; |
|
5136 ++q; |
|
5137 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 ); |
|
5138 } while ( 0 <= (sbits64) aSig0 ); |
|
5139 add128( |
|
5140 aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 ); |
|
5141 if ( ( sigMean0 < 0 ) |
|
5142 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) { |
|
5143 aSig0 = alternateASig0; |
|
5144 aSig1 = alternateASig1; |
|
5145 } |
|
5146 zSign = ( (sbits64) aSig0 < 0 ); |
|
5147 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 ); |
|
5148 return |
|
5149 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR ); |
|
5150 |
|
5151 } |
|
5152 |
|
5153 /*---------------------------------------------------------------------------- |
|
5154 | Returns the square root of the quadruple-precision floating-point value `a'. |
|
5155 | The operation is performed according to the IEC/IEEE Standard for Binary |
|
5156 | Floating-Point Arithmetic. |
|
5157 *----------------------------------------------------------------------------*/ |
|
5158 |
|
5159 float128 float128_sqrt( float128 a STATUS_PARAM ) |
|
5160 { |
|
5161 flag aSign; |
|
5162 int32 aExp, zExp; |
|
5163 bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0; |
|
5164 bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3; |
|
5165 float128 z; |
|
5166 |
|
5167 aSig1 = extractFloat128Frac1( a ); |
|
5168 aSig0 = extractFloat128Frac0( a ); |
|
5169 aExp = extractFloat128Exp( a ); |
|
5170 aSign = extractFloat128Sign( a ); |
|
5171 if ( aExp == 0x7FFF ) { |
|
5172 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR ); |
|
5173 if ( ! aSign ) return a; |
|
5174 goto invalid; |
|
5175 } |
|
5176 if ( aSign ) { |
|
5177 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a; |
|
5178 invalid: |
|
5179 float_raise( float_flag_invalid STATUS_VAR); |
|
5180 z.low = float128_default_nan_low; |
|
5181 z.high = float128_default_nan_high; |
|
5182 return z; |
|
5183 } |
|
5184 if ( aExp == 0 ) { |
|
5185 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 ); |
|
5186 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 ); |
|
5187 } |
|
5188 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE; |
|
5189 aSig0 |= LIT64( 0x0001000000000000 ); |
|
5190 zSig0 = estimateSqrt32( aExp, aSig0>>17 ); |
|
5191 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 ); |
|
5192 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 ); |
|
5193 doubleZSig0 = zSig0<<1; |
|
5194 mul64To128( zSig0, zSig0, &term0, &term1 ); |
|
5195 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 ); |
|
5196 while ( (sbits64) rem0 < 0 ) { |
|
5197 --zSig0; |
|
5198 doubleZSig0 -= 2; |
|
5199 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 ); |
|
5200 } |
|
5201 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 ); |
|
5202 if ( ( zSig1 & 0x1FFF ) <= 5 ) { |
|
5203 if ( zSig1 == 0 ) zSig1 = 1; |
|
5204 mul64To128( doubleZSig0, zSig1, &term1, &term2 ); |
|
5205 sub128( rem1, 0, term1, term2, &rem1, &rem2 ); |
|
5206 mul64To128( zSig1, zSig1, &term2, &term3 ); |
|
5207 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 ); |
|
5208 while ( (sbits64) rem1 < 0 ) { |
|
5209 --zSig1; |
|
5210 shortShift128Left( 0, zSig1, 1, &term2, &term3 ); |
|
5211 term3 |= 1; |
|
5212 term2 |= doubleZSig0; |
|
5213 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 ); |
|
5214 } |
|
5215 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 ); |
|
5216 } |
|
5217 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 ); |
|
5218 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR ); |
|
5219 |
|
5220 } |
|
5221 |
|
5222 /*---------------------------------------------------------------------------- |
|
5223 | Returns 1 if the quadruple-precision floating-point value `a' is equal to |
|
5224 | the corresponding value `b', and 0 otherwise. The comparison is performed |
|
5225 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
5226 *----------------------------------------------------------------------------*/ |
|
5227 |
|
5228 int float128_eq( float128 a, float128 b STATUS_PARAM ) |
|
5229 { |
|
5230 |
|
5231 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5232 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5233 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5234 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5235 ) { |
|
5236 if ( float128_is_signaling_nan( a ) |
|
5237 || float128_is_signaling_nan( b ) ) { |
|
5238 float_raise( float_flag_invalid STATUS_VAR); |
|
5239 } |
|
5240 return 0; |
|
5241 } |
|
5242 return |
|
5243 ( a.low == b.low ) |
|
5244 && ( ( a.high == b.high ) |
|
5245 || ( ( a.low == 0 ) |
|
5246 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) |
|
5247 ); |
|
5248 |
|
5249 } |
|
5250 |
|
5251 /*---------------------------------------------------------------------------- |
|
5252 | Returns 1 if the quadruple-precision floating-point value `a' is less than |
|
5253 | or equal to the corresponding value `b', and 0 otherwise. The comparison |
|
5254 | is performed according to the IEC/IEEE Standard for Binary Floating-Point |
|
5255 | Arithmetic. |
|
5256 *----------------------------------------------------------------------------*/ |
|
5257 |
|
5258 int float128_le( float128 a, float128 b STATUS_PARAM ) |
|
5259 { |
|
5260 flag aSign, bSign; |
|
5261 |
|
5262 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5263 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5264 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5265 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5266 ) { |
|
5267 float_raise( float_flag_invalid STATUS_VAR); |
|
5268 return 0; |
|
5269 } |
|
5270 aSign = extractFloat128Sign( a ); |
|
5271 bSign = extractFloat128Sign( b ); |
|
5272 if ( aSign != bSign ) { |
|
5273 return |
|
5274 aSign |
|
5275 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
5276 == 0 ); |
|
5277 } |
|
5278 return |
|
5279 aSign ? le128( b.high, b.low, a.high, a.low ) |
|
5280 : le128( a.high, a.low, b.high, b.low ); |
|
5281 |
|
5282 } |
|
5283 |
|
5284 /*---------------------------------------------------------------------------- |
|
5285 | Returns 1 if the quadruple-precision floating-point value `a' is less than |
|
5286 | the corresponding value `b', and 0 otherwise. The comparison is performed |
|
5287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
5288 *----------------------------------------------------------------------------*/ |
|
5289 |
|
5290 int float128_lt( float128 a, float128 b STATUS_PARAM ) |
|
5291 { |
|
5292 flag aSign, bSign; |
|
5293 |
|
5294 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5295 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5296 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5297 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5298 ) { |
|
5299 float_raise( float_flag_invalid STATUS_VAR); |
|
5300 return 0; |
|
5301 } |
|
5302 aSign = extractFloat128Sign( a ); |
|
5303 bSign = extractFloat128Sign( b ); |
|
5304 if ( aSign != bSign ) { |
|
5305 return |
|
5306 aSign |
|
5307 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
5308 != 0 ); |
|
5309 } |
|
5310 return |
|
5311 aSign ? lt128( b.high, b.low, a.high, a.low ) |
|
5312 : lt128( a.high, a.low, b.high, b.low ); |
|
5313 |
|
5314 } |
|
5315 |
|
5316 /*---------------------------------------------------------------------------- |
|
5317 | Returns 1 if the quadruple-precision floating-point value `a' is equal to |
|
5318 | the corresponding value `b', and 0 otherwise. The invalid exception is |
|
5319 | raised if either operand is a NaN. Otherwise, the comparison is performed |
|
5320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
5321 *----------------------------------------------------------------------------*/ |
|
5322 |
|
5323 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM ) |
|
5324 { |
|
5325 |
|
5326 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5327 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5328 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5329 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5330 ) { |
|
5331 float_raise( float_flag_invalid STATUS_VAR); |
|
5332 return 0; |
|
5333 } |
|
5334 return |
|
5335 ( a.low == b.low ) |
|
5336 && ( ( a.high == b.high ) |
|
5337 || ( ( a.low == 0 ) |
|
5338 && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) ) |
|
5339 ); |
|
5340 |
|
5341 } |
|
5342 |
|
5343 /*---------------------------------------------------------------------------- |
|
5344 | Returns 1 if the quadruple-precision floating-point value `a' is less than |
|
5345 | or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not |
|
5346 | cause an exception. Otherwise, the comparison is performed according to the |
|
5347 | IEC/IEEE Standard for Binary Floating-Point Arithmetic. |
|
5348 *----------------------------------------------------------------------------*/ |
|
5349 |
|
5350 int float128_le_quiet( float128 a, float128 b STATUS_PARAM ) |
|
5351 { |
|
5352 flag aSign, bSign; |
|
5353 |
|
5354 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5355 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5356 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5357 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5358 ) { |
|
5359 if ( float128_is_signaling_nan( a ) |
|
5360 || float128_is_signaling_nan( b ) ) { |
|
5361 float_raise( float_flag_invalid STATUS_VAR); |
|
5362 } |
|
5363 return 0; |
|
5364 } |
|
5365 aSign = extractFloat128Sign( a ); |
|
5366 bSign = extractFloat128Sign( b ); |
|
5367 if ( aSign != bSign ) { |
|
5368 return |
|
5369 aSign |
|
5370 || ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
5371 == 0 ); |
|
5372 } |
|
5373 return |
|
5374 aSign ? le128( b.high, b.low, a.high, a.low ) |
|
5375 : le128( a.high, a.low, b.high, b.low ); |
|
5376 |
|
5377 } |
|
5378 |
|
5379 /*---------------------------------------------------------------------------- |
|
5380 | Returns 1 if the quadruple-precision floating-point value `a' is less than |
|
5381 | the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an |
|
5382 | exception. Otherwise, the comparison is performed according to the IEC/IEEE |
|
5383 | Standard for Binary Floating-Point Arithmetic. |
|
5384 *----------------------------------------------------------------------------*/ |
|
5385 |
|
5386 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM ) |
|
5387 { |
|
5388 flag aSign, bSign; |
|
5389 |
|
5390 if ( ( ( extractFloat128Exp( a ) == 0x7FFF ) |
|
5391 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) |
|
5392 || ( ( extractFloat128Exp( b ) == 0x7FFF ) |
|
5393 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) ) |
|
5394 ) { |
|
5395 if ( float128_is_signaling_nan( a ) |
|
5396 || float128_is_signaling_nan( b ) ) { |
|
5397 float_raise( float_flag_invalid STATUS_VAR); |
|
5398 } |
|
5399 return 0; |
|
5400 } |
|
5401 aSign = extractFloat128Sign( a ); |
|
5402 bSign = extractFloat128Sign( b ); |
|
5403 if ( aSign != bSign ) { |
|
5404 return |
|
5405 aSign |
|
5406 && ( ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low ) |
|
5407 != 0 ); |
|
5408 } |
|
5409 return |
|
5410 aSign ? lt128( b.high, b.low, a.high, a.low ) |
|
5411 : lt128( a.high, a.low, b.high, b.low ); |
|
5412 |
|
5413 } |
|
5414 |
|
5415 #endif |
|
5416 |
|
5417 /* misc functions */ |
|
5418 float32 uint32_to_float32( unsigned int a STATUS_PARAM ) |
|
5419 { |
|
5420 return int64_to_float32(a STATUS_VAR); |
|
5421 } |
|
5422 |
|
5423 float64 uint32_to_float64( unsigned int a STATUS_PARAM ) |
|
5424 { |
|
5425 return int64_to_float64(a STATUS_VAR); |
|
5426 } |
|
5427 |
|
5428 unsigned int float32_to_uint32( float32 a STATUS_PARAM ) |
|
5429 { |
|
5430 int64_t v; |
|
5431 unsigned int res; |
|
5432 |
|
5433 v = float32_to_int64(a STATUS_VAR); |
|
5434 if (v < 0) { |
|
5435 res = 0; |
|
5436 float_raise( float_flag_invalid STATUS_VAR); |
|
5437 } else if (v > 0xffffffff) { |
|
5438 res = 0xffffffff; |
|
5439 float_raise( float_flag_invalid STATUS_VAR); |
|
5440 } else { |
|
5441 res = v; |
|
5442 } |
|
5443 return res; |
|
5444 } |
|
5445 |
|
5446 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM ) |
|
5447 { |
|
5448 int64_t v; |
|
5449 unsigned int res; |
|
5450 |
|
5451 v = float32_to_int64_round_to_zero(a STATUS_VAR); |
|
5452 if (v < 0) { |
|
5453 res = 0; |
|
5454 float_raise( float_flag_invalid STATUS_VAR); |
|
5455 } else if (v > 0xffffffff) { |
|
5456 res = 0xffffffff; |
|
5457 float_raise( float_flag_invalid STATUS_VAR); |
|
5458 } else { |
|
5459 res = v; |
|
5460 } |
|
5461 return res; |
|
5462 } |
|
5463 |
|
5464 unsigned int float64_to_uint32( float64 a STATUS_PARAM ) |
|
5465 { |
|
5466 int64_t v; |
|
5467 unsigned int res; |
|
5468 |
|
5469 v = float64_to_int64(a STATUS_VAR); |
|
5470 if (v < 0) { |
|
5471 res = 0; |
|
5472 float_raise( float_flag_invalid STATUS_VAR); |
|
5473 } else if (v > 0xffffffff) { |
|
5474 res = 0xffffffff; |
|
5475 float_raise( float_flag_invalid STATUS_VAR); |
|
5476 } else { |
|
5477 res = v; |
|
5478 } |
|
5479 return res; |
|
5480 } |
|
5481 |
|
5482 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM ) |
|
5483 { |
|
5484 int64_t v; |
|
5485 unsigned int res; |
|
5486 |
|
5487 v = float64_to_int64_round_to_zero(a STATUS_VAR); |
|
5488 if (v < 0) { |
|
5489 res = 0; |
|
5490 float_raise( float_flag_invalid STATUS_VAR); |
|
5491 } else if (v > 0xffffffff) { |
|
5492 res = 0xffffffff; |
|
5493 float_raise( float_flag_invalid STATUS_VAR); |
|
5494 } else { |
|
5495 res = v; |
|
5496 } |
|
5497 return res; |
|
5498 } |
|
5499 |
|
5500 /* FIXME: This looks broken. */ |
|
5501 uint64_t float64_to_uint64 (float64 a STATUS_PARAM) |
|
5502 { |
|
5503 int64_t v; |
|
5504 |
|
5505 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); |
|
5506 v += float64_val(a); |
|
5507 v = float64_to_int64(make_float64(v) STATUS_VAR); |
|
5508 |
|
5509 return v - INT64_MIN; |
|
5510 } |
|
5511 |
|
5512 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM) |
|
5513 { |
|
5514 int64_t v; |
|
5515 |
|
5516 v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR)); |
|
5517 v += float64_val(a); |
|
5518 v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR); |
|
5519 |
|
5520 return v - INT64_MIN; |
|
5521 } |
|
5522 |
|
5523 #define COMPARE(s, nan_exp) \ |
|
5524 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b, \ |
|
5525 int is_quiet STATUS_PARAM ) \ |
|
5526 { \ |
|
5527 flag aSign, bSign; \ |
|
5528 bits ## s av, bv; \ |
|
5529 \ |
|
5530 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \ |
|
5531 extractFloat ## s ## Frac( a ) ) || \ |
|
5532 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \ |
|
5533 extractFloat ## s ## Frac( b ) )) { \ |
|
5534 if (!is_quiet || \ |
|
5535 float ## s ## _is_signaling_nan( a ) || \ |
|
5536 float ## s ## _is_signaling_nan( b ) ) { \ |
|
5537 float_raise( float_flag_invalid STATUS_VAR); \ |
|
5538 } \ |
|
5539 return float_relation_unordered; \ |
|
5540 } \ |
|
5541 aSign = extractFloat ## s ## Sign( a ); \ |
|
5542 bSign = extractFloat ## s ## Sign( b ); \ |
|
5543 av = float ## s ## _val(a); \ |
|
5544 bv = float ## s ## _val(b); \ |
|
5545 if ( aSign != bSign ) { \ |
|
5546 if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) { \ |
|
5547 /* zero case */ \ |
|
5548 return float_relation_equal; \ |
|
5549 } else { \ |
|
5550 return 1 - (2 * aSign); \ |
|
5551 } \ |
|
5552 } else { \ |
|
5553 if (av == bv) { \ |
|
5554 return float_relation_equal; \ |
|
5555 } else { \ |
|
5556 return 1 - 2 * (aSign ^ ( av < bv )); \ |
|
5557 } \ |
|
5558 } \ |
|
5559 } \ |
|
5560 \ |
|
5561 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \ |
|
5562 { \ |
|
5563 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \ |
|
5564 } \ |
|
5565 \ |
|
5566 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \ |
|
5567 { \ |
|
5568 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \ |
|
5569 } |
|
5570 |
|
5571 COMPARE(32, 0xff) |
|
5572 COMPARE(64, 0x7ff) |
|
5573 |
|
5574 INLINE int float128_compare_internal( float128 a, float128 b, |
|
5575 int is_quiet STATUS_PARAM ) |
|
5576 { |
|
5577 flag aSign, bSign; |
|
5578 |
|
5579 if (( ( extractFloat128Exp( a ) == 0x7fff ) && |
|
5580 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) || |
|
5581 ( ( extractFloat128Exp( b ) == 0x7fff ) && |
|
5582 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) { |
|
5583 if (!is_quiet || |
|
5584 float128_is_signaling_nan( a ) || |
|
5585 float128_is_signaling_nan( b ) ) { |
|
5586 float_raise( float_flag_invalid STATUS_VAR); |
|
5587 } |
|
5588 return float_relation_unordered; |
|
5589 } |
|
5590 aSign = extractFloat128Sign( a ); |
|
5591 bSign = extractFloat128Sign( b ); |
|
5592 if ( aSign != bSign ) { |
|
5593 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) { |
|
5594 /* zero case */ |
|
5595 return float_relation_equal; |
|
5596 } else { |
|
5597 return 1 - (2 * aSign); |
|
5598 } |
|
5599 } else { |
|
5600 if (a.low == b.low && a.high == b.high) { |
|
5601 return float_relation_equal; |
|
5602 } else { |
|
5603 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) )); |
|
5604 } |
|
5605 } |
|
5606 } |
|
5607 |
|
5608 int float128_compare( float128 a, float128 b STATUS_PARAM ) |
|
5609 { |
|
5610 return float128_compare_internal(a, b, 0 STATUS_VAR); |
|
5611 } |
|
5612 |
|
5613 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM ) |
|
5614 { |
|
5615 return float128_compare_internal(a, b, 1 STATUS_VAR); |
|
5616 } |
|
5617 |
|
5618 /* Multiply A by 2 raised to the power N. */ |
|
5619 float32 float32_scalbn( float32 a, int n STATUS_PARAM ) |
|
5620 { |
|
5621 flag aSign; |
|
5622 int16 aExp; |
|
5623 bits32 aSig; |
|
5624 |
|
5625 aSig = extractFloat32Frac( a ); |
|
5626 aExp = extractFloat32Exp( a ); |
|
5627 aSign = extractFloat32Sign( a ); |
|
5628 |
|
5629 if ( aExp == 0xFF ) { |
|
5630 return a; |
|
5631 } |
|
5632 if ( aExp != 0 ) |
|
5633 aSig |= 0x00800000; |
|
5634 else if ( aSig == 0 ) |
|
5635 return a; |
|
5636 |
|
5637 aExp += n - 1; |
|
5638 aSig <<= 7; |
|
5639 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR ); |
|
5640 } |
|
5641 |
|
5642 float64 float64_scalbn( float64 a, int n STATUS_PARAM ) |
|
5643 { |
|
5644 flag aSign; |
|
5645 int16 aExp; |
|
5646 bits64 aSig; |
|
5647 |
|
5648 aSig = extractFloat64Frac( a ); |
|
5649 aExp = extractFloat64Exp( a ); |
|
5650 aSign = extractFloat64Sign( a ); |
|
5651 |
|
5652 if ( aExp == 0x7FF ) { |
|
5653 return a; |
|
5654 } |
|
5655 if ( aExp != 0 ) |
|
5656 aSig |= LIT64( 0x0010000000000000 ); |
|
5657 else if ( aSig == 0 ) |
|
5658 return a; |
|
5659 |
|
5660 aExp += n - 1; |
|
5661 aSig <<= 10; |
|
5662 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR ); |
|
5663 } |
|
5664 |
|
5665 #ifdef FLOATX80 |
|
5666 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM ) |
|
5667 { |
|
5668 flag aSign; |
|
5669 int16 aExp; |
|
5670 bits64 aSig; |
|
5671 |
|
5672 aSig = extractFloatx80Frac( a ); |
|
5673 aExp = extractFloatx80Exp( a ); |
|
5674 aSign = extractFloatx80Sign( a ); |
|
5675 |
|
5676 if ( aExp == 0x7FF ) { |
|
5677 return a; |
|
5678 } |
|
5679 if (aExp == 0 && aSig == 0) |
|
5680 return a; |
|
5681 |
|
5682 aExp += n; |
|
5683 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision), |
|
5684 aSign, aExp, aSig, 0 STATUS_VAR ); |
|
5685 } |
|
5686 #endif |
|
5687 |
|
5688 #ifdef FLOAT128 |
|
5689 float128 float128_scalbn( float128 a, int n STATUS_PARAM ) |
|
5690 { |
|
5691 flag aSign; |
|
5692 int32 aExp; |
|
5693 bits64 aSig0, aSig1; |
|
5694 |
|
5695 aSig1 = extractFloat128Frac1( a ); |
|
5696 aSig0 = extractFloat128Frac0( a ); |
|
5697 aExp = extractFloat128Exp( a ); |
|
5698 aSign = extractFloat128Sign( a ); |
|
5699 if ( aExp == 0x7FFF ) { |
|
5700 return a; |
|
5701 } |
|
5702 if ( aExp != 0 ) |
|
5703 aSig0 |= LIT64( 0x0001000000000000 ); |
|
5704 else if ( aSig0 == 0 && aSig1 == 0 ) |
|
5705 return a; |
|
5706 |
|
5707 aExp += n - 1; |
|
5708 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1 |
|
5709 STATUS_VAR ); |
|
5710 |
|
5711 } |
|
5712 #endif |