symbian-qemu-0.9.1-12/qemu-symbian-svp/fpu/softfloat.c
changeset 1 2fb8b9db1c86
equal deleted inserted replaced
0:ffa851df0825 1:2fb8b9db1c86
       
     1 
       
     2 /*============================================================================
       
     3 
       
     4 This C source file is part of the SoftFloat IEC/IEEE Floating-point Arithmetic
       
     5 Package, Release 2b.
       
     6 
       
     7 Written by John R. Hauser.  This work was made possible in part by the
       
     8 International Computer Science Institute, located at Suite 600, 1947 Center
       
     9 Street, Berkeley, California 94704.  Funding was partially provided by the
       
    10 National Science Foundation under grant MIP-9311980.  The original version
       
    11 of this code was written as part of a project to build a fixed-point vector
       
    12 processor in collaboration with the University of California at Berkeley,
       
    13 overseen by Profs. Nelson Morgan and John Wawrzynek.  More information
       
    14 is available through the Web page `http://www.cs.berkeley.edu/~jhauser/
       
    15 arithmetic/SoftFloat.html'.
       
    16 
       
    17 THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE.  Although reasonable effort has
       
    18 been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT TIMES
       
    19 RESULT IN INCORRECT BEHAVIOR.  USE OF THIS SOFTWARE IS RESTRICTED TO PERSONS
       
    20 AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ALL LOSSES,
       
    21 COSTS, OR OTHER PROBLEMS THEY INCUR DUE TO THE SOFTWARE, AND WHO FURTHERMORE
       
    22 EFFECTIVELY INDEMNIFY JOHN HAUSER AND THE INTERNATIONAL COMPUTER SCIENCE
       
    23 INSTITUTE (possibly via similar legal warning) AGAINST ALL LOSSES, COSTS, OR
       
    24 OTHER PROBLEMS INCURRED BY THEIR CUSTOMERS AND CLIENTS DUE TO THE SOFTWARE.
       
    25 
       
    26 Derivative works are acceptable, even for commercial purposes, so long as
       
    27 (1) the source code for the derivative work includes prominent notice that
       
    28 the work is derivative, and (2) the source code includes prominent notice with
       
    29 these four paragraphs for those parts of this code that are retained.
       
    30 
       
    31 =============================================================================*/
       
    32 
       
    33 /* FIXME: Flush-To-Zero only effects results.  Denormal inputs should also
       
    34    be flushed to zero.  */
       
    35 #include "softfloat.h"
       
    36 
       
    37 /*----------------------------------------------------------------------------
       
    38 | Primitive arithmetic functions, including multi-word arithmetic, and
       
    39 | division and square root approximations.  (Can be specialized to target if
       
    40 | desired.)
       
    41 *----------------------------------------------------------------------------*/
       
    42 #include "softfloat-macros.h"
       
    43 
       
    44 /*----------------------------------------------------------------------------
       
    45 | Functions and definitions to determine:  (1) whether tininess for underflow
       
    46 | is detected before or after rounding by default, (2) what (if anything)
       
    47 | happens when exceptions are raised, (3) how signaling NaNs are distinguished
       
    48 | from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
       
    49 | are propagated from function inputs to output.  These details are target-
       
    50 | specific.
       
    51 *----------------------------------------------------------------------------*/
       
    52 #include "softfloat-specialize.h"
       
    53 
       
    54 void set_float_rounding_mode(int val STATUS_PARAM)
       
    55 {
       
    56     STATUS(float_rounding_mode) = val;
       
    57 }
       
    58 
       
    59 void set_float_exception_flags(int val STATUS_PARAM)
       
    60 {
       
    61     STATUS(float_exception_flags) = val;
       
    62 }
       
    63 
       
    64 #ifdef FLOATX80
       
    65 void set_floatx80_rounding_precision(int val STATUS_PARAM)
       
    66 {
       
    67     STATUS(floatx80_rounding_precision) = val;
       
    68 }
       
    69 #endif
       
    70 
       
    71 /*----------------------------------------------------------------------------
       
    72 | Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
       
    73 | and 7, and returns the properly rounded 32-bit integer corresponding to the
       
    74 | input.  If `zSign' is 1, the input is negated before being converted to an
       
    75 | integer.  Bit 63 of `absZ' must be zero.  Ordinarily, the fixed-point input
       
    76 | is simply rounded to an integer, with the inexact exception raised if the
       
    77 | input cannot be represented exactly as an integer.  However, if the fixed-
       
    78 | point input is too large, the invalid exception is raised and the largest
       
    79 | positive or negative integer is returned.
       
    80 *----------------------------------------------------------------------------*/
       
    81 
       
    82 static int32 roundAndPackInt32( flag zSign, bits64 absZ STATUS_PARAM)
       
    83 {
       
    84     int8 roundingMode;
       
    85     flag roundNearestEven;
       
    86     int8 roundIncrement, roundBits;
       
    87     int32 z;
       
    88 
       
    89     roundingMode = STATUS(float_rounding_mode);
       
    90     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
    91     roundIncrement = 0x40;
       
    92     if ( ! roundNearestEven ) {
       
    93         if ( roundingMode == float_round_to_zero ) {
       
    94             roundIncrement = 0;
       
    95         }
       
    96         else {
       
    97             roundIncrement = 0x7F;
       
    98             if ( zSign ) {
       
    99                 if ( roundingMode == float_round_up ) roundIncrement = 0;
       
   100             }
       
   101             else {
       
   102                 if ( roundingMode == float_round_down ) roundIncrement = 0;
       
   103             }
       
   104         }
       
   105     }
       
   106     roundBits = absZ & 0x7F;
       
   107     absZ = ( absZ + roundIncrement )>>7;
       
   108     absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
       
   109     z = absZ;
       
   110     if ( zSign ) z = - z;
       
   111     if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
       
   112         float_raise( float_flag_invalid STATUS_VAR);
       
   113         return zSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
       
   114     }
       
   115     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   116     return z;
       
   117 
       
   118 }
       
   119 
       
   120 /*----------------------------------------------------------------------------
       
   121 | Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
       
   122 | `absZ1', with binary point between bits 63 and 64 (between the input words),
       
   123 | and returns the properly rounded 64-bit integer corresponding to the input.
       
   124 | If `zSign' is 1, the input is negated before being converted to an integer.
       
   125 | Ordinarily, the fixed-point input is simply rounded to an integer, with
       
   126 | the inexact exception raised if the input cannot be represented exactly as
       
   127 | an integer.  However, if the fixed-point input is too large, the invalid
       
   128 | exception is raised and the largest positive or negative integer is
       
   129 | returned.
       
   130 *----------------------------------------------------------------------------*/
       
   131 
       
   132 static int64 roundAndPackInt64( flag zSign, bits64 absZ0, bits64 absZ1 STATUS_PARAM)
       
   133 {
       
   134     int8 roundingMode;
       
   135     flag roundNearestEven, increment;
       
   136     int64 z;
       
   137 
       
   138     roundingMode = STATUS(float_rounding_mode);
       
   139     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
   140     increment = ( (sbits64) absZ1 < 0 );
       
   141     if ( ! roundNearestEven ) {
       
   142         if ( roundingMode == float_round_to_zero ) {
       
   143             increment = 0;
       
   144         }
       
   145         else {
       
   146             if ( zSign ) {
       
   147                 increment = ( roundingMode == float_round_down ) && absZ1;
       
   148             }
       
   149             else {
       
   150                 increment = ( roundingMode == float_round_up ) && absZ1;
       
   151             }
       
   152         }
       
   153     }
       
   154     if ( increment ) {
       
   155         ++absZ0;
       
   156         if ( absZ0 == 0 ) goto overflow;
       
   157         absZ0 &= ~ ( ( (bits64) ( absZ1<<1 ) == 0 ) & roundNearestEven );
       
   158     }
       
   159     z = absZ0;
       
   160     if ( zSign ) z = - z;
       
   161     if ( z && ( ( z < 0 ) ^ zSign ) ) {
       
   162  overflow:
       
   163         float_raise( float_flag_invalid STATUS_VAR);
       
   164         return
       
   165               zSign ? (sbits64) LIT64( 0x8000000000000000 )
       
   166             : LIT64( 0x7FFFFFFFFFFFFFFF );
       
   167     }
       
   168     if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   169     return z;
       
   170 
       
   171 }
       
   172 
       
   173 /*----------------------------------------------------------------------------
       
   174 | Returns the fraction bits of the single-precision floating-point value `a'.
       
   175 *----------------------------------------------------------------------------*/
       
   176 
       
   177 INLINE bits32 extractFloat32Frac( float32 a )
       
   178 {
       
   179 
       
   180     return float32_val(a) & 0x007FFFFF;
       
   181 
       
   182 }
       
   183 
       
   184 /*----------------------------------------------------------------------------
       
   185 | Returns the exponent bits of the single-precision floating-point value `a'.
       
   186 *----------------------------------------------------------------------------*/
       
   187 
       
   188 INLINE int16 extractFloat32Exp( float32 a )
       
   189 {
       
   190 
       
   191     return ( float32_val(a)>>23 ) & 0xFF;
       
   192 
       
   193 }
       
   194 
       
   195 /*----------------------------------------------------------------------------
       
   196 | Returns the sign bit of the single-precision floating-point value `a'.
       
   197 *----------------------------------------------------------------------------*/
       
   198 
       
   199 INLINE flag extractFloat32Sign( float32 a )
       
   200 {
       
   201 
       
   202     return float32_val(a)>>31;
       
   203 
       
   204 }
       
   205 
       
   206 /*----------------------------------------------------------------------------
       
   207 | Normalizes the subnormal single-precision floating-point value represented
       
   208 | by the denormalized significand `aSig'.  The normalized exponent and
       
   209 | significand are stored at the locations pointed to by `zExpPtr' and
       
   210 | `zSigPtr', respectively.
       
   211 *----------------------------------------------------------------------------*/
       
   212 
       
   213 static void
       
   214  normalizeFloat32Subnormal( bits32 aSig, int16 *zExpPtr, bits32 *zSigPtr )
       
   215 {
       
   216     int8 shiftCount;
       
   217 
       
   218     shiftCount = countLeadingZeros32( aSig ) - 8;
       
   219     *zSigPtr = aSig<<shiftCount;
       
   220     *zExpPtr = 1 - shiftCount;
       
   221 
       
   222 }
       
   223 
       
   224 /*----------------------------------------------------------------------------
       
   225 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
       
   226 | single-precision floating-point value, returning the result.  After being
       
   227 | shifted into the proper positions, the three fields are simply added
       
   228 | together to form the result.  This means that any integer portion of `zSig'
       
   229 | will be added into the exponent.  Since a properly normalized significand
       
   230 | will have an integer portion equal to 1, the `zExp' input should be 1 less
       
   231 | than the desired result exponent whenever `zSig' is a complete, normalized
       
   232 | significand.
       
   233 *----------------------------------------------------------------------------*/
       
   234 
       
   235 INLINE float32 packFloat32( flag zSign, int16 zExp, bits32 zSig )
       
   236 {
       
   237 
       
   238     return make_float32(
       
   239           ( ( (bits32) zSign )<<31 ) + ( ( (bits32) zExp )<<23 ) + zSig);
       
   240 
       
   241 }
       
   242 
       
   243 /*----------------------------------------------------------------------------
       
   244 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   245 | and significand `zSig', and returns the proper single-precision floating-
       
   246 | point value corresponding to the abstract input.  Ordinarily, the abstract
       
   247 | value is simply rounded and packed into the single-precision format, with
       
   248 | the inexact exception raised if the abstract input cannot be represented
       
   249 | exactly.  However, if the abstract value is too large, the overflow and
       
   250 | inexact exceptions are raised and an infinity or maximal finite value is
       
   251 | returned.  If the abstract value is too small, the input value is rounded to
       
   252 | a subnormal number, and the underflow and inexact exceptions are raised if
       
   253 | the abstract input cannot be represented exactly as a subnormal single-
       
   254 | precision floating-point number.
       
   255 |     The input significand `zSig' has its binary point between bits 30
       
   256 | and 29, which is 7 bits to the left of the usual location.  This shifted
       
   257 | significand must be normalized or smaller.  If `zSig' is not normalized,
       
   258 | `zExp' must be 0; in that case, the result returned is a subnormal number,
       
   259 | and it must not require rounding.  In the usual case that `zSig' is
       
   260 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
       
   261 | The handling of underflow and overflow follows the IEC/IEEE Standard for
       
   262 | Binary Floating-Point Arithmetic.
       
   263 *----------------------------------------------------------------------------*/
       
   264 
       
   265 static float32 roundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
       
   266 {
       
   267     int8 roundingMode;
       
   268     flag roundNearestEven;
       
   269     int8 roundIncrement, roundBits;
       
   270     flag isTiny;
       
   271 
       
   272     roundingMode = STATUS(float_rounding_mode);
       
   273     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
   274     roundIncrement = 0x40;
       
   275     if ( ! roundNearestEven ) {
       
   276         if ( roundingMode == float_round_to_zero ) {
       
   277             roundIncrement = 0;
       
   278         }
       
   279         else {
       
   280             roundIncrement = 0x7F;
       
   281             if ( zSign ) {
       
   282                 if ( roundingMode == float_round_up ) roundIncrement = 0;
       
   283             }
       
   284             else {
       
   285                 if ( roundingMode == float_round_down ) roundIncrement = 0;
       
   286             }
       
   287         }
       
   288     }
       
   289     roundBits = zSig & 0x7F;
       
   290     if ( 0xFD <= (bits16) zExp ) {
       
   291         if (    ( 0xFD < zExp )
       
   292              || (    ( zExp == 0xFD )
       
   293                   && ( (sbits32) ( zSig + roundIncrement ) < 0 ) )
       
   294            ) {
       
   295             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
   296             return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
       
   297         }
       
   298         if ( zExp < 0 ) {
       
   299             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
       
   300             isTiny =
       
   301                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
       
   302                 || ( zExp < -1 )
       
   303                 || ( zSig + roundIncrement < 0x80000000 );
       
   304             shift32RightJamming( zSig, - zExp, &zSig );
       
   305             zExp = 0;
       
   306             roundBits = zSig & 0x7F;
       
   307             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
       
   308         }
       
   309     }
       
   310     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   311     zSig = ( zSig + roundIncrement )>>7;
       
   312     zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
       
   313     if ( zSig == 0 ) zExp = 0;
       
   314     return packFloat32( zSign, zExp, zSig );
       
   315 
       
   316 }
       
   317 
       
   318 /*----------------------------------------------------------------------------
       
   319 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   320 | and significand `zSig', and returns the proper single-precision floating-
       
   321 | point value corresponding to the abstract input.  This routine is just like
       
   322 | `roundAndPackFloat32' except that `zSig' does not have to be normalized.
       
   323 | Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
       
   324 | floating-point exponent.
       
   325 *----------------------------------------------------------------------------*/
       
   326 
       
   327 static float32
       
   328  normalizeRoundAndPackFloat32( flag zSign, int16 zExp, bits32 zSig STATUS_PARAM)
       
   329 {
       
   330     int8 shiftCount;
       
   331 
       
   332     shiftCount = countLeadingZeros32( zSig ) - 1;
       
   333     return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
       
   334 
       
   335 }
       
   336 
       
   337 /*----------------------------------------------------------------------------
       
   338 | Returns the fraction bits of the double-precision floating-point value `a'.
       
   339 *----------------------------------------------------------------------------*/
       
   340 
       
   341 INLINE bits64 extractFloat64Frac( float64 a )
       
   342 {
       
   343 
       
   344     return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
       
   345 
       
   346 }
       
   347 
       
   348 /*----------------------------------------------------------------------------
       
   349 | Returns the exponent bits of the double-precision floating-point value `a'.
       
   350 *----------------------------------------------------------------------------*/
       
   351 
       
   352 INLINE int16 extractFloat64Exp( float64 a )
       
   353 {
       
   354 
       
   355     return ( float64_val(a)>>52 ) & 0x7FF;
       
   356 
       
   357 }
       
   358 
       
   359 /*----------------------------------------------------------------------------
       
   360 | Returns the sign bit of the double-precision floating-point value `a'.
       
   361 *----------------------------------------------------------------------------*/
       
   362 
       
   363 INLINE flag extractFloat64Sign( float64 a )
       
   364 {
       
   365 
       
   366     return float64_val(a)>>63;
       
   367 
       
   368 }
       
   369 
       
   370 /*----------------------------------------------------------------------------
       
   371 | Normalizes the subnormal double-precision floating-point value represented
       
   372 | by the denormalized significand `aSig'.  The normalized exponent and
       
   373 | significand are stored at the locations pointed to by `zExpPtr' and
       
   374 | `zSigPtr', respectively.
       
   375 *----------------------------------------------------------------------------*/
       
   376 
       
   377 static void
       
   378  normalizeFloat64Subnormal( bits64 aSig, int16 *zExpPtr, bits64 *zSigPtr )
       
   379 {
       
   380     int8 shiftCount;
       
   381 
       
   382     shiftCount = countLeadingZeros64( aSig ) - 11;
       
   383     *zSigPtr = aSig<<shiftCount;
       
   384     *zExpPtr = 1 - shiftCount;
       
   385 
       
   386 }
       
   387 
       
   388 /*----------------------------------------------------------------------------
       
   389 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
       
   390 | double-precision floating-point value, returning the result.  After being
       
   391 | shifted into the proper positions, the three fields are simply added
       
   392 | together to form the result.  This means that any integer portion of `zSig'
       
   393 | will be added into the exponent.  Since a properly normalized significand
       
   394 | will have an integer portion equal to 1, the `zExp' input should be 1 less
       
   395 | than the desired result exponent whenever `zSig' is a complete, normalized
       
   396 | significand.
       
   397 *----------------------------------------------------------------------------*/
       
   398 
       
   399 INLINE float64 packFloat64( flag zSign, int16 zExp, bits64 zSig )
       
   400 {
       
   401 
       
   402     return make_float64(
       
   403         ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<52 ) + zSig);
       
   404 
       
   405 }
       
   406 
       
   407 /*----------------------------------------------------------------------------
       
   408 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   409 | and significand `zSig', and returns the proper double-precision floating-
       
   410 | point value corresponding to the abstract input.  Ordinarily, the abstract
       
   411 | value is simply rounded and packed into the double-precision format, with
       
   412 | the inexact exception raised if the abstract input cannot be represented
       
   413 | exactly.  However, if the abstract value is too large, the overflow and
       
   414 | inexact exceptions are raised and an infinity or maximal finite value is
       
   415 | returned.  If the abstract value is too small, the input value is rounded
       
   416 | to a subnormal number, and the underflow and inexact exceptions are raised
       
   417 | if the abstract input cannot be represented exactly as a subnormal double-
       
   418 | precision floating-point number.
       
   419 |     The input significand `zSig' has its binary point between bits 62
       
   420 | and 61, which is 10 bits to the left of the usual location.  This shifted
       
   421 | significand must be normalized or smaller.  If `zSig' is not normalized,
       
   422 | `zExp' must be 0; in that case, the result returned is a subnormal number,
       
   423 | and it must not require rounding.  In the usual case that `zSig' is
       
   424 | normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
       
   425 | The handling of underflow and overflow follows the IEC/IEEE Standard for
       
   426 | Binary Floating-Point Arithmetic.
       
   427 *----------------------------------------------------------------------------*/
       
   428 
       
   429 static float64 roundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
       
   430 {
       
   431     int8 roundingMode;
       
   432     flag roundNearestEven;
       
   433     int16 roundIncrement, roundBits;
       
   434     flag isTiny;
       
   435 
       
   436     roundingMode = STATUS(float_rounding_mode);
       
   437     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
   438     roundIncrement = 0x200;
       
   439     if ( ! roundNearestEven ) {
       
   440         if ( roundingMode == float_round_to_zero ) {
       
   441             roundIncrement = 0;
       
   442         }
       
   443         else {
       
   444             roundIncrement = 0x3FF;
       
   445             if ( zSign ) {
       
   446                 if ( roundingMode == float_round_up ) roundIncrement = 0;
       
   447             }
       
   448             else {
       
   449                 if ( roundingMode == float_round_down ) roundIncrement = 0;
       
   450             }
       
   451         }
       
   452     }
       
   453     roundBits = zSig & 0x3FF;
       
   454     if ( 0x7FD <= (bits16) zExp ) {
       
   455         if (    ( 0x7FD < zExp )
       
   456              || (    ( zExp == 0x7FD )
       
   457                   && ( (sbits64) ( zSig + roundIncrement ) < 0 ) )
       
   458            ) {
       
   459             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
   460             return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
       
   461         }
       
   462         if ( zExp < 0 ) {
       
   463             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
       
   464             isTiny =
       
   465                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
       
   466                 || ( zExp < -1 )
       
   467                 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
       
   468             shift64RightJamming( zSig, - zExp, &zSig );
       
   469             zExp = 0;
       
   470             roundBits = zSig & 0x3FF;
       
   471             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
       
   472         }
       
   473     }
       
   474     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   475     zSig = ( zSig + roundIncrement )>>10;
       
   476     zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
       
   477     if ( zSig == 0 ) zExp = 0;
       
   478     return packFloat64( zSign, zExp, zSig );
       
   479 
       
   480 }
       
   481 
       
   482 /*----------------------------------------------------------------------------
       
   483 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   484 | and significand `zSig', and returns the proper double-precision floating-
       
   485 | point value corresponding to the abstract input.  This routine is just like
       
   486 | `roundAndPackFloat64' except that `zSig' does not have to be normalized.
       
   487 | Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
       
   488 | floating-point exponent.
       
   489 *----------------------------------------------------------------------------*/
       
   490 
       
   491 static float64
       
   492  normalizeRoundAndPackFloat64( flag zSign, int16 zExp, bits64 zSig STATUS_PARAM)
       
   493 {
       
   494     int8 shiftCount;
       
   495 
       
   496     shiftCount = countLeadingZeros64( zSig ) - 1;
       
   497     return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
       
   498 
       
   499 }
       
   500 
       
   501 #ifdef FLOATX80
       
   502 
       
   503 /*----------------------------------------------------------------------------
       
   504 | Returns the fraction bits of the extended double-precision floating-point
       
   505 | value `a'.
       
   506 *----------------------------------------------------------------------------*/
       
   507 
       
   508 INLINE bits64 extractFloatx80Frac( floatx80 a )
       
   509 {
       
   510 
       
   511     return a.low;
       
   512 
       
   513 }
       
   514 
       
   515 /*----------------------------------------------------------------------------
       
   516 | Returns the exponent bits of the extended double-precision floating-point
       
   517 | value `a'.
       
   518 *----------------------------------------------------------------------------*/
       
   519 
       
   520 INLINE int32 extractFloatx80Exp( floatx80 a )
       
   521 {
       
   522 
       
   523     return a.high & 0x7FFF;
       
   524 
       
   525 }
       
   526 
       
   527 /*----------------------------------------------------------------------------
       
   528 | Returns the sign bit of the extended double-precision floating-point value
       
   529 | `a'.
       
   530 *----------------------------------------------------------------------------*/
       
   531 
       
   532 INLINE flag extractFloatx80Sign( floatx80 a )
       
   533 {
       
   534 
       
   535     return a.high>>15;
       
   536 
       
   537 }
       
   538 
       
   539 /*----------------------------------------------------------------------------
       
   540 | Normalizes the subnormal extended double-precision floating-point value
       
   541 | represented by the denormalized significand `aSig'.  The normalized exponent
       
   542 | and significand are stored at the locations pointed to by `zExpPtr' and
       
   543 | `zSigPtr', respectively.
       
   544 *----------------------------------------------------------------------------*/
       
   545 
       
   546 static void
       
   547  normalizeFloatx80Subnormal( bits64 aSig, int32 *zExpPtr, bits64 *zSigPtr )
       
   548 {
       
   549     int8 shiftCount;
       
   550 
       
   551     shiftCount = countLeadingZeros64( aSig );
       
   552     *zSigPtr = aSig<<shiftCount;
       
   553     *zExpPtr = 1 - shiftCount;
       
   554 
       
   555 }
       
   556 
       
   557 /*----------------------------------------------------------------------------
       
   558 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
       
   559 | extended double-precision floating-point value, returning the result.
       
   560 *----------------------------------------------------------------------------*/
       
   561 
       
   562 INLINE floatx80 packFloatx80( flag zSign, int32 zExp, bits64 zSig )
       
   563 {
       
   564     floatx80 z;
       
   565 
       
   566     z.low = zSig;
       
   567     z.high = ( ( (bits16) zSign )<<15 ) + zExp;
       
   568     return z;
       
   569 
       
   570 }
       
   571 
       
   572 /*----------------------------------------------------------------------------
       
   573 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   574 | and extended significand formed by the concatenation of `zSig0' and `zSig1',
       
   575 | and returns the proper extended double-precision floating-point value
       
   576 | corresponding to the abstract input.  Ordinarily, the abstract value is
       
   577 | rounded and packed into the extended double-precision format, with the
       
   578 | inexact exception raised if the abstract input cannot be represented
       
   579 | exactly.  However, if the abstract value is too large, the overflow and
       
   580 | inexact exceptions are raised and an infinity or maximal finite value is
       
   581 | returned.  If the abstract value is too small, the input value is rounded to
       
   582 | a subnormal number, and the underflow and inexact exceptions are raised if
       
   583 | the abstract input cannot be represented exactly as a subnormal extended
       
   584 | double-precision floating-point number.
       
   585 |     If `roundingPrecision' is 32 or 64, the result is rounded to the same
       
   586 | number of bits as single or double precision, respectively.  Otherwise, the
       
   587 | result is rounded to the full precision of the extended double-precision
       
   588 | format.
       
   589 |     The input significand must be normalized or smaller.  If the input
       
   590 | significand is not normalized, `zExp' must be 0; in that case, the result
       
   591 | returned is a subnormal number, and it must not require rounding.  The
       
   592 | handling of underflow and overflow follows the IEC/IEEE Standard for Binary
       
   593 | Floating-Point Arithmetic.
       
   594 *----------------------------------------------------------------------------*/
       
   595 
       
   596 static floatx80
       
   597  roundAndPackFloatx80(
       
   598      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
       
   599  STATUS_PARAM)
       
   600 {
       
   601     int8 roundingMode;
       
   602     flag roundNearestEven, increment, isTiny;
       
   603     int64 roundIncrement, roundMask, roundBits;
       
   604 
       
   605     roundingMode = STATUS(float_rounding_mode);
       
   606     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
   607     if ( roundingPrecision == 80 ) goto precision80;
       
   608     if ( roundingPrecision == 64 ) {
       
   609         roundIncrement = LIT64( 0x0000000000000400 );
       
   610         roundMask = LIT64( 0x00000000000007FF );
       
   611     }
       
   612     else if ( roundingPrecision == 32 ) {
       
   613         roundIncrement = LIT64( 0x0000008000000000 );
       
   614         roundMask = LIT64( 0x000000FFFFFFFFFF );
       
   615     }
       
   616     else {
       
   617         goto precision80;
       
   618     }
       
   619     zSig0 |= ( zSig1 != 0 );
       
   620     if ( ! roundNearestEven ) {
       
   621         if ( roundingMode == float_round_to_zero ) {
       
   622             roundIncrement = 0;
       
   623         }
       
   624         else {
       
   625             roundIncrement = roundMask;
       
   626             if ( zSign ) {
       
   627                 if ( roundingMode == float_round_up ) roundIncrement = 0;
       
   628             }
       
   629             else {
       
   630                 if ( roundingMode == float_round_down ) roundIncrement = 0;
       
   631             }
       
   632         }
       
   633     }
       
   634     roundBits = zSig0 & roundMask;
       
   635     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
       
   636         if (    ( 0x7FFE < zExp )
       
   637              || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
       
   638            ) {
       
   639             goto overflow;
       
   640         }
       
   641         if ( zExp <= 0 ) {
       
   642             if ( STATUS(flush_to_zero) ) return packFloatx80( zSign, 0, 0 );
       
   643             isTiny =
       
   644                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
       
   645                 || ( zExp < 0 )
       
   646                 || ( zSig0 <= zSig0 + roundIncrement );
       
   647             shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
       
   648             zExp = 0;
       
   649             roundBits = zSig0 & roundMask;
       
   650             if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
       
   651             if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   652             zSig0 += roundIncrement;
       
   653             if ( (sbits64) zSig0 < 0 ) zExp = 1;
       
   654             roundIncrement = roundMask + 1;
       
   655             if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
       
   656                 roundMask |= roundIncrement;
       
   657             }
       
   658             zSig0 &= ~ roundMask;
       
   659             return packFloatx80( zSign, zExp, zSig0 );
       
   660         }
       
   661     }
       
   662     if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   663     zSig0 += roundIncrement;
       
   664     if ( zSig0 < roundIncrement ) {
       
   665         ++zExp;
       
   666         zSig0 = LIT64( 0x8000000000000000 );
       
   667     }
       
   668     roundIncrement = roundMask + 1;
       
   669     if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
       
   670         roundMask |= roundIncrement;
       
   671     }
       
   672     zSig0 &= ~ roundMask;
       
   673     if ( zSig0 == 0 ) zExp = 0;
       
   674     return packFloatx80( zSign, zExp, zSig0 );
       
   675  precision80:
       
   676     increment = ( (sbits64) zSig1 < 0 );
       
   677     if ( ! roundNearestEven ) {
       
   678         if ( roundingMode == float_round_to_zero ) {
       
   679             increment = 0;
       
   680         }
       
   681         else {
       
   682             if ( zSign ) {
       
   683                 increment = ( roundingMode == float_round_down ) && zSig1;
       
   684             }
       
   685             else {
       
   686                 increment = ( roundingMode == float_round_up ) && zSig1;
       
   687             }
       
   688         }
       
   689     }
       
   690     if ( 0x7FFD <= (bits32) ( zExp - 1 ) ) {
       
   691         if (    ( 0x7FFE < zExp )
       
   692              || (    ( zExp == 0x7FFE )
       
   693                   && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
       
   694                   && increment
       
   695                 )
       
   696            ) {
       
   697             roundMask = 0;
       
   698  overflow:
       
   699             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
   700             if (    ( roundingMode == float_round_to_zero )
       
   701                  || ( zSign && ( roundingMode == float_round_up ) )
       
   702                  || ( ! zSign && ( roundingMode == float_round_down ) )
       
   703                ) {
       
   704                 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
       
   705             }
       
   706             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
   707         }
       
   708         if ( zExp <= 0 ) {
       
   709             isTiny =
       
   710                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
       
   711                 || ( zExp < 0 )
       
   712                 || ! increment
       
   713                 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
       
   714             shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
       
   715             zExp = 0;
       
   716             if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
       
   717             if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   718             if ( roundNearestEven ) {
       
   719                 increment = ( (sbits64) zSig1 < 0 );
       
   720             }
       
   721             else {
       
   722                 if ( zSign ) {
       
   723                     increment = ( roundingMode == float_round_down ) && zSig1;
       
   724                 }
       
   725                 else {
       
   726                     increment = ( roundingMode == float_round_up ) && zSig1;
       
   727                 }
       
   728             }
       
   729             if ( increment ) {
       
   730                 ++zSig0;
       
   731                 zSig0 &=
       
   732                     ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
       
   733                 if ( (sbits64) zSig0 < 0 ) zExp = 1;
       
   734             }
       
   735             return packFloatx80( zSign, zExp, zSig0 );
       
   736         }
       
   737     }
       
   738     if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
   739     if ( increment ) {
       
   740         ++zSig0;
       
   741         if ( zSig0 == 0 ) {
       
   742             ++zExp;
       
   743             zSig0 = LIT64( 0x8000000000000000 );
       
   744         }
       
   745         else {
       
   746             zSig0 &= ~ ( ( (bits64) ( zSig1<<1 ) == 0 ) & roundNearestEven );
       
   747         }
       
   748     }
       
   749     else {
       
   750         if ( zSig0 == 0 ) zExp = 0;
       
   751     }
       
   752     return packFloatx80( zSign, zExp, zSig0 );
       
   753 
       
   754 }
       
   755 
       
   756 /*----------------------------------------------------------------------------
       
   757 | Takes an abstract floating-point value having sign `zSign', exponent
       
   758 | `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
       
   759 | and returns the proper extended double-precision floating-point value
       
   760 | corresponding to the abstract input.  This routine is just like
       
   761 | `roundAndPackFloatx80' except that the input significand does not have to be
       
   762 | normalized.
       
   763 *----------------------------------------------------------------------------*/
       
   764 
       
   765 static floatx80
       
   766  normalizeRoundAndPackFloatx80(
       
   767      int8 roundingPrecision, flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1
       
   768  STATUS_PARAM)
       
   769 {
       
   770     int8 shiftCount;
       
   771 
       
   772     if ( zSig0 == 0 ) {
       
   773         zSig0 = zSig1;
       
   774         zSig1 = 0;
       
   775         zExp -= 64;
       
   776     }
       
   777     shiftCount = countLeadingZeros64( zSig0 );
       
   778     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
       
   779     zExp -= shiftCount;
       
   780     return
       
   781         roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
       
   782 
       
   783 }
       
   784 
       
   785 #endif
       
   786 
       
   787 #ifdef FLOAT128
       
   788 
       
   789 /*----------------------------------------------------------------------------
       
   790 | Returns the least-significant 64 fraction bits of the quadruple-precision
       
   791 | floating-point value `a'.
       
   792 *----------------------------------------------------------------------------*/
       
   793 
       
   794 INLINE bits64 extractFloat128Frac1( float128 a )
       
   795 {
       
   796 
       
   797     return a.low;
       
   798 
       
   799 }
       
   800 
       
   801 /*----------------------------------------------------------------------------
       
   802 | Returns the most-significant 48 fraction bits of the quadruple-precision
       
   803 | floating-point value `a'.
       
   804 *----------------------------------------------------------------------------*/
       
   805 
       
   806 INLINE bits64 extractFloat128Frac0( float128 a )
       
   807 {
       
   808 
       
   809     return a.high & LIT64( 0x0000FFFFFFFFFFFF );
       
   810 
       
   811 }
       
   812 
       
   813 /*----------------------------------------------------------------------------
       
   814 | Returns the exponent bits of the quadruple-precision floating-point value
       
   815 | `a'.
       
   816 *----------------------------------------------------------------------------*/
       
   817 
       
   818 INLINE int32 extractFloat128Exp( float128 a )
       
   819 {
       
   820 
       
   821     return ( a.high>>48 ) & 0x7FFF;
       
   822 
       
   823 }
       
   824 
       
   825 /*----------------------------------------------------------------------------
       
   826 | Returns the sign bit of the quadruple-precision floating-point value `a'.
       
   827 *----------------------------------------------------------------------------*/
       
   828 
       
   829 INLINE flag extractFloat128Sign( float128 a )
       
   830 {
       
   831 
       
   832     return a.high>>63;
       
   833 
       
   834 }
       
   835 
       
   836 /*----------------------------------------------------------------------------
       
   837 | Normalizes the subnormal quadruple-precision floating-point value
       
   838 | represented by the denormalized significand formed by the concatenation of
       
   839 | `aSig0' and `aSig1'.  The normalized exponent is stored at the location
       
   840 | pointed to by `zExpPtr'.  The most significant 49 bits of the normalized
       
   841 | significand are stored at the location pointed to by `zSig0Ptr', and the
       
   842 | least significant 64 bits of the normalized significand are stored at the
       
   843 | location pointed to by `zSig1Ptr'.
       
   844 *----------------------------------------------------------------------------*/
       
   845 
       
   846 static void
       
   847  normalizeFloat128Subnormal(
       
   848      bits64 aSig0,
       
   849      bits64 aSig1,
       
   850      int32 *zExpPtr,
       
   851      bits64 *zSig0Ptr,
       
   852      bits64 *zSig1Ptr
       
   853  )
       
   854 {
       
   855     int8 shiftCount;
       
   856 
       
   857     if ( aSig0 == 0 ) {
       
   858         shiftCount = countLeadingZeros64( aSig1 ) - 15;
       
   859         if ( shiftCount < 0 ) {
       
   860             *zSig0Ptr = aSig1>>( - shiftCount );
       
   861             *zSig1Ptr = aSig1<<( shiftCount & 63 );
       
   862         }
       
   863         else {
       
   864             *zSig0Ptr = aSig1<<shiftCount;
       
   865             *zSig1Ptr = 0;
       
   866         }
       
   867         *zExpPtr = - shiftCount - 63;
       
   868     }
       
   869     else {
       
   870         shiftCount = countLeadingZeros64( aSig0 ) - 15;
       
   871         shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
       
   872         *zExpPtr = 1 - shiftCount;
       
   873     }
       
   874 
       
   875 }
       
   876 
       
   877 /*----------------------------------------------------------------------------
       
   878 | Packs the sign `zSign', the exponent `zExp', and the significand formed
       
   879 | by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
       
   880 | floating-point value, returning the result.  After being shifted into the
       
   881 | proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
       
   882 | added together to form the most significant 32 bits of the result.  This
       
   883 | means that any integer portion of `zSig0' will be added into the exponent.
       
   884 | Since a properly normalized significand will have an integer portion equal
       
   885 | to 1, the `zExp' input should be 1 less than the desired result exponent
       
   886 | whenever `zSig0' and `zSig1' concatenated form a complete, normalized
       
   887 | significand.
       
   888 *----------------------------------------------------------------------------*/
       
   889 
       
   890 INLINE float128
       
   891  packFloat128( flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 )
       
   892 {
       
   893     float128 z;
       
   894 
       
   895     z.low = zSig1;
       
   896     z.high = ( ( (bits64) zSign )<<63 ) + ( ( (bits64) zExp )<<48 ) + zSig0;
       
   897     return z;
       
   898 
       
   899 }
       
   900 
       
   901 /*----------------------------------------------------------------------------
       
   902 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
   903 | and extended significand formed by the concatenation of `zSig0', `zSig1',
       
   904 | and `zSig2', and returns the proper quadruple-precision floating-point value
       
   905 | corresponding to the abstract input.  Ordinarily, the abstract value is
       
   906 | simply rounded and packed into the quadruple-precision format, with the
       
   907 | inexact exception raised if the abstract input cannot be represented
       
   908 | exactly.  However, if the abstract value is too large, the overflow and
       
   909 | inexact exceptions are raised and an infinity or maximal finite value is
       
   910 | returned.  If the abstract value is too small, the input value is rounded to
       
   911 | a subnormal number, and the underflow and inexact exceptions are raised if
       
   912 | the abstract input cannot be represented exactly as a subnormal quadruple-
       
   913 | precision floating-point number.
       
   914 |     The input significand must be normalized or smaller.  If the input
       
   915 | significand is not normalized, `zExp' must be 0; in that case, the result
       
   916 | returned is a subnormal number, and it must not require rounding.  In the
       
   917 | usual case that the input significand is normalized, `zExp' must be 1 less
       
   918 | than the ``true'' floating-point exponent.  The handling of underflow and
       
   919 | overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
   920 *----------------------------------------------------------------------------*/
       
   921 
       
   922 static float128
       
   923  roundAndPackFloat128(
       
   924      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1, bits64 zSig2 STATUS_PARAM)
       
   925 {
       
   926     int8 roundingMode;
       
   927     flag roundNearestEven, increment, isTiny;
       
   928 
       
   929     roundingMode = STATUS(float_rounding_mode);
       
   930     roundNearestEven = ( roundingMode == float_round_nearest_even );
       
   931     increment = ( (sbits64) zSig2 < 0 );
       
   932     if ( ! roundNearestEven ) {
       
   933         if ( roundingMode == float_round_to_zero ) {
       
   934             increment = 0;
       
   935         }
       
   936         else {
       
   937             if ( zSign ) {
       
   938                 increment = ( roundingMode == float_round_down ) && zSig2;
       
   939             }
       
   940             else {
       
   941                 increment = ( roundingMode == float_round_up ) && zSig2;
       
   942             }
       
   943         }
       
   944     }
       
   945     if ( 0x7FFD <= (bits32) zExp ) {
       
   946         if (    ( 0x7FFD < zExp )
       
   947              || (    ( zExp == 0x7FFD )
       
   948                   && eq128(
       
   949                          LIT64( 0x0001FFFFFFFFFFFF ),
       
   950                          LIT64( 0xFFFFFFFFFFFFFFFF ),
       
   951                          zSig0,
       
   952                          zSig1
       
   953                      )
       
   954                   && increment
       
   955                 )
       
   956            ) {
       
   957             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
   958             if (    ( roundingMode == float_round_to_zero )
       
   959                  || ( zSign && ( roundingMode == float_round_up ) )
       
   960                  || ( ! zSign && ( roundingMode == float_round_down ) )
       
   961                ) {
       
   962                 return
       
   963                     packFloat128(
       
   964                         zSign,
       
   965                         0x7FFE,
       
   966                         LIT64( 0x0000FFFFFFFFFFFF ),
       
   967                         LIT64( 0xFFFFFFFFFFFFFFFF )
       
   968                     );
       
   969             }
       
   970             return packFloat128( zSign, 0x7FFF, 0, 0 );
       
   971         }
       
   972         if ( zExp < 0 ) {
       
   973             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
       
   974             isTiny =
       
   975                    ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
       
   976                 || ( zExp < -1 )
       
   977                 || ! increment
       
   978                 || lt128(
       
   979                        zSig0,
       
   980                        zSig1,
       
   981                        LIT64( 0x0001FFFFFFFFFFFF ),
       
   982                        LIT64( 0xFFFFFFFFFFFFFFFF )
       
   983                    );
       
   984             shift128ExtraRightJamming(
       
   985                 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
       
   986             zExp = 0;
       
   987             if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
       
   988             if ( roundNearestEven ) {
       
   989                 increment = ( (sbits64) zSig2 < 0 );
       
   990             }
       
   991             else {
       
   992                 if ( zSign ) {
       
   993                     increment = ( roundingMode == float_round_down ) && zSig2;
       
   994                 }
       
   995                 else {
       
   996                     increment = ( roundingMode == float_round_up ) && zSig2;
       
   997                 }
       
   998             }
       
   999         }
       
  1000     }
       
  1001     if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  1002     if ( increment ) {
       
  1003         add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
       
  1004         zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
       
  1005     }
       
  1006     else {
       
  1007         if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
       
  1008     }
       
  1009     return packFloat128( zSign, zExp, zSig0, zSig1 );
       
  1010 
       
  1011 }
       
  1012 
       
  1013 /*----------------------------------------------------------------------------
       
  1014 | Takes an abstract floating-point value having sign `zSign', exponent `zExp',
       
  1015 | and significand formed by the concatenation of `zSig0' and `zSig1', and
       
  1016 | returns the proper quadruple-precision floating-point value corresponding
       
  1017 | to the abstract input.  This routine is just like `roundAndPackFloat128'
       
  1018 | except that the input significand has fewer bits and does not have to be
       
  1019 | normalized.  In all cases, `zExp' must be 1 less than the ``true'' floating-
       
  1020 | point exponent.
       
  1021 *----------------------------------------------------------------------------*/
       
  1022 
       
  1023 static float128
       
  1024  normalizeRoundAndPackFloat128(
       
  1025      flag zSign, int32 zExp, bits64 zSig0, bits64 zSig1 STATUS_PARAM)
       
  1026 {
       
  1027     int8 shiftCount;
       
  1028     bits64 zSig2;
       
  1029 
       
  1030     if ( zSig0 == 0 ) {
       
  1031         zSig0 = zSig1;
       
  1032         zSig1 = 0;
       
  1033         zExp -= 64;
       
  1034     }
       
  1035     shiftCount = countLeadingZeros64( zSig0 ) - 15;
       
  1036     if ( 0 <= shiftCount ) {
       
  1037         zSig2 = 0;
       
  1038         shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
       
  1039     }
       
  1040     else {
       
  1041         shift128ExtraRightJamming(
       
  1042             zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
       
  1043     }
       
  1044     zExp -= shiftCount;
       
  1045     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
       
  1046 
       
  1047 }
       
  1048 
       
  1049 #endif
       
  1050 
       
  1051 /*----------------------------------------------------------------------------
       
  1052 | Returns the result of converting the 32-bit two's complement integer `a'
       
  1053 | to the single-precision floating-point format.  The conversion is performed
       
  1054 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1055 *----------------------------------------------------------------------------*/
       
  1056 
       
  1057 float32 int32_to_float32( int32 a STATUS_PARAM )
       
  1058 {
       
  1059     flag zSign;
       
  1060 
       
  1061     if ( a == 0 ) return float32_zero;
       
  1062     if ( a == (sbits32) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
       
  1063     zSign = ( a < 0 );
       
  1064     return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
       
  1065 
       
  1066 }
       
  1067 
       
  1068 /*----------------------------------------------------------------------------
       
  1069 | Returns the result of converting the 32-bit two's complement integer `a'
       
  1070 | to the double-precision floating-point format.  The conversion is performed
       
  1071 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1072 *----------------------------------------------------------------------------*/
       
  1073 
       
  1074 float64 int32_to_float64( int32 a STATUS_PARAM )
       
  1075 {
       
  1076     flag zSign;
       
  1077     uint32 absA;
       
  1078     int8 shiftCount;
       
  1079     bits64 zSig;
       
  1080 
       
  1081     if ( a == 0 ) return float64_zero;
       
  1082     zSign = ( a < 0 );
       
  1083     absA = zSign ? - a : a;
       
  1084     shiftCount = countLeadingZeros32( absA ) + 21;
       
  1085     zSig = absA;
       
  1086     return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
       
  1087 
       
  1088 }
       
  1089 
       
  1090 #ifdef FLOATX80
       
  1091 
       
  1092 /*----------------------------------------------------------------------------
       
  1093 | Returns the result of converting the 32-bit two's complement integer `a'
       
  1094 | to the extended double-precision floating-point format.  The conversion
       
  1095 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1096 | Arithmetic.
       
  1097 *----------------------------------------------------------------------------*/
       
  1098 
       
  1099 floatx80 int32_to_floatx80( int32 a STATUS_PARAM )
       
  1100 {
       
  1101     flag zSign;
       
  1102     uint32 absA;
       
  1103     int8 shiftCount;
       
  1104     bits64 zSig;
       
  1105 
       
  1106     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
       
  1107     zSign = ( a < 0 );
       
  1108     absA = zSign ? - a : a;
       
  1109     shiftCount = countLeadingZeros32( absA ) + 32;
       
  1110     zSig = absA;
       
  1111     return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
       
  1112 
       
  1113 }
       
  1114 
       
  1115 #endif
       
  1116 
       
  1117 #ifdef FLOAT128
       
  1118 
       
  1119 /*----------------------------------------------------------------------------
       
  1120 | Returns the result of converting the 32-bit two's complement integer `a' to
       
  1121 | the quadruple-precision floating-point format.  The conversion is performed
       
  1122 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1123 *----------------------------------------------------------------------------*/
       
  1124 
       
  1125 float128 int32_to_float128( int32 a STATUS_PARAM )
       
  1126 {
       
  1127     flag zSign;
       
  1128     uint32 absA;
       
  1129     int8 shiftCount;
       
  1130     bits64 zSig0;
       
  1131 
       
  1132     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
       
  1133     zSign = ( a < 0 );
       
  1134     absA = zSign ? - a : a;
       
  1135     shiftCount = countLeadingZeros32( absA ) + 17;
       
  1136     zSig0 = absA;
       
  1137     return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
       
  1138 
       
  1139 }
       
  1140 
       
  1141 #endif
       
  1142 
       
  1143 /*----------------------------------------------------------------------------
       
  1144 | Returns the result of converting the 64-bit two's complement integer `a'
       
  1145 | to the single-precision floating-point format.  The conversion is performed
       
  1146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1147 *----------------------------------------------------------------------------*/
       
  1148 
       
  1149 float32 int64_to_float32( int64 a STATUS_PARAM )
       
  1150 {
       
  1151     flag zSign;
       
  1152     uint64 absA;
       
  1153     int8 shiftCount;
       
  1154 
       
  1155     if ( a == 0 ) return float32_zero;
       
  1156     zSign = ( a < 0 );
       
  1157     absA = zSign ? - a : a;
       
  1158     shiftCount = countLeadingZeros64( absA ) - 40;
       
  1159     if ( 0 <= shiftCount ) {
       
  1160         return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
       
  1161     }
       
  1162     else {
       
  1163         shiftCount += 7;
       
  1164         if ( shiftCount < 0 ) {
       
  1165             shift64RightJamming( absA, - shiftCount, &absA );
       
  1166         }
       
  1167         else {
       
  1168             absA <<= shiftCount;
       
  1169         }
       
  1170         return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
       
  1171     }
       
  1172 
       
  1173 }
       
  1174 
       
  1175 float32 uint64_to_float32( uint64 a STATUS_PARAM )
       
  1176 {
       
  1177     int8 shiftCount;
       
  1178 
       
  1179     if ( a == 0 ) return float32_zero;
       
  1180     shiftCount = countLeadingZeros64( a ) - 40;
       
  1181     if ( 0 <= shiftCount ) {
       
  1182         return packFloat32( 1 > 0, 0x95 - shiftCount, a<<shiftCount );
       
  1183     }
       
  1184     else {
       
  1185         shiftCount += 7;
       
  1186         if ( shiftCount < 0 ) {
       
  1187             shift64RightJamming( a, - shiftCount, &a );
       
  1188         }
       
  1189         else {
       
  1190             a <<= shiftCount;
       
  1191         }
       
  1192         return roundAndPackFloat32( 1 > 0, 0x9C - shiftCount, a STATUS_VAR );
       
  1193     }
       
  1194 }
       
  1195 
       
  1196 /*----------------------------------------------------------------------------
       
  1197 | Returns the result of converting the 64-bit two's complement integer `a'
       
  1198 | to the double-precision floating-point format.  The conversion is performed
       
  1199 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1200 *----------------------------------------------------------------------------*/
       
  1201 
       
  1202 float64 int64_to_float64( int64 a STATUS_PARAM )
       
  1203 {
       
  1204     flag zSign;
       
  1205 
       
  1206     if ( a == 0 ) return float64_zero;
       
  1207     if ( a == (sbits64) LIT64( 0x8000000000000000 ) ) {
       
  1208         return packFloat64( 1, 0x43E, 0 );
       
  1209     }
       
  1210     zSign = ( a < 0 );
       
  1211     return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
       
  1212 
       
  1213 }
       
  1214 
       
  1215 float64 uint64_to_float64( uint64 a STATUS_PARAM )
       
  1216 {
       
  1217     if ( a == 0 ) return float64_zero;
       
  1218     return normalizeRoundAndPackFloat64( 0, 0x43C, a STATUS_VAR );
       
  1219 
       
  1220 }
       
  1221 
       
  1222 #ifdef FLOATX80
       
  1223 
       
  1224 /*----------------------------------------------------------------------------
       
  1225 | Returns the result of converting the 64-bit two's complement integer `a'
       
  1226 | to the extended double-precision floating-point format.  The conversion
       
  1227 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1228 | Arithmetic.
       
  1229 *----------------------------------------------------------------------------*/
       
  1230 
       
  1231 floatx80 int64_to_floatx80( int64 a STATUS_PARAM )
       
  1232 {
       
  1233     flag zSign;
       
  1234     uint64 absA;
       
  1235     int8 shiftCount;
       
  1236 
       
  1237     if ( a == 0 ) return packFloatx80( 0, 0, 0 );
       
  1238     zSign = ( a < 0 );
       
  1239     absA = zSign ? - a : a;
       
  1240     shiftCount = countLeadingZeros64( absA );
       
  1241     return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
       
  1242 
       
  1243 }
       
  1244 
       
  1245 #endif
       
  1246 
       
  1247 #ifdef FLOAT128
       
  1248 
       
  1249 /*----------------------------------------------------------------------------
       
  1250 | Returns the result of converting the 64-bit two's complement integer `a' to
       
  1251 | the quadruple-precision floating-point format.  The conversion is performed
       
  1252 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1253 *----------------------------------------------------------------------------*/
       
  1254 
       
  1255 float128 int64_to_float128( int64 a STATUS_PARAM )
       
  1256 {
       
  1257     flag zSign;
       
  1258     uint64 absA;
       
  1259     int8 shiftCount;
       
  1260     int32 zExp;
       
  1261     bits64 zSig0, zSig1;
       
  1262 
       
  1263     if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
       
  1264     zSign = ( a < 0 );
       
  1265     absA = zSign ? - a : a;
       
  1266     shiftCount = countLeadingZeros64( absA ) + 49;
       
  1267     zExp = 0x406E - shiftCount;
       
  1268     if ( 64 <= shiftCount ) {
       
  1269         zSig1 = 0;
       
  1270         zSig0 = absA;
       
  1271         shiftCount -= 64;
       
  1272     }
       
  1273     else {
       
  1274         zSig1 = absA;
       
  1275         zSig0 = 0;
       
  1276     }
       
  1277     shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
       
  1278     return packFloat128( zSign, zExp, zSig0, zSig1 );
       
  1279 
       
  1280 }
       
  1281 
       
  1282 #endif
       
  1283 
       
  1284 /*----------------------------------------------------------------------------
       
  1285 | Returns the result of converting the single-precision floating-point value
       
  1286 | `a' to the 32-bit two's complement integer format.  The conversion is
       
  1287 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1288 | Arithmetic---which means in particular that the conversion is rounded
       
  1289 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  1290 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  1291 | largest integer with the same sign as `a' is returned.
       
  1292 *----------------------------------------------------------------------------*/
       
  1293 
       
  1294 int32 float32_to_int32( float32 a STATUS_PARAM )
       
  1295 {
       
  1296     flag aSign;
       
  1297     int16 aExp, shiftCount;
       
  1298     bits32 aSig;
       
  1299     bits64 aSig64;
       
  1300 
       
  1301     aSig = extractFloat32Frac( a );
       
  1302     aExp = extractFloat32Exp( a );
       
  1303     aSign = extractFloat32Sign( a );
       
  1304     if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
       
  1305     if ( aExp ) aSig |= 0x00800000;
       
  1306     shiftCount = 0xAF - aExp;
       
  1307     aSig64 = aSig;
       
  1308     aSig64 <<= 32;
       
  1309     if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
       
  1310     return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
       
  1311 
       
  1312 }
       
  1313 
       
  1314 /*----------------------------------------------------------------------------
       
  1315 | Returns the result of converting the single-precision floating-point value
       
  1316 | `a' to the 32-bit two's complement integer format.  The conversion is
       
  1317 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1318 | Arithmetic, except that the conversion is always rounded toward zero.
       
  1319 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
       
  1320 | the conversion overflows, the largest integer with the same sign as `a' is
       
  1321 | returned.
       
  1322 *----------------------------------------------------------------------------*/
       
  1323 
       
  1324 int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
       
  1325 {
       
  1326     flag aSign;
       
  1327     int16 aExp, shiftCount;
       
  1328     bits32 aSig;
       
  1329     int32 z;
       
  1330 
       
  1331     aSig = extractFloat32Frac( a );
       
  1332     aExp = extractFloat32Exp( a );
       
  1333     aSign = extractFloat32Sign( a );
       
  1334     shiftCount = aExp - 0x9E;
       
  1335     if ( 0 <= shiftCount ) {
       
  1336         if ( float32_val(a) != 0xCF000000 ) {
       
  1337             float_raise( float_flag_invalid STATUS_VAR);
       
  1338             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
       
  1339         }
       
  1340         return (sbits32) 0x80000000;
       
  1341     }
       
  1342     else if ( aExp <= 0x7E ) {
       
  1343         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  1344         return 0;
       
  1345     }
       
  1346     aSig = ( aSig | 0x00800000 )<<8;
       
  1347     z = aSig>>( - shiftCount );
       
  1348     if ( (bits32) ( aSig<<( shiftCount & 31 ) ) ) {
       
  1349         STATUS(float_exception_flags) |= float_flag_inexact;
       
  1350     }
       
  1351     if ( aSign ) z = - z;
       
  1352     return z;
       
  1353 
       
  1354 }
       
  1355 
       
  1356 /*----------------------------------------------------------------------------
       
  1357 | Returns the result of converting the single-precision floating-point value
       
  1358 | `a' to the 64-bit two's complement integer format.  The conversion is
       
  1359 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1360 | Arithmetic---which means in particular that the conversion is rounded
       
  1361 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  1362 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  1363 | largest integer with the same sign as `a' is returned.
       
  1364 *----------------------------------------------------------------------------*/
       
  1365 
       
  1366 int64 float32_to_int64( float32 a STATUS_PARAM )
       
  1367 {
       
  1368     flag aSign;
       
  1369     int16 aExp, shiftCount;
       
  1370     bits32 aSig;
       
  1371     bits64 aSig64, aSigExtra;
       
  1372 
       
  1373     aSig = extractFloat32Frac( a );
       
  1374     aExp = extractFloat32Exp( a );
       
  1375     aSign = extractFloat32Sign( a );
       
  1376     shiftCount = 0xBE - aExp;
       
  1377     if ( shiftCount < 0 ) {
       
  1378         float_raise( float_flag_invalid STATUS_VAR);
       
  1379         if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
       
  1380             return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  1381         }
       
  1382         return (sbits64) LIT64( 0x8000000000000000 );
       
  1383     }
       
  1384     if ( aExp ) aSig |= 0x00800000;
       
  1385     aSig64 = aSig;
       
  1386     aSig64 <<= 40;
       
  1387     shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
       
  1388     return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
       
  1389 
       
  1390 }
       
  1391 
       
  1392 /*----------------------------------------------------------------------------
       
  1393 | Returns the result of converting the single-precision floating-point value
       
  1394 | `a' to the 64-bit two's complement integer format.  The conversion is
       
  1395 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1396 | Arithmetic, except that the conversion is always rounded toward zero.  If
       
  1397 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
       
  1398 | conversion overflows, the largest integer with the same sign as `a' is
       
  1399 | returned.
       
  1400 *----------------------------------------------------------------------------*/
       
  1401 
       
  1402 int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
       
  1403 {
       
  1404     flag aSign;
       
  1405     int16 aExp, shiftCount;
       
  1406     bits32 aSig;
       
  1407     bits64 aSig64;
       
  1408     int64 z;
       
  1409 
       
  1410     aSig = extractFloat32Frac( a );
       
  1411     aExp = extractFloat32Exp( a );
       
  1412     aSign = extractFloat32Sign( a );
       
  1413     shiftCount = aExp - 0xBE;
       
  1414     if ( 0 <= shiftCount ) {
       
  1415         if ( float32_val(a) != 0xDF000000 ) {
       
  1416             float_raise( float_flag_invalid STATUS_VAR);
       
  1417             if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
       
  1418                 return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  1419             }
       
  1420         }
       
  1421         return (sbits64) LIT64( 0x8000000000000000 );
       
  1422     }
       
  1423     else if ( aExp <= 0x7E ) {
       
  1424         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  1425         return 0;
       
  1426     }
       
  1427     aSig64 = aSig | 0x00800000;
       
  1428     aSig64 <<= 40;
       
  1429     z = aSig64>>( - shiftCount );
       
  1430     if ( (bits64) ( aSig64<<( shiftCount & 63 ) ) ) {
       
  1431         STATUS(float_exception_flags) |= float_flag_inexact;
       
  1432     }
       
  1433     if ( aSign ) z = - z;
       
  1434     return z;
       
  1435 
       
  1436 }
       
  1437 
       
  1438 /*----------------------------------------------------------------------------
       
  1439 | Returns the result of converting the single-precision floating-point value
       
  1440 | `a' to the double-precision floating-point format.  The conversion is
       
  1441 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1442 | Arithmetic.
       
  1443 *----------------------------------------------------------------------------*/
       
  1444 
       
  1445 float64 float32_to_float64( float32 a STATUS_PARAM )
       
  1446 {
       
  1447     flag aSign;
       
  1448     int16 aExp;
       
  1449     bits32 aSig;
       
  1450 
       
  1451     aSig = extractFloat32Frac( a );
       
  1452     aExp = extractFloat32Exp( a );
       
  1453     aSign = extractFloat32Sign( a );
       
  1454     if ( aExp == 0xFF ) {
       
  1455         if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ));
       
  1456         return packFloat64( aSign, 0x7FF, 0 );
       
  1457     }
       
  1458     if ( aExp == 0 ) {
       
  1459         if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
       
  1460         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1461         --aExp;
       
  1462     }
       
  1463     return packFloat64( aSign, aExp + 0x380, ( (bits64) aSig )<<29 );
       
  1464 
       
  1465 }
       
  1466 
       
  1467 #ifdef FLOATX80
       
  1468 
       
  1469 /*----------------------------------------------------------------------------
       
  1470 | Returns the result of converting the single-precision floating-point value
       
  1471 | `a' to the extended double-precision floating-point format.  The conversion
       
  1472 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1473 | Arithmetic.
       
  1474 *----------------------------------------------------------------------------*/
       
  1475 
       
  1476 floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
       
  1477 {
       
  1478     flag aSign;
       
  1479     int16 aExp;
       
  1480     bits32 aSig;
       
  1481 
       
  1482     aSig = extractFloat32Frac( a );
       
  1483     aExp = extractFloat32Exp( a );
       
  1484     aSign = extractFloat32Sign( a );
       
  1485     if ( aExp == 0xFF ) {
       
  1486         if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) );
       
  1487         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  1488     }
       
  1489     if ( aExp == 0 ) {
       
  1490         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
       
  1491         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1492     }
       
  1493     aSig |= 0x00800000;
       
  1494     return packFloatx80( aSign, aExp + 0x3F80, ( (bits64) aSig )<<40 );
       
  1495 
       
  1496 }
       
  1497 
       
  1498 #endif
       
  1499 
       
  1500 #ifdef FLOAT128
       
  1501 
       
  1502 /*----------------------------------------------------------------------------
       
  1503 | Returns the result of converting the single-precision floating-point value
       
  1504 | `a' to the double-precision floating-point format.  The conversion is
       
  1505 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  1506 | Arithmetic.
       
  1507 *----------------------------------------------------------------------------*/
       
  1508 
       
  1509 float128 float32_to_float128( float32 a STATUS_PARAM )
       
  1510 {
       
  1511     flag aSign;
       
  1512     int16 aExp;
       
  1513     bits32 aSig;
       
  1514 
       
  1515     aSig = extractFloat32Frac( a );
       
  1516     aExp = extractFloat32Exp( a );
       
  1517     aSign = extractFloat32Sign( a );
       
  1518     if ( aExp == 0xFF ) {
       
  1519         if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) );
       
  1520         return packFloat128( aSign, 0x7FFF, 0, 0 );
       
  1521     }
       
  1522     if ( aExp == 0 ) {
       
  1523         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
       
  1524         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1525         --aExp;
       
  1526     }
       
  1527     return packFloat128( aSign, aExp + 0x3F80, ( (bits64) aSig )<<25, 0 );
       
  1528 
       
  1529 }
       
  1530 
       
  1531 #endif
       
  1532 
       
  1533 /*----------------------------------------------------------------------------
       
  1534 | Rounds the single-precision floating-point value `a' to an integer, and
       
  1535 | returns the result as a single-precision floating-point value.  The
       
  1536 | operation is performed according to the IEC/IEEE Standard for Binary
       
  1537 | Floating-Point Arithmetic.
       
  1538 *----------------------------------------------------------------------------*/
       
  1539 
       
  1540 float32 float32_round_to_int( float32 a STATUS_PARAM)
       
  1541 {
       
  1542     flag aSign;
       
  1543     int16 aExp;
       
  1544     bits32 lastBitMask, roundBitsMask;
       
  1545     int8 roundingMode;
       
  1546     bits32 z;
       
  1547 
       
  1548     aExp = extractFloat32Exp( a );
       
  1549     if ( 0x96 <= aExp ) {
       
  1550         if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
       
  1551             return propagateFloat32NaN( a, a STATUS_VAR );
       
  1552         }
       
  1553         return a;
       
  1554     }
       
  1555     if ( aExp <= 0x7E ) {
       
  1556         if ( (bits32) ( float32_val(a)<<1 ) == 0 ) return a;
       
  1557         STATUS(float_exception_flags) |= float_flag_inexact;
       
  1558         aSign = extractFloat32Sign( a );
       
  1559         switch ( STATUS(float_rounding_mode) ) {
       
  1560          case float_round_nearest_even:
       
  1561             if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
       
  1562                 return packFloat32( aSign, 0x7F, 0 );
       
  1563             }
       
  1564             break;
       
  1565          case float_round_down:
       
  1566             return make_float32(aSign ? 0xBF800000 : 0);
       
  1567          case float_round_up:
       
  1568             return make_float32(aSign ? 0x80000000 : 0x3F800000);
       
  1569         }
       
  1570         return packFloat32( aSign, 0, 0 );
       
  1571     }
       
  1572     lastBitMask = 1;
       
  1573     lastBitMask <<= 0x96 - aExp;
       
  1574     roundBitsMask = lastBitMask - 1;
       
  1575     z = float32_val(a);
       
  1576     roundingMode = STATUS(float_rounding_mode);
       
  1577     if ( roundingMode == float_round_nearest_even ) {
       
  1578         z += lastBitMask>>1;
       
  1579         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
       
  1580     }
       
  1581     else if ( roundingMode != float_round_to_zero ) {
       
  1582         if ( extractFloat32Sign( make_float32(z) ) ^ ( roundingMode == float_round_up ) ) {
       
  1583             z += roundBitsMask;
       
  1584         }
       
  1585     }
       
  1586     z &= ~ roundBitsMask;
       
  1587     if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  1588     return make_float32(z);
       
  1589 
       
  1590 }
       
  1591 
       
  1592 /*----------------------------------------------------------------------------
       
  1593 | Returns the result of adding the absolute values of the single-precision
       
  1594 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
       
  1595 | before being returned.  `zSign' is ignored if the result is a NaN.
       
  1596 | The addition is performed according to the IEC/IEEE Standard for Binary
       
  1597 | Floating-Point Arithmetic.
       
  1598 *----------------------------------------------------------------------------*/
       
  1599 
       
  1600 static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
       
  1601 {
       
  1602     int16 aExp, bExp, zExp;
       
  1603     bits32 aSig, bSig, zSig;
       
  1604     int16 expDiff;
       
  1605 
       
  1606     aSig = extractFloat32Frac( a );
       
  1607     aExp = extractFloat32Exp( a );
       
  1608     bSig = extractFloat32Frac( b );
       
  1609     bExp = extractFloat32Exp( b );
       
  1610     expDiff = aExp - bExp;
       
  1611     aSig <<= 6;
       
  1612     bSig <<= 6;
       
  1613     if ( 0 < expDiff ) {
       
  1614         if ( aExp == 0xFF ) {
       
  1615             if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1616             return a;
       
  1617         }
       
  1618         if ( bExp == 0 ) {
       
  1619             --expDiff;
       
  1620         }
       
  1621         else {
       
  1622             bSig |= 0x20000000;
       
  1623         }
       
  1624         shift32RightJamming( bSig, expDiff, &bSig );
       
  1625         zExp = aExp;
       
  1626     }
       
  1627     else if ( expDiff < 0 ) {
       
  1628         if ( bExp == 0xFF ) {
       
  1629             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1630             return packFloat32( zSign, 0xFF, 0 );
       
  1631         }
       
  1632         if ( aExp == 0 ) {
       
  1633             ++expDiff;
       
  1634         }
       
  1635         else {
       
  1636             aSig |= 0x20000000;
       
  1637         }
       
  1638         shift32RightJamming( aSig, - expDiff, &aSig );
       
  1639         zExp = bExp;
       
  1640     }
       
  1641     else {
       
  1642         if ( aExp == 0xFF ) {
       
  1643             if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1644             return a;
       
  1645         }
       
  1646         if ( aExp == 0 ) {
       
  1647             if ( STATUS(flush_to_zero) ) return packFloat32( zSign, 0, 0 );
       
  1648             return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
       
  1649         }
       
  1650         zSig = 0x40000000 + aSig + bSig;
       
  1651         zExp = aExp;
       
  1652         goto roundAndPack;
       
  1653     }
       
  1654     aSig |= 0x20000000;
       
  1655     zSig = ( aSig + bSig )<<1;
       
  1656     --zExp;
       
  1657     if ( (sbits32) zSig < 0 ) {
       
  1658         zSig = aSig + bSig;
       
  1659         ++zExp;
       
  1660     }
       
  1661  roundAndPack:
       
  1662     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
       
  1663 
       
  1664 }
       
  1665 
       
  1666 /*----------------------------------------------------------------------------
       
  1667 | Returns the result of subtracting the absolute values of the single-
       
  1668 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
       
  1669 | difference is negated before being returned.  `zSign' is ignored if the
       
  1670 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
       
  1671 | Standard for Binary Floating-Point Arithmetic.
       
  1672 *----------------------------------------------------------------------------*/
       
  1673 
       
  1674 static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
       
  1675 {
       
  1676     int16 aExp, bExp, zExp;
       
  1677     bits32 aSig, bSig, zSig;
       
  1678     int16 expDiff;
       
  1679 
       
  1680     aSig = extractFloat32Frac( a );
       
  1681     aExp = extractFloat32Exp( a );
       
  1682     bSig = extractFloat32Frac( b );
       
  1683     bExp = extractFloat32Exp( b );
       
  1684     expDiff = aExp - bExp;
       
  1685     aSig <<= 7;
       
  1686     bSig <<= 7;
       
  1687     if ( 0 < expDiff ) goto aExpBigger;
       
  1688     if ( expDiff < 0 ) goto bExpBigger;
       
  1689     if ( aExp == 0xFF ) {
       
  1690         if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1691         float_raise( float_flag_invalid STATUS_VAR);
       
  1692         return float32_default_nan;
       
  1693     }
       
  1694     if ( aExp == 0 ) {
       
  1695         aExp = 1;
       
  1696         bExp = 1;
       
  1697     }
       
  1698     if ( bSig < aSig ) goto aBigger;
       
  1699     if ( aSig < bSig ) goto bBigger;
       
  1700     return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
       
  1701  bExpBigger:
       
  1702     if ( bExp == 0xFF ) {
       
  1703         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1704         return packFloat32( zSign ^ 1, 0xFF, 0 );
       
  1705     }
       
  1706     if ( aExp == 0 ) {
       
  1707         ++expDiff;
       
  1708     }
       
  1709     else {
       
  1710         aSig |= 0x40000000;
       
  1711     }
       
  1712     shift32RightJamming( aSig, - expDiff, &aSig );
       
  1713     bSig |= 0x40000000;
       
  1714  bBigger:
       
  1715     zSig = bSig - aSig;
       
  1716     zExp = bExp;
       
  1717     zSign ^= 1;
       
  1718     goto normalizeRoundAndPack;
       
  1719  aExpBigger:
       
  1720     if ( aExp == 0xFF ) {
       
  1721         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1722         return a;
       
  1723     }
       
  1724     if ( bExp == 0 ) {
       
  1725         --expDiff;
       
  1726     }
       
  1727     else {
       
  1728         bSig |= 0x40000000;
       
  1729     }
       
  1730     shift32RightJamming( bSig, expDiff, &bSig );
       
  1731     aSig |= 0x40000000;
       
  1732  aBigger:
       
  1733     zSig = aSig - bSig;
       
  1734     zExp = aExp;
       
  1735  normalizeRoundAndPack:
       
  1736     --zExp;
       
  1737     return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
       
  1738 
       
  1739 }
       
  1740 
       
  1741 /*----------------------------------------------------------------------------
       
  1742 | Returns the result of adding the single-precision floating-point values `a'
       
  1743 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
       
  1744 | Binary Floating-Point Arithmetic.
       
  1745 *----------------------------------------------------------------------------*/
       
  1746 
       
  1747 float32 float32_add( float32 a, float32 b STATUS_PARAM )
       
  1748 {
       
  1749     flag aSign, bSign;
       
  1750 
       
  1751     aSign = extractFloat32Sign( a );
       
  1752     bSign = extractFloat32Sign( b );
       
  1753     if ( aSign == bSign ) {
       
  1754         return addFloat32Sigs( a, b, aSign STATUS_VAR);
       
  1755     }
       
  1756     else {
       
  1757         return subFloat32Sigs( a, b, aSign STATUS_VAR );
       
  1758     }
       
  1759 
       
  1760 }
       
  1761 
       
  1762 /*----------------------------------------------------------------------------
       
  1763 | Returns the result of subtracting the single-precision floating-point values
       
  1764 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
       
  1765 | for Binary Floating-Point Arithmetic.
       
  1766 *----------------------------------------------------------------------------*/
       
  1767 
       
  1768 float32 float32_sub( float32 a, float32 b STATUS_PARAM )
       
  1769 {
       
  1770     flag aSign, bSign;
       
  1771 
       
  1772     aSign = extractFloat32Sign( a );
       
  1773     bSign = extractFloat32Sign( b );
       
  1774     if ( aSign == bSign ) {
       
  1775         return subFloat32Sigs( a, b, aSign STATUS_VAR );
       
  1776     }
       
  1777     else {
       
  1778         return addFloat32Sigs( a, b, aSign STATUS_VAR );
       
  1779     }
       
  1780 
       
  1781 }
       
  1782 
       
  1783 /*----------------------------------------------------------------------------
       
  1784 | Returns the result of multiplying the single-precision floating-point values
       
  1785 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
       
  1786 | for Binary Floating-Point Arithmetic.
       
  1787 *----------------------------------------------------------------------------*/
       
  1788 
       
  1789 float32 float32_mul( float32 a, float32 b STATUS_PARAM )
       
  1790 {
       
  1791     flag aSign, bSign, zSign;
       
  1792     int16 aExp, bExp, zExp;
       
  1793     bits32 aSig, bSig;
       
  1794     bits64 zSig64;
       
  1795     bits32 zSig;
       
  1796 
       
  1797     aSig = extractFloat32Frac( a );
       
  1798     aExp = extractFloat32Exp( a );
       
  1799     aSign = extractFloat32Sign( a );
       
  1800     bSig = extractFloat32Frac( b );
       
  1801     bExp = extractFloat32Exp( b );
       
  1802     bSign = extractFloat32Sign( b );
       
  1803     zSign = aSign ^ bSign;
       
  1804     if ( aExp == 0xFF ) {
       
  1805         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
       
  1806             return propagateFloat32NaN( a, b STATUS_VAR );
       
  1807         }
       
  1808         if ( ( bExp | bSig ) == 0 ) {
       
  1809             float_raise( float_flag_invalid STATUS_VAR);
       
  1810             return float32_default_nan;
       
  1811         }
       
  1812         return packFloat32( zSign, 0xFF, 0 );
       
  1813     }
       
  1814     if ( bExp == 0xFF ) {
       
  1815         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1816         if ( ( aExp | aSig ) == 0 ) {
       
  1817             float_raise( float_flag_invalid STATUS_VAR);
       
  1818             return float32_default_nan;
       
  1819         }
       
  1820         return packFloat32( zSign, 0xFF, 0 );
       
  1821     }
       
  1822     if ( aExp == 0 ) {
       
  1823         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
       
  1824         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1825     }
       
  1826     if ( bExp == 0 ) {
       
  1827         if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
       
  1828         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
       
  1829     }
       
  1830     zExp = aExp + bExp - 0x7F;
       
  1831     aSig = ( aSig | 0x00800000 )<<7;
       
  1832     bSig = ( bSig | 0x00800000 )<<8;
       
  1833     shift64RightJamming( ( (bits64) aSig ) * bSig, 32, &zSig64 );
       
  1834     zSig = zSig64;
       
  1835     if ( 0 <= (sbits32) ( zSig<<1 ) ) {
       
  1836         zSig <<= 1;
       
  1837         --zExp;
       
  1838     }
       
  1839     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
       
  1840 
       
  1841 }
       
  1842 
       
  1843 /*----------------------------------------------------------------------------
       
  1844 | Returns the result of dividing the single-precision floating-point value `a'
       
  1845 | by the corresponding value `b'.  The operation is performed according to the
       
  1846 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1847 *----------------------------------------------------------------------------*/
       
  1848 
       
  1849 float32 float32_div( float32 a, float32 b STATUS_PARAM )
       
  1850 {
       
  1851     flag aSign, bSign, zSign;
       
  1852     int16 aExp, bExp, zExp;
       
  1853     bits32 aSig, bSig, zSig;
       
  1854 
       
  1855     aSig = extractFloat32Frac( a );
       
  1856     aExp = extractFloat32Exp( a );
       
  1857     aSign = extractFloat32Sign( a );
       
  1858     bSig = extractFloat32Frac( b );
       
  1859     bExp = extractFloat32Exp( b );
       
  1860     bSign = extractFloat32Sign( b );
       
  1861     zSign = aSign ^ bSign;
       
  1862     if ( aExp == 0xFF ) {
       
  1863         if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1864         if ( bExp == 0xFF ) {
       
  1865             if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1866             float_raise( float_flag_invalid STATUS_VAR);
       
  1867             return float32_default_nan;
       
  1868         }
       
  1869         return packFloat32( zSign, 0xFF, 0 );
       
  1870     }
       
  1871     if ( bExp == 0xFF ) {
       
  1872         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1873         return packFloat32( zSign, 0, 0 );
       
  1874     }
       
  1875     if ( bExp == 0 ) {
       
  1876         if ( bSig == 0 ) {
       
  1877             if ( ( aExp | aSig ) == 0 ) {
       
  1878                 float_raise( float_flag_invalid STATUS_VAR);
       
  1879                 return float32_default_nan;
       
  1880             }
       
  1881             float_raise( float_flag_divbyzero STATUS_VAR);
       
  1882             return packFloat32( zSign, 0xFF, 0 );
       
  1883         }
       
  1884         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
       
  1885     }
       
  1886     if ( aExp == 0 ) {
       
  1887         if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
       
  1888         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1889     }
       
  1890     zExp = aExp - bExp + 0x7D;
       
  1891     aSig = ( aSig | 0x00800000 )<<7;
       
  1892     bSig = ( bSig | 0x00800000 )<<8;
       
  1893     if ( bSig <= ( aSig + aSig ) ) {
       
  1894         aSig >>= 1;
       
  1895         ++zExp;
       
  1896     }
       
  1897     zSig = ( ( (bits64) aSig )<<32 ) / bSig;
       
  1898     if ( ( zSig & 0x3F ) == 0 ) {
       
  1899         zSig |= ( (bits64) bSig * zSig != ( (bits64) aSig )<<32 );
       
  1900     }
       
  1901     return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
       
  1902 
       
  1903 }
       
  1904 
       
  1905 /*----------------------------------------------------------------------------
       
  1906 | Returns the remainder of the single-precision floating-point value `a'
       
  1907 | with respect to the corresponding value `b'.  The operation is performed
       
  1908 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  1909 *----------------------------------------------------------------------------*/
       
  1910 
       
  1911 float32 float32_rem( float32 a, float32 b STATUS_PARAM )
       
  1912 {
       
  1913     flag aSign, bSign, zSign;
       
  1914     int16 aExp, bExp, expDiff;
       
  1915     bits32 aSig, bSig;
       
  1916     bits32 q;
       
  1917     bits64 aSig64, bSig64, q64;
       
  1918     bits32 alternateASig;
       
  1919     sbits32 sigMean;
       
  1920 
       
  1921     aSig = extractFloat32Frac( a );
       
  1922     aExp = extractFloat32Exp( a );
       
  1923     aSign = extractFloat32Sign( a );
       
  1924     bSig = extractFloat32Frac( b );
       
  1925     bExp = extractFloat32Exp( b );
       
  1926     bSign = extractFloat32Sign( b );
       
  1927     if ( aExp == 0xFF ) {
       
  1928         if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
       
  1929             return propagateFloat32NaN( a, b STATUS_VAR );
       
  1930         }
       
  1931         float_raise( float_flag_invalid STATUS_VAR);
       
  1932         return float32_default_nan;
       
  1933     }
       
  1934     if ( bExp == 0xFF ) {
       
  1935         if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
       
  1936         return a;
       
  1937     }
       
  1938     if ( bExp == 0 ) {
       
  1939         if ( bSig == 0 ) {
       
  1940             float_raise( float_flag_invalid STATUS_VAR);
       
  1941             return float32_default_nan;
       
  1942         }
       
  1943         normalizeFloat32Subnormal( bSig, &bExp, &bSig );
       
  1944     }
       
  1945     if ( aExp == 0 ) {
       
  1946         if ( aSig == 0 ) return a;
       
  1947         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  1948     }
       
  1949     expDiff = aExp - bExp;
       
  1950     aSig |= 0x00800000;
       
  1951     bSig |= 0x00800000;
       
  1952     if ( expDiff < 32 ) {
       
  1953         aSig <<= 8;
       
  1954         bSig <<= 8;
       
  1955         if ( expDiff < 0 ) {
       
  1956             if ( expDiff < -1 ) return a;
       
  1957             aSig >>= 1;
       
  1958         }
       
  1959         q = ( bSig <= aSig );
       
  1960         if ( q ) aSig -= bSig;
       
  1961         if ( 0 < expDiff ) {
       
  1962             q = ( ( (bits64) aSig )<<32 ) / bSig;
       
  1963             q >>= 32 - expDiff;
       
  1964             bSig >>= 2;
       
  1965             aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
       
  1966         }
       
  1967         else {
       
  1968             aSig >>= 2;
       
  1969             bSig >>= 2;
       
  1970         }
       
  1971     }
       
  1972     else {
       
  1973         if ( bSig <= aSig ) aSig -= bSig;
       
  1974         aSig64 = ( (bits64) aSig )<<40;
       
  1975         bSig64 = ( (bits64) bSig )<<40;
       
  1976         expDiff -= 64;
       
  1977         while ( 0 < expDiff ) {
       
  1978             q64 = estimateDiv128To64( aSig64, 0, bSig64 );
       
  1979             q64 = ( 2 < q64 ) ? q64 - 2 : 0;
       
  1980             aSig64 = - ( ( bSig * q64 )<<38 );
       
  1981             expDiff -= 62;
       
  1982         }
       
  1983         expDiff += 64;
       
  1984         q64 = estimateDiv128To64( aSig64, 0, bSig64 );
       
  1985         q64 = ( 2 < q64 ) ? q64 - 2 : 0;
       
  1986         q = q64>>( 64 - expDiff );
       
  1987         bSig <<= 6;
       
  1988         aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
       
  1989     }
       
  1990     do {
       
  1991         alternateASig = aSig;
       
  1992         ++q;
       
  1993         aSig -= bSig;
       
  1994     } while ( 0 <= (sbits32) aSig );
       
  1995     sigMean = aSig + alternateASig;
       
  1996     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
       
  1997         aSig = alternateASig;
       
  1998     }
       
  1999     zSign = ( (sbits32) aSig < 0 );
       
  2000     if ( zSign ) aSig = - aSig;
       
  2001     return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
       
  2002 
       
  2003 }
       
  2004 
       
  2005 /*----------------------------------------------------------------------------
       
  2006 | Returns the square root of the single-precision floating-point value `a'.
       
  2007 | The operation is performed according to the IEC/IEEE Standard for Binary
       
  2008 | Floating-Point Arithmetic.
       
  2009 *----------------------------------------------------------------------------*/
       
  2010 
       
  2011 float32 float32_sqrt( float32 a STATUS_PARAM )
       
  2012 {
       
  2013     flag aSign;
       
  2014     int16 aExp, zExp;
       
  2015     bits32 aSig, zSig;
       
  2016     bits64 rem, term;
       
  2017 
       
  2018     aSig = extractFloat32Frac( a );
       
  2019     aExp = extractFloat32Exp( a );
       
  2020     aSign = extractFloat32Sign( a );
       
  2021     if ( aExp == 0xFF ) {
       
  2022         if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
       
  2023         if ( ! aSign ) return a;
       
  2024         float_raise( float_flag_invalid STATUS_VAR);
       
  2025         return float32_default_nan;
       
  2026     }
       
  2027     if ( aSign ) {
       
  2028         if ( ( aExp | aSig ) == 0 ) return a;
       
  2029         float_raise( float_flag_invalid STATUS_VAR);
       
  2030         return float32_default_nan;
       
  2031     }
       
  2032     if ( aExp == 0 ) {
       
  2033         if ( aSig == 0 ) return float32_zero;
       
  2034         normalizeFloat32Subnormal( aSig, &aExp, &aSig );
       
  2035     }
       
  2036     zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
       
  2037     aSig = ( aSig | 0x00800000 )<<8;
       
  2038     zSig = estimateSqrt32( aExp, aSig ) + 2;
       
  2039     if ( ( zSig & 0x7F ) <= 5 ) {
       
  2040         if ( zSig < 2 ) {
       
  2041             zSig = 0x7FFFFFFF;
       
  2042             goto roundAndPack;
       
  2043         }
       
  2044         aSig >>= aExp & 1;
       
  2045         term = ( (bits64) zSig ) * zSig;
       
  2046         rem = ( ( (bits64) aSig )<<32 ) - term;
       
  2047         while ( (sbits64) rem < 0 ) {
       
  2048             --zSig;
       
  2049             rem += ( ( (bits64) zSig )<<1 ) | 1;
       
  2050         }
       
  2051         zSig |= ( rem != 0 );
       
  2052     }
       
  2053     shift32RightJamming( zSig, 1, &zSig );
       
  2054  roundAndPack:
       
  2055     return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
       
  2056 
       
  2057 }
       
  2058 
       
  2059 /*----------------------------------------------------------------------------
       
  2060 | Returns 1 if the single-precision floating-point value `a' is equal to
       
  2061 | the corresponding value `b', and 0 otherwise.  The comparison is performed
       
  2062 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  2063 *----------------------------------------------------------------------------*/
       
  2064 
       
  2065 int float32_eq( float32 a, float32 b STATUS_PARAM )
       
  2066 {
       
  2067 
       
  2068     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2069          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2070        ) {
       
  2071         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
       
  2072             float_raise( float_flag_invalid STATUS_VAR);
       
  2073         }
       
  2074         return 0;
       
  2075     }
       
  2076     return ( float32_val(a) == float32_val(b) ) ||
       
  2077             ( (bits32) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
       
  2078 
       
  2079 }
       
  2080 
       
  2081 /*----------------------------------------------------------------------------
       
  2082 | Returns 1 if the single-precision floating-point value `a' is less than
       
  2083 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
       
  2084 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2085 | Arithmetic.
       
  2086 *----------------------------------------------------------------------------*/
       
  2087 
       
  2088 int float32_le( float32 a, float32 b STATUS_PARAM )
       
  2089 {
       
  2090     flag aSign, bSign;
       
  2091     bits32 av, bv;
       
  2092 
       
  2093     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2094          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2095        ) {
       
  2096         float_raise( float_flag_invalid STATUS_VAR);
       
  2097         return 0;
       
  2098     }
       
  2099     aSign = extractFloat32Sign( a );
       
  2100     bSign = extractFloat32Sign( b );
       
  2101     av = float32_val(a);
       
  2102     bv = float32_val(b);
       
  2103     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
       
  2104     return ( av == bv ) || ( aSign ^ ( av < bv ) );
       
  2105 
       
  2106 }
       
  2107 
       
  2108 /*----------------------------------------------------------------------------
       
  2109 | Returns 1 if the single-precision floating-point value `a' is less than
       
  2110 | the corresponding value `b', and 0 otherwise.  The comparison is performed
       
  2111 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  2112 *----------------------------------------------------------------------------*/
       
  2113 
       
  2114 int float32_lt( float32 a, float32 b STATUS_PARAM )
       
  2115 {
       
  2116     flag aSign, bSign;
       
  2117     bits32 av, bv;
       
  2118 
       
  2119     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2120          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2121        ) {
       
  2122         float_raise( float_flag_invalid STATUS_VAR);
       
  2123         return 0;
       
  2124     }
       
  2125     aSign = extractFloat32Sign( a );
       
  2126     bSign = extractFloat32Sign( b );
       
  2127     av = float32_val(a);
       
  2128     bv = float32_val(b);
       
  2129     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
       
  2130     return ( av != bv ) && ( aSign ^ ( av < bv ) );
       
  2131 
       
  2132 }
       
  2133 
       
  2134 /*----------------------------------------------------------------------------
       
  2135 | Returns 1 if the single-precision floating-point value `a' is equal to
       
  2136 | the corresponding value `b', and 0 otherwise.  The invalid exception is
       
  2137 | raised if either operand is a NaN.  Otherwise, the comparison is performed
       
  2138 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  2139 *----------------------------------------------------------------------------*/
       
  2140 
       
  2141 int float32_eq_signaling( float32 a, float32 b STATUS_PARAM )
       
  2142 {
       
  2143     bits32 av, bv;
       
  2144 
       
  2145     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2146          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2147        ) {
       
  2148         float_raise( float_flag_invalid STATUS_VAR);
       
  2149         return 0;
       
  2150     }
       
  2151     av = float32_val(a);
       
  2152     bv = float32_val(b);
       
  2153     return ( av == bv ) || ( (bits32) ( ( av | bv )<<1 ) == 0 );
       
  2154 
       
  2155 }
       
  2156 
       
  2157 /*----------------------------------------------------------------------------
       
  2158 | Returns 1 if the single-precision floating-point value `a' is less than or
       
  2159 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
       
  2160 | cause an exception.  Otherwise, the comparison is performed according to the
       
  2161 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  2162 *----------------------------------------------------------------------------*/
       
  2163 
       
  2164 int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
       
  2165 {
       
  2166     flag aSign, bSign;
       
  2167     bits32 av, bv;
       
  2168 
       
  2169     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2170          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2171        ) {
       
  2172         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
       
  2173             float_raise( float_flag_invalid STATUS_VAR);
       
  2174         }
       
  2175         return 0;
       
  2176     }
       
  2177     aSign = extractFloat32Sign( a );
       
  2178     bSign = extractFloat32Sign( b );
       
  2179     av = float32_val(a);
       
  2180     bv = float32_val(b);
       
  2181     if ( aSign != bSign ) return aSign || ( (bits32) ( ( av | bv )<<1 ) == 0 );
       
  2182     return ( av == bv ) || ( aSign ^ ( av < bv ) );
       
  2183 
       
  2184 }
       
  2185 
       
  2186 /*----------------------------------------------------------------------------
       
  2187 | Returns 1 if the single-precision floating-point value `a' is less than
       
  2188 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
       
  2189 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
       
  2190 | Standard for Binary Floating-Point Arithmetic.
       
  2191 *----------------------------------------------------------------------------*/
       
  2192 
       
  2193 int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
       
  2194 {
       
  2195     flag aSign, bSign;
       
  2196     bits32 av, bv;
       
  2197 
       
  2198     if (    ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
       
  2199          || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
       
  2200        ) {
       
  2201         if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
       
  2202             float_raise( float_flag_invalid STATUS_VAR);
       
  2203         }
       
  2204         return 0;
       
  2205     }
       
  2206     aSign = extractFloat32Sign( a );
       
  2207     bSign = extractFloat32Sign( b );
       
  2208     av = float32_val(a);
       
  2209     bv = float32_val(b);
       
  2210     if ( aSign != bSign ) return aSign && ( (bits32) ( ( av | bv )<<1 ) != 0 );
       
  2211     return ( av != bv ) && ( aSign ^ ( av < bv ) );
       
  2212 
       
  2213 }
       
  2214 
       
  2215 /*----------------------------------------------------------------------------
       
  2216 | Returns the result of converting the double-precision floating-point value
       
  2217 | `a' to the 32-bit two's complement integer format.  The conversion is
       
  2218 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2219 | Arithmetic---which means in particular that the conversion is rounded
       
  2220 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  2221 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  2222 | largest integer with the same sign as `a' is returned.
       
  2223 *----------------------------------------------------------------------------*/
       
  2224 
       
  2225 int32 float64_to_int32( float64 a STATUS_PARAM )
       
  2226 {
       
  2227     flag aSign;
       
  2228     int16 aExp, shiftCount;
       
  2229     bits64 aSig;
       
  2230 
       
  2231     aSig = extractFloat64Frac( a );
       
  2232     aExp = extractFloat64Exp( a );
       
  2233     aSign = extractFloat64Sign( a );
       
  2234     if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
       
  2235     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
       
  2236     shiftCount = 0x42C - aExp;
       
  2237     if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
       
  2238     return roundAndPackInt32( aSign, aSig STATUS_VAR );
       
  2239 
       
  2240 }
       
  2241 
       
  2242 /*----------------------------------------------------------------------------
       
  2243 | Returns the result of converting the double-precision floating-point value
       
  2244 | `a' to the 32-bit two's complement integer format.  The conversion is
       
  2245 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2246 | Arithmetic, except that the conversion is always rounded toward zero.
       
  2247 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
       
  2248 | the conversion overflows, the largest integer with the same sign as `a' is
       
  2249 | returned.
       
  2250 *----------------------------------------------------------------------------*/
       
  2251 
       
  2252 int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
       
  2253 {
       
  2254     flag aSign;
       
  2255     int16 aExp, shiftCount;
       
  2256     bits64 aSig, savedASig;
       
  2257     int32 z;
       
  2258 
       
  2259     aSig = extractFloat64Frac( a );
       
  2260     aExp = extractFloat64Exp( a );
       
  2261     aSign = extractFloat64Sign( a );
       
  2262     if ( 0x41E < aExp ) {
       
  2263         if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
       
  2264         goto invalid;
       
  2265     }
       
  2266     else if ( aExp < 0x3FF ) {
       
  2267         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  2268         return 0;
       
  2269     }
       
  2270     aSig |= LIT64( 0x0010000000000000 );
       
  2271     shiftCount = 0x433 - aExp;
       
  2272     savedASig = aSig;
       
  2273     aSig >>= shiftCount;
       
  2274     z = aSig;
       
  2275     if ( aSign ) z = - z;
       
  2276     if ( ( z < 0 ) ^ aSign ) {
       
  2277  invalid:
       
  2278         float_raise( float_flag_invalid STATUS_VAR);
       
  2279         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
       
  2280     }
       
  2281     if ( ( aSig<<shiftCount ) != savedASig ) {
       
  2282         STATUS(float_exception_flags) |= float_flag_inexact;
       
  2283     }
       
  2284     return z;
       
  2285 
       
  2286 }
       
  2287 
       
  2288 /*----------------------------------------------------------------------------
       
  2289 | Returns the result of converting the double-precision floating-point value
       
  2290 | `a' to the 64-bit two's complement integer format.  The conversion is
       
  2291 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2292 | Arithmetic---which means in particular that the conversion is rounded
       
  2293 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  2294 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  2295 | largest integer with the same sign as `a' is returned.
       
  2296 *----------------------------------------------------------------------------*/
       
  2297 
       
  2298 int64 float64_to_int64( float64 a STATUS_PARAM )
       
  2299 {
       
  2300     flag aSign;
       
  2301     int16 aExp, shiftCount;
       
  2302     bits64 aSig, aSigExtra;
       
  2303 
       
  2304     aSig = extractFloat64Frac( a );
       
  2305     aExp = extractFloat64Exp( a );
       
  2306     aSign = extractFloat64Sign( a );
       
  2307     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
       
  2308     shiftCount = 0x433 - aExp;
       
  2309     if ( shiftCount <= 0 ) {
       
  2310         if ( 0x43E < aExp ) {
       
  2311             float_raise( float_flag_invalid STATUS_VAR);
       
  2312             if (    ! aSign
       
  2313                  || (    ( aExp == 0x7FF )
       
  2314                       && ( aSig != LIT64( 0x0010000000000000 ) ) )
       
  2315                ) {
       
  2316                 return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  2317             }
       
  2318             return (sbits64) LIT64( 0x8000000000000000 );
       
  2319         }
       
  2320         aSigExtra = 0;
       
  2321         aSig <<= - shiftCount;
       
  2322     }
       
  2323     else {
       
  2324         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
       
  2325     }
       
  2326     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
       
  2327 
       
  2328 }
       
  2329 
       
  2330 /*----------------------------------------------------------------------------
       
  2331 | Returns the result of converting the double-precision floating-point value
       
  2332 | `a' to the 64-bit two's complement integer format.  The conversion is
       
  2333 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2334 | Arithmetic, except that the conversion is always rounded toward zero.
       
  2335 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
       
  2336 | the conversion overflows, the largest integer with the same sign as `a' is
       
  2337 | returned.
       
  2338 *----------------------------------------------------------------------------*/
       
  2339 
       
  2340 int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
       
  2341 {
       
  2342     flag aSign;
       
  2343     int16 aExp, shiftCount;
       
  2344     bits64 aSig;
       
  2345     int64 z;
       
  2346 
       
  2347     aSig = extractFloat64Frac( a );
       
  2348     aExp = extractFloat64Exp( a );
       
  2349     aSign = extractFloat64Sign( a );
       
  2350     if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
       
  2351     shiftCount = aExp - 0x433;
       
  2352     if ( 0 <= shiftCount ) {
       
  2353         if ( 0x43E <= aExp ) {
       
  2354             if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
       
  2355                 float_raise( float_flag_invalid STATUS_VAR);
       
  2356                 if (    ! aSign
       
  2357                      || (    ( aExp == 0x7FF )
       
  2358                           && ( aSig != LIT64( 0x0010000000000000 ) ) )
       
  2359                    ) {
       
  2360                     return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  2361                 }
       
  2362             }
       
  2363             return (sbits64) LIT64( 0x8000000000000000 );
       
  2364         }
       
  2365         z = aSig<<shiftCount;
       
  2366     }
       
  2367     else {
       
  2368         if ( aExp < 0x3FE ) {
       
  2369             if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  2370             return 0;
       
  2371         }
       
  2372         z = aSig>>( - shiftCount );
       
  2373         if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
       
  2374             STATUS(float_exception_flags) |= float_flag_inexact;
       
  2375         }
       
  2376     }
       
  2377     if ( aSign ) z = - z;
       
  2378     return z;
       
  2379 
       
  2380 }
       
  2381 
       
  2382 /*----------------------------------------------------------------------------
       
  2383 | Returns the result of converting the double-precision floating-point value
       
  2384 | `a' to the single-precision floating-point format.  The conversion is
       
  2385 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2386 | Arithmetic.
       
  2387 *----------------------------------------------------------------------------*/
       
  2388 
       
  2389 float32 float64_to_float32( float64 a STATUS_PARAM )
       
  2390 {
       
  2391     flag aSign;
       
  2392     int16 aExp;
       
  2393     bits64 aSig;
       
  2394     bits32 zSig;
       
  2395 
       
  2396     aSig = extractFloat64Frac( a );
       
  2397     aExp = extractFloat64Exp( a );
       
  2398     aSign = extractFloat64Sign( a );
       
  2399     if ( aExp == 0x7FF ) {
       
  2400         if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) );
       
  2401         return packFloat32( aSign, 0xFF, 0 );
       
  2402     }
       
  2403     shift64RightJamming( aSig, 22, &aSig );
       
  2404     zSig = aSig;
       
  2405     if ( aExp || zSig ) {
       
  2406         zSig |= 0x40000000;
       
  2407         aExp -= 0x381;
       
  2408     }
       
  2409     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
       
  2410 
       
  2411 }
       
  2412 
       
  2413 
       
  2414 /*----------------------------------------------------------------------------
       
  2415 | Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
       
  2416 | half-precision floating-point value, returning the result.  After being
       
  2417 | shifted into the proper positions, the three fields are simply added
       
  2418 | together to form the result.  This means that any integer portion of `zSig'
       
  2419 | will be added into the exponent.  Since a properly normalized significand
       
  2420 | will have an integer portion equal to 1, the `zExp' input should be 1 less
       
  2421 | than the desired result exponent whenever `zSig' is a complete, normalized
       
  2422 | significand.
       
  2423 *----------------------------------------------------------------------------*/
       
  2424 static bits16 packFloat16(flag zSign, int16 zExp, bits16 zSig)
       
  2425 {
       
  2426     return (((bits32)zSign) << 15) + (((bits32)zExp) << 10) + zSig;
       
  2427 }
       
  2428 
       
  2429 float32 float16_to_float32( bits16 a, flag ieee STATUS_PARAM )
       
  2430 {
       
  2431     flag aSign;
       
  2432     int16 aExp;
       
  2433     bits32 aSig;
       
  2434 
       
  2435     aSign = a >> 15;
       
  2436     aExp = (a >> 10) & 0x1f;
       
  2437     aSig = a & 0x3ff;
       
  2438 
       
  2439     if (aExp == 0x1f && ieee) {
       
  2440         if (aSig) {
       
  2441             /* Make sure correct exceptions are raised.  */
       
  2442             float32ToCommonNaN(a STATUS_VAR);
       
  2443             aSig |= 0x200;
       
  2444         }
       
  2445         return packFloat32(aSign, 0xff, aSig << 13);
       
  2446     }
       
  2447     if (aExp == 0) {
       
  2448         int8 shiftCount;
       
  2449 
       
  2450         if (aSig == 0) {
       
  2451             return packFloat32(aSign, 0, 0);
       
  2452         }
       
  2453 
       
  2454         shiftCount = countLeadingZeros32( aSig ) - 21;
       
  2455         aSig = aSig << shiftCount;
       
  2456         aExp = -shiftCount;
       
  2457     }
       
  2458     return packFloat32( aSign, aExp + 0x70, aSig << 13);
       
  2459 }
       
  2460 
       
  2461 bits16 float32_to_float16( float32 a, flag ieee STATUS_PARAM)
       
  2462 {
       
  2463     flag aSign;
       
  2464     int16 aExp;
       
  2465     bits32 aSig;
       
  2466     bits32 mask;
       
  2467     bits32 increment;
       
  2468     int8 roundingMode;
       
  2469 
       
  2470     aSig = extractFloat32Frac( a );
       
  2471     aExp = extractFloat32Exp( a );
       
  2472     aSign = extractFloat32Sign( a );
       
  2473     if ( aExp == 0xFF ) {
       
  2474         if (aSig) {
       
  2475             /* Make sure correct exceptions are raised.  */
       
  2476             float32ToCommonNaN(a STATUS_VAR);
       
  2477             aSig |= 0x00400000;
       
  2478         }
       
  2479         return packFloat16(aSign, 0x1f, aSig >> 13);
       
  2480     }
       
  2481     if (aExp == 0 && aSign == 0) {
       
  2482         return packFloat16(aSign, 0, 0);
       
  2483     }
       
  2484     /* Decimal point between bits 22 and 23.  */
       
  2485     aSig |= 0x00800000;
       
  2486     aExp -= 0x7f;
       
  2487     if (aExp < -14) {
       
  2488         mask = 0x007fffff;
       
  2489         if (aExp < -25) {
       
  2490             aExp = -26;
       
  2491         } else if (aExp != -25) {
       
  2492             mask >>= 24 + aExp;
       
  2493         }
       
  2494     } else {
       
  2495         mask = 0x00001fff;
       
  2496     }
       
  2497     if (aSig & mask) {
       
  2498         float_raise( float_flag_underflow STATUS_VAR );
       
  2499         roundingMode = STATUS(float_rounding_mode);
       
  2500         switch (roundingMode) {
       
  2501         case float_round_nearest_even:
       
  2502             increment = (mask + 1) >> 1;
       
  2503             if ((aSig & mask) == increment) {
       
  2504                 increment = aSig & (increment << 1);
       
  2505             }
       
  2506             break;
       
  2507         case float_round_up:
       
  2508             increment = aSign ? 0 : mask;
       
  2509             break;
       
  2510         case float_round_down:
       
  2511             increment = aSign ? mask : 0;
       
  2512             break;
       
  2513         default: /* round_to_zero */
       
  2514             increment = 0;
       
  2515             break;
       
  2516         }
       
  2517         aSig += increment;
       
  2518         if (aSig >= 0x01000000) {
       
  2519             aSig >>= 1;
       
  2520             aExp++;
       
  2521         }
       
  2522     } else if (aExp < -14
       
  2523           && STATUS(float_detect_tininess) == float_tininess_before_rounding) {
       
  2524         float_raise( float_flag_underflow STATUS_VAR);
       
  2525     }
       
  2526 
       
  2527     if (ieee) {
       
  2528         if (aExp > 15) {
       
  2529             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
  2530             return packFloat16(aSign, 0x1f, 0);
       
  2531         }
       
  2532     } else {
       
  2533         if (aExp > 16) {
       
  2534             float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
       
  2535             return packFloat16(aSign, 0x1f, 0x3ff);
       
  2536         }
       
  2537     }
       
  2538     if (aExp < -24) {
       
  2539         return packFloat16(aSign, 0, 0);
       
  2540     }
       
  2541     if (aExp < -14) {
       
  2542         aSig >>= -14 - aExp;
       
  2543         aExp = -14;
       
  2544     }
       
  2545     return packFloat16(aSign, aExp + 14, aSig >> 13);
       
  2546 }
       
  2547 
       
  2548 #ifdef FLOATX80
       
  2549 
       
  2550 /*----------------------------------------------------------------------------
       
  2551 | Returns the result of converting the double-precision floating-point value
       
  2552 | `a' to the extended double-precision floating-point format.  The conversion
       
  2553 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2554 | Arithmetic.
       
  2555 *----------------------------------------------------------------------------*/
       
  2556 
       
  2557 floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
       
  2558 {
       
  2559     flag aSign;
       
  2560     int16 aExp;
       
  2561     bits64 aSig;
       
  2562 
       
  2563     aSig = extractFloat64Frac( a );
       
  2564     aExp = extractFloat64Exp( a );
       
  2565     aSign = extractFloat64Sign( a );
       
  2566     if ( aExp == 0x7FF ) {
       
  2567         if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) );
       
  2568         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  2569     }
       
  2570     if ( aExp == 0 ) {
       
  2571         if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
       
  2572         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  2573     }
       
  2574     return
       
  2575         packFloatx80(
       
  2576             aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
       
  2577 
       
  2578 }
       
  2579 
       
  2580 #endif
       
  2581 
       
  2582 #ifdef FLOAT128
       
  2583 
       
  2584 /*----------------------------------------------------------------------------
       
  2585 | Returns the result of converting the double-precision floating-point value
       
  2586 | `a' to the quadruple-precision floating-point format.  The conversion is
       
  2587 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  2588 | Arithmetic.
       
  2589 *----------------------------------------------------------------------------*/
       
  2590 
       
  2591 float128 float64_to_float128( float64 a STATUS_PARAM )
       
  2592 {
       
  2593     flag aSign;
       
  2594     int16 aExp;
       
  2595     bits64 aSig, zSig0, zSig1;
       
  2596 
       
  2597     aSig = extractFloat64Frac( a );
       
  2598     aExp = extractFloat64Exp( a );
       
  2599     aSign = extractFloat64Sign( a );
       
  2600     if ( aExp == 0x7FF ) {
       
  2601         if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) );
       
  2602         return packFloat128( aSign, 0x7FFF, 0, 0 );
       
  2603     }
       
  2604     if ( aExp == 0 ) {
       
  2605         if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
       
  2606         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  2607         --aExp;
       
  2608     }
       
  2609     shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
       
  2610     return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
       
  2611 
       
  2612 }
       
  2613 
       
  2614 #endif
       
  2615 
       
  2616 /*----------------------------------------------------------------------------
       
  2617 | Rounds the double-precision floating-point value `a' to an integer, and
       
  2618 | returns the result as a double-precision floating-point value.  The
       
  2619 | operation is performed according to the IEC/IEEE Standard for Binary
       
  2620 | Floating-Point Arithmetic.
       
  2621 *----------------------------------------------------------------------------*/
       
  2622 
       
  2623 float64 float64_round_to_int( float64 a STATUS_PARAM )
       
  2624 {
       
  2625     flag aSign;
       
  2626     int16 aExp;
       
  2627     bits64 lastBitMask, roundBitsMask;
       
  2628     int8 roundingMode;
       
  2629     bits64 z;
       
  2630 
       
  2631     aExp = extractFloat64Exp( a );
       
  2632     if ( 0x433 <= aExp ) {
       
  2633         if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
       
  2634             return propagateFloat64NaN( a, a STATUS_VAR );
       
  2635         }
       
  2636         return a;
       
  2637     }
       
  2638     if ( aExp < 0x3FF ) {
       
  2639         if ( (bits64) ( float64_val(a)<<1 ) == 0 ) return a;
       
  2640         STATUS(float_exception_flags) |= float_flag_inexact;
       
  2641         aSign = extractFloat64Sign( a );
       
  2642         switch ( STATUS(float_rounding_mode) ) {
       
  2643          case float_round_nearest_even:
       
  2644             if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
       
  2645                 return packFloat64( aSign, 0x3FF, 0 );
       
  2646             }
       
  2647             break;
       
  2648          case float_round_down:
       
  2649             return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
       
  2650          case float_round_up:
       
  2651             return make_float64(
       
  2652             aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
       
  2653         }
       
  2654         return packFloat64( aSign, 0, 0 );
       
  2655     }
       
  2656     lastBitMask = 1;
       
  2657     lastBitMask <<= 0x433 - aExp;
       
  2658     roundBitsMask = lastBitMask - 1;
       
  2659     z = float64_val(a);
       
  2660     roundingMode = STATUS(float_rounding_mode);
       
  2661     if ( roundingMode == float_round_nearest_even ) {
       
  2662         z += lastBitMask>>1;
       
  2663         if ( ( z & roundBitsMask ) == 0 ) z &= ~ lastBitMask;
       
  2664     }
       
  2665     else if ( roundingMode != float_round_to_zero ) {
       
  2666         if ( extractFloat64Sign( make_float64(z) ) ^ ( roundingMode == float_round_up ) ) {
       
  2667             z += roundBitsMask;
       
  2668         }
       
  2669     }
       
  2670     z &= ~ roundBitsMask;
       
  2671     if ( z != float64_val(a) )
       
  2672         STATUS(float_exception_flags) |= float_flag_inexact;
       
  2673     return make_float64(z);
       
  2674 
       
  2675 }
       
  2676 
       
  2677 float64 float64_trunc_to_int( float64 a STATUS_PARAM)
       
  2678 {
       
  2679     int oldmode;
       
  2680     float64 res;
       
  2681     oldmode = STATUS(float_rounding_mode);
       
  2682     STATUS(float_rounding_mode) = float_round_to_zero;
       
  2683     res = float64_round_to_int(a STATUS_VAR);
       
  2684     STATUS(float_rounding_mode) = oldmode;
       
  2685     return res;
       
  2686 }
       
  2687 
       
  2688 /*----------------------------------------------------------------------------
       
  2689 | Returns the result of adding the absolute values of the double-precision
       
  2690 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
       
  2691 | before being returned.  `zSign' is ignored if the result is a NaN.
       
  2692 | The addition is performed according to the IEC/IEEE Standard for Binary
       
  2693 | Floating-Point Arithmetic.
       
  2694 *----------------------------------------------------------------------------*/
       
  2695 
       
  2696 static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
       
  2697 {
       
  2698     int16 aExp, bExp, zExp;
       
  2699     bits64 aSig, bSig, zSig;
       
  2700     int16 expDiff;
       
  2701 
       
  2702     aSig = extractFloat64Frac( a );
       
  2703     aExp = extractFloat64Exp( a );
       
  2704     bSig = extractFloat64Frac( b );
       
  2705     bExp = extractFloat64Exp( b );
       
  2706     expDiff = aExp - bExp;
       
  2707     aSig <<= 9;
       
  2708     bSig <<= 9;
       
  2709     if ( 0 < expDiff ) {
       
  2710         if ( aExp == 0x7FF ) {
       
  2711             if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2712             return a;
       
  2713         }
       
  2714         if ( bExp == 0 ) {
       
  2715             --expDiff;
       
  2716         }
       
  2717         else {
       
  2718             bSig |= LIT64( 0x2000000000000000 );
       
  2719         }
       
  2720         shift64RightJamming( bSig, expDiff, &bSig );
       
  2721         zExp = aExp;
       
  2722     }
       
  2723     else if ( expDiff < 0 ) {
       
  2724         if ( bExp == 0x7FF ) {
       
  2725             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2726             return packFloat64( zSign, 0x7FF, 0 );
       
  2727         }
       
  2728         if ( aExp == 0 ) {
       
  2729             ++expDiff;
       
  2730         }
       
  2731         else {
       
  2732             aSig |= LIT64( 0x2000000000000000 );
       
  2733         }
       
  2734         shift64RightJamming( aSig, - expDiff, &aSig );
       
  2735         zExp = bExp;
       
  2736     }
       
  2737     else {
       
  2738         if ( aExp == 0x7FF ) {
       
  2739             if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2740             return a;
       
  2741         }
       
  2742         if ( aExp == 0 ) {
       
  2743             if ( STATUS(flush_to_zero) ) return packFloat64( zSign, 0, 0 );
       
  2744             return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
       
  2745         }
       
  2746         zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
       
  2747         zExp = aExp;
       
  2748         goto roundAndPack;
       
  2749     }
       
  2750     aSig |= LIT64( 0x2000000000000000 );
       
  2751     zSig = ( aSig + bSig )<<1;
       
  2752     --zExp;
       
  2753     if ( (sbits64) zSig < 0 ) {
       
  2754         zSig = aSig + bSig;
       
  2755         ++zExp;
       
  2756     }
       
  2757  roundAndPack:
       
  2758     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
       
  2759 
       
  2760 }
       
  2761 
       
  2762 /*----------------------------------------------------------------------------
       
  2763 | Returns the result of subtracting the absolute values of the double-
       
  2764 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
       
  2765 | difference is negated before being returned.  `zSign' is ignored if the
       
  2766 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
       
  2767 | Standard for Binary Floating-Point Arithmetic.
       
  2768 *----------------------------------------------------------------------------*/
       
  2769 
       
  2770 static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
       
  2771 {
       
  2772     int16 aExp, bExp, zExp;
       
  2773     bits64 aSig, bSig, zSig;
       
  2774     int16 expDiff;
       
  2775 
       
  2776     aSig = extractFloat64Frac( a );
       
  2777     aExp = extractFloat64Exp( a );
       
  2778     bSig = extractFloat64Frac( b );
       
  2779     bExp = extractFloat64Exp( b );
       
  2780     expDiff = aExp - bExp;
       
  2781     aSig <<= 10;
       
  2782     bSig <<= 10;
       
  2783     if ( 0 < expDiff ) goto aExpBigger;
       
  2784     if ( expDiff < 0 ) goto bExpBigger;
       
  2785     if ( aExp == 0x7FF ) {
       
  2786         if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2787         float_raise( float_flag_invalid STATUS_VAR);
       
  2788         return float64_default_nan;
       
  2789     }
       
  2790     if ( aExp == 0 ) {
       
  2791         aExp = 1;
       
  2792         bExp = 1;
       
  2793     }
       
  2794     if ( bSig < aSig ) goto aBigger;
       
  2795     if ( aSig < bSig ) goto bBigger;
       
  2796     return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
       
  2797  bExpBigger:
       
  2798     if ( bExp == 0x7FF ) {
       
  2799         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2800         return packFloat64( zSign ^ 1, 0x7FF, 0 );
       
  2801     }
       
  2802     if ( aExp == 0 ) {
       
  2803         ++expDiff;
       
  2804     }
       
  2805     else {
       
  2806         aSig |= LIT64( 0x4000000000000000 );
       
  2807     }
       
  2808     shift64RightJamming( aSig, - expDiff, &aSig );
       
  2809     bSig |= LIT64( 0x4000000000000000 );
       
  2810  bBigger:
       
  2811     zSig = bSig - aSig;
       
  2812     zExp = bExp;
       
  2813     zSign ^= 1;
       
  2814     goto normalizeRoundAndPack;
       
  2815  aExpBigger:
       
  2816     if ( aExp == 0x7FF ) {
       
  2817         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2818         return a;
       
  2819     }
       
  2820     if ( bExp == 0 ) {
       
  2821         --expDiff;
       
  2822     }
       
  2823     else {
       
  2824         bSig |= LIT64( 0x4000000000000000 );
       
  2825     }
       
  2826     shift64RightJamming( bSig, expDiff, &bSig );
       
  2827     aSig |= LIT64( 0x4000000000000000 );
       
  2828  aBigger:
       
  2829     zSig = aSig - bSig;
       
  2830     zExp = aExp;
       
  2831  normalizeRoundAndPack:
       
  2832     --zExp;
       
  2833     return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
       
  2834 
       
  2835 }
       
  2836 
       
  2837 /*----------------------------------------------------------------------------
       
  2838 | Returns the result of adding the double-precision floating-point values `a'
       
  2839 | and `b'.  The operation is performed according to the IEC/IEEE Standard for
       
  2840 | Binary Floating-Point Arithmetic.
       
  2841 *----------------------------------------------------------------------------*/
       
  2842 
       
  2843 float64 float64_add( float64 a, float64 b STATUS_PARAM )
       
  2844 {
       
  2845     flag aSign, bSign;
       
  2846 
       
  2847     aSign = extractFloat64Sign( a );
       
  2848     bSign = extractFloat64Sign( b );
       
  2849     if ( aSign == bSign ) {
       
  2850         return addFloat64Sigs( a, b, aSign STATUS_VAR );
       
  2851     }
       
  2852     else {
       
  2853         return subFloat64Sigs( a, b, aSign STATUS_VAR );
       
  2854     }
       
  2855 
       
  2856 }
       
  2857 
       
  2858 /*----------------------------------------------------------------------------
       
  2859 | Returns the result of subtracting the double-precision floating-point values
       
  2860 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
       
  2861 | for Binary Floating-Point Arithmetic.
       
  2862 *----------------------------------------------------------------------------*/
       
  2863 
       
  2864 float64 float64_sub( float64 a, float64 b STATUS_PARAM )
       
  2865 {
       
  2866     flag aSign, bSign;
       
  2867 
       
  2868     aSign = extractFloat64Sign( a );
       
  2869     bSign = extractFloat64Sign( b );
       
  2870     if ( aSign == bSign ) {
       
  2871         return subFloat64Sigs( a, b, aSign STATUS_VAR );
       
  2872     }
       
  2873     else {
       
  2874         return addFloat64Sigs( a, b, aSign STATUS_VAR );
       
  2875     }
       
  2876 
       
  2877 }
       
  2878 
       
  2879 /*----------------------------------------------------------------------------
       
  2880 | Returns the result of multiplying the double-precision floating-point values
       
  2881 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
       
  2882 | for Binary Floating-Point Arithmetic.
       
  2883 *----------------------------------------------------------------------------*/
       
  2884 
       
  2885 float64 float64_mul( float64 a, float64 b STATUS_PARAM )
       
  2886 {
       
  2887     flag aSign, bSign, zSign;
       
  2888     int16 aExp, bExp, zExp;
       
  2889     bits64 aSig, bSig, zSig0, zSig1;
       
  2890 
       
  2891     aSig = extractFloat64Frac( a );
       
  2892     aExp = extractFloat64Exp( a );
       
  2893     aSign = extractFloat64Sign( a );
       
  2894     bSig = extractFloat64Frac( b );
       
  2895     bExp = extractFloat64Exp( b );
       
  2896     bSign = extractFloat64Sign( b );
       
  2897     zSign = aSign ^ bSign;
       
  2898     if ( aExp == 0x7FF ) {
       
  2899         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
       
  2900             return propagateFloat64NaN( a, b STATUS_VAR );
       
  2901         }
       
  2902         if ( ( bExp | bSig ) == 0 ) {
       
  2903             float_raise( float_flag_invalid STATUS_VAR);
       
  2904             return float64_default_nan;
       
  2905         }
       
  2906         return packFloat64( zSign, 0x7FF, 0 );
       
  2907     }
       
  2908     if ( bExp == 0x7FF ) {
       
  2909         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2910         if ( ( aExp | aSig ) == 0 ) {
       
  2911             float_raise( float_flag_invalid STATUS_VAR);
       
  2912             return float64_default_nan;
       
  2913         }
       
  2914         return packFloat64( zSign, 0x7FF, 0 );
       
  2915     }
       
  2916     if ( aExp == 0 ) {
       
  2917         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
       
  2918         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  2919     }
       
  2920     if ( bExp == 0 ) {
       
  2921         if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
       
  2922         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
       
  2923     }
       
  2924     zExp = aExp + bExp - 0x3FF;
       
  2925     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
       
  2926     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
       
  2927     mul64To128( aSig, bSig, &zSig0, &zSig1 );
       
  2928     zSig0 |= ( zSig1 != 0 );
       
  2929     if ( 0 <= (sbits64) ( zSig0<<1 ) ) {
       
  2930         zSig0 <<= 1;
       
  2931         --zExp;
       
  2932     }
       
  2933     return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
       
  2934 
       
  2935 }
       
  2936 
       
  2937 /*----------------------------------------------------------------------------
       
  2938 | Returns the result of dividing the double-precision floating-point value `a'
       
  2939 | by the corresponding value `b'.  The operation is performed according to
       
  2940 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  2941 *----------------------------------------------------------------------------*/
       
  2942 
       
  2943 float64 float64_div( float64 a, float64 b STATUS_PARAM )
       
  2944 {
       
  2945     flag aSign, bSign, zSign;
       
  2946     int16 aExp, bExp, zExp;
       
  2947     bits64 aSig, bSig, zSig;
       
  2948     bits64 rem0, rem1;
       
  2949     bits64 term0, term1;
       
  2950 
       
  2951     aSig = extractFloat64Frac( a );
       
  2952     aExp = extractFloat64Exp( a );
       
  2953     aSign = extractFloat64Sign( a );
       
  2954     bSig = extractFloat64Frac( b );
       
  2955     bExp = extractFloat64Exp( b );
       
  2956     bSign = extractFloat64Sign( b );
       
  2957     zSign = aSign ^ bSign;
       
  2958     if ( aExp == 0x7FF ) {
       
  2959         if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2960         if ( bExp == 0x7FF ) {
       
  2961             if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2962             float_raise( float_flag_invalid STATUS_VAR);
       
  2963             return float64_default_nan;
       
  2964         }
       
  2965         return packFloat64( zSign, 0x7FF, 0 );
       
  2966     }
       
  2967     if ( bExp == 0x7FF ) {
       
  2968         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  2969         return packFloat64( zSign, 0, 0 );
       
  2970     }
       
  2971     if ( bExp == 0 ) {
       
  2972         if ( bSig == 0 ) {
       
  2973             if ( ( aExp | aSig ) == 0 ) {
       
  2974                 float_raise( float_flag_invalid STATUS_VAR);
       
  2975                 return float64_default_nan;
       
  2976             }
       
  2977             float_raise( float_flag_divbyzero STATUS_VAR);
       
  2978             return packFloat64( zSign, 0x7FF, 0 );
       
  2979         }
       
  2980         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
       
  2981     }
       
  2982     if ( aExp == 0 ) {
       
  2983         if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
       
  2984         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  2985     }
       
  2986     zExp = aExp - bExp + 0x3FD;
       
  2987     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
       
  2988     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
       
  2989     if ( bSig <= ( aSig + aSig ) ) {
       
  2990         aSig >>= 1;
       
  2991         ++zExp;
       
  2992     }
       
  2993     zSig = estimateDiv128To64( aSig, 0, bSig );
       
  2994     if ( ( zSig & 0x1FF ) <= 2 ) {
       
  2995         mul64To128( bSig, zSig, &term0, &term1 );
       
  2996         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
       
  2997         while ( (sbits64) rem0 < 0 ) {
       
  2998             --zSig;
       
  2999             add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
       
  3000         }
       
  3001         zSig |= ( rem1 != 0 );
       
  3002     }
       
  3003     return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
       
  3004 
       
  3005 }
       
  3006 
       
  3007 /*----------------------------------------------------------------------------
       
  3008 | Returns the remainder of the double-precision floating-point value `a'
       
  3009 | with respect to the corresponding value `b'.  The operation is performed
       
  3010 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3011 *----------------------------------------------------------------------------*/
       
  3012 
       
  3013 float64 float64_rem( float64 a, float64 b STATUS_PARAM )
       
  3014 {
       
  3015     flag aSign, bSign, zSign;
       
  3016     int16 aExp, bExp, expDiff;
       
  3017     bits64 aSig, bSig;
       
  3018     bits64 q, alternateASig;
       
  3019     sbits64 sigMean;
       
  3020 
       
  3021     aSig = extractFloat64Frac( a );
       
  3022     aExp = extractFloat64Exp( a );
       
  3023     aSign = extractFloat64Sign( a );
       
  3024     bSig = extractFloat64Frac( b );
       
  3025     bExp = extractFloat64Exp( b );
       
  3026     bSign = extractFloat64Sign( b );
       
  3027     if ( aExp == 0x7FF ) {
       
  3028         if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
       
  3029             return propagateFloat64NaN( a, b STATUS_VAR );
       
  3030         }
       
  3031         float_raise( float_flag_invalid STATUS_VAR);
       
  3032         return float64_default_nan;
       
  3033     }
       
  3034     if ( bExp == 0x7FF ) {
       
  3035         if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
       
  3036         return a;
       
  3037     }
       
  3038     if ( bExp == 0 ) {
       
  3039         if ( bSig == 0 ) {
       
  3040             float_raise( float_flag_invalid STATUS_VAR);
       
  3041             return float64_default_nan;
       
  3042         }
       
  3043         normalizeFloat64Subnormal( bSig, &bExp, &bSig );
       
  3044     }
       
  3045     if ( aExp == 0 ) {
       
  3046         if ( aSig == 0 ) return a;
       
  3047         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  3048     }
       
  3049     expDiff = aExp - bExp;
       
  3050     aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
       
  3051     bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
       
  3052     if ( expDiff < 0 ) {
       
  3053         if ( expDiff < -1 ) return a;
       
  3054         aSig >>= 1;
       
  3055     }
       
  3056     q = ( bSig <= aSig );
       
  3057     if ( q ) aSig -= bSig;
       
  3058     expDiff -= 64;
       
  3059     while ( 0 < expDiff ) {
       
  3060         q = estimateDiv128To64( aSig, 0, bSig );
       
  3061         q = ( 2 < q ) ? q - 2 : 0;
       
  3062         aSig = - ( ( bSig>>2 ) * q );
       
  3063         expDiff -= 62;
       
  3064     }
       
  3065     expDiff += 64;
       
  3066     if ( 0 < expDiff ) {
       
  3067         q = estimateDiv128To64( aSig, 0, bSig );
       
  3068         q = ( 2 < q ) ? q - 2 : 0;
       
  3069         q >>= 64 - expDiff;
       
  3070         bSig >>= 2;
       
  3071         aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
       
  3072     }
       
  3073     else {
       
  3074         aSig >>= 2;
       
  3075         bSig >>= 2;
       
  3076     }
       
  3077     do {
       
  3078         alternateASig = aSig;
       
  3079         ++q;
       
  3080         aSig -= bSig;
       
  3081     } while ( 0 <= (sbits64) aSig );
       
  3082     sigMean = aSig + alternateASig;
       
  3083     if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
       
  3084         aSig = alternateASig;
       
  3085     }
       
  3086     zSign = ( (sbits64) aSig < 0 );
       
  3087     if ( zSign ) aSig = - aSig;
       
  3088     return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
       
  3089 
       
  3090 }
       
  3091 
       
  3092 /*----------------------------------------------------------------------------
       
  3093 | Returns the square root of the double-precision floating-point value `a'.
       
  3094 | The operation is performed according to the IEC/IEEE Standard for Binary
       
  3095 | Floating-Point Arithmetic.
       
  3096 *----------------------------------------------------------------------------*/
       
  3097 
       
  3098 float64 float64_sqrt( float64 a STATUS_PARAM )
       
  3099 {
       
  3100     flag aSign;
       
  3101     int16 aExp, zExp;
       
  3102     bits64 aSig, zSig, doubleZSig;
       
  3103     bits64 rem0, rem1, term0, term1;
       
  3104 
       
  3105     aSig = extractFloat64Frac( a );
       
  3106     aExp = extractFloat64Exp( a );
       
  3107     aSign = extractFloat64Sign( a );
       
  3108     if ( aExp == 0x7FF ) {
       
  3109         if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
       
  3110         if ( ! aSign ) return a;
       
  3111         float_raise( float_flag_invalid STATUS_VAR);
       
  3112         return float64_default_nan;
       
  3113     }
       
  3114     if ( aSign ) {
       
  3115         if ( ( aExp | aSig ) == 0 ) return a;
       
  3116         float_raise( float_flag_invalid STATUS_VAR);
       
  3117         return float64_default_nan;
       
  3118     }
       
  3119     if ( aExp == 0 ) {
       
  3120         if ( aSig == 0 ) return float64_zero;
       
  3121         normalizeFloat64Subnormal( aSig, &aExp, &aSig );
       
  3122     }
       
  3123     zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
       
  3124     aSig |= LIT64( 0x0010000000000000 );
       
  3125     zSig = estimateSqrt32( aExp, aSig>>21 );
       
  3126     aSig <<= 9 - ( aExp & 1 );
       
  3127     zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
       
  3128     if ( ( zSig & 0x1FF ) <= 5 ) {
       
  3129         doubleZSig = zSig<<1;
       
  3130         mul64To128( zSig, zSig, &term0, &term1 );
       
  3131         sub128( aSig, 0, term0, term1, &rem0, &rem1 );
       
  3132         while ( (sbits64) rem0 < 0 ) {
       
  3133             --zSig;
       
  3134             doubleZSig -= 2;
       
  3135             add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
       
  3136         }
       
  3137         zSig |= ( ( rem0 | rem1 ) != 0 );
       
  3138     }
       
  3139     return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
       
  3140 
       
  3141 }
       
  3142 
       
  3143 /*----------------------------------------------------------------------------
       
  3144 | Returns 1 if the double-precision floating-point value `a' is equal to the
       
  3145 | corresponding value `b', and 0 otherwise.  The comparison is performed
       
  3146 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3147 *----------------------------------------------------------------------------*/
       
  3148 
       
  3149 int float64_eq( float64 a, float64 b STATUS_PARAM )
       
  3150 {
       
  3151     bits64 av, bv;
       
  3152 
       
  3153     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3154          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3155        ) {
       
  3156         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
       
  3157             float_raise( float_flag_invalid STATUS_VAR);
       
  3158         }
       
  3159         return 0;
       
  3160     }
       
  3161     av = float64_val(a);
       
  3162     bv = float64_val(b);
       
  3163     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
       
  3164 
       
  3165 }
       
  3166 
       
  3167 /*----------------------------------------------------------------------------
       
  3168 | Returns 1 if the double-precision floating-point value `a' is less than or
       
  3169 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
       
  3170 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  3171 | Arithmetic.
       
  3172 *----------------------------------------------------------------------------*/
       
  3173 
       
  3174 int float64_le( float64 a, float64 b STATUS_PARAM )
       
  3175 {
       
  3176     flag aSign, bSign;
       
  3177     bits64 av, bv;
       
  3178 
       
  3179     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3180          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3181        ) {
       
  3182         float_raise( float_flag_invalid STATUS_VAR);
       
  3183         return 0;
       
  3184     }
       
  3185     aSign = extractFloat64Sign( a );
       
  3186     bSign = extractFloat64Sign( b );
       
  3187     av = float64_val(a);
       
  3188     bv = float64_val(b);
       
  3189     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
       
  3190     return ( av == bv ) || ( aSign ^ ( av < bv ) );
       
  3191 
       
  3192 }
       
  3193 
       
  3194 /*----------------------------------------------------------------------------
       
  3195 | Returns 1 if the double-precision floating-point value `a' is less than
       
  3196 | the corresponding value `b', and 0 otherwise.  The comparison is performed
       
  3197 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3198 *----------------------------------------------------------------------------*/
       
  3199 
       
  3200 int float64_lt( float64 a, float64 b STATUS_PARAM )
       
  3201 {
       
  3202     flag aSign, bSign;
       
  3203     bits64 av, bv;
       
  3204 
       
  3205     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3206          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3207        ) {
       
  3208         float_raise( float_flag_invalid STATUS_VAR);
       
  3209         return 0;
       
  3210     }
       
  3211     aSign = extractFloat64Sign( a );
       
  3212     bSign = extractFloat64Sign( b );
       
  3213     av = float64_val(a);
       
  3214     bv = float64_val(b);
       
  3215     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
       
  3216     return ( av != bv ) && ( aSign ^ ( av < bv ) );
       
  3217 
       
  3218 }
       
  3219 
       
  3220 /*----------------------------------------------------------------------------
       
  3221 | Returns 1 if the double-precision floating-point value `a' is equal to the
       
  3222 | corresponding value `b', and 0 otherwise.  The invalid exception is raised
       
  3223 | if either operand is a NaN.  Otherwise, the comparison is performed
       
  3224 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3225 *----------------------------------------------------------------------------*/
       
  3226 
       
  3227 int float64_eq_signaling( float64 a, float64 b STATUS_PARAM )
       
  3228 {
       
  3229     bits64 av, bv;
       
  3230 
       
  3231     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3232          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3233        ) {
       
  3234         float_raise( float_flag_invalid STATUS_VAR);
       
  3235         return 0;
       
  3236     }
       
  3237     av = float64_val(a);
       
  3238     bv = float64_val(b);
       
  3239     return ( av == bv ) || ( (bits64) ( ( av | bv )<<1 ) == 0 );
       
  3240 
       
  3241 }
       
  3242 
       
  3243 /*----------------------------------------------------------------------------
       
  3244 | Returns 1 if the double-precision floating-point value `a' is less than or
       
  3245 | equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
       
  3246 | cause an exception.  Otherwise, the comparison is performed according to the
       
  3247 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3248 *----------------------------------------------------------------------------*/
       
  3249 
       
  3250 int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
       
  3251 {
       
  3252     flag aSign, bSign;
       
  3253     bits64 av, bv;
       
  3254 
       
  3255     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3256          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3257        ) {
       
  3258         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
       
  3259             float_raise( float_flag_invalid STATUS_VAR);
       
  3260         }
       
  3261         return 0;
       
  3262     }
       
  3263     aSign = extractFloat64Sign( a );
       
  3264     bSign = extractFloat64Sign( b );
       
  3265     av = float64_val(a);
       
  3266     bv = float64_val(b);
       
  3267     if ( aSign != bSign ) return aSign || ( (bits64) ( ( av | bv )<<1 ) == 0 );
       
  3268     return ( av == bv ) || ( aSign ^ ( av < bv ) );
       
  3269 
       
  3270 }
       
  3271 
       
  3272 /*----------------------------------------------------------------------------
       
  3273 | Returns 1 if the double-precision floating-point value `a' is less than
       
  3274 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
       
  3275 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
       
  3276 | Standard for Binary Floating-Point Arithmetic.
       
  3277 *----------------------------------------------------------------------------*/
       
  3278 
       
  3279 int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
       
  3280 {
       
  3281     flag aSign, bSign;
       
  3282     bits64 av, bv;
       
  3283 
       
  3284     if (    ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
       
  3285          || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
       
  3286        ) {
       
  3287         if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
       
  3288             float_raise( float_flag_invalid STATUS_VAR);
       
  3289         }
       
  3290         return 0;
       
  3291     }
       
  3292     aSign = extractFloat64Sign( a );
       
  3293     bSign = extractFloat64Sign( b );
       
  3294     av = float64_val(a);
       
  3295     bv = float64_val(b);
       
  3296     if ( aSign != bSign ) return aSign && ( (bits64) ( ( av | bv )<<1 ) != 0 );
       
  3297     return ( av != bv ) && ( aSign ^ ( av < bv ) );
       
  3298 
       
  3299 }
       
  3300 
       
  3301 #ifdef FLOATX80
       
  3302 
       
  3303 /*----------------------------------------------------------------------------
       
  3304 | Returns the result of converting the extended double-precision floating-
       
  3305 | point value `a' to the 32-bit two's complement integer format.  The
       
  3306 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3307 | Floating-Point Arithmetic---which means in particular that the conversion
       
  3308 | is rounded according to the current rounding mode.  If `a' is a NaN, the
       
  3309 | largest positive integer is returned.  Otherwise, if the conversion
       
  3310 | overflows, the largest integer with the same sign as `a' is returned.
       
  3311 *----------------------------------------------------------------------------*/
       
  3312 
       
  3313 int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
       
  3314 {
       
  3315     flag aSign;
       
  3316     int32 aExp, shiftCount;
       
  3317     bits64 aSig;
       
  3318 
       
  3319     aSig = extractFloatx80Frac( a );
       
  3320     aExp = extractFloatx80Exp( a );
       
  3321     aSign = extractFloatx80Sign( a );
       
  3322     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
       
  3323     shiftCount = 0x4037 - aExp;
       
  3324     if ( shiftCount <= 0 ) shiftCount = 1;
       
  3325     shift64RightJamming( aSig, shiftCount, &aSig );
       
  3326     return roundAndPackInt32( aSign, aSig STATUS_VAR );
       
  3327 
       
  3328 }
       
  3329 
       
  3330 /*----------------------------------------------------------------------------
       
  3331 | Returns the result of converting the extended double-precision floating-
       
  3332 | point value `a' to the 32-bit two's complement integer format.  The
       
  3333 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3334 | Floating-Point Arithmetic, except that the conversion is always rounded
       
  3335 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
       
  3336 | Otherwise, if the conversion overflows, the largest integer with the same
       
  3337 | sign as `a' is returned.
       
  3338 *----------------------------------------------------------------------------*/
       
  3339 
       
  3340 int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
       
  3341 {
       
  3342     flag aSign;
       
  3343     int32 aExp, shiftCount;
       
  3344     bits64 aSig, savedASig;
       
  3345     int32 z;
       
  3346 
       
  3347     aSig = extractFloatx80Frac( a );
       
  3348     aExp = extractFloatx80Exp( a );
       
  3349     aSign = extractFloatx80Sign( a );
       
  3350     if ( 0x401E < aExp ) {
       
  3351         if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) aSign = 0;
       
  3352         goto invalid;
       
  3353     }
       
  3354     else if ( aExp < 0x3FFF ) {
       
  3355         if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  3356         return 0;
       
  3357     }
       
  3358     shiftCount = 0x403E - aExp;
       
  3359     savedASig = aSig;
       
  3360     aSig >>= shiftCount;
       
  3361     z = aSig;
       
  3362     if ( aSign ) z = - z;
       
  3363     if ( ( z < 0 ) ^ aSign ) {
       
  3364  invalid:
       
  3365         float_raise( float_flag_invalid STATUS_VAR);
       
  3366         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
       
  3367     }
       
  3368     if ( ( aSig<<shiftCount ) != savedASig ) {
       
  3369         STATUS(float_exception_flags) |= float_flag_inexact;
       
  3370     }
       
  3371     return z;
       
  3372 
       
  3373 }
       
  3374 
       
  3375 /*----------------------------------------------------------------------------
       
  3376 | Returns the result of converting the extended double-precision floating-
       
  3377 | point value `a' to the 64-bit two's complement integer format.  The
       
  3378 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3379 | Floating-Point Arithmetic---which means in particular that the conversion
       
  3380 | is rounded according to the current rounding mode.  If `a' is a NaN,
       
  3381 | the largest positive integer is returned.  Otherwise, if the conversion
       
  3382 | overflows, the largest integer with the same sign as `a' is returned.
       
  3383 *----------------------------------------------------------------------------*/
       
  3384 
       
  3385 int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
       
  3386 {
       
  3387     flag aSign;
       
  3388     int32 aExp, shiftCount;
       
  3389     bits64 aSig, aSigExtra;
       
  3390 
       
  3391     aSig = extractFloatx80Frac( a );
       
  3392     aExp = extractFloatx80Exp( a );
       
  3393     aSign = extractFloatx80Sign( a );
       
  3394     shiftCount = 0x403E - aExp;
       
  3395     if ( shiftCount <= 0 ) {
       
  3396         if ( shiftCount ) {
       
  3397             float_raise( float_flag_invalid STATUS_VAR);
       
  3398             if (    ! aSign
       
  3399                  || (    ( aExp == 0x7FFF )
       
  3400                       && ( aSig != LIT64( 0x8000000000000000 ) ) )
       
  3401                ) {
       
  3402                 return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  3403             }
       
  3404             return (sbits64) LIT64( 0x8000000000000000 );
       
  3405         }
       
  3406         aSigExtra = 0;
       
  3407     }
       
  3408     else {
       
  3409         shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
       
  3410     }
       
  3411     return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
       
  3412 
       
  3413 }
       
  3414 
       
  3415 /*----------------------------------------------------------------------------
       
  3416 | Returns the result of converting the extended double-precision floating-
       
  3417 | point value `a' to the 64-bit two's complement integer format.  The
       
  3418 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3419 | Floating-Point Arithmetic, except that the conversion is always rounded
       
  3420 | toward zero.  If `a' is a NaN, the largest positive integer is returned.
       
  3421 | Otherwise, if the conversion overflows, the largest integer with the same
       
  3422 | sign as `a' is returned.
       
  3423 *----------------------------------------------------------------------------*/
       
  3424 
       
  3425 int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
       
  3426 {
       
  3427     flag aSign;
       
  3428     int32 aExp, shiftCount;
       
  3429     bits64 aSig;
       
  3430     int64 z;
       
  3431 
       
  3432     aSig = extractFloatx80Frac( a );
       
  3433     aExp = extractFloatx80Exp( a );
       
  3434     aSign = extractFloatx80Sign( a );
       
  3435     shiftCount = aExp - 0x403E;
       
  3436     if ( 0 <= shiftCount ) {
       
  3437         aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
       
  3438         if ( ( a.high != 0xC03E ) || aSig ) {
       
  3439             float_raise( float_flag_invalid STATUS_VAR);
       
  3440             if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
       
  3441                 return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  3442             }
       
  3443         }
       
  3444         return (sbits64) LIT64( 0x8000000000000000 );
       
  3445     }
       
  3446     else if ( aExp < 0x3FFF ) {
       
  3447         if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  3448         return 0;
       
  3449     }
       
  3450     z = aSig>>( - shiftCount );
       
  3451     if ( (bits64) ( aSig<<( shiftCount & 63 ) ) ) {
       
  3452         STATUS(float_exception_flags) |= float_flag_inexact;
       
  3453     }
       
  3454     if ( aSign ) z = - z;
       
  3455     return z;
       
  3456 
       
  3457 }
       
  3458 
       
  3459 /*----------------------------------------------------------------------------
       
  3460 | Returns the result of converting the extended double-precision floating-
       
  3461 | point value `a' to the single-precision floating-point format.  The
       
  3462 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3463 | Floating-Point Arithmetic.
       
  3464 *----------------------------------------------------------------------------*/
       
  3465 
       
  3466 float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
       
  3467 {
       
  3468     flag aSign;
       
  3469     int32 aExp;
       
  3470     bits64 aSig;
       
  3471 
       
  3472     aSig = extractFloatx80Frac( a );
       
  3473     aExp = extractFloatx80Exp( a );
       
  3474     aSign = extractFloatx80Sign( a );
       
  3475     if ( aExp == 0x7FFF ) {
       
  3476         if ( (bits64) ( aSig<<1 ) ) {
       
  3477             return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) );
       
  3478         }
       
  3479         return packFloat32( aSign, 0xFF, 0 );
       
  3480     }
       
  3481     shift64RightJamming( aSig, 33, &aSig );
       
  3482     if ( aExp || aSig ) aExp -= 0x3F81;
       
  3483     return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
       
  3484 
       
  3485 }
       
  3486 
       
  3487 /*----------------------------------------------------------------------------
       
  3488 | Returns the result of converting the extended double-precision floating-
       
  3489 | point value `a' to the double-precision floating-point format.  The
       
  3490 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3491 | Floating-Point Arithmetic.
       
  3492 *----------------------------------------------------------------------------*/
       
  3493 
       
  3494 float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
       
  3495 {
       
  3496     flag aSign;
       
  3497     int32 aExp;
       
  3498     bits64 aSig, zSig;
       
  3499 
       
  3500     aSig = extractFloatx80Frac( a );
       
  3501     aExp = extractFloatx80Exp( a );
       
  3502     aSign = extractFloatx80Sign( a );
       
  3503     if ( aExp == 0x7FFF ) {
       
  3504         if ( (bits64) ( aSig<<1 ) ) {
       
  3505             return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) );
       
  3506         }
       
  3507         return packFloat64( aSign, 0x7FF, 0 );
       
  3508     }
       
  3509     shift64RightJamming( aSig, 1, &zSig );
       
  3510     if ( aExp || aSig ) aExp -= 0x3C01;
       
  3511     return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
       
  3512 
       
  3513 }
       
  3514 
       
  3515 #ifdef FLOAT128
       
  3516 
       
  3517 /*----------------------------------------------------------------------------
       
  3518 | Returns the result of converting the extended double-precision floating-
       
  3519 | point value `a' to the quadruple-precision floating-point format.  The
       
  3520 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  3521 | Floating-Point Arithmetic.
       
  3522 *----------------------------------------------------------------------------*/
       
  3523 
       
  3524 float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
       
  3525 {
       
  3526     flag aSign;
       
  3527     int16 aExp;
       
  3528     bits64 aSig, zSig0, zSig1;
       
  3529 
       
  3530     aSig = extractFloatx80Frac( a );
       
  3531     aExp = extractFloatx80Exp( a );
       
  3532     aSign = extractFloatx80Sign( a );
       
  3533     if ( ( aExp == 0x7FFF ) && (bits64) ( aSig<<1 ) ) {
       
  3534         return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) );
       
  3535     }
       
  3536     shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
       
  3537     return packFloat128( aSign, aExp, zSig0, zSig1 );
       
  3538 
       
  3539 }
       
  3540 
       
  3541 #endif
       
  3542 
       
  3543 /*----------------------------------------------------------------------------
       
  3544 | Rounds the extended double-precision floating-point value `a' to an integer,
       
  3545 | and returns the result as an extended quadruple-precision floating-point
       
  3546 | value.  The operation is performed according to the IEC/IEEE Standard for
       
  3547 | Binary Floating-Point Arithmetic.
       
  3548 *----------------------------------------------------------------------------*/
       
  3549 
       
  3550 floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
       
  3551 {
       
  3552     flag aSign;
       
  3553     int32 aExp;
       
  3554     bits64 lastBitMask, roundBitsMask;
       
  3555     int8 roundingMode;
       
  3556     floatx80 z;
       
  3557 
       
  3558     aExp = extractFloatx80Exp( a );
       
  3559     if ( 0x403E <= aExp ) {
       
  3560         if ( ( aExp == 0x7FFF ) && (bits64) ( extractFloatx80Frac( a )<<1 ) ) {
       
  3561             return propagateFloatx80NaN( a, a STATUS_VAR );
       
  3562         }
       
  3563         return a;
       
  3564     }
       
  3565     if ( aExp < 0x3FFF ) {
       
  3566         if (    ( aExp == 0 )
       
  3567              && ( (bits64) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
       
  3568             return a;
       
  3569         }
       
  3570         STATUS(float_exception_flags) |= float_flag_inexact;
       
  3571         aSign = extractFloatx80Sign( a );
       
  3572         switch ( STATUS(float_rounding_mode) ) {
       
  3573          case float_round_nearest_even:
       
  3574             if ( ( aExp == 0x3FFE ) && (bits64) ( extractFloatx80Frac( a )<<1 )
       
  3575                ) {
       
  3576                 return
       
  3577                     packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
       
  3578             }
       
  3579             break;
       
  3580          case float_round_down:
       
  3581             return
       
  3582                   aSign ?
       
  3583                       packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
       
  3584                 : packFloatx80( 0, 0, 0 );
       
  3585          case float_round_up:
       
  3586             return
       
  3587                   aSign ? packFloatx80( 1, 0, 0 )
       
  3588                 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
       
  3589         }
       
  3590         return packFloatx80( aSign, 0, 0 );
       
  3591     }
       
  3592     lastBitMask = 1;
       
  3593     lastBitMask <<= 0x403E - aExp;
       
  3594     roundBitsMask = lastBitMask - 1;
       
  3595     z = a;
       
  3596     roundingMode = STATUS(float_rounding_mode);
       
  3597     if ( roundingMode == float_round_nearest_even ) {
       
  3598         z.low += lastBitMask>>1;
       
  3599         if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
       
  3600     }
       
  3601     else if ( roundingMode != float_round_to_zero ) {
       
  3602         if ( extractFloatx80Sign( z ) ^ ( roundingMode == float_round_up ) ) {
       
  3603             z.low += roundBitsMask;
       
  3604         }
       
  3605     }
       
  3606     z.low &= ~ roundBitsMask;
       
  3607     if ( z.low == 0 ) {
       
  3608         ++z.high;
       
  3609         z.low = LIT64( 0x8000000000000000 );
       
  3610     }
       
  3611     if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  3612     return z;
       
  3613 
       
  3614 }
       
  3615 
       
  3616 /*----------------------------------------------------------------------------
       
  3617 | Returns the result of adding the absolute values of the extended double-
       
  3618 | precision floating-point values `a' and `b'.  If `zSign' is 1, the sum is
       
  3619 | negated before being returned.  `zSign' is ignored if the result is a NaN.
       
  3620 | The addition is performed according to the IEC/IEEE Standard for Binary
       
  3621 | Floating-Point Arithmetic.
       
  3622 *----------------------------------------------------------------------------*/
       
  3623 
       
  3624 static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
       
  3625 {
       
  3626     int32 aExp, bExp, zExp;
       
  3627     bits64 aSig, bSig, zSig0, zSig1;
       
  3628     int32 expDiff;
       
  3629 
       
  3630     aSig = extractFloatx80Frac( a );
       
  3631     aExp = extractFloatx80Exp( a );
       
  3632     bSig = extractFloatx80Frac( b );
       
  3633     bExp = extractFloatx80Exp( b );
       
  3634     expDiff = aExp - bExp;
       
  3635     if ( 0 < expDiff ) {
       
  3636         if ( aExp == 0x7FFF ) {
       
  3637             if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3638             return a;
       
  3639         }
       
  3640         if ( bExp == 0 ) --expDiff;
       
  3641         shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
       
  3642         zExp = aExp;
       
  3643     }
       
  3644     else if ( expDiff < 0 ) {
       
  3645         if ( bExp == 0x7FFF ) {
       
  3646             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3647             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3648         }
       
  3649         if ( aExp == 0 ) ++expDiff;
       
  3650         shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
       
  3651         zExp = bExp;
       
  3652     }
       
  3653     else {
       
  3654         if ( aExp == 0x7FFF ) {
       
  3655             if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
       
  3656                 return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3657             }
       
  3658             return a;
       
  3659         }
       
  3660         zSig1 = 0;
       
  3661         zSig0 = aSig + bSig;
       
  3662         if ( aExp == 0 ) {
       
  3663             normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
       
  3664             goto roundAndPack;
       
  3665         }
       
  3666         zExp = aExp;
       
  3667         goto shiftRight1;
       
  3668     }
       
  3669     zSig0 = aSig + bSig;
       
  3670     if ( (sbits64) zSig0 < 0 ) goto roundAndPack;
       
  3671  shiftRight1:
       
  3672     shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
       
  3673     zSig0 |= LIT64( 0x8000000000000000 );
       
  3674     ++zExp;
       
  3675  roundAndPack:
       
  3676     return
       
  3677         roundAndPackFloatx80(
       
  3678             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
       
  3679 
       
  3680 }
       
  3681 
       
  3682 /*----------------------------------------------------------------------------
       
  3683 | Returns the result of subtracting the absolute values of the extended
       
  3684 | double-precision floating-point values `a' and `b'.  If `zSign' is 1, the
       
  3685 | difference is negated before being returned.  `zSign' is ignored if the
       
  3686 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
       
  3687 | Standard for Binary Floating-Point Arithmetic.
       
  3688 *----------------------------------------------------------------------------*/
       
  3689 
       
  3690 static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
       
  3691 {
       
  3692     int32 aExp, bExp, zExp;
       
  3693     bits64 aSig, bSig, zSig0, zSig1;
       
  3694     int32 expDiff;
       
  3695     floatx80 z;
       
  3696 
       
  3697     aSig = extractFloatx80Frac( a );
       
  3698     aExp = extractFloatx80Exp( a );
       
  3699     bSig = extractFloatx80Frac( b );
       
  3700     bExp = extractFloatx80Exp( b );
       
  3701     expDiff = aExp - bExp;
       
  3702     if ( 0 < expDiff ) goto aExpBigger;
       
  3703     if ( expDiff < 0 ) goto bExpBigger;
       
  3704     if ( aExp == 0x7FFF ) {
       
  3705         if ( (bits64) ( ( aSig | bSig )<<1 ) ) {
       
  3706             return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3707         }
       
  3708         float_raise( float_flag_invalid STATUS_VAR);
       
  3709         z.low = floatx80_default_nan_low;
       
  3710         z.high = floatx80_default_nan_high;
       
  3711         return z;
       
  3712     }
       
  3713     if ( aExp == 0 ) {
       
  3714         aExp = 1;
       
  3715         bExp = 1;
       
  3716     }
       
  3717     zSig1 = 0;
       
  3718     if ( bSig < aSig ) goto aBigger;
       
  3719     if ( aSig < bSig ) goto bBigger;
       
  3720     return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
       
  3721  bExpBigger:
       
  3722     if ( bExp == 0x7FFF ) {
       
  3723         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3724         return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3725     }
       
  3726     if ( aExp == 0 ) ++expDiff;
       
  3727     shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
       
  3728  bBigger:
       
  3729     sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
       
  3730     zExp = bExp;
       
  3731     zSign ^= 1;
       
  3732     goto normalizeRoundAndPack;
       
  3733  aExpBigger:
       
  3734     if ( aExp == 0x7FFF ) {
       
  3735         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3736         return a;
       
  3737     }
       
  3738     if ( bExp == 0 ) --expDiff;
       
  3739     shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
       
  3740  aBigger:
       
  3741     sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
       
  3742     zExp = aExp;
       
  3743  normalizeRoundAndPack:
       
  3744     return
       
  3745         normalizeRoundAndPackFloatx80(
       
  3746             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
       
  3747 
       
  3748 }
       
  3749 
       
  3750 /*----------------------------------------------------------------------------
       
  3751 | Returns the result of adding the extended double-precision floating-point
       
  3752 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
       
  3753 | Standard for Binary Floating-Point Arithmetic.
       
  3754 *----------------------------------------------------------------------------*/
       
  3755 
       
  3756 floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
       
  3757 {
       
  3758     flag aSign, bSign;
       
  3759 
       
  3760     aSign = extractFloatx80Sign( a );
       
  3761     bSign = extractFloatx80Sign( b );
       
  3762     if ( aSign == bSign ) {
       
  3763         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
       
  3764     }
       
  3765     else {
       
  3766         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
       
  3767     }
       
  3768 
       
  3769 }
       
  3770 
       
  3771 /*----------------------------------------------------------------------------
       
  3772 | Returns the result of subtracting the extended double-precision floating-
       
  3773 | point values `a' and `b'.  The operation is performed according to the
       
  3774 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3775 *----------------------------------------------------------------------------*/
       
  3776 
       
  3777 floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
       
  3778 {
       
  3779     flag aSign, bSign;
       
  3780 
       
  3781     aSign = extractFloatx80Sign( a );
       
  3782     bSign = extractFloatx80Sign( b );
       
  3783     if ( aSign == bSign ) {
       
  3784         return subFloatx80Sigs( a, b, aSign STATUS_VAR );
       
  3785     }
       
  3786     else {
       
  3787         return addFloatx80Sigs( a, b, aSign STATUS_VAR );
       
  3788     }
       
  3789 
       
  3790 }
       
  3791 
       
  3792 /*----------------------------------------------------------------------------
       
  3793 | Returns the result of multiplying the extended double-precision floating-
       
  3794 | point values `a' and `b'.  The operation is performed according to the
       
  3795 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3796 *----------------------------------------------------------------------------*/
       
  3797 
       
  3798 floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
       
  3799 {
       
  3800     flag aSign, bSign, zSign;
       
  3801     int32 aExp, bExp, zExp;
       
  3802     bits64 aSig, bSig, zSig0, zSig1;
       
  3803     floatx80 z;
       
  3804 
       
  3805     aSig = extractFloatx80Frac( a );
       
  3806     aExp = extractFloatx80Exp( a );
       
  3807     aSign = extractFloatx80Sign( a );
       
  3808     bSig = extractFloatx80Frac( b );
       
  3809     bExp = extractFloatx80Exp( b );
       
  3810     bSign = extractFloatx80Sign( b );
       
  3811     zSign = aSign ^ bSign;
       
  3812     if ( aExp == 0x7FFF ) {
       
  3813         if (    (bits64) ( aSig<<1 )
       
  3814              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
       
  3815             return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3816         }
       
  3817         if ( ( bExp | bSig ) == 0 ) goto invalid;
       
  3818         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3819     }
       
  3820     if ( bExp == 0x7FFF ) {
       
  3821         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3822         if ( ( aExp | aSig ) == 0 ) {
       
  3823  invalid:
       
  3824             float_raise( float_flag_invalid STATUS_VAR);
       
  3825             z.low = floatx80_default_nan_low;
       
  3826             z.high = floatx80_default_nan_high;
       
  3827             return z;
       
  3828         }
       
  3829         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3830     }
       
  3831     if ( aExp == 0 ) {
       
  3832         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
       
  3833         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
       
  3834     }
       
  3835     if ( bExp == 0 ) {
       
  3836         if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
       
  3837         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
       
  3838     }
       
  3839     zExp = aExp + bExp - 0x3FFE;
       
  3840     mul64To128( aSig, bSig, &zSig0, &zSig1 );
       
  3841     if ( 0 < (sbits64) zSig0 ) {
       
  3842         shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
       
  3843         --zExp;
       
  3844     }
       
  3845     return
       
  3846         roundAndPackFloatx80(
       
  3847             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
       
  3848 
       
  3849 }
       
  3850 
       
  3851 /*----------------------------------------------------------------------------
       
  3852 | Returns the result of dividing the extended double-precision floating-point
       
  3853 | value `a' by the corresponding value `b'.  The operation is performed
       
  3854 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3855 *----------------------------------------------------------------------------*/
       
  3856 
       
  3857 floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
       
  3858 {
       
  3859     flag aSign, bSign, zSign;
       
  3860     int32 aExp, bExp, zExp;
       
  3861     bits64 aSig, bSig, zSig0, zSig1;
       
  3862     bits64 rem0, rem1, rem2, term0, term1, term2;
       
  3863     floatx80 z;
       
  3864 
       
  3865     aSig = extractFloatx80Frac( a );
       
  3866     aExp = extractFloatx80Exp( a );
       
  3867     aSign = extractFloatx80Sign( a );
       
  3868     bSig = extractFloatx80Frac( b );
       
  3869     bExp = extractFloatx80Exp( b );
       
  3870     bSign = extractFloatx80Sign( b );
       
  3871     zSign = aSign ^ bSign;
       
  3872     if ( aExp == 0x7FFF ) {
       
  3873         if ( (bits64) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3874         if ( bExp == 0x7FFF ) {
       
  3875             if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3876             goto invalid;
       
  3877         }
       
  3878         return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3879     }
       
  3880     if ( bExp == 0x7FFF ) {
       
  3881         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3882         return packFloatx80( zSign, 0, 0 );
       
  3883     }
       
  3884     if ( bExp == 0 ) {
       
  3885         if ( bSig == 0 ) {
       
  3886             if ( ( aExp | aSig ) == 0 ) {
       
  3887  invalid:
       
  3888                 float_raise( float_flag_invalid STATUS_VAR);
       
  3889                 z.low = floatx80_default_nan_low;
       
  3890                 z.high = floatx80_default_nan_high;
       
  3891                 return z;
       
  3892             }
       
  3893             float_raise( float_flag_divbyzero STATUS_VAR);
       
  3894             return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  3895         }
       
  3896         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
       
  3897     }
       
  3898     if ( aExp == 0 ) {
       
  3899         if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
       
  3900         normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
       
  3901     }
       
  3902     zExp = aExp - bExp + 0x3FFE;
       
  3903     rem1 = 0;
       
  3904     if ( bSig <= aSig ) {
       
  3905         shift128Right( aSig, 0, 1, &aSig, &rem1 );
       
  3906         ++zExp;
       
  3907     }
       
  3908     zSig0 = estimateDiv128To64( aSig, rem1, bSig );
       
  3909     mul64To128( bSig, zSig0, &term0, &term1 );
       
  3910     sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
       
  3911     while ( (sbits64) rem0 < 0 ) {
       
  3912         --zSig0;
       
  3913         add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
       
  3914     }
       
  3915     zSig1 = estimateDiv128To64( rem1, 0, bSig );
       
  3916     if ( (bits64) ( zSig1<<1 ) <= 8 ) {
       
  3917         mul64To128( bSig, zSig1, &term1, &term2 );
       
  3918         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
       
  3919         while ( (sbits64) rem1 < 0 ) {
       
  3920             --zSig1;
       
  3921             add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
       
  3922         }
       
  3923         zSig1 |= ( ( rem1 | rem2 ) != 0 );
       
  3924     }
       
  3925     return
       
  3926         roundAndPackFloatx80(
       
  3927             STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
       
  3928 
       
  3929 }
       
  3930 
       
  3931 /*----------------------------------------------------------------------------
       
  3932 | Returns the remainder of the extended double-precision floating-point value
       
  3933 | `a' with respect to the corresponding value `b'.  The operation is performed
       
  3934 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  3935 *----------------------------------------------------------------------------*/
       
  3936 
       
  3937 floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
       
  3938 {
       
  3939     flag aSign, bSign, zSign;
       
  3940     int32 aExp, bExp, expDiff;
       
  3941     bits64 aSig0, aSig1, bSig;
       
  3942     bits64 q, term0, term1, alternateASig0, alternateASig1;
       
  3943     floatx80 z;
       
  3944 
       
  3945     aSig0 = extractFloatx80Frac( a );
       
  3946     aExp = extractFloatx80Exp( a );
       
  3947     aSign = extractFloatx80Sign( a );
       
  3948     bSig = extractFloatx80Frac( b );
       
  3949     bExp = extractFloatx80Exp( b );
       
  3950     bSign = extractFloatx80Sign( b );
       
  3951     if ( aExp == 0x7FFF ) {
       
  3952         if (    (bits64) ( aSig0<<1 )
       
  3953              || ( ( bExp == 0x7FFF ) && (bits64) ( bSig<<1 ) ) ) {
       
  3954             return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3955         }
       
  3956         goto invalid;
       
  3957     }
       
  3958     if ( bExp == 0x7FFF ) {
       
  3959         if ( (bits64) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
       
  3960         return a;
       
  3961     }
       
  3962     if ( bExp == 0 ) {
       
  3963         if ( bSig == 0 ) {
       
  3964  invalid:
       
  3965             float_raise( float_flag_invalid STATUS_VAR);
       
  3966             z.low = floatx80_default_nan_low;
       
  3967             z.high = floatx80_default_nan_high;
       
  3968             return z;
       
  3969         }
       
  3970         normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
       
  3971     }
       
  3972     if ( aExp == 0 ) {
       
  3973         if ( (bits64) ( aSig0<<1 ) == 0 ) return a;
       
  3974         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
       
  3975     }
       
  3976     bSig |= LIT64( 0x8000000000000000 );
       
  3977     zSign = aSign;
       
  3978     expDiff = aExp - bExp;
       
  3979     aSig1 = 0;
       
  3980     if ( expDiff < 0 ) {
       
  3981         if ( expDiff < -1 ) return a;
       
  3982         shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
       
  3983         expDiff = 0;
       
  3984     }
       
  3985     q = ( bSig <= aSig0 );
       
  3986     if ( q ) aSig0 -= bSig;
       
  3987     expDiff -= 64;
       
  3988     while ( 0 < expDiff ) {
       
  3989         q = estimateDiv128To64( aSig0, aSig1, bSig );
       
  3990         q = ( 2 < q ) ? q - 2 : 0;
       
  3991         mul64To128( bSig, q, &term0, &term1 );
       
  3992         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
       
  3993         shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
       
  3994         expDiff -= 62;
       
  3995     }
       
  3996     expDiff += 64;
       
  3997     if ( 0 < expDiff ) {
       
  3998         q = estimateDiv128To64( aSig0, aSig1, bSig );
       
  3999         q = ( 2 < q ) ? q - 2 : 0;
       
  4000         q >>= 64 - expDiff;
       
  4001         mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
       
  4002         sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
       
  4003         shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
       
  4004         while ( le128( term0, term1, aSig0, aSig1 ) ) {
       
  4005             ++q;
       
  4006             sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
       
  4007         }
       
  4008     }
       
  4009     else {
       
  4010         term1 = 0;
       
  4011         term0 = bSig;
       
  4012     }
       
  4013     sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
       
  4014     if (    lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
       
  4015          || (    eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
       
  4016               && ( q & 1 ) )
       
  4017        ) {
       
  4018         aSig0 = alternateASig0;
       
  4019         aSig1 = alternateASig1;
       
  4020         zSign = ! zSign;
       
  4021     }
       
  4022     return
       
  4023         normalizeRoundAndPackFloatx80(
       
  4024             80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
       
  4025 
       
  4026 }
       
  4027 
       
  4028 /*----------------------------------------------------------------------------
       
  4029 | Returns the square root of the extended double-precision floating-point
       
  4030 | value `a'.  The operation is performed according to the IEC/IEEE Standard
       
  4031 | for Binary Floating-Point Arithmetic.
       
  4032 *----------------------------------------------------------------------------*/
       
  4033 
       
  4034 floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
       
  4035 {
       
  4036     flag aSign;
       
  4037     int32 aExp, zExp;
       
  4038     bits64 aSig0, aSig1, zSig0, zSig1, doubleZSig0;
       
  4039     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
       
  4040     floatx80 z;
       
  4041 
       
  4042     aSig0 = extractFloatx80Frac( a );
       
  4043     aExp = extractFloatx80Exp( a );
       
  4044     aSign = extractFloatx80Sign( a );
       
  4045     if ( aExp == 0x7FFF ) {
       
  4046         if ( (bits64) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
       
  4047         if ( ! aSign ) return a;
       
  4048         goto invalid;
       
  4049     }
       
  4050     if ( aSign ) {
       
  4051         if ( ( aExp | aSig0 ) == 0 ) return a;
       
  4052  invalid:
       
  4053         float_raise( float_flag_invalid STATUS_VAR);
       
  4054         z.low = floatx80_default_nan_low;
       
  4055         z.high = floatx80_default_nan_high;
       
  4056         return z;
       
  4057     }
       
  4058     if ( aExp == 0 ) {
       
  4059         if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
       
  4060         normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
       
  4061     }
       
  4062     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
       
  4063     zSig0 = estimateSqrt32( aExp, aSig0>>32 );
       
  4064     shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
       
  4065     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
       
  4066     doubleZSig0 = zSig0<<1;
       
  4067     mul64To128( zSig0, zSig0, &term0, &term1 );
       
  4068     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
       
  4069     while ( (sbits64) rem0 < 0 ) {
       
  4070         --zSig0;
       
  4071         doubleZSig0 -= 2;
       
  4072         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
       
  4073     }
       
  4074     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
       
  4075     if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
       
  4076         if ( zSig1 == 0 ) zSig1 = 1;
       
  4077         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
       
  4078         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
       
  4079         mul64To128( zSig1, zSig1, &term2, &term3 );
       
  4080         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
       
  4081         while ( (sbits64) rem1 < 0 ) {
       
  4082             --zSig1;
       
  4083             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
       
  4084             term3 |= 1;
       
  4085             term2 |= doubleZSig0;
       
  4086             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
       
  4087         }
       
  4088         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
       
  4089     }
       
  4090     shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
       
  4091     zSig0 |= doubleZSig0;
       
  4092     return
       
  4093         roundAndPackFloatx80(
       
  4094             STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
       
  4095 
       
  4096 }
       
  4097 
       
  4098 /*----------------------------------------------------------------------------
       
  4099 | Returns 1 if the extended double-precision floating-point value `a' is
       
  4100 | equal to the corresponding value `b', and 0 otherwise.  The comparison is
       
  4101 | performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4102 | Arithmetic.
       
  4103 *----------------------------------------------------------------------------*/
       
  4104 
       
  4105 int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
       
  4106 {
       
  4107 
       
  4108     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4109               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4110          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4111               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4112        ) {
       
  4113         if (    floatx80_is_signaling_nan( a )
       
  4114              || floatx80_is_signaling_nan( b ) ) {
       
  4115             float_raise( float_flag_invalid STATUS_VAR);
       
  4116         }
       
  4117         return 0;
       
  4118     }
       
  4119     return
       
  4120            ( a.low == b.low )
       
  4121         && (    ( a.high == b.high )
       
  4122              || (    ( a.low == 0 )
       
  4123                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
       
  4124            );
       
  4125 
       
  4126 }
       
  4127 
       
  4128 /*----------------------------------------------------------------------------
       
  4129 | Returns 1 if the extended double-precision floating-point value `a' is
       
  4130 | less than or equal to the corresponding value `b', and 0 otherwise.  The
       
  4131 | comparison is performed according to the IEC/IEEE Standard for Binary
       
  4132 | Floating-Point Arithmetic.
       
  4133 *----------------------------------------------------------------------------*/
       
  4134 
       
  4135 int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
       
  4136 {
       
  4137     flag aSign, bSign;
       
  4138 
       
  4139     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4140               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4141          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4142               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4143        ) {
       
  4144         float_raise( float_flag_invalid STATUS_VAR);
       
  4145         return 0;
       
  4146     }
       
  4147     aSign = extractFloatx80Sign( a );
       
  4148     bSign = extractFloatx80Sign( b );
       
  4149     if ( aSign != bSign ) {
       
  4150         return
       
  4151                aSign
       
  4152             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  4153                  == 0 );
       
  4154     }
       
  4155     return
       
  4156           aSign ? le128( b.high, b.low, a.high, a.low )
       
  4157         : le128( a.high, a.low, b.high, b.low );
       
  4158 
       
  4159 }
       
  4160 
       
  4161 /*----------------------------------------------------------------------------
       
  4162 | Returns 1 if the extended double-precision floating-point value `a' is
       
  4163 | less than the corresponding value `b', and 0 otherwise.  The comparison
       
  4164 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4165 | Arithmetic.
       
  4166 *----------------------------------------------------------------------------*/
       
  4167 
       
  4168 int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
       
  4169 {
       
  4170     flag aSign, bSign;
       
  4171 
       
  4172     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4173               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4174          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4175               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4176        ) {
       
  4177         float_raise( float_flag_invalid STATUS_VAR);
       
  4178         return 0;
       
  4179     }
       
  4180     aSign = extractFloatx80Sign( a );
       
  4181     bSign = extractFloatx80Sign( b );
       
  4182     if ( aSign != bSign ) {
       
  4183         return
       
  4184                aSign
       
  4185             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  4186                  != 0 );
       
  4187     }
       
  4188     return
       
  4189           aSign ? lt128( b.high, b.low, a.high, a.low )
       
  4190         : lt128( a.high, a.low, b.high, b.low );
       
  4191 
       
  4192 }
       
  4193 
       
  4194 /*----------------------------------------------------------------------------
       
  4195 | Returns 1 if the extended double-precision floating-point value `a' is equal
       
  4196 | to the corresponding value `b', and 0 otherwise.  The invalid exception is
       
  4197 | raised if either operand is a NaN.  Otherwise, the comparison is performed
       
  4198 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  4199 *----------------------------------------------------------------------------*/
       
  4200 
       
  4201 int floatx80_eq_signaling( floatx80 a, floatx80 b STATUS_PARAM )
       
  4202 {
       
  4203 
       
  4204     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4205               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4206          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4207               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4208        ) {
       
  4209         float_raise( float_flag_invalid STATUS_VAR);
       
  4210         return 0;
       
  4211     }
       
  4212     return
       
  4213            ( a.low == b.low )
       
  4214         && (    ( a.high == b.high )
       
  4215              || (    ( a.low == 0 )
       
  4216                   && ( (bits16) ( ( a.high | b.high )<<1 ) == 0 ) )
       
  4217            );
       
  4218 
       
  4219 }
       
  4220 
       
  4221 /*----------------------------------------------------------------------------
       
  4222 | Returns 1 if the extended double-precision floating-point value `a' is less
       
  4223 | than or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs
       
  4224 | do not cause an exception.  Otherwise, the comparison is performed according
       
  4225 | to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  4226 *----------------------------------------------------------------------------*/
       
  4227 
       
  4228 int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
       
  4229 {
       
  4230     flag aSign, bSign;
       
  4231 
       
  4232     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4233               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4234          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4235               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4236        ) {
       
  4237         if (    floatx80_is_signaling_nan( a )
       
  4238              || floatx80_is_signaling_nan( b ) ) {
       
  4239             float_raise( float_flag_invalid STATUS_VAR);
       
  4240         }
       
  4241         return 0;
       
  4242     }
       
  4243     aSign = extractFloatx80Sign( a );
       
  4244     bSign = extractFloatx80Sign( b );
       
  4245     if ( aSign != bSign ) {
       
  4246         return
       
  4247                aSign
       
  4248             || (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  4249                  == 0 );
       
  4250     }
       
  4251     return
       
  4252           aSign ? le128( b.high, b.low, a.high, a.low )
       
  4253         : le128( a.high, a.low, b.high, b.low );
       
  4254 
       
  4255 }
       
  4256 
       
  4257 /*----------------------------------------------------------------------------
       
  4258 | Returns 1 if the extended double-precision floating-point value `a' is less
       
  4259 | than the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause
       
  4260 | an exception.  Otherwise, the comparison is performed according to the
       
  4261 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  4262 *----------------------------------------------------------------------------*/
       
  4263 
       
  4264 int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
       
  4265 {
       
  4266     flag aSign, bSign;
       
  4267 
       
  4268     if (    (    ( extractFloatx80Exp( a ) == 0x7FFF )
       
  4269               && (bits64) ( extractFloatx80Frac( a )<<1 ) )
       
  4270          || (    ( extractFloatx80Exp( b ) == 0x7FFF )
       
  4271               && (bits64) ( extractFloatx80Frac( b )<<1 ) )
       
  4272        ) {
       
  4273         if (    floatx80_is_signaling_nan( a )
       
  4274              || floatx80_is_signaling_nan( b ) ) {
       
  4275             float_raise( float_flag_invalid STATUS_VAR);
       
  4276         }
       
  4277         return 0;
       
  4278     }
       
  4279     aSign = extractFloatx80Sign( a );
       
  4280     bSign = extractFloatx80Sign( b );
       
  4281     if ( aSign != bSign ) {
       
  4282         return
       
  4283                aSign
       
  4284             && (    ( ( (bits16) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  4285                  != 0 );
       
  4286     }
       
  4287     return
       
  4288           aSign ? lt128( b.high, b.low, a.high, a.low )
       
  4289         : lt128( a.high, a.low, b.high, b.low );
       
  4290 
       
  4291 }
       
  4292 
       
  4293 #endif
       
  4294 
       
  4295 #ifdef FLOAT128
       
  4296 
       
  4297 /*----------------------------------------------------------------------------
       
  4298 | Returns the result of converting the quadruple-precision floating-point
       
  4299 | value `a' to the 32-bit two's complement integer format.  The conversion
       
  4300 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4301 | Arithmetic---which means in particular that the conversion is rounded
       
  4302 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  4303 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  4304 | largest integer with the same sign as `a' is returned.
       
  4305 *----------------------------------------------------------------------------*/
       
  4306 
       
  4307 int32 float128_to_int32( float128 a STATUS_PARAM )
       
  4308 {
       
  4309     flag aSign;
       
  4310     int32 aExp, shiftCount;
       
  4311     bits64 aSig0, aSig1;
       
  4312 
       
  4313     aSig1 = extractFloat128Frac1( a );
       
  4314     aSig0 = extractFloat128Frac0( a );
       
  4315     aExp = extractFloat128Exp( a );
       
  4316     aSign = extractFloat128Sign( a );
       
  4317     if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
       
  4318     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
       
  4319     aSig0 |= ( aSig1 != 0 );
       
  4320     shiftCount = 0x4028 - aExp;
       
  4321     if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
       
  4322     return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
       
  4323 
       
  4324 }
       
  4325 
       
  4326 /*----------------------------------------------------------------------------
       
  4327 | Returns the result of converting the quadruple-precision floating-point
       
  4328 | value `a' to the 32-bit two's complement integer format.  The conversion
       
  4329 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4330 | Arithmetic, except that the conversion is always rounded toward zero.  If
       
  4331 | `a' is a NaN, the largest positive integer is returned.  Otherwise, if the
       
  4332 | conversion overflows, the largest integer with the same sign as `a' is
       
  4333 | returned.
       
  4334 *----------------------------------------------------------------------------*/
       
  4335 
       
  4336 int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
       
  4337 {
       
  4338     flag aSign;
       
  4339     int32 aExp, shiftCount;
       
  4340     bits64 aSig0, aSig1, savedASig;
       
  4341     int32 z;
       
  4342 
       
  4343     aSig1 = extractFloat128Frac1( a );
       
  4344     aSig0 = extractFloat128Frac0( a );
       
  4345     aExp = extractFloat128Exp( a );
       
  4346     aSign = extractFloat128Sign( a );
       
  4347     aSig0 |= ( aSig1 != 0 );
       
  4348     if ( 0x401E < aExp ) {
       
  4349         if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
       
  4350         goto invalid;
       
  4351     }
       
  4352     else if ( aExp < 0x3FFF ) {
       
  4353         if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  4354         return 0;
       
  4355     }
       
  4356     aSig0 |= LIT64( 0x0001000000000000 );
       
  4357     shiftCount = 0x402F - aExp;
       
  4358     savedASig = aSig0;
       
  4359     aSig0 >>= shiftCount;
       
  4360     z = aSig0;
       
  4361     if ( aSign ) z = - z;
       
  4362     if ( ( z < 0 ) ^ aSign ) {
       
  4363  invalid:
       
  4364         float_raise( float_flag_invalid STATUS_VAR);
       
  4365         return aSign ? (sbits32) 0x80000000 : 0x7FFFFFFF;
       
  4366     }
       
  4367     if ( ( aSig0<<shiftCount ) != savedASig ) {
       
  4368         STATUS(float_exception_flags) |= float_flag_inexact;
       
  4369     }
       
  4370     return z;
       
  4371 
       
  4372 }
       
  4373 
       
  4374 /*----------------------------------------------------------------------------
       
  4375 | Returns the result of converting the quadruple-precision floating-point
       
  4376 | value `a' to the 64-bit two's complement integer format.  The conversion
       
  4377 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4378 | Arithmetic---which means in particular that the conversion is rounded
       
  4379 | according to the current rounding mode.  If `a' is a NaN, the largest
       
  4380 | positive integer is returned.  Otherwise, if the conversion overflows, the
       
  4381 | largest integer with the same sign as `a' is returned.
       
  4382 *----------------------------------------------------------------------------*/
       
  4383 
       
  4384 int64 float128_to_int64( float128 a STATUS_PARAM )
       
  4385 {
       
  4386     flag aSign;
       
  4387     int32 aExp, shiftCount;
       
  4388     bits64 aSig0, aSig1;
       
  4389 
       
  4390     aSig1 = extractFloat128Frac1( a );
       
  4391     aSig0 = extractFloat128Frac0( a );
       
  4392     aExp = extractFloat128Exp( a );
       
  4393     aSign = extractFloat128Sign( a );
       
  4394     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
       
  4395     shiftCount = 0x402F - aExp;
       
  4396     if ( shiftCount <= 0 ) {
       
  4397         if ( 0x403E < aExp ) {
       
  4398             float_raise( float_flag_invalid STATUS_VAR);
       
  4399             if (    ! aSign
       
  4400                  || (    ( aExp == 0x7FFF )
       
  4401                       && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
       
  4402                     )
       
  4403                ) {
       
  4404                 return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  4405             }
       
  4406             return (sbits64) LIT64( 0x8000000000000000 );
       
  4407         }
       
  4408         shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
       
  4409     }
       
  4410     else {
       
  4411         shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
       
  4412     }
       
  4413     return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
       
  4414 
       
  4415 }
       
  4416 
       
  4417 /*----------------------------------------------------------------------------
       
  4418 | Returns the result of converting the quadruple-precision floating-point
       
  4419 | value `a' to the 64-bit two's complement integer format.  The conversion
       
  4420 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4421 | Arithmetic, except that the conversion is always rounded toward zero.
       
  4422 | If `a' is a NaN, the largest positive integer is returned.  Otherwise, if
       
  4423 | the conversion overflows, the largest integer with the same sign as `a' is
       
  4424 | returned.
       
  4425 *----------------------------------------------------------------------------*/
       
  4426 
       
  4427 int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
       
  4428 {
       
  4429     flag aSign;
       
  4430     int32 aExp, shiftCount;
       
  4431     bits64 aSig0, aSig1;
       
  4432     int64 z;
       
  4433 
       
  4434     aSig1 = extractFloat128Frac1( a );
       
  4435     aSig0 = extractFloat128Frac0( a );
       
  4436     aExp = extractFloat128Exp( a );
       
  4437     aSign = extractFloat128Sign( a );
       
  4438     if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
       
  4439     shiftCount = aExp - 0x402F;
       
  4440     if ( 0 < shiftCount ) {
       
  4441         if ( 0x403E <= aExp ) {
       
  4442             aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
       
  4443             if (    ( a.high == LIT64( 0xC03E000000000000 ) )
       
  4444                  && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
       
  4445                 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
       
  4446             }
       
  4447             else {
       
  4448                 float_raise( float_flag_invalid STATUS_VAR);
       
  4449                 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
       
  4450                     return LIT64( 0x7FFFFFFFFFFFFFFF );
       
  4451                 }
       
  4452             }
       
  4453             return (sbits64) LIT64( 0x8000000000000000 );
       
  4454         }
       
  4455         z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
       
  4456         if ( (bits64) ( aSig1<<shiftCount ) ) {
       
  4457             STATUS(float_exception_flags) |= float_flag_inexact;
       
  4458         }
       
  4459     }
       
  4460     else {
       
  4461         if ( aExp < 0x3FFF ) {
       
  4462             if ( aExp | aSig0 | aSig1 ) {
       
  4463                 STATUS(float_exception_flags) |= float_flag_inexact;
       
  4464             }
       
  4465             return 0;
       
  4466         }
       
  4467         z = aSig0>>( - shiftCount );
       
  4468         if (    aSig1
       
  4469              || ( shiftCount && (bits64) ( aSig0<<( shiftCount & 63 ) ) ) ) {
       
  4470             STATUS(float_exception_flags) |= float_flag_inexact;
       
  4471         }
       
  4472     }
       
  4473     if ( aSign ) z = - z;
       
  4474     return z;
       
  4475 
       
  4476 }
       
  4477 
       
  4478 /*----------------------------------------------------------------------------
       
  4479 | Returns the result of converting the quadruple-precision floating-point
       
  4480 | value `a' to the single-precision floating-point format.  The conversion
       
  4481 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4482 | Arithmetic.
       
  4483 *----------------------------------------------------------------------------*/
       
  4484 
       
  4485 float32 float128_to_float32( float128 a STATUS_PARAM )
       
  4486 {
       
  4487     flag aSign;
       
  4488     int32 aExp;
       
  4489     bits64 aSig0, aSig1;
       
  4490     bits32 zSig;
       
  4491 
       
  4492     aSig1 = extractFloat128Frac1( a );
       
  4493     aSig0 = extractFloat128Frac0( a );
       
  4494     aExp = extractFloat128Exp( a );
       
  4495     aSign = extractFloat128Sign( a );
       
  4496     if ( aExp == 0x7FFF ) {
       
  4497         if ( aSig0 | aSig1 ) {
       
  4498             return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) );
       
  4499         }
       
  4500         return packFloat32( aSign, 0xFF, 0 );
       
  4501     }
       
  4502     aSig0 |= ( aSig1 != 0 );
       
  4503     shift64RightJamming( aSig0, 18, &aSig0 );
       
  4504     zSig = aSig0;
       
  4505     if ( aExp || zSig ) {
       
  4506         zSig |= 0x40000000;
       
  4507         aExp -= 0x3F81;
       
  4508     }
       
  4509     return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
       
  4510 
       
  4511 }
       
  4512 
       
  4513 /*----------------------------------------------------------------------------
       
  4514 | Returns the result of converting the quadruple-precision floating-point
       
  4515 | value `a' to the double-precision floating-point format.  The conversion
       
  4516 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  4517 | Arithmetic.
       
  4518 *----------------------------------------------------------------------------*/
       
  4519 
       
  4520 float64 float128_to_float64( float128 a STATUS_PARAM )
       
  4521 {
       
  4522     flag aSign;
       
  4523     int32 aExp;
       
  4524     bits64 aSig0, aSig1;
       
  4525 
       
  4526     aSig1 = extractFloat128Frac1( a );
       
  4527     aSig0 = extractFloat128Frac0( a );
       
  4528     aExp = extractFloat128Exp( a );
       
  4529     aSign = extractFloat128Sign( a );
       
  4530     if ( aExp == 0x7FFF ) {
       
  4531         if ( aSig0 | aSig1 ) {
       
  4532             return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) );
       
  4533         }
       
  4534         return packFloat64( aSign, 0x7FF, 0 );
       
  4535     }
       
  4536     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
       
  4537     aSig0 |= ( aSig1 != 0 );
       
  4538     if ( aExp || aSig0 ) {
       
  4539         aSig0 |= LIT64( 0x4000000000000000 );
       
  4540         aExp -= 0x3C01;
       
  4541     }
       
  4542     return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
       
  4543 
       
  4544 }
       
  4545 
       
  4546 #ifdef FLOATX80
       
  4547 
       
  4548 /*----------------------------------------------------------------------------
       
  4549 | Returns the result of converting the quadruple-precision floating-point
       
  4550 | value `a' to the extended double-precision floating-point format.  The
       
  4551 | conversion is performed according to the IEC/IEEE Standard for Binary
       
  4552 | Floating-Point Arithmetic.
       
  4553 *----------------------------------------------------------------------------*/
       
  4554 
       
  4555 floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
       
  4556 {
       
  4557     flag aSign;
       
  4558     int32 aExp;
       
  4559     bits64 aSig0, aSig1;
       
  4560 
       
  4561     aSig1 = extractFloat128Frac1( a );
       
  4562     aSig0 = extractFloat128Frac0( a );
       
  4563     aExp = extractFloat128Exp( a );
       
  4564     aSign = extractFloat128Sign( a );
       
  4565     if ( aExp == 0x7FFF ) {
       
  4566         if ( aSig0 | aSig1 ) {
       
  4567             return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) );
       
  4568         }
       
  4569         return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
       
  4570     }
       
  4571     if ( aExp == 0 ) {
       
  4572         if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
       
  4573         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
       
  4574     }
       
  4575     else {
       
  4576         aSig0 |= LIT64( 0x0001000000000000 );
       
  4577     }
       
  4578     shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
       
  4579     return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
       
  4580 
       
  4581 }
       
  4582 
       
  4583 #endif
       
  4584 
       
  4585 /*----------------------------------------------------------------------------
       
  4586 | Rounds the quadruple-precision floating-point value `a' to an integer, and
       
  4587 | returns the result as a quadruple-precision floating-point value.  The
       
  4588 | operation is performed according to the IEC/IEEE Standard for Binary
       
  4589 | Floating-Point Arithmetic.
       
  4590 *----------------------------------------------------------------------------*/
       
  4591 
       
  4592 float128 float128_round_to_int( float128 a STATUS_PARAM )
       
  4593 {
       
  4594     flag aSign;
       
  4595     int32 aExp;
       
  4596     bits64 lastBitMask, roundBitsMask;
       
  4597     int8 roundingMode;
       
  4598     float128 z;
       
  4599 
       
  4600     aExp = extractFloat128Exp( a );
       
  4601     if ( 0x402F <= aExp ) {
       
  4602         if ( 0x406F <= aExp ) {
       
  4603             if (    ( aExp == 0x7FFF )
       
  4604                  && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
       
  4605                ) {
       
  4606                 return propagateFloat128NaN( a, a STATUS_VAR );
       
  4607             }
       
  4608             return a;
       
  4609         }
       
  4610         lastBitMask = 1;
       
  4611         lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
       
  4612         roundBitsMask = lastBitMask - 1;
       
  4613         z = a;
       
  4614         roundingMode = STATUS(float_rounding_mode);
       
  4615         if ( roundingMode == float_round_nearest_even ) {
       
  4616             if ( lastBitMask ) {
       
  4617                 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
       
  4618                 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
       
  4619             }
       
  4620             else {
       
  4621                 if ( (sbits64) z.low < 0 ) {
       
  4622                     ++z.high;
       
  4623                     if ( (bits64) ( z.low<<1 ) == 0 ) z.high &= ~1;
       
  4624                 }
       
  4625             }
       
  4626         }
       
  4627         else if ( roundingMode != float_round_to_zero ) {
       
  4628             if (   extractFloat128Sign( z )
       
  4629                  ^ ( roundingMode == float_round_up ) ) {
       
  4630                 add128( z.high, z.low, 0, roundBitsMask, &z.high, &z.low );
       
  4631             }
       
  4632         }
       
  4633         z.low &= ~ roundBitsMask;
       
  4634     }
       
  4635     else {
       
  4636         if ( aExp < 0x3FFF ) {
       
  4637             if ( ( ( (bits64) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
       
  4638             STATUS(float_exception_flags) |= float_flag_inexact;
       
  4639             aSign = extractFloat128Sign( a );
       
  4640             switch ( STATUS(float_rounding_mode) ) {
       
  4641              case float_round_nearest_even:
       
  4642                 if (    ( aExp == 0x3FFE )
       
  4643                      && (   extractFloat128Frac0( a )
       
  4644                           | extractFloat128Frac1( a ) )
       
  4645                    ) {
       
  4646                     return packFloat128( aSign, 0x3FFF, 0, 0 );
       
  4647                 }
       
  4648                 break;
       
  4649              case float_round_down:
       
  4650                 return
       
  4651                       aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
       
  4652                     : packFloat128( 0, 0, 0, 0 );
       
  4653              case float_round_up:
       
  4654                 return
       
  4655                       aSign ? packFloat128( 1, 0, 0, 0 )
       
  4656                     : packFloat128( 0, 0x3FFF, 0, 0 );
       
  4657             }
       
  4658             return packFloat128( aSign, 0, 0, 0 );
       
  4659         }
       
  4660         lastBitMask = 1;
       
  4661         lastBitMask <<= 0x402F - aExp;
       
  4662         roundBitsMask = lastBitMask - 1;
       
  4663         z.low = 0;
       
  4664         z.high = a.high;
       
  4665         roundingMode = STATUS(float_rounding_mode);
       
  4666         if ( roundingMode == float_round_nearest_even ) {
       
  4667             z.high += lastBitMask>>1;
       
  4668             if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
       
  4669                 z.high &= ~ lastBitMask;
       
  4670             }
       
  4671         }
       
  4672         else if ( roundingMode != float_round_to_zero ) {
       
  4673             if (   extractFloat128Sign( z )
       
  4674                  ^ ( roundingMode == float_round_up ) ) {
       
  4675                 z.high |= ( a.low != 0 );
       
  4676                 z.high += roundBitsMask;
       
  4677             }
       
  4678         }
       
  4679         z.high &= ~ roundBitsMask;
       
  4680     }
       
  4681     if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
       
  4682         STATUS(float_exception_flags) |= float_flag_inexact;
       
  4683     }
       
  4684     return z;
       
  4685 
       
  4686 }
       
  4687 
       
  4688 /*----------------------------------------------------------------------------
       
  4689 | Returns the result of adding the absolute values of the quadruple-precision
       
  4690 | floating-point values `a' and `b'.  If `zSign' is 1, the sum is negated
       
  4691 | before being returned.  `zSign' is ignored if the result is a NaN.
       
  4692 | The addition is performed according to the IEC/IEEE Standard for Binary
       
  4693 | Floating-Point Arithmetic.
       
  4694 *----------------------------------------------------------------------------*/
       
  4695 
       
  4696 static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
       
  4697 {
       
  4698     int32 aExp, bExp, zExp;
       
  4699     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
       
  4700     int32 expDiff;
       
  4701 
       
  4702     aSig1 = extractFloat128Frac1( a );
       
  4703     aSig0 = extractFloat128Frac0( a );
       
  4704     aExp = extractFloat128Exp( a );
       
  4705     bSig1 = extractFloat128Frac1( b );
       
  4706     bSig0 = extractFloat128Frac0( b );
       
  4707     bExp = extractFloat128Exp( b );
       
  4708     expDiff = aExp - bExp;
       
  4709     if ( 0 < expDiff ) {
       
  4710         if ( aExp == 0x7FFF ) {
       
  4711             if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4712             return a;
       
  4713         }
       
  4714         if ( bExp == 0 ) {
       
  4715             --expDiff;
       
  4716         }
       
  4717         else {
       
  4718             bSig0 |= LIT64( 0x0001000000000000 );
       
  4719         }
       
  4720         shift128ExtraRightJamming(
       
  4721             bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
       
  4722         zExp = aExp;
       
  4723     }
       
  4724     else if ( expDiff < 0 ) {
       
  4725         if ( bExp == 0x7FFF ) {
       
  4726             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4727             return packFloat128( zSign, 0x7FFF, 0, 0 );
       
  4728         }
       
  4729         if ( aExp == 0 ) {
       
  4730             ++expDiff;
       
  4731         }
       
  4732         else {
       
  4733             aSig0 |= LIT64( 0x0001000000000000 );
       
  4734         }
       
  4735         shift128ExtraRightJamming(
       
  4736             aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
       
  4737         zExp = bExp;
       
  4738     }
       
  4739     else {
       
  4740         if ( aExp == 0x7FFF ) {
       
  4741             if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
       
  4742                 return propagateFloat128NaN( a, b STATUS_VAR );
       
  4743             }
       
  4744             return a;
       
  4745         }
       
  4746         add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
       
  4747         if ( aExp == 0 ) {
       
  4748             if ( STATUS(flush_to_zero) ) return packFloat128( zSign, 0, 0, 0 );
       
  4749             return packFloat128( zSign, 0, zSig0, zSig1 );
       
  4750         }
       
  4751         zSig2 = 0;
       
  4752         zSig0 |= LIT64( 0x0002000000000000 );
       
  4753         zExp = aExp;
       
  4754         goto shiftRight1;
       
  4755     }
       
  4756     aSig0 |= LIT64( 0x0001000000000000 );
       
  4757     add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
       
  4758     --zExp;
       
  4759     if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
       
  4760     ++zExp;
       
  4761  shiftRight1:
       
  4762     shift128ExtraRightJamming(
       
  4763         zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
       
  4764  roundAndPack:
       
  4765     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
       
  4766 
       
  4767 }
       
  4768 
       
  4769 /*----------------------------------------------------------------------------
       
  4770 | Returns the result of subtracting the absolute values of the quadruple-
       
  4771 | precision floating-point values `a' and `b'.  If `zSign' is 1, the
       
  4772 | difference is negated before being returned.  `zSign' is ignored if the
       
  4773 | result is a NaN.  The subtraction is performed according to the IEC/IEEE
       
  4774 | Standard for Binary Floating-Point Arithmetic.
       
  4775 *----------------------------------------------------------------------------*/
       
  4776 
       
  4777 static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
       
  4778 {
       
  4779     int32 aExp, bExp, zExp;
       
  4780     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
       
  4781     int32 expDiff;
       
  4782     float128 z;
       
  4783 
       
  4784     aSig1 = extractFloat128Frac1( a );
       
  4785     aSig0 = extractFloat128Frac0( a );
       
  4786     aExp = extractFloat128Exp( a );
       
  4787     bSig1 = extractFloat128Frac1( b );
       
  4788     bSig0 = extractFloat128Frac0( b );
       
  4789     bExp = extractFloat128Exp( b );
       
  4790     expDiff = aExp - bExp;
       
  4791     shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
       
  4792     shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
       
  4793     if ( 0 < expDiff ) goto aExpBigger;
       
  4794     if ( expDiff < 0 ) goto bExpBigger;
       
  4795     if ( aExp == 0x7FFF ) {
       
  4796         if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
       
  4797             return propagateFloat128NaN( a, b STATUS_VAR );
       
  4798         }
       
  4799         float_raise( float_flag_invalid STATUS_VAR);
       
  4800         z.low = float128_default_nan_low;
       
  4801         z.high = float128_default_nan_high;
       
  4802         return z;
       
  4803     }
       
  4804     if ( aExp == 0 ) {
       
  4805         aExp = 1;
       
  4806         bExp = 1;
       
  4807     }
       
  4808     if ( bSig0 < aSig0 ) goto aBigger;
       
  4809     if ( aSig0 < bSig0 ) goto bBigger;
       
  4810     if ( bSig1 < aSig1 ) goto aBigger;
       
  4811     if ( aSig1 < bSig1 ) goto bBigger;
       
  4812     return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
       
  4813  bExpBigger:
       
  4814     if ( bExp == 0x7FFF ) {
       
  4815         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4816         return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
       
  4817     }
       
  4818     if ( aExp == 0 ) {
       
  4819         ++expDiff;
       
  4820     }
       
  4821     else {
       
  4822         aSig0 |= LIT64( 0x4000000000000000 );
       
  4823     }
       
  4824     shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
       
  4825     bSig0 |= LIT64( 0x4000000000000000 );
       
  4826  bBigger:
       
  4827     sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
       
  4828     zExp = bExp;
       
  4829     zSign ^= 1;
       
  4830     goto normalizeRoundAndPack;
       
  4831  aExpBigger:
       
  4832     if ( aExp == 0x7FFF ) {
       
  4833         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4834         return a;
       
  4835     }
       
  4836     if ( bExp == 0 ) {
       
  4837         --expDiff;
       
  4838     }
       
  4839     else {
       
  4840         bSig0 |= LIT64( 0x4000000000000000 );
       
  4841     }
       
  4842     shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
       
  4843     aSig0 |= LIT64( 0x4000000000000000 );
       
  4844  aBigger:
       
  4845     sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
       
  4846     zExp = aExp;
       
  4847  normalizeRoundAndPack:
       
  4848     --zExp;
       
  4849     return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
       
  4850 
       
  4851 }
       
  4852 
       
  4853 /*----------------------------------------------------------------------------
       
  4854 | Returns the result of adding the quadruple-precision floating-point values
       
  4855 | `a' and `b'.  The operation is performed according to the IEC/IEEE Standard
       
  4856 | for Binary Floating-Point Arithmetic.
       
  4857 *----------------------------------------------------------------------------*/
       
  4858 
       
  4859 float128 float128_add( float128 a, float128 b STATUS_PARAM )
       
  4860 {
       
  4861     flag aSign, bSign;
       
  4862 
       
  4863     aSign = extractFloat128Sign( a );
       
  4864     bSign = extractFloat128Sign( b );
       
  4865     if ( aSign == bSign ) {
       
  4866         return addFloat128Sigs( a, b, aSign STATUS_VAR );
       
  4867     }
       
  4868     else {
       
  4869         return subFloat128Sigs( a, b, aSign STATUS_VAR );
       
  4870     }
       
  4871 
       
  4872 }
       
  4873 
       
  4874 /*----------------------------------------------------------------------------
       
  4875 | Returns the result of subtracting the quadruple-precision floating-point
       
  4876 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
       
  4877 | Standard for Binary Floating-Point Arithmetic.
       
  4878 *----------------------------------------------------------------------------*/
       
  4879 
       
  4880 float128 float128_sub( float128 a, float128 b STATUS_PARAM )
       
  4881 {
       
  4882     flag aSign, bSign;
       
  4883 
       
  4884     aSign = extractFloat128Sign( a );
       
  4885     bSign = extractFloat128Sign( b );
       
  4886     if ( aSign == bSign ) {
       
  4887         return subFloat128Sigs( a, b, aSign STATUS_VAR );
       
  4888     }
       
  4889     else {
       
  4890         return addFloat128Sigs( a, b, aSign STATUS_VAR );
       
  4891     }
       
  4892 
       
  4893 }
       
  4894 
       
  4895 /*----------------------------------------------------------------------------
       
  4896 | Returns the result of multiplying the quadruple-precision floating-point
       
  4897 | values `a' and `b'.  The operation is performed according to the IEC/IEEE
       
  4898 | Standard for Binary Floating-Point Arithmetic.
       
  4899 *----------------------------------------------------------------------------*/
       
  4900 
       
  4901 float128 float128_mul( float128 a, float128 b STATUS_PARAM )
       
  4902 {
       
  4903     flag aSign, bSign, zSign;
       
  4904     int32 aExp, bExp, zExp;
       
  4905     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
       
  4906     float128 z;
       
  4907 
       
  4908     aSig1 = extractFloat128Frac1( a );
       
  4909     aSig0 = extractFloat128Frac0( a );
       
  4910     aExp = extractFloat128Exp( a );
       
  4911     aSign = extractFloat128Sign( a );
       
  4912     bSig1 = extractFloat128Frac1( b );
       
  4913     bSig0 = extractFloat128Frac0( b );
       
  4914     bExp = extractFloat128Exp( b );
       
  4915     bSign = extractFloat128Sign( b );
       
  4916     zSign = aSign ^ bSign;
       
  4917     if ( aExp == 0x7FFF ) {
       
  4918         if (    ( aSig0 | aSig1 )
       
  4919              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
       
  4920             return propagateFloat128NaN( a, b STATUS_VAR );
       
  4921         }
       
  4922         if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
       
  4923         return packFloat128( zSign, 0x7FFF, 0, 0 );
       
  4924     }
       
  4925     if ( bExp == 0x7FFF ) {
       
  4926         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4927         if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
       
  4928  invalid:
       
  4929             float_raise( float_flag_invalid STATUS_VAR);
       
  4930             z.low = float128_default_nan_low;
       
  4931             z.high = float128_default_nan_high;
       
  4932             return z;
       
  4933         }
       
  4934         return packFloat128( zSign, 0x7FFF, 0, 0 );
       
  4935     }
       
  4936     if ( aExp == 0 ) {
       
  4937         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
       
  4938         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
       
  4939     }
       
  4940     if ( bExp == 0 ) {
       
  4941         if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
       
  4942         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
       
  4943     }
       
  4944     zExp = aExp + bExp - 0x4000;
       
  4945     aSig0 |= LIT64( 0x0001000000000000 );
       
  4946     shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
       
  4947     mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
       
  4948     add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
       
  4949     zSig2 |= ( zSig3 != 0 );
       
  4950     if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
       
  4951         shift128ExtraRightJamming(
       
  4952             zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
       
  4953         ++zExp;
       
  4954     }
       
  4955     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
       
  4956 
       
  4957 }
       
  4958 
       
  4959 /*----------------------------------------------------------------------------
       
  4960 | Returns the result of dividing the quadruple-precision floating-point value
       
  4961 | `a' by the corresponding value `b'.  The operation is performed according to
       
  4962 | the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  4963 *----------------------------------------------------------------------------*/
       
  4964 
       
  4965 float128 float128_div( float128 a, float128 b STATUS_PARAM )
       
  4966 {
       
  4967     flag aSign, bSign, zSign;
       
  4968     int32 aExp, bExp, zExp;
       
  4969     bits64 aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
       
  4970     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
       
  4971     float128 z;
       
  4972 
       
  4973     aSig1 = extractFloat128Frac1( a );
       
  4974     aSig0 = extractFloat128Frac0( a );
       
  4975     aExp = extractFloat128Exp( a );
       
  4976     aSign = extractFloat128Sign( a );
       
  4977     bSig1 = extractFloat128Frac1( b );
       
  4978     bSig0 = extractFloat128Frac0( b );
       
  4979     bExp = extractFloat128Exp( b );
       
  4980     bSign = extractFloat128Sign( b );
       
  4981     zSign = aSign ^ bSign;
       
  4982     if ( aExp == 0x7FFF ) {
       
  4983         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4984         if ( bExp == 0x7FFF ) {
       
  4985             if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4986             goto invalid;
       
  4987         }
       
  4988         return packFloat128( zSign, 0x7FFF, 0, 0 );
       
  4989     }
       
  4990     if ( bExp == 0x7FFF ) {
       
  4991         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  4992         return packFloat128( zSign, 0, 0, 0 );
       
  4993     }
       
  4994     if ( bExp == 0 ) {
       
  4995         if ( ( bSig0 | bSig1 ) == 0 ) {
       
  4996             if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
       
  4997  invalid:
       
  4998                 float_raise( float_flag_invalid STATUS_VAR);
       
  4999                 z.low = float128_default_nan_low;
       
  5000                 z.high = float128_default_nan_high;
       
  5001                 return z;
       
  5002             }
       
  5003             float_raise( float_flag_divbyzero STATUS_VAR);
       
  5004             return packFloat128( zSign, 0x7FFF, 0, 0 );
       
  5005         }
       
  5006         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
       
  5007     }
       
  5008     if ( aExp == 0 ) {
       
  5009         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
       
  5010         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
       
  5011     }
       
  5012     zExp = aExp - bExp + 0x3FFD;
       
  5013     shortShift128Left(
       
  5014         aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
       
  5015     shortShift128Left(
       
  5016         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
       
  5017     if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
       
  5018         shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
       
  5019         ++zExp;
       
  5020     }
       
  5021     zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
       
  5022     mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
       
  5023     sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
       
  5024     while ( (sbits64) rem0 < 0 ) {
       
  5025         --zSig0;
       
  5026         add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
       
  5027     }
       
  5028     zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
       
  5029     if ( ( zSig1 & 0x3FFF ) <= 4 ) {
       
  5030         mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
       
  5031         sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
       
  5032         while ( (sbits64) rem1 < 0 ) {
       
  5033             --zSig1;
       
  5034             add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
       
  5035         }
       
  5036         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
       
  5037     }
       
  5038     shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
       
  5039     return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
       
  5040 
       
  5041 }
       
  5042 
       
  5043 /*----------------------------------------------------------------------------
       
  5044 | Returns the remainder of the quadruple-precision floating-point value `a'
       
  5045 | with respect to the corresponding value `b'.  The operation is performed
       
  5046 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  5047 *----------------------------------------------------------------------------*/
       
  5048 
       
  5049 float128 float128_rem( float128 a, float128 b STATUS_PARAM )
       
  5050 {
       
  5051     flag aSign, bSign, zSign;
       
  5052     int32 aExp, bExp, expDiff;
       
  5053     bits64 aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
       
  5054     bits64 allZero, alternateASig0, alternateASig1, sigMean1;
       
  5055     sbits64 sigMean0;
       
  5056     float128 z;
       
  5057 
       
  5058     aSig1 = extractFloat128Frac1( a );
       
  5059     aSig0 = extractFloat128Frac0( a );
       
  5060     aExp = extractFloat128Exp( a );
       
  5061     aSign = extractFloat128Sign( a );
       
  5062     bSig1 = extractFloat128Frac1( b );
       
  5063     bSig0 = extractFloat128Frac0( b );
       
  5064     bExp = extractFloat128Exp( b );
       
  5065     bSign = extractFloat128Sign( b );
       
  5066     if ( aExp == 0x7FFF ) {
       
  5067         if (    ( aSig0 | aSig1 )
       
  5068              || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
       
  5069             return propagateFloat128NaN( a, b STATUS_VAR );
       
  5070         }
       
  5071         goto invalid;
       
  5072     }
       
  5073     if ( bExp == 0x7FFF ) {
       
  5074         if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
       
  5075         return a;
       
  5076     }
       
  5077     if ( bExp == 0 ) {
       
  5078         if ( ( bSig0 | bSig1 ) == 0 ) {
       
  5079  invalid:
       
  5080             float_raise( float_flag_invalid STATUS_VAR);
       
  5081             z.low = float128_default_nan_low;
       
  5082             z.high = float128_default_nan_high;
       
  5083             return z;
       
  5084         }
       
  5085         normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
       
  5086     }
       
  5087     if ( aExp == 0 ) {
       
  5088         if ( ( aSig0 | aSig1 ) == 0 ) return a;
       
  5089         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
       
  5090     }
       
  5091     expDiff = aExp - bExp;
       
  5092     if ( expDiff < -1 ) return a;
       
  5093     shortShift128Left(
       
  5094         aSig0 | LIT64( 0x0001000000000000 ),
       
  5095         aSig1,
       
  5096         15 - ( expDiff < 0 ),
       
  5097         &aSig0,
       
  5098         &aSig1
       
  5099     );
       
  5100     shortShift128Left(
       
  5101         bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
       
  5102     q = le128( bSig0, bSig1, aSig0, aSig1 );
       
  5103     if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
       
  5104     expDiff -= 64;
       
  5105     while ( 0 < expDiff ) {
       
  5106         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
       
  5107         q = ( 4 < q ) ? q - 4 : 0;
       
  5108         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
       
  5109         shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
       
  5110         shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
       
  5111         sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
       
  5112         expDiff -= 61;
       
  5113     }
       
  5114     if ( -64 < expDiff ) {
       
  5115         q = estimateDiv128To64( aSig0, aSig1, bSig0 );
       
  5116         q = ( 4 < q ) ? q - 4 : 0;
       
  5117         q >>= - expDiff;
       
  5118         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
       
  5119         expDiff += 52;
       
  5120         if ( expDiff < 0 ) {
       
  5121             shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
       
  5122         }
       
  5123         else {
       
  5124             shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
       
  5125         }
       
  5126         mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
       
  5127         sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
       
  5128     }
       
  5129     else {
       
  5130         shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
       
  5131         shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
       
  5132     }
       
  5133     do {
       
  5134         alternateASig0 = aSig0;
       
  5135         alternateASig1 = aSig1;
       
  5136         ++q;
       
  5137         sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
       
  5138     } while ( 0 <= (sbits64) aSig0 );
       
  5139     add128(
       
  5140         aSig0, aSig1, alternateASig0, alternateASig1, (bits64 *)&sigMean0, &sigMean1 );
       
  5141     if (    ( sigMean0 < 0 )
       
  5142          || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
       
  5143         aSig0 = alternateASig0;
       
  5144         aSig1 = alternateASig1;
       
  5145     }
       
  5146     zSign = ( (sbits64) aSig0 < 0 );
       
  5147     if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
       
  5148     return
       
  5149         normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
       
  5150 
       
  5151 }
       
  5152 
       
  5153 /*----------------------------------------------------------------------------
       
  5154 | Returns the square root of the quadruple-precision floating-point value `a'.
       
  5155 | The operation is performed according to the IEC/IEEE Standard for Binary
       
  5156 | Floating-Point Arithmetic.
       
  5157 *----------------------------------------------------------------------------*/
       
  5158 
       
  5159 float128 float128_sqrt( float128 a STATUS_PARAM )
       
  5160 {
       
  5161     flag aSign;
       
  5162     int32 aExp, zExp;
       
  5163     bits64 aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
       
  5164     bits64 rem0, rem1, rem2, rem3, term0, term1, term2, term3;
       
  5165     float128 z;
       
  5166 
       
  5167     aSig1 = extractFloat128Frac1( a );
       
  5168     aSig0 = extractFloat128Frac0( a );
       
  5169     aExp = extractFloat128Exp( a );
       
  5170     aSign = extractFloat128Sign( a );
       
  5171     if ( aExp == 0x7FFF ) {
       
  5172         if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
       
  5173         if ( ! aSign ) return a;
       
  5174         goto invalid;
       
  5175     }
       
  5176     if ( aSign ) {
       
  5177         if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
       
  5178  invalid:
       
  5179         float_raise( float_flag_invalid STATUS_VAR);
       
  5180         z.low = float128_default_nan_low;
       
  5181         z.high = float128_default_nan_high;
       
  5182         return z;
       
  5183     }
       
  5184     if ( aExp == 0 ) {
       
  5185         if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
       
  5186         normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
       
  5187     }
       
  5188     zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
       
  5189     aSig0 |= LIT64( 0x0001000000000000 );
       
  5190     zSig0 = estimateSqrt32( aExp, aSig0>>17 );
       
  5191     shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
       
  5192     zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
       
  5193     doubleZSig0 = zSig0<<1;
       
  5194     mul64To128( zSig0, zSig0, &term0, &term1 );
       
  5195     sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
       
  5196     while ( (sbits64) rem0 < 0 ) {
       
  5197         --zSig0;
       
  5198         doubleZSig0 -= 2;
       
  5199         add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
       
  5200     }
       
  5201     zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
       
  5202     if ( ( zSig1 & 0x1FFF ) <= 5 ) {
       
  5203         if ( zSig1 == 0 ) zSig1 = 1;
       
  5204         mul64To128( doubleZSig0, zSig1, &term1, &term2 );
       
  5205         sub128( rem1, 0, term1, term2, &rem1, &rem2 );
       
  5206         mul64To128( zSig1, zSig1, &term2, &term3 );
       
  5207         sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
       
  5208         while ( (sbits64) rem1 < 0 ) {
       
  5209             --zSig1;
       
  5210             shortShift128Left( 0, zSig1, 1, &term2, &term3 );
       
  5211             term3 |= 1;
       
  5212             term2 |= doubleZSig0;
       
  5213             add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
       
  5214         }
       
  5215         zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
       
  5216     }
       
  5217     shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
       
  5218     return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
       
  5219 
       
  5220 }
       
  5221 
       
  5222 /*----------------------------------------------------------------------------
       
  5223 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
       
  5224 | the corresponding value `b', and 0 otherwise.  The comparison is performed
       
  5225 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  5226 *----------------------------------------------------------------------------*/
       
  5227 
       
  5228 int float128_eq( float128 a, float128 b STATUS_PARAM )
       
  5229 {
       
  5230 
       
  5231     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5232               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5233          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5234               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5235        ) {
       
  5236         if (    float128_is_signaling_nan( a )
       
  5237              || float128_is_signaling_nan( b ) ) {
       
  5238             float_raise( float_flag_invalid STATUS_VAR);
       
  5239         }
       
  5240         return 0;
       
  5241     }
       
  5242     return
       
  5243            ( a.low == b.low )
       
  5244         && (    ( a.high == b.high )
       
  5245              || (    ( a.low == 0 )
       
  5246                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
       
  5247            );
       
  5248 
       
  5249 }
       
  5250 
       
  5251 /*----------------------------------------------------------------------------
       
  5252 | Returns 1 if the quadruple-precision floating-point value `a' is less than
       
  5253 | or equal to the corresponding value `b', and 0 otherwise.  The comparison
       
  5254 | is performed according to the IEC/IEEE Standard for Binary Floating-Point
       
  5255 | Arithmetic.
       
  5256 *----------------------------------------------------------------------------*/
       
  5257 
       
  5258 int float128_le( float128 a, float128 b STATUS_PARAM )
       
  5259 {
       
  5260     flag aSign, bSign;
       
  5261 
       
  5262     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5263               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5264          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5265               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5266        ) {
       
  5267         float_raise( float_flag_invalid STATUS_VAR);
       
  5268         return 0;
       
  5269     }
       
  5270     aSign = extractFloat128Sign( a );
       
  5271     bSign = extractFloat128Sign( b );
       
  5272     if ( aSign != bSign ) {
       
  5273         return
       
  5274                aSign
       
  5275             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  5276                  == 0 );
       
  5277     }
       
  5278     return
       
  5279           aSign ? le128( b.high, b.low, a.high, a.low )
       
  5280         : le128( a.high, a.low, b.high, b.low );
       
  5281 
       
  5282 }
       
  5283 
       
  5284 /*----------------------------------------------------------------------------
       
  5285 | Returns 1 if the quadruple-precision floating-point value `a' is less than
       
  5286 | the corresponding value `b', and 0 otherwise.  The comparison is performed
       
  5287 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  5288 *----------------------------------------------------------------------------*/
       
  5289 
       
  5290 int float128_lt( float128 a, float128 b STATUS_PARAM )
       
  5291 {
       
  5292     flag aSign, bSign;
       
  5293 
       
  5294     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5295               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5296          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5297               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5298        ) {
       
  5299         float_raise( float_flag_invalid STATUS_VAR);
       
  5300         return 0;
       
  5301     }
       
  5302     aSign = extractFloat128Sign( a );
       
  5303     bSign = extractFloat128Sign( b );
       
  5304     if ( aSign != bSign ) {
       
  5305         return
       
  5306                aSign
       
  5307             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  5308                  != 0 );
       
  5309     }
       
  5310     return
       
  5311           aSign ? lt128( b.high, b.low, a.high, a.low )
       
  5312         : lt128( a.high, a.low, b.high, b.low );
       
  5313 
       
  5314 }
       
  5315 
       
  5316 /*----------------------------------------------------------------------------
       
  5317 | Returns 1 if the quadruple-precision floating-point value `a' is equal to
       
  5318 | the corresponding value `b', and 0 otherwise.  The invalid exception is
       
  5319 | raised if either operand is a NaN.  Otherwise, the comparison is performed
       
  5320 | according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  5321 *----------------------------------------------------------------------------*/
       
  5322 
       
  5323 int float128_eq_signaling( float128 a, float128 b STATUS_PARAM )
       
  5324 {
       
  5325 
       
  5326     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5327               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5328          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5329               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5330        ) {
       
  5331         float_raise( float_flag_invalid STATUS_VAR);
       
  5332         return 0;
       
  5333     }
       
  5334     return
       
  5335            ( a.low == b.low )
       
  5336         && (    ( a.high == b.high )
       
  5337              || (    ( a.low == 0 )
       
  5338                   && ( (bits64) ( ( a.high | b.high )<<1 ) == 0 ) )
       
  5339            );
       
  5340 
       
  5341 }
       
  5342 
       
  5343 /*----------------------------------------------------------------------------
       
  5344 | Returns 1 if the quadruple-precision floating-point value `a' is less than
       
  5345 | or equal to the corresponding value `b', and 0 otherwise.  Quiet NaNs do not
       
  5346 | cause an exception.  Otherwise, the comparison is performed according to the
       
  5347 | IEC/IEEE Standard for Binary Floating-Point Arithmetic.
       
  5348 *----------------------------------------------------------------------------*/
       
  5349 
       
  5350 int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
       
  5351 {
       
  5352     flag aSign, bSign;
       
  5353 
       
  5354     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5355               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5356          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5357               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5358        ) {
       
  5359         if (    float128_is_signaling_nan( a )
       
  5360              || float128_is_signaling_nan( b ) ) {
       
  5361             float_raise( float_flag_invalid STATUS_VAR);
       
  5362         }
       
  5363         return 0;
       
  5364     }
       
  5365     aSign = extractFloat128Sign( a );
       
  5366     bSign = extractFloat128Sign( b );
       
  5367     if ( aSign != bSign ) {
       
  5368         return
       
  5369                aSign
       
  5370             || (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  5371                  == 0 );
       
  5372     }
       
  5373     return
       
  5374           aSign ? le128( b.high, b.low, a.high, a.low )
       
  5375         : le128( a.high, a.low, b.high, b.low );
       
  5376 
       
  5377 }
       
  5378 
       
  5379 /*----------------------------------------------------------------------------
       
  5380 | Returns 1 if the quadruple-precision floating-point value `a' is less than
       
  5381 | the corresponding value `b', and 0 otherwise.  Quiet NaNs do not cause an
       
  5382 | exception.  Otherwise, the comparison is performed according to the IEC/IEEE
       
  5383 | Standard for Binary Floating-Point Arithmetic.
       
  5384 *----------------------------------------------------------------------------*/
       
  5385 
       
  5386 int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
       
  5387 {
       
  5388     flag aSign, bSign;
       
  5389 
       
  5390     if (    (    ( extractFloat128Exp( a ) == 0x7FFF )
       
  5391               && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
       
  5392          || (    ( extractFloat128Exp( b ) == 0x7FFF )
       
  5393               && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
       
  5394        ) {
       
  5395         if (    float128_is_signaling_nan( a )
       
  5396              || float128_is_signaling_nan( b ) ) {
       
  5397             float_raise( float_flag_invalid STATUS_VAR);
       
  5398         }
       
  5399         return 0;
       
  5400     }
       
  5401     aSign = extractFloat128Sign( a );
       
  5402     bSign = extractFloat128Sign( b );
       
  5403     if ( aSign != bSign ) {
       
  5404         return
       
  5405                aSign
       
  5406             && (    ( ( (bits64) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
       
  5407                  != 0 );
       
  5408     }
       
  5409     return
       
  5410           aSign ? lt128( b.high, b.low, a.high, a.low )
       
  5411         : lt128( a.high, a.low, b.high, b.low );
       
  5412 
       
  5413 }
       
  5414 
       
  5415 #endif
       
  5416 
       
  5417 /* misc functions */
       
  5418 float32 uint32_to_float32( unsigned int a STATUS_PARAM )
       
  5419 {
       
  5420     return int64_to_float32(a STATUS_VAR);
       
  5421 }
       
  5422 
       
  5423 float64 uint32_to_float64( unsigned int a STATUS_PARAM )
       
  5424 {
       
  5425     return int64_to_float64(a STATUS_VAR);
       
  5426 }
       
  5427 
       
  5428 unsigned int float32_to_uint32( float32 a STATUS_PARAM )
       
  5429 {
       
  5430     int64_t v;
       
  5431     unsigned int res;
       
  5432 
       
  5433     v = float32_to_int64(a STATUS_VAR);
       
  5434     if (v < 0) {
       
  5435         res = 0;
       
  5436         float_raise( float_flag_invalid STATUS_VAR);
       
  5437     } else if (v > 0xffffffff) {
       
  5438         res = 0xffffffff;
       
  5439         float_raise( float_flag_invalid STATUS_VAR);
       
  5440     } else {
       
  5441         res = v;
       
  5442     }
       
  5443     return res;
       
  5444 }
       
  5445 
       
  5446 unsigned int float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
       
  5447 {
       
  5448     int64_t v;
       
  5449     unsigned int res;
       
  5450 
       
  5451     v = float32_to_int64_round_to_zero(a STATUS_VAR);
       
  5452     if (v < 0) {
       
  5453         res = 0;
       
  5454         float_raise( float_flag_invalid STATUS_VAR);
       
  5455     } else if (v > 0xffffffff) {
       
  5456         res = 0xffffffff;
       
  5457         float_raise( float_flag_invalid STATUS_VAR);
       
  5458     } else {
       
  5459         res = v;
       
  5460     }
       
  5461     return res;
       
  5462 }
       
  5463 
       
  5464 unsigned int float64_to_uint32( float64 a STATUS_PARAM )
       
  5465 {
       
  5466     int64_t v;
       
  5467     unsigned int res;
       
  5468 
       
  5469     v = float64_to_int64(a STATUS_VAR);
       
  5470     if (v < 0) {
       
  5471         res = 0;
       
  5472         float_raise( float_flag_invalid STATUS_VAR);
       
  5473     } else if (v > 0xffffffff) {
       
  5474         res = 0xffffffff;
       
  5475         float_raise( float_flag_invalid STATUS_VAR);
       
  5476     } else {
       
  5477         res = v;
       
  5478     }
       
  5479     return res;
       
  5480 }
       
  5481 
       
  5482 unsigned int float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
       
  5483 {
       
  5484     int64_t v;
       
  5485     unsigned int res;
       
  5486 
       
  5487     v = float64_to_int64_round_to_zero(a STATUS_VAR);
       
  5488     if (v < 0) {
       
  5489         res = 0;
       
  5490         float_raise( float_flag_invalid STATUS_VAR);
       
  5491     } else if (v > 0xffffffff) {
       
  5492         res = 0xffffffff;
       
  5493         float_raise( float_flag_invalid STATUS_VAR);
       
  5494     } else {
       
  5495         res = v;
       
  5496     }
       
  5497     return res;
       
  5498 }
       
  5499 
       
  5500 /* FIXME: This looks broken.  */
       
  5501 uint64_t float64_to_uint64 (float64 a STATUS_PARAM)
       
  5502 {
       
  5503     int64_t v;
       
  5504 
       
  5505     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
       
  5506     v += float64_val(a);
       
  5507     v = float64_to_int64(make_float64(v) STATUS_VAR);
       
  5508 
       
  5509     return v - INT64_MIN;
       
  5510 }
       
  5511 
       
  5512 uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
       
  5513 {
       
  5514     int64_t v;
       
  5515 
       
  5516     v = float64_val(int64_to_float64(INT64_MIN STATUS_VAR));
       
  5517     v += float64_val(a);
       
  5518     v = float64_to_int64_round_to_zero(make_float64(v) STATUS_VAR);
       
  5519 
       
  5520     return v - INT64_MIN;
       
  5521 }
       
  5522 
       
  5523 #define COMPARE(s, nan_exp)                                                  \
       
  5524 INLINE int float ## s ## _compare_internal( float ## s a, float ## s b,      \
       
  5525                                       int is_quiet STATUS_PARAM )            \
       
  5526 {                                                                            \
       
  5527     flag aSign, bSign;                                                       \
       
  5528     bits ## s av, bv;                                                        \
       
  5529                                                                              \
       
  5530     if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) &&                    \
       
  5531          extractFloat ## s ## Frac( a ) ) ||                                 \
       
  5532         ( ( extractFloat ## s ## Exp( b ) == nan_exp ) &&                    \
       
  5533           extractFloat ## s ## Frac( b ) )) {                                \
       
  5534         if (!is_quiet ||                                                     \
       
  5535             float ## s ## _is_signaling_nan( a ) ||                          \
       
  5536             float ## s ## _is_signaling_nan( b ) ) {                         \
       
  5537             float_raise( float_flag_invalid STATUS_VAR);                     \
       
  5538         }                                                                    \
       
  5539         return float_relation_unordered;                                     \
       
  5540     }                                                                        \
       
  5541     aSign = extractFloat ## s ## Sign( a );                                  \
       
  5542     bSign = extractFloat ## s ## Sign( b );                                  \
       
  5543     av = float ## s ## _val(a);                                              \
       
  5544     bv = float ## s ## _val(b);                                              \
       
  5545     if ( aSign != bSign ) {                                                  \
       
  5546         if ( (bits ## s) ( ( av | bv )<<1 ) == 0 ) {                         \
       
  5547             /* zero case */                                                  \
       
  5548             return float_relation_equal;                                     \
       
  5549         } else {                                                             \
       
  5550             return 1 - (2 * aSign);                                          \
       
  5551         }                                                                    \
       
  5552     } else {                                                                 \
       
  5553         if (av == bv) {                                                      \
       
  5554             return float_relation_equal;                                     \
       
  5555         } else {                                                             \
       
  5556             return 1 - 2 * (aSign ^ ( av < bv ));                            \
       
  5557         }                                                                    \
       
  5558     }                                                                        \
       
  5559 }                                                                            \
       
  5560                                                                              \
       
  5561 int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM )        \
       
  5562 {                                                                            \
       
  5563     return float ## s ## _compare_internal(a, b, 0 STATUS_VAR);              \
       
  5564 }                                                                            \
       
  5565                                                                              \
       
  5566 int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM )  \
       
  5567 {                                                                            \
       
  5568     return float ## s ## _compare_internal(a, b, 1 STATUS_VAR);              \
       
  5569 }
       
  5570 
       
  5571 COMPARE(32, 0xff)
       
  5572 COMPARE(64, 0x7ff)
       
  5573 
       
  5574 INLINE int float128_compare_internal( float128 a, float128 b,
       
  5575                                       int is_quiet STATUS_PARAM )
       
  5576 {
       
  5577     flag aSign, bSign;
       
  5578 
       
  5579     if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
       
  5580           ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
       
  5581         ( ( extractFloat128Exp( b ) == 0x7fff ) &&
       
  5582           ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
       
  5583         if (!is_quiet ||
       
  5584             float128_is_signaling_nan( a ) ||
       
  5585             float128_is_signaling_nan( b ) ) {
       
  5586             float_raise( float_flag_invalid STATUS_VAR);
       
  5587         }
       
  5588         return float_relation_unordered;
       
  5589     }
       
  5590     aSign = extractFloat128Sign( a );
       
  5591     bSign = extractFloat128Sign( b );
       
  5592     if ( aSign != bSign ) {
       
  5593         if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
       
  5594             /* zero case */
       
  5595             return float_relation_equal;
       
  5596         } else {
       
  5597             return 1 - (2 * aSign);
       
  5598         }
       
  5599     } else {
       
  5600         if (a.low == b.low && a.high == b.high) {
       
  5601             return float_relation_equal;
       
  5602         } else {
       
  5603             return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
       
  5604         }
       
  5605     }
       
  5606 }
       
  5607 
       
  5608 int float128_compare( float128 a, float128 b STATUS_PARAM )
       
  5609 {
       
  5610     return float128_compare_internal(a, b, 0 STATUS_VAR);
       
  5611 }
       
  5612 
       
  5613 int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
       
  5614 {
       
  5615     return float128_compare_internal(a, b, 1 STATUS_VAR);
       
  5616 }
       
  5617 
       
  5618 /* Multiply A by 2 raised to the power N.  */
       
  5619 float32 float32_scalbn( float32 a, int n STATUS_PARAM )
       
  5620 {
       
  5621     flag aSign;
       
  5622     int16 aExp;
       
  5623     bits32 aSig;
       
  5624 
       
  5625     aSig = extractFloat32Frac( a );
       
  5626     aExp = extractFloat32Exp( a );
       
  5627     aSign = extractFloat32Sign( a );
       
  5628 
       
  5629     if ( aExp == 0xFF ) {
       
  5630         return a;
       
  5631     }
       
  5632     if ( aExp != 0 )
       
  5633         aSig |= 0x00800000;
       
  5634     else if ( aSig == 0 )
       
  5635         return a;
       
  5636 
       
  5637     aExp += n - 1;
       
  5638     aSig <<= 7;
       
  5639     return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
       
  5640 }
       
  5641 
       
  5642 float64 float64_scalbn( float64 a, int n STATUS_PARAM )
       
  5643 {
       
  5644     flag aSign;
       
  5645     int16 aExp;
       
  5646     bits64 aSig;
       
  5647 
       
  5648     aSig = extractFloat64Frac( a );
       
  5649     aExp = extractFloat64Exp( a );
       
  5650     aSign = extractFloat64Sign( a );
       
  5651 
       
  5652     if ( aExp == 0x7FF ) {
       
  5653         return a;
       
  5654     }
       
  5655     if ( aExp != 0 )
       
  5656         aSig |= LIT64( 0x0010000000000000 );
       
  5657     else if ( aSig == 0 )
       
  5658         return a;
       
  5659 
       
  5660     aExp += n - 1;
       
  5661     aSig <<= 10;
       
  5662     return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
       
  5663 }
       
  5664 
       
  5665 #ifdef FLOATX80
       
  5666 floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
       
  5667 {
       
  5668     flag aSign;
       
  5669     int16 aExp;
       
  5670     bits64 aSig;
       
  5671 
       
  5672     aSig = extractFloatx80Frac( a );
       
  5673     aExp = extractFloatx80Exp( a );
       
  5674     aSign = extractFloatx80Sign( a );
       
  5675 
       
  5676     if ( aExp == 0x7FF ) {
       
  5677         return a;
       
  5678     }
       
  5679     if (aExp == 0 && aSig == 0)
       
  5680         return a;
       
  5681 
       
  5682     aExp += n;
       
  5683     return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
       
  5684                                           aSign, aExp, aSig, 0 STATUS_VAR );
       
  5685 }
       
  5686 #endif
       
  5687 
       
  5688 #ifdef FLOAT128
       
  5689 float128 float128_scalbn( float128 a, int n STATUS_PARAM )
       
  5690 {
       
  5691     flag aSign;
       
  5692     int32 aExp;
       
  5693     bits64 aSig0, aSig1;
       
  5694 
       
  5695     aSig1 = extractFloat128Frac1( a );
       
  5696     aSig0 = extractFloat128Frac0( a );
       
  5697     aExp = extractFloat128Exp( a );
       
  5698     aSign = extractFloat128Sign( a );
       
  5699     if ( aExp == 0x7FFF ) {
       
  5700         return a;
       
  5701     }
       
  5702     if ( aExp != 0 )
       
  5703         aSig0 |= LIT64( 0x0001000000000000 );
       
  5704     else if ( aSig0 == 0 && aSig1 == 0 )
       
  5705         return a;
       
  5706 
       
  5707     aExp += n - 1;
       
  5708     return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
       
  5709                                           STATUS_VAR );
       
  5710 
       
  5711 }
       
  5712 #endif