m3g/m3gcore11/src/m3g_math.c
changeset 0 5d03bc08d59c
child 33 25f95128741d
equal deleted inserted replaced
-1:000000000000 0:5d03bc08d59c
       
     1 /*
       
     2 * Copyright (c) 2003 Nokia Corporation and/or its subsidiary(-ies).
       
     3 * All rights reserved.
       
     4 * This component and the accompanying materials are made available
       
     5 * under the terms of the License "Eclipse Public License v1.0"
       
     6 * which accompanies this distribution, and is available
       
     7 * at the URL "http://www.eclipse.org/legal/epl-v10.html".
       
     8 *
       
     9 * Initial Contributors:
       
    10 * Nokia Corporation - initial contribution.
       
    11 *
       
    12 * Contributors:
       
    13 *
       
    14 * Description: Internal math function implementations
       
    15 *
       
    16 */
       
    17 
       
    18 
       
    19 /*!
       
    20  * \internal
       
    21  * \file
       
    22  * \brief Internal math function implementations
       
    23  *
       
    24  */
       
    25 
       
    26 #ifndef M3G_CORE_INCLUDE
       
    27 #   error included by m3g_core.c; do not compile separately.
       
    28 #endif
       
    29 
       
    30 #include "m3g_defs.h"
       
    31 #include "m3g_memory.h"
       
    32 
       
    33 #if defined(M3G_SOFT_FLOAT)
       
    34 #include <math.h>
       
    35 #endif
       
    36 
       
    37 /*----------------------------------------------------------------------
       
    38  * Private types and definitions
       
    39  *--------------------------------------------------------------------*/
       
    40 
       
    41 /* Enumerated common matrix classifications */
       
    42 #define MC_IDENTITY             0x40100401 // 01000000 00010000 00000100 00000001
       
    43 #define MC_FRUSTUM              0x30BF0C03 // 00110000 10111111 00001100 00000011
       
    44 #define MC_PERSPECTIVE          0x30B00C03 // 00110000 10110000 00001100 00000011
       
    45 #define MC_ORTHO                0x7F300C03 // 01111111 00110000 00001100 00000011
       
    46 #define MC_PARALLEL             0x70300C03 // 01110000 00110000 00001100 00000011
       
    47 #define MC_SCALING_ROTATION     0x403F3F3F // 01000000 00111111 00111111 00111111
       
    48 #define MC_SCALING              0x40300C03 // 01000000 00110000 00001100 00000011
       
    49 #define MC_TRANSLATION          0x7F100401 // 01111111 00010000 00000100 00000001
       
    50 #define MC_X_ROTATION           0x403C3C01 // 01000000 00111100 00111100 00000001
       
    51 #define MC_Y_ROTATION           0x40330433 // 01000000 00110011 00000100 00110011
       
    52 #define MC_Z_ROTATION           0x40100F0F // 01000000 00010000 00001111 00001111
       
    53 #define MC_W_UNITY              0x7F3F3F3F // 01111111 00111111 00111111 00111111
       
    54 #define MC_GENERIC              0xFFFFFFFF
       
    55 
       
    56 /* Partial masks for individual matrix components */
       
    57 #define MC_TRANSLATION_PART     0x3F000000 // 00111111 00000000 00000000 00000000
       
    58 #define MC_SCALE_PART           0x00300C03 // 00000000 00110000 00001100 00000011
       
    59 #define MC_SCALE_ROTATION_PART  0x003F3F3F // 00000000 00111111 00111111 00111111
       
    60 
       
    61 /* Matrix element classification masks */
       
    62 #define ELEM_ZERO       0x00
       
    63 #define ELEM_ONE        0x01
       
    64 #define ELEM_MINUS_ONE  0x02
       
    65 #define ELEM_ANY        0x03
       
    66 
       
    67 /*!
       
    68  * \internal
       
    69  * \brief calculates the offset of a 4x4 matrix element in a linear
       
    70  * array
       
    71  *
       
    72  * \notes The current convention is column-major, as in OpenGL ES
       
    73  */
       
    74 #define MELEM(row, col) ((row) + ((col) << 2))
       
    75 
       
    76 /*!
       
    77  * \internal
       
    78  * \brief Macro for accessing 4x4 float matrix elements
       
    79  *
       
    80  * \param mtx pointer to the first element of the matrix
       
    81  * \param row matrix row
       
    82  * \param col matrix column
       
    83  */
       
    84 #define M44F(mtx, row, col)     ((mtx)->elem[MELEM((row), (col))])
       
    85 
       
    86 /*--------------------------------------------------------------------*/
       
    87 
       
    88 /*----------------------------------------------------------------------
       
    89  * Private functions
       
    90  *--------------------------------------------------------------------*/
       
    91 
       
    92 
       
    93 /*!
       
    94  * \internal
       
    95  * \brief ARM VFPv2 implementation of 4x4 matrix multiplication.
       
    96  *
       
    97  * \param dst	multiplication result
       
    98  * \param left	left-hand matrix
       
    99  * \param right right-hand matrix
       
   100  */
       
   101 #if defined(M3G_HW_FLOAT_VFPV2)
       
   102 
       
   103 __asm void _m3gGenericMatrixProduct(Matrix *dst,
       
   104                                     const Matrix *left, const Matrix *right)
       
   105 {
       
   106 // r0 = *dst
       
   107 // r1 = *left
       
   108 // r2 = *right
       
   109 
       
   110 		CODE32
       
   111 
       
   112 // save the VFP registers and set the vector STRIDE = 1, LENGTH = 4
       
   113 
       
   114 		FSTMFDD	sp!, {d8-d15}
       
   115 
       
   116 		FMRX	r3, FPSCR		 
       
   117 		BIC		r12, r3, #0x00370000
       
   118 		ORR		r12, #0x00030000
       
   119 		FMXR	FPSCR, r12
       
   120 
       
   121 // left = [a0  a1  a2  a3	right = [b0  b1  b2  b3
       
   122 //	 	   a4  a5  a6  a7		     b4  b5  b6  b7
       
   123 //		   a8  a9 a10 a11		     b8  b9 b10 b11
       
   124 //		  a12 a13 a14 a15]		    b12 b13 b14 b15]
       
   125 //
       
   126 // dst = [a0*b0+a4*b1+a8*b2+a12*b3  a1*b0+a5*b1+a9*b2+a13*b3  ..
       
   127 //		  a0*b4+a4*b5+a8*b6+a12*b7  a1*b4+a5*b5+a9*b6+a13*b7  ..
       
   128 //					.							.
       
   129 //					.							.
       
   130 
       
   131 		FLDMIAS	r1!, {s8-s23}		// load the left matrix [a0-a15] to registers s8-s23
       
   132 		FLDMIAS	r2!, {s0-s7}		// load [b0-b7] of right matrix to registers s0-s7
       
   133 		FMULS	s24, s8, s0			// [s24-s27]  = [a0*b0  a1*b0  a2*b0  a3*b0]
       
   134 		FMULS	s28, s8, s4			// [s28-s31]  = [a0*b4  a1*b4  a2*b4  a3*b4]
       
   135 		FMACS	s24, s12, s1		// [s24-s27] += [a4*b1  a5*b1  a6*b1  a7*b1]
       
   136 		FMACS	s28, s12, s5		// [s28-s31] += [a4*b5  a5*b5  a6*b5  a7*b5]
       
   137 		FMACS	s24, s16, s2		// [s24-s27] += [a8*b2  a9*b2  a10*b2 a11*b2]
       
   138 		FMACS	s28, s16, s6		// [s28-s31] += [a8*b6  a9*b6  a10*b6 a11*b6]
       
   139 		FMACS	s24, s20, s3		// [s24-s27] += [a12*b3 a13*b3 a14*b3 a15*b3]
       
   140 		FMACS	s28, s20, s7		// [s28-s31] += [a12*b7 a13*b37a14*b7 a15*b7]
       
   141 		FLDMIAS	r2!, {s0-s7}		// load [b8-b15]
       
   142 		FSTMIAS	r0!, {s24-s31}		// write [dst0-dst7]
       
   143 		FMULS	s24, s8, s0			
       
   144 		FMULS	s28, s8, s4			
       
   145 		FMACS	s24, s12, s1		
       
   146 		FMACS	s28, s12, s5		
       
   147 		FMACS	s24, s16, s2		
       
   148 		FMACS	s28, s16, s6		
       
   149 		FMACS	s24, s20, s3		
       
   150 		FMACS	s28, s20, s7		
       
   151 		FSTMIAS	r0!, {s24-s31}
       
   152 
       
   153 // Restore the VFP registers and return.
       
   154 
       
   155 		FMXR	FPSCR, r3
       
   156 
       
   157 		FLDMFDD	sp!, {d8-d15}	
       
   158 
       
   159 		BX		lr
       
   160 
       
   161 }
       
   162 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
       
   163 
       
   164 
       
   165 /*------------------ Elementary float ------------------*/
       
   166 
       
   167 #if defined(M3G_SOFT_FLOAT)
       
   168 
       
   169 #if defined (M3G_BUILD_ARM)
       
   170 
       
   171 /*!
       
   172  * \internal
       
   173  * \brief Floating point multiplication implementation for ARM.
       
   174  */
       
   175 __asm M3Gfloat m3gMul(const M3Gfloat a,
       
   176                       const M3Gfloat b)
       
   177 {
       
   178     /**
       
   179      * Extract the exponents of the multiplicands and add them
       
   180      * together. Flush to zero if either exponent or their sum
       
   181      * is zero.
       
   182      */
       
   183 
       
   184     mov     r12, #0xff;
       
   185     ands    r2, r0, r12, lsl #23;   // exit if e1 == 0
       
   186     andnes  r3, r1, r12, lsl #23;   // exit if e2 == 0
       
   187     subne   r2, r2, #(127 << 23);
       
   188     addnes  r12, r2, r3;            // exit if e1+e2-127 <= 0
       
   189     movle   r0, #0;
       
   190     bxle    lr;
       
   191 
       
   192     /**
       
   193      * Determine the sign of the result. Note that the exponent
       
   194      * may have overflowed to the sign bit, and thus the result
       
   195      * may be an arbitrary negative value when it really should
       
   196      * be +Inf or -Inf.
       
   197      */
       
   198 
       
   199     teq     r0, r1;
       
   200     orrmi   r12, r12, #0x80000000;
       
   201 
       
   202     /**
       
   203      * Multiply the mantissas. First shift the mantissas up to
       
   204      * unsigned 1.31 fixed point, adding the leading "1" bit at
       
   205      * the same time, and finally do a 32x32 -> 64 bit unsigned
       
   206      * multiplication. The result is in unsigned 2.62 fixed point,
       
   207      * representing the interval [1.0, 4.0).
       
   208      */
       
   209 
       
   210     mov     r2, #0x80000000;
       
   211     orr     r0, r2, r0, lsl #8;
       
   212     orr     r1, r2, r1, lsl #8;
       
   213     umulls  r2, r3, r0, r1;
       
   214 
       
   215     /**
       
   216      * If the highest bit of the 64-bit result is set, then the
       
   217      * mantissa lies in [2.0, 4.0) and needs to be renormalized.
       
   218      * That is, the mantissa is shifted one bit to the right and
       
   219      * the exponent correspondingly increased by 1. Note that we
       
   220      * lose the leading "1" bit from the mantissa by adding it up
       
   221      * with the exponent.
       
   222      */
       
   223 
       
   224     subpl   r12, r12, #(1 << 23);    // no overflow: exponent -= 1
       
   225     addpl   r0, r12, r3, lsr #7;     // no overflow: exponent += 1
       
   226     addmi   r0, r12, r3, lsr #8;     // overflow: exponent += 1
       
   227     bx      lr;
       
   228 }
       
   229 
       
   230 /*!
       
   231  * \internal
       
   232  * \brief Floating point addition implementation for ARM.
       
   233  */
       
   234 __asm M3Gfloat m3gAdd(const M3Gfloat a,
       
   235                       const M3Gfloat b)
       
   236 {
       
   237     /**
       
   238      * If the operands have opposite signs then this is not really
       
   239      * an addition but a subtraction. Subtraction is much slower,
       
   240      * so we have a separate code path for it, rather than trying
       
   241      * to save space by handling both in the same place.
       
   242      */
       
   243     
       
   244     teq     r0, r1;
       
   245     bmi     _m3gSub;
       
   246 
       
   247     /**
       
   248      * Sort the operands such that the larger operand is in R0 and
       
   249      * the smaller in R1. The sign bits do not affect the ordering,
       
   250      * since they are known to be equal.
       
   251      */
       
   252     
       
   253     subs    r2, r0, r1;
       
   254     submi   r0, r0, r2;
       
   255     addmi   r1, r1, r2;
       
   256 
       
   257     /**
       
   258      * Extract the exponent of the smaller operand into R2 and compute
       
   259      * the difference between the larger and smaller exponent into R3.
       
   260      * (Note that the sign bits cancel out in the subtraction.) The
       
   261      * exponent delta tells how many bits the mantissa of the smaller
       
   262      * operand must be shifted to the right in order to bring the
       
   263      * operands into equal scale.
       
   264      */
       
   265 
       
   266     mov     r2, r1, lsr #23;
       
   267     rsb     r3, r2, r0, lsr #23;
       
   268 
       
   269     /**
       
   270      * Check if the exponent delta is bigger than 23 bits, or if the
       
   271      * smaller exponent is zero. In either case, exit the routine and
       
   272      * return the larger operand (which is already in R0). Note that
       
   273      * this means that subnormals are treated as zero.
       
   274      */
       
   275 
       
   276     rsbs    r12, r3, #23;               // N set, V clear if R3 > 23
       
   277     tstpl   r2, #0xff;                  // execute only if R3 <= 23
       
   278     bxle    lr;                         // exit if Z set or N != V
       
   279 
       
   280     /**
       
   281      * Extract the mantissas and shift them up to unsigned 1.31 fixed
       
   282      * point, inserting the implied leading "1" bit at the same time.
       
   283      * Finally, align the decimal points and add up the mantissas.
       
   284      */
       
   285     
       
   286     mov     r12, #0x80000000;
       
   287     orr     r0, r12, r0, lsl #8;
       
   288     orr     r1, r12, r1, lsl #8;
       
   289     adds    r0, r0, r1, lsr r3;
       
   290 
       
   291     /**
       
   292      * Compute the final exponent by adding up the smaller exponent
       
   293      * (R2), the exponent delta (R3), and the possible overflow bit
       
   294      * (carry flag). Note that in case of overflow, the leading "1"
       
   295      * has ended up in the carry flag and thus needs not be explicitly
       
   296      * discarded. Finally, put the mantissa together with the sign and
       
   297      * exponent.
       
   298      */
       
   299 
       
   300     adc     r2, r2, r3;                 // r2 = smallExp + deltaExp + overflow
       
   301     movcc   r0, r0, lsl #1;             // no overflow: discard leading 1
       
   302     mov     r0, r0, lsr #9;
       
   303     orr     r0, r0, r2, lsl #23;
       
   304     bx      lr;
       
   305     
       
   306 _m3gSub
       
   307 
       
   308     /**
       
   309      * Sort the operands such that the one with larger magnitude is
       
   310      * in R0 and has the correct sign (the sign of the final result),
       
   311      * while the smaller operand is in R1 with an inverted sign bit.
       
   312      */
       
   313     
       
   314     eor     r1, r1, #0x80000000;
       
   315     subs    r2, r0, r1;
       
   316     eormi   r2, r2, #0x80000000;
       
   317     submi   r0, r0, r2;
       
   318     addmi   r1, r1, r2;
       
   319 
       
   320     /**
       
   321      * Extract the exponent of the smaller operand into R2 and compute
       
   322      * the difference between the larger and smaller exponent into R3.
       
   323      * (Note that the sign bits cancel out in the subtraction.) The
       
   324      * exponent delta tells how many bits the mantissa of the smaller
       
   325      * operand must be shifted to the right in order to bring the
       
   326      * operands into equal scale.
       
   327      */
       
   328 
       
   329     mov     r2, r1, lsr #23;
       
   330     rsbs    r3, r2, r0, lsr #23;
       
   331 
       
   332     /**
       
   333      * Check if the exponent delta is bigger than 31 bits, or if the
       
   334      * smaller exponent is zero. In either case, exit the routine and
       
   335      * return the larger operand (which is already in R0). Note that
       
   336      * this means that subnormals are treated as zero.
       
   337      */
       
   338 
       
   339     rsbs    r12, r3, #31;               // N set, V clear if R3 > 31
       
   340     tstpl   r2, #0xff;                  // execute only if R3 <= 31
       
   341     bxle    lr;                         // exit if Z set or N != V
       
   342 
       
   343     /**
       
   344      * Extract the mantissas and shift them up to unsigned 1.31 fixed
       
   345      * point, inserting the implied leading "1" bit at the same time.
       
   346      * Then align the decimal points and subtract the smaller mantissa
       
   347      * from the larger one.
       
   348      */
       
   349     
       
   350     mov     r12, #0x80000000;
       
   351     orr     r0, r12, r0, lsl #8;
       
   352     orr     r1, r12, r1, lsl #8;
       
   353     subs    r0, r0, r1, lsr r3;
       
   354 
       
   355     /**
       
   356      * We split the range of possible results into three categories:
       
   357      * 
       
   358      *   1. [1.0, 2.0) ==> no renormalization needed (highest bit set)
       
   359      *   2. [0.5, 1.0) ==> only one left-shift needed
       
   360      *   3. (0.0, 0.5) ==> multiple left-shifts needed
       
   361      *   4. zero       ==> just return
       
   362      *   
       
   363      * Cases 1 and 2 are handled in the main code path. Cases 3 and 4
       
   364      * are less common by far, so we branch to a separate code fragment
       
   365      * for those.
       
   366      */
       
   367     
       
   368     movpls  r0, r0, lsl #1;             // Cases 2,3,4: shift left
       
   369     bpl     _m3gSubRenormalize;         // Cases 3 & 4: branch out
       
   370     
       
   371     /**
       
   372      * Now we have normalized the mantissa such that the highest bit
       
   373      * is set. Here we only need to adjust the exponent, if necessary,
       
   374      * and put the pieces together. Note that we lose the leading "1"
       
   375      * bit from the mantissa by adding it up with the exponent. We can
       
   376      * also do proper rounding (towards nearest) instead of truncation
       
   377      * (towards zero) at no extra cost!
       
   378      */
       
   379 
       
   380     sbc     r3, r3, #1;                 // deltaExp -= 1 (Case 1) or 2 (Case 2)
       
   381     add     r2, r2, r3;                 // resultExp = smallExp + deltaExp
       
   382     movs    r0, r0, lsr #8;             // shift mantissa, keep leading "1"
       
   383     adc     r0, r0, r2, lsl #23;        // resultExp += 1, mantissa += carry
       
   384     bx      lr;
       
   385 
       
   386     /**
       
   387      * Separate code path for cases 3 and 4 (see above). The mantissa
       
   388      * has already been shifted up by one, but the exponent has not
       
   389      * been correspondingly decreased. We also know that the highest
       
   390      * bit is still not set, and that the carry flag is clear.
       
   391      */
       
   392 
       
   393 _m3gSubRenormalize
       
   394     bxeq    lr;
       
   395     subcc   r3, r3, #2;
       
   396     movccs  r0, r0, lsl #2;
       
   397 
       
   398     /**
       
   399      * If the carry flag is still not set, i.e. there were more than
       
   400      * two leading zeros in the mantissa initially, loop until we
       
   401      * find the highest set bit.
       
   402      */
       
   403     
       
   404 _m3gSubRenormalizeLoop
       
   405     subcc   r3, r3, #1;
       
   406     movccs  r0, r0, lsl #1;
       
   407     bcc     _m3gSubRenormalizeLoop;
       
   408 
       
   409     /**
       
   410      * Now the leading "1" is in the carry flag, so we can just add up
       
   411      * the exponent and mantissa as usual, doing proper rounding at
       
   412      * the same time. However, cases where the exponent goes negative
       
   413      * (that is, underflows) must be detected and flushed to zero.
       
   414      */
       
   415     
       
   416     add     r3, r2, r3;
       
   417     movs    r0, r0, lsr #9;
       
   418     adc     r0, r0, r3, lsl #23;
       
   419     teq     r0, r2, lsl #23;
       
   420     movmi   r0, #0;
       
   421     bx      lr;
       
   422 }
       
   423 
       
   424 #else /* M3G_BUILD_ARM */
       
   425 
       
   426 /*!
       
   427  * \internal
       
   428  * \brief Floating point addition implementation
       
   429  *
       
   430  */
       
   431 static M3G_INLINE M3Gfloat m3gFloatAdd(const M3Guint aBits,
       
   432                                        const M3Guint bBits)
       
   433 {
       
   434     M3Guint large, small, signMask;
       
   435     
       
   436     /* Early exits for underflow cases */
       
   437 
       
   438     large = (M3Gint)(aBits & ~SIGN_MASK);
       
   439     if (large <= 0x00800000) {
       
   440         return INT_AS_FLOAT(bBits);
       
   441     }
       
   442     small = (M3Gint)(bBits & ~SIGN_MASK);
       
   443     if (small <= 0x00800000) {
       
   444         return INT_AS_FLOAT(aBits);
       
   445     }
       
   446 
       
   447     /* Swap the numbers so that "large" really is larger; the unsigned
       
   448      * (or de-signed) bitmasks for floats are nicely monotonous, so we
       
   449      * can compare directly.  Also store the sign of the larger number
       
   450      * for future reference. */
       
   451 
       
   452     if (small > large) {
       
   453         M3Gint temp = small;
       
   454         small = large;
       
   455         large = temp;
       
   456         signMask = (bBits & SIGN_MASK);
       
   457     }
       
   458     else {
       
   459         signMask = (aBits & SIGN_MASK);
       
   460     }
       
   461     
       
   462     {
       
   463         M3Guint res, overflow;
       
   464         M3Guint resExp, expDelta;
       
   465         
       
   466         /* Store the larger exponent as our candidate result exponent,
       
   467          * and compute the difference between the exponents */
       
   468 
       
   469         resExp = (large >> 23);
       
   470         expDelta = resExp - (small >> 23);
       
   471 
       
   472         /* Take an early exit if the change would be insignificant;
       
   473          * this also guards against odd results from shifting by more
       
   474          * than 31 (undefined in C) */
       
   475 
       
   476         if (expDelta >= 24) {
       
   477             res = large | signMask;
       
   478             return INT_AS_FLOAT(res);
       
   479         }
       
   480 
       
   481         /* Convert the mantissas into shifted integers, and shift the
       
   482          * smaller number to the same scale with the larger one. */
       
   483 
       
   484         large = (large & MANTISSA_MASK) | LEADING_ONE;
       
   485         small = (small & MANTISSA_MASK) | LEADING_ONE;
       
   486         small >>= expDelta;
       
   487         M3G_ASSERT(large >= small);
       
   488         
       
   489         /* Check whether we're really adding or subtracting the
       
   490          * smaller number, and branch to slightly different routines
       
   491          * respectively */
       
   492 
       
   493         if (((aBits ^ bBits) & SIGN_MASK) == 0) {
       
   494 
       
   495             /* Matching signs; just add the numbers and check for
       
   496              * overflow, shifting to compensate as necessary. */
       
   497 
       
   498             res = large + small;
       
   499             
       
   500             overflow = (res >> 24);
       
   501             res >>= overflow;
       
   502             resExp += overflow;
       
   503         }
       
   504         else {
       
   505 
       
   506             /* Different signs, so let's subtract the smaller value;
       
   507              * also check that we're not subtracting a number from
       
   508              * itself (so we don't have to normalize a zero below) */
       
   509             
       
   510             if (small == large) {
       
   511                 return 0.0f; /* x - x = 0 */
       
   512             }
       
   513 
       
   514             res = (large << 8) - (small << 8);
       
   515 
       
   516             /* Renormalize the number by shifting until we've got the
       
   517              * high bit in place */
       
   518 
       
   519             while ((res >> 24) == 0) {
       
   520                 res <<= 8;
       
   521                 resExp -= 8;
       
   522             }
       
   523             while ((res >> 31) == 0) {
       
   524                 res <<= 1;
       
   525                 --resExp;
       
   526             }
       
   527             res >>= 8;
       
   528         }
       
   529 
       
   530         /* Flush to zero in case of over/underflow of the exponent */
       
   531 
       
   532         if (resExp >= 255) {
       
   533             return 0.0f;
       
   534         }
       
   535         
       
   536         /* Compose the final number into "res"; note that we pull in
       
   537          * the sign of the original larger number, which should still
       
   538          * be valid */
       
   539 
       
   540         res &= MANTISSA_MASK;
       
   541         res |= (resExp << 23);
       
   542         res |= signMask;
       
   543 
       
   544         return INT_AS_FLOAT(res);
       
   545     }
       
   546 }
       
   547 
       
   548 /*!
       
   549  * \internal
       
   550  * \brief Floating point multiplication implementation
       
   551  */
       
   552 static M3G_INLINE M3Gfloat m3gFloatMul(const M3Guint aBits,
       
   553                                        const M3Guint bBits)
       
   554 {
       
   555     M3Guint a, b;
       
   556 
       
   557     /* Early exits for underflow and multiplication by zero */
       
   558 
       
   559     a = (aBits & ~SIGN_MASK);
       
   560     if (a <= 0x00800000) {
       
   561         return 0.0f;
       
   562     }
       
   563     
       
   564     b = (bBits & ~SIGN_MASK);
       
   565     if (b <= 0x00800000) {
       
   566         return 0.0f;
       
   567     }
       
   568     
       
   569     {
       
   570         M3Guint res, exponent, overflow;
       
   571         
       
   572         /* Compute the exponent of the result, assuming the mantissas
       
   573          * don't overflow; then mask out the original exponents */
       
   574         
       
   575         exponent = (a >> 23) + (b >> 23) - 127;
       
   576         a &= MANTISSA_MASK;
       
   577         b &= MANTISSA_MASK;
       
   578 
       
   579         /* Compute the new mantissa from:
       
   580          *
       
   581          *   (1 + a)(1 + b) = ab + a + b + 1
       
   582          *   
       
   583          * First shift the mantissas from 0.23 down to 0.16 for the
       
   584          * multiplication, then shift back to 0.23 for adding in the
       
   585          * "a + b + 1" part of the equation.  */
       
   586         
       
   587         res = (a >> 7) * (b >> 7);              /* "ab" at 0.32 */
       
   588         res = (res >> 9) + a + b + LEADING_ONE;
       
   589 
       
   590         /* Add the leading one, then normalize the result by checking
       
   591          * the overflow bit and dividing by two if necessary */
       
   592 
       
   593         overflow = (res >> 24);
       
   594         res >>= overflow;
       
   595         exponent += overflow;
       
   596 
       
   597         /* Flush to zero in case of over/underflow of the exponent */
       
   598 
       
   599         if (exponent >= 255) {
       
   600             return 0.0f;
       
   601         }
       
   602 
       
   603         /* Compose the final number into "res" */
       
   604 
       
   605         res &= MANTISSA_MASK;
       
   606         res |= (exponent << 23);
       
   607         res |= (aBits ^ bBits) & SIGN_MASK;
       
   608 
       
   609         return INT_AS_FLOAT(res);
       
   610     }
       
   611 }
       
   612 
       
   613 #endif /* M3G_BUILD_ARM */
       
   614 
       
   615 /*!
       
   616  * \internal
       
   617  * \brief Computes the signed fractional part of a floating point
       
   618  * number
       
   619  *
       
   620  * \param x  floating point value
       
   621  * \return x signed fraction of x in ]-1, 1[
       
   622  */
       
   623 static M3Gfloat m3gFrac(M3Gfloat x)
       
   624 {
       
   625     /* Quick exit for -1 < x < 1 */
       
   626     
       
   627     if (m3gAbs(x) < 1.0f) {
       
   628         return x;
       
   629     }
       
   630 
       
   631     /* Shift the mantissa to the proper place, mask out the integer
       
   632      * part, and renormalize */
       
   633     {
       
   634         M3Guint ix = FLOAT_AS_UINT(x);
       
   635         M3Gint expo = ((ix >> 23) & 0xFF) - 127;
       
   636         M3G_ASSERT(expo >= 0);
       
   637 
       
   638         /* The decimal part will always be zero for large values with
       
   639          * exponents over 24 */
       
   640         
       
   641         if (expo >= 24) {
       
   642             return 0.f;
       
   643         }
       
   644         else {
       
   645             
       
   646             /* Shift the integer part out of the mantissa and see what
       
   647              * we have left */
       
   648             
       
   649             M3Guint base = (ix & MANTISSA_MASK) | LEADING_ONE;
       
   650             base = (base << expo) & MANTISSA_MASK;
       
   651 
       
   652             /* Quick exit (and guard against infinite looping) for
       
   653              * zero */
       
   654 
       
   655             if (base == 0) {
       
   656                 return 0.f;
       
   657             }
       
   658 
       
   659             /* We now have an exponent of 0 (i.e. no shifting), but
       
   660              * must renormalize to get a set bit in place of the
       
   661              * hidden (implicit one) bit */
       
   662             
       
   663             expo = 0;
       
   664             
       
   665             while ((base >> 19) == 0) {
       
   666                 base <<= 4;
       
   667                 expo -= 4;
       
   668             }
       
   669             while ((base >> 23) == 0) {
       
   670                 base <<= 1;
       
   671                 --expo;
       
   672             }
       
   673 
       
   674             /* Compose the final number */
       
   675 
       
   676             ix =
       
   677                 (base & MANTISSA_MASK) |
       
   678                 ((expo + 127) << 23) |
       
   679                 (ix & SIGN_MASK);
       
   680             return INT_AS_FLOAT(ix);
       
   681         }
       
   682     }
       
   683 }
       
   684 
       
   685 #endif /* M3G_SOFT_FLOAT */
       
   686 
       
   687 #if defined(M3G_DEBUG)
       
   688 /*!
       
   689  * \internal
       
   690  * \brief Checks for NaN or infinity in a floating point input
       
   691  */
       
   692 static void m3gValidateFloats(int n, float *p)
       
   693 {
       
   694     while (n-- > 0) {
       
   695         M3G_ASSERT(EXPONENT(*p) < 120);
       
   696         ++p;
       
   697     }
       
   698 }
       
   699 #else
       
   700 #   define m3gValidateFloats(n, p)
       
   701 #endif
       
   702     
       
   703 /*------------------ Trigonometry and exp ----------*/
       
   704 
       
   705 
       
   706 #if defined(M3G_SOFT_FLOAT)
       
   707 /*!
       
   708  * \internal
       
   709  * \brief Sine for the first quadrant
       
   710  *
       
   711  * \param x floating point value in [0, PI/2]
       
   712  * \return sine of \c x
       
   713  */
       
   714 static M3Gfloat m3gSinFirstQuadrant(M3Gfloat x)
       
   715 {
       
   716     M3Guint bits = FLOAT_AS_UINT(x);
       
   717     
       
   718     if (bits <= 0x3ba3d70au)    /* 0.005f */
       
   719         return x;
       
   720     else {
       
   721         static const M3Gfloat sinTermLut[4] = {
       
   722             -1.0f / (2*3),
       
   723             -1.0f / (4*5),
       
   724             -1.0f / (6*7),
       
   725             -1.0f / (8*9)
       
   726         };
       
   727 
       
   728         M3Gfloat xx = m3gSquare(x);
       
   729         M3Gfloat sinx = x;
       
   730         int i;
       
   731 
       
   732         for (i = 0; i < 4; ++i) {
       
   733             x    = m3gMul(x, m3gMul(xx, sinTermLut[i]));
       
   734             sinx = m3gAdd(sinx, x);
       
   735         }
       
   736 
       
   737         return sinx;
       
   738     }
       
   739 }
       
   740 #endif /* M3G_SOFT_FLOAT */
       
   741 
       
   742 #if defined(M3G_SOFT_FLOAT)
       
   743 /*!
       
   744  * \internal
       
   745  * \brief Computes sine for the first period
       
   746  *
       
   747  * \param x floating point value in [0, 2PI]
       
   748  * \return sine of x
       
   749  */
       
   750 static M3Gfloat m3gSinFirstPeriod(const M3Gfloat x)
       
   751 {
       
   752     M3G_ASSERT(x >= 0 && x <= TWO_PI);
       
   753     
       
   754     if (x >= PI) {
       
   755         return m3gNegate(m3gSinFirstQuadrant(x >= ONE_AND_HALF_PI ?
       
   756                                              m3gSub(TWO_PI, x) :
       
   757                                              m3gSub(x, PI)));
       
   758     }
       
   759     return m3gSinFirstQuadrant((x >= HALF_PI) ? m3gSub(PI, x) : x);
       
   760 }
       
   761 #endif /* M3G_SOFT_FLOAT */
       
   762 
       
   763 /*------------- Float vs. int conversion helpers -------------*/
       
   764 
       
   765 /*!
       
   766  * \internal
       
   767  * \brief Scales and clamps a float to unsigned byte range
       
   768  */
       
   769 static M3G_INLINE M3Gint m3gFloatToByte(const M3Gfloat a)
       
   770 {
       
   771     return m3gRoundToInt(m3gMul(255.f, m3gClampFloat(a, 0.f, 1.f)));
       
   772 }
       
   773 
       
   774 /*------------------ Vector helpers ------------------*/
       
   775 
       
   776 /*!
       
   777  * \internal
       
   778  * \brief Computes the norm of a floating-point 3-vector
       
   779  */
       
   780 static M3G_INLINE M3Gfloat m3gNorm3(const M3Gfloat *v)
       
   781 {
       
   782     return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
       
   783                   m3gSquare(v[2]));
       
   784 }
       
   785 
       
   786 /*!
       
   787  * \internal
       
   788  * \brief Computes the norm of a floating-point 4-vector
       
   789  */
       
   790 static M3G_INLINE M3Gfloat m3gNorm4(const M3Gfloat *v)
       
   791 {
       
   792     return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
       
   793                   m3gAdd(m3gSquare(v[2]), m3gSquare(v[3])));
       
   794 }
       
   795 
       
   796 /*!
       
   797  * \internal
       
   798  * \brief Scales a floating-point 3-vector
       
   799  */
       
   800 static M3G_INLINE void m3gScale3(M3Gfloat *v, M3Gfloat s)
       
   801 {
       
   802     v[0] = m3gMul(v[0], s);
       
   803     v[1] = m3gMul(v[1], s);
       
   804     v[2] = m3gMul(v[2], s);
       
   805 }
       
   806 
       
   807 /*!
       
   808  * \internal
       
   809  * \brief Scales a floating-point 4-vector
       
   810  */
       
   811 static M3G_INLINE void m3gScale4(M3Gfloat *v, M3Gfloat s)
       
   812 {
       
   813     v[0] = m3gMul(v[0], s);
       
   814     v[1] = m3gMul(v[1], s);
       
   815     v[2] = m3gMul(v[2], s);
       
   816     v[3] = m3gMul(v[3], s);
       
   817 }
       
   818 
       
   819 
       
   820 /*------------------ Matrices ------------------*/
       
   821 
       
   822 /*!
       
   823  * \internal
       
   824  */
       
   825 static M3G_INLINE M3Gbool m3gIsClassified(const Matrix *mtx)
       
   826 {
       
   827     M3G_ASSERT(mtx != NULL);
       
   828     return (M3Gbool) mtx->classified;
       
   829 }
       
   830 
       
   831 /*!
       
   832  * \internal
       
   833  * \brief Returns a classification for a single floating point number
       
   834  */
       
   835 static M3G_INLINE M3Guint m3gElementClass(const M3Gfloat x)
       
   836 {
       
   837     if (IS_ZERO(x)) {
       
   838         return ELEM_ZERO;
       
   839     }
       
   840     else if (IS_ONE(x)) {
       
   841         return ELEM_ONE;
       
   842     }
       
   843     else if (IS_MINUS_ONE(x)) {
       
   844         return ELEM_MINUS_ONE;
       
   845     }
       
   846     return ELEM_ANY;
       
   847 }
       
   848 
       
   849 /*!
       
   850  * \internal
       
   851  * \brief Computes the classification mask of a matrix
       
   852  *
       
   853  * The mask is constructed from two bits per elements, with the lowest
       
   854  * two bits corresponding to the first element in the \c elem array of
       
   855  * the matrix.
       
   856  */
       
   857 static void m3gClassify(Matrix *mtx)
       
   858 {
       
   859     M3Guint mask = 0;
       
   860     const M3Gfloat *p;
       
   861     int i;
       
   862 
       
   863     M3G_ASSERT(mtx != NULL);
       
   864     M3G_ASSERT(!m3gIsClassified(mtx));
       
   865 
       
   866     p = mtx->elem;
       
   867     for (i = 0; i < 16; ++i) {
       
   868         M3Gfloat elem = *p++;
       
   869         mask |= (m3gElementClass(elem) << (i*2));
       
   870     }
       
   871     mtx->mask = mask;
       
   872     mtx->classified = M3G_TRUE;
       
   873 }
       
   874 
       
   875 /*!
       
   876  * \internal
       
   877  * \brief Sets matrix classification directly
       
   878  */
       
   879 static M3G_INLINE void m3gClassifyAs(M3Guint mask, Matrix *mtx)
       
   880 {
       
   881     M3G_ASSERT(mtx != NULL);
       
   882     mtx->mask = mask;
       
   883     mtx->classified = M3G_TRUE;
       
   884     mtx->complete = M3G_FALSE;
       
   885 }
       
   886 
       
   887 /*!
       
   888  * \internal
       
   889  * \brief Attempts to classify a matrix more precisely
       
   890  *
       
   891  * Tries to classify all "free" elements of a matrix into one of the
       
   892  * predefined constants to improve precision and performance in
       
   893  * subsequent calculations.
       
   894  */
       
   895 static void m3gSubClassify(Matrix *mtx)
       
   896 {
       
   897     M3G_ASSERT_PTR(mtx);
       
   898     M3G_ASSERT(m3gIsClassified(mtx));
       
   899     {
       
   900         const M3Gfloat *p = mtx->elem;
       
   901         M3Guint inMask = mtx->mask;
       
   902         M3Guint outMask = inMask;
       
   903         int i;
       
   904 
       
   905         for (i = 0; i < 16; ++i, inMask >>= 2) {
       
   906             M3Gfloat elem = *p++;
       
   907             if ((inMask & 0x03) == ELEM_ANY) {
       
   908                 outMask &= ~(0x03u << (i*2));
       
   909                 outMask |= (m3gElementClass(elem) << (i*2));
       
   910             }
       
   911         }
       
   912         mtx->mask = outMask;
       
   913     }
       
   914 }
       
   915 
       
   916 /*!
       
   917  * \internal
       
   918  * \brief Fills in the implicit values for a classified matrix
       
   919  */
       
   920 static void m3gFillClassifiedMatrix(Matrix *mtx)
       
   921 {
       
   922     int i;
       
   923     M3Guint mask;
       
   924     M3Gfloat *p;
       
   925 
       
   926     M3G_ASSERT(mtx != NULL);
       
   927     M3G_ASSERT(mtx->classified);
       
   928     M3G_ASSERT(!mtx->complete);
       
   929 
       
   930     mask = mtx->mask;
       
   931     p = mtx->elem;
       
   932 
       
   933     for (i = 0; i < 16; ++i, mask >>= 2) {
       
   934         unsigned elem = (mask & 0x03);
       
   935         switch (elem) {
       
   936         case ELEM_ZERO:         *p++ =  0.0f; break;
       
   937         case ELEM_ONE:          *p++ =  1.0f; break;
       
   938         case ELEM_MINUS_ONE:    *p++ = -1.0f; break;
       
   939         default:                ++p;
       
   940         }
       
   941     }
       
   942     mtx->complete = M3G_TRUE;
       
   943 }
       
   944 
       
   945 
       
   946 #if !defined(M3G_HW_FLOAT)
       
   947 /*!
       
   948  * \internal
       
   949  * \brief Performs one multiply-add of classified matrix elements
       
   950  *
       
   951  * \param amask element class of the first multiplicand
       
   952  * \param a     float value of the first multiplicand
       
   953  * \param bmask element class of the second multiplicand
       
   954  * \param b     float value of the second multiplicand
       
   955  * \param c     float value to add
       
   956  * \return a * b + c
       
   957  * 
       
   958  * \notes inline, as only called from the matrix product function
       
   959  */
       
   960 static M3G_INLINE M3Gfloat m3gClassifiedMadd(
       
   961     const M3Gbitmask amask, const M3Gfloat *pa,
       
   962     const M3Gbitmask bmask, const M3Gfloat *pb,
       
   963     const M3Gfloat c)
       
   964 {
       
   965     /* Check for zero product to reduce the switch cases below */
       
   966     
       
   967     if (amask == ELEM_ZERO || bmask == ELEM_ZERO) {
       
   968         return c;    
       
   969     }
       
   970 
       
   971     /* Branch based on the classification of a */
       
   972     
       
   973     switch (amask) {
       
   974         
       
   975     case ELEM_ANY:
       
   976         if (bmask == ELEM_ONE) {
       
   977             return m3gAdd(*pa, c);      /*  a * 1 + c  =  a + c  */
       
   978         }
       
   979         if (bmask == ELEM_MINUS_ONE) {
       
   980             return m3gSub(c, *pa);      /*  a * -1 + c = -a + c  */
       
   981         }
       
   982         return m3gMadd(*pa, *pb, c);    /*  a * b + c            */
       
   983         
       
   984     case ELEM_ONE:
       
   985         if (bmask == ELEM_ONE) {
       
   986             return m3gAdd(c, 1.f);      /*  1 * 1 + c  = 1 + c   */
       
   987         }
       
   988         if (bmask == ELEM_MINUS_ONE) {
       
   989             return m3gSub(c, 1.f);      /*  1 * -1 + c = -1 + c  */
       
   990         }
       
   991         return m3gAdd(*pb, c);          /*  1 * b + c  =  b + c  */
       
   992         
       
   993     case ELEM_MINUS_ONE:
       
   994         if (bmask == ELEM_ONE) {
       
   995             return m3gSub(c, 1.f);      /* -1 * 1 + c  = -1 + c  */
       
   996         }
       
   997         if (bmask == ELEM_MINUS_ONE) {
       
   998             return m3gAdd(c, 1.f);      /* -1 * -1 + c =  1 + c  */
       
   999         }
       
  1000         return m3gSub(c, *pb);          /* -1 * b + c  = -b + c  */
       
  1001         
       
  1002     default:
       
  1003         M3G_ASSERT(M3G_FALSE);
       
  1004         return 0.0f;
       
  1005     }
       
  1006 }
       
  1007 #endif /*!defined(M3G_HW_FLOAT)*/
       
  1008 
       
  1009 /*!
       
  1010  * \internal
       
  1011  * \brief Computes a generic 4x4 matrix product
       
  1012  */
       
  1013 static void m3gGenericMatrixProduct(Matrix *dst,
       
  1014                                     const Matrix *left, const Matrix *right)
       
  1015 {
       
  1016     M3G_ASSERT(dst != NULL && left != NULL && right != NULL);
       
  1017 
       
  1018     {
       
  1019         
       
  1020 #       if defined(M3G_HW_FLOAT)
       
  1021         if (!left->complete) {
       
  1022             m3gFillClassifiedMatrix((Matrix*)left);
       
  1023         }
       
  1024         if (!right->complete) {
       
  1025             m3gFillClassifiedMatrix((Matrix*)right);
       
  1026         }
       
  1027 #       else
       
  1028         int row;
       
  1029         const unsigned lmask = left->mask;
       
  1030         const unsigned rmask = right->mask;
       
  1031 #       endif
       
  1032         
       
  1033 #if defined(M3G_HW_FLOAT_VFPV2)
       
  1034 		_m3gGenericMatrixProduct(dst, left, right);
       
  1035 #else	
       
  1036         for (row = 0; row < 4; ++row) {
       
  1037             int col;
       
  1038             for (col = 0; col < 4; ++col) {
       
  1039                 int k;
       
  1040                 M3Gfloat a = 0;
       
  1041                 for (k = 0; k < 4; ++k) {
       
  1042                     M3Gint lidx = MELEM(row, k);
       
  1043                     M3Gint ridx = MELEM(k, col);
       
  1044 #                   if defined(M3G_HW_FLOAT)
       
  1045                     a = m3gMadd(left->elem[lidx], right->elem[ridx], a);
       
  1046 #                   else
       
  1047                     a = m3gClassifiedMadd((lmask >> (2 * lidx)) & 3,
       
  1048                                           &left->elem[lidx],
       
  1049                                           (rmask >> (2 * ridx)) & 3,
       
  1050                                           &right->elem[ridx],
       
  1051                                           a);
       
  1052 #                   endif /*!M3G_HW_FLOAT*/
       
  1053                 }
       
  1054                 M44F(dst, row, col) = a;
       
  1055             }
       
  1056         }
       
  1057 #endif /*!M3G_HW_FLOAT_VFPV2*/
       
  1058     }
       
  1059     dst->complete = M3G_TRUE;
       
  1060     dst->classified = M3G_FALSE;
       
  1061 }
       
  1062 
       
  1063 /*!
       
  1064  * \internal
       
  1065  * \brief Converts a float vector to 16-bit integers
       
  1066  *
       
  1067  * \param size   vector length
       
  1068  * \param vec    pointer to float vector
       
  1069  * \param scale  scale of output values, as the number of bits to
       
  1070  *               shift left to get actual values
       
  1071  * \param outVec output value vector
       
  1072  */
       
  1073 static void m3gFloatVecToShort(M3Gint size, const M3Gfloat *vec,
       
  1074                                M3Gint scale, M3Gshort *outVec)
       
  1075 {
       
  1076     const M3Guint *vecInt = (const M3Guint*) vec;
       
  1077     M3Gint i;
       
  1078     
       
  1079     for (i = 0; i < size; ++i) {
       
  1080         M3Guint a = vecInt[i];
       
  1081         if ((a & ~SIGN_MASK) < (1 << 23)) {
       
  1082             *outVec++ = 0;
       
  1083         }
       
  1084         else {
       
  1085             M3Gint shift =
       
  1086                 scale - ((M3Gint)((vecInt[i] >> 23) & 0xFFu) - (127 + 23));
       
  1087             M3G_ASSERT(shift > 8); /* or the high bits will overflow */
       
  1088             
       
  1089             if (shift > 23) {
       
  1090                 *outVec++ = 0;
       
  1091             }
       
  1092             else {
       
  1093                 M3Gint out =
       
  1094                     (M3Gint) (((a & MANTISSA_MASK) | LEADING_ONE) >> shift);
       
  1095                 if (a >> 31) {
       
  1096                     out = -out;
       
  1097                 }
       
  1098                 M3G_ASSERT(m3gInRange(out, -32767, 32767));
       
  1099                 *outVec++ = (M3Gshort) out;
       
  1100             }
       
  1101         }
       
  1102     }
       
  1103 }
       
  1104 
       
  1105 /*----------------------------------------------------------------------
       
  1106  * Internal functions
       
  1107  *--------------------------------------------------------------------*/
       
  1108 
       
  1109 /*--------------------------------------------------------------------*/
       
  1110 
       
  1111 #if defined(M3G_SOFT_FLOAT)
       
  1112 
       
  1113 #if !defined (M3G_BUILD_ARM)
       
  1114 
       
  1115 static M3Gfloat m3gAdd(const M3Gfloat a, const M3Gfloat b)
       
  1116 {
       
  1117     return m3gFloatAdd(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
       
  1118 }
       
  1119 
       
  1120 static M3Gfloat m3gMul(const M3Gfloat a, const M3Gfloat b)
       
  1121 {
       
  1122     return m3gFloatMul(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
       
  1123 }
       
  1124 
       
  1125 #endif /* M3G_BUILD_ARM */
       
  1126 
       
  1127 /*!
       
  1128  * \internal
       
  1129  * \brief Computes the reciprocal of the square root
       
  1130  *
       
  1131  * \param x a floating point value
       
  1132  * \return 1 / square root of \c x
       
  1133  */
       
  1134 static M3Gfloat m3gRcpSqrt(const M3Gfloat x)
       
  1135 {
       
  1136     /* Approximation followed by Newton-Raphson iteration a'la
       
  1137      * "Floating-point tricks" by J. Blinn, but we iterate several
       
  1138      * times to improve precision */
       
  1139     
       
  1140     M3Gint i = (M3G_FLOAT_ONE + (M3G_FLOAT_ONE >> 1))
       
  1141         - (FLOAT_AS_UINT(x) >> 1);
       
  1142     M3Gfloat y = INT_AS_FLOAT(i);
       
  1143     for (i = 0; i < 3; ++i) {
       
  1144         y = m3gMul(y, m3gSub(1.5f, m3gHalf(m3gMul(x, m3gSquare(y)))));
       
  1145     }
       
  1146     return y;
       
  1147 }
       
  1148 
       
  1149 /*!
       
  1150  * \internal
       
  1151  * \brief Computes the square root
       
  1152  *
       
  1153  * \param x a floating point value
       
  1154  * \return square root of \c x
       
  1155  */
       
  1156 static M3Gfloat m3gSqrt(const M3Gfloat x)
       
  1157 {
       
  1158     /* Approximation followed by Newton-Raphson iteration a'la
       
  1159      * "Floating-point tricks" by J. Blinn, but we iterate several
       
  1160      * times to improve precision */
       
  1161 
       
  1162     M3Gint i = (FLOAT_AS_UINT(x) >> 1) + (M3G_FLOAT_ONE >> 1);
       
  1163     M3Gfloat y = INT_AS_FLOAT(i);
       
  1164     for (i = 0; i < 2; ++i) {
       
  1165         y = m3gDiv(m3gAdd(m3gSquare(y), x), m3gDouble(y));
       
  1166     }
       
  1167     return y;
       
  1168 }
       
  1169 
       
  1170 #endif /* M3G_SOFT_FLOAT */
       
  1171 
       
  1172 /*--------------------------------------------------------------------*/
       
  1173 
       
  1174 #if defined(M3G_SOFT_FLOAT)
       
  1175 /*!
       
  1176  * \internal
       
  1177  */
       
  1178 static M3Gfloat m3gArcCos(const M3Gfloat x)
       
  1179 {
       
  1180     return (M3Gfloat) acos(x);
       
  1181 }
       
  1182 
       
  1183 /*!
       
  1184  * \internal
       
  1185  */
       
  1186 static M3Gfloat m3gArcTan(const M3Gfloat y, const M3Gfloat x)
       
  1187 {
       
  1188     return (M3Gfloat) atan2(y, x);
       
  1189 }
       
  1190 
       
  1191 /*!
       
  1192  * \internal
       
  1193  */
       
  1194 static M3Gfloat m3gCos(const M3Gfloat x)
       
  1195 {
       
  1196     return m3gSin(m3gAdd(x, HALF_PI));
       
  1197 }
       
  1198 
       
  1199 /*!
       
  1200  * \internal
       
  1201  * \brief
       
  1202  */
       
  1203 static M3Gfloat m3gSin(const M3Gfloat x)
       
  1204 {
       
  1205     M3Gfloat f = x;
       
  1206     
       
  1207     /* If x is greater than two pi, do a modulo operation to bring it
       
  1208      * back in range for the internal sine function */
       
  1209     
       
  1210     if (m3gAbs(f) >= TWO_PI) {
       
  1211         f = m3gMul (f, (1.f / TWO_PI));
       
  1212         f = m3gFrac(f);
       
  1213         f = m3gMul (f, TWO_PI);
       
  1214     }
       
  1215 
       
  1216     /* Compute the result, negating both the input value and the
       
  1217      * result if x was negative */
       
  1218     {
       
  1219         M3Guint i = FLOAT_AS_UINT(f);
       
  1220         M3Guint neg = (i & SIGN_MASK);
       
  1221         i ^= neg;
       
  1222         f = m3gSinFirstPeriod(INT_AS_FLOAT(i));
       
  1223         i = FLOAT_AS_UINT(f) ^ neg;
       
  1224         return INT_AS_FLOAT(i);
       
  1225     }
       
  1226 }
       
  1227 
       
  1228 /*!
       
  1229  * \internal
       
  1230  */
       
  1231 static M3Gfloat m3gTan(const M3Gfloat x)
       
  1232 {
       
  1233     return (M3Gfloat) tan(x);
       
  1234 }
       
  1235 
       
  1236 /*!
       
  1237  * \internal
       
  1238  */
       
  1239 static M3Gfloat m3gExp(const M3Gfloat a)
       
  1240 {
       
  1241     return (M3Gfloat) exp(a);
       
  1242 }
       
  1243 #endif /* M3G_SOFT_FLOAT */
       
  1244 
       
  1245 /*!
       
  1246  * \brief Checks whether the bottom row of a matrix is 0 0 0 1
       
  1247  */
       
  1248 static M3Gbool m3gIsWUnity(const Matrix *mtx)
       
  1249 {
       
  1250     M3G_ASSERT_PTR(mtx);
       
  1251 
       
  1252     if (!m3gIsClassified(mtx)) {
       
  1253         return (IS_ZERO(M44F(mtx, 3, 0)) &&
       
  1254                 IS_ZERO(M44F(mtx, 3, 1)) &&
       
  1255                 IS_ZERO(M44F(mtx, 3, 2)) &&
       
  1256                 IS_ONE (M44F(mtx, 3, 3)));
       
  1257     }
       
  1258     else {
       
  1259         return ((mtx->mask & 0xC0C0C0C0) == (ELEM_ONE << 30));
       
  1260     }
       
  1261 }
       
  1262 
       
  1263 /*!
       
  1264  * \brief Makes a quaternion by exponentiating a quaternion logarithm
       
  1265  */
       
  1266 static void m3gExpQuat(Quat *quat, const Vec3 *qExp)
       
  1267 {
       
  1268     M3Gfloat theta;
       
  1269 
       
  1270     M3G_ASSERT_PTR(quat);
       
  1271     M3G_ASSERT_PTR(qExp);
       
  1272 
       
  1273     theta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(qExp->x),
       
  1274                                   m3gSquare(qExp->y)),
       
  1275                            m3gSquare(qExp->z)));
       
  1276 
       
  1277     if (theta > EPSILON) {
       
  1278         M3Gfloat s = m3gMul(m3gSin(theta), m3gRcp(theta));
       
  1279         quat->x = m3gMul(qExp->x, s);
       
  1280         quat->y = m3gMul(qExp->y, s);
       
  1281         quat->z = m3gMul(qExp->z, s);
       
  1282         quat->w = m3gCos(theta);
       
  1283     }
       
  1284     else {
       
  1285         quat->x = quat->y = quat->z = 0.0f;
       
  1286         quat->w = 1.0f;
       
  1287     }
       
  1288 }
       
  1289 
       
  1290 /*!
       
  1291  * \brief Natural logarithm of a unit quaternion.
       
  1292  */
       
  1293 static void m3gLogQuat(Vec3 *qLog, const Quat *quat)
       
  1294 {
       
  1295     M3Gfloat sinTheta = m3gSqrt(m3gNorm3((const float *) quat));
       
  1296 
       
  1297     if (sinTheta > EPSILON) {
       
  1298         M3Gfloat s = m3gArcTan(sinTheta, quat->w) / sinTheta;
       
  1299         qLog->x = m3gMul(s, quat->x);
       
  1300         qLog->y = m3gMul(s, quat->y);
       
  1301         qLog->z = m3gMul(s, quat->z);
       
  1302     }
       
  1303     else {
       
  1304         qLog->x = qLog->y = qLog->z = 0.0f;
       
  1305     }
       
  1306 }
       
  1307 
       
  1308 /*!
       
  1309  * \brief Make quaternion the "logarithmic difference" between two
       
  1310  * other quaternions.
       
  1311  */
       
  1312 static void m3gLogDiffQuat(Vec3 *logDiff,
       
  1313                            const Quat *from, const Quat *to)
       
  1314 {
       
  1315     Quat temp;
       
  1316     temp.x = m3gNegate(from->x);
       
  1317     temp.y = m3gNegate(from->y);
       
  1318     temp.z = m3gNegate(from->z);
       
  1319     temp.w =           from->w;
       
  1320     m3gMulQuat(&temp, to);
       
  1321     m3gLogQuat(logDiff, &temp);
       
  1322 }
       
  1323 
       
  1324 /*!
       
  1325  * \brief Rounds a float to the nearest integer
       
  1326  *
       
  1327  * Overflows are clamped to the maximum or minimum representable
       
  1328  * value.
       
  1329  */
       
  1330 static M3Gint m3gRoundToInt(const M3Gfloat a)
       
  1331 {
       
  1332     M3Guint base = FLOAT_AS_UINT(a);
       
  1333     M3Gint signMask, expo;
       
  1334 
       
  1335     /* Decompose the number into sign, exponent, and base number */
       
  1336     
       
  1337     signMask = ((M3Gint) base >> 31);   /* -> 0 or 0xFFFFFFFF */
       
  1338     expo = (M3Gint)((base >> 23) & 0xFF) - 127;
       
  1339     
       
  1340     /* First check for large values and return either the negative or
       
  1341      * the positive maximum integer in case of overflow.  The overflow
       
  1342      * check can be made on the exponent alone, as large floats are
       
  1343      * spaced several integer values apart so that nothing will
       
  1344      * overflow because of rounding later on */
       
  1345     
       
  1346     if (expo >= 31) {
       
  1347         return (M3Gint)((1U << 31) - 1 + (((M3Guint) signMask) & 1));
       
  1348     }
       
  1349 
       
  1350     /* Also check for underflow to avoid problems with shifting by
       
  1351      * more than 31 */
       
  1352 
       
  1353     if (expo < -1) {
       
  1354         return 0;
       
  1355     }
       
  1356     
       
  1357     /* Mask out the sign and exponent bits, shift the base number so
       
  1358      * that the lowest bit corresponds to one half, then add one
       
  1359      * (half) and shift to round to the closest integer. */
       
  1360 
       
  1361     base = (base | LEADING_ONE) << 8;   /* shift mantissa to 1.31 */
       
  1362     base =  base >> (30 - expo);        /* integer value as 31.1 */
       
  1363     base = (base + 1) >> 1;             /* round to nearest 32.0 */
       
  1364     
       
  1365     /* Factor in the sign (negate if originally negative) and
       
  1366      * return */
       
  1367 
       
  1368     return ((M3Gint) base ^ signMask) - signMask;
       
  1369 }
       
  1370 
       
  1371 /*!
       
  1372  * \brief Calculates ray-triangle intersection.
       
  1373  *
       
  1374  * http://www.acm.org/jgt/papers/MollerTrumbore97
       
  1375  */
       
  1376 static M3Gbool m3gIntersectTriangle(const Vec3 *orig, const Vec3 *dir,
       
  1377                                     const Vec3 *vert0, const Vec3 *vert1, const Vec3 *vert2,
       
  1378                                     Vec3 *tuv, M3Gint cullMode)
       
  1379 {
       
  1380     Vec3 edge1, edge2, tvec, pvec, qvec;
       
  1381     M3Gfloat det,inv_det;
       
  1382 
       
  1383     /* find vectors for two edges sharing vert0 */
       
  1384     edge1 = *vert1;
       
  1385     edge2 = *vert2;
       
  1386     m3gSubVec3(&edge1, vert0);
       
  1387     m3gSubVec3(&edge2, vert0);
       
  1388 
       
  1389     /* begin calculating determinant - also used to calculate U parameter */
       
  1390     m3gCross(&pvec, dir, &edge2);
       
  1391 
       
  1392     /* if determinant is near zero, ray lies in plane of triangle */
       
  1393     det = m3gDot3(&edge1, &pvec);
       
  1394 
       
  1395     if (cullMode == 0 && det <= 0) return M3G_FALSE;
       
  1396     if (cullMode == 1 && det >= 0) return M3G_FALSE;
       
  1397 
       
  1398     if (det > -EPSILON && det < EPSILON)
       
  1399         return M3G_FALSE;
       
  1400     inv_det = m3gRcp(det);
       
  1401 
       
  1402     /* calculate distance from vert0 to ray origin */
       
  1403     tvec = *orig;
       
  1404     m3gSubVec3(&tvec, vert0);
       
  1405 
       
  1406     /* calculate U parameter and test bounds */
       
  1407     tuv->y = m3gMul(m3gDot3(&tvec, &pvec), inv_det);
       
  1408     if (tuv->y < 0.0f || tuv->y > 1.0f)
       
  1409         return M3G_FALSE;
       
  1410 
       
  1411     /* prepare to test V parameter */
       
  1412     m3gCross(&qvec, &tvec, &edge1);
       
  1413 
       
  1414     /* calculate V parameter and test bounds */
       
  1415     tuv->z = m3gMul(m3gDot3(dir, &qvec), inv_det);
       
  1416     if (tuv->z < 0.0f || m3gAdd(tuv->y, tuv->z) > 1.0f)
       
  1417         return M3G_FALSE;
       
  1418 
       
  1419     /* calculate t, ray intersects triangle */
       
  1420     tuv->x = m3gMul(m3gDot3(&edge2, &qvec), inv_det);
       
  1421 
       
  1422     return M3G_TRUE;
       
  1423 }
       
  1424 
       
  1425 /*!
       
  1426  * \brief Calculates ray-box intersection.
       
  1427  *
       
  1428  * http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm
       
  1429  */
       
  1430 
       
  1431 #define XO	orig->x
       
  1432 #define YO	orig->y
       
  1433 #define ZO	orig->z
       
  1434 #define XD	dir->x
       
  1435 #define YD	dir->y
       
  1436 #define ZD	dir->z
       
  1437 
       
  1438 #define XL	box->min[0]
       
  1439 #define YL	box->min[1]
       
  1440 #define ZL	box->min[2]
       
  1441 #define XH	box->max[0]
       
  1442 #define YH	box->max[1]
       
  1443 #define ZH	box->max[2]
       
  1444 
       
  1445 /*!
       
  1446  * \internal
       
  1447  * \brief Ray - bounding box intersection
       
  1448  *
       
  1449  */
       
  1450 static M3Gbool m3gIntersectBox(const Vec3 *orig, const Vec3 *dir, const AABB *box)
       
  1451 {
       
  1452 	M3Gfloat tnear = M3G_MIN_NEGATIVE_FLOAT;
       
  1453 	M3Gfloat tfar  = M3G_MAX_POSITIVE_FLOAT;
       
  1454 	M3Gfloat t1, t2, temp;
       
  1455 
       
  1456 	/* X slab */
       
  1457 	if(XD != 0) {
       
  1458 		t1 = m3gSub(XL, XO) / XD;
       
  1459 		t2 = m3gSub(XH, XO) / XD;
       
  1460 
       
  1461 		if(t1 > t2) {
       
  1462 			temp = t1;
       
  1463 			t1 = t2;
       
  1464 			t2 = temp;
       
  1465 		}
       
  1466 
       
  1467 		if(t1 > tnear) tnear = t1;
       
  1468 		if(t2 < tfar) tfar = t2;
       
  1469 
       
  1470 		if(tnear > tfar) return M3G_FALSE;
       
  1471 		if(tfar < 0) return M3G_FALSE;
       
  1472 	}
       
  1473 	else {
       
  1474 		if(XO > XH || XO < XL) return M3G_FALSE;
       
  1475 	}
       
  1476 
       
  1477 	/* Y slab */
       
  1478 	if(YD != 0) {
       
  1479 		t1 = m3gSub(YL, YO) / YD;
       
  1480 		t2 = m3gSub(YH, YO) / YD;
       
  1481 
       
  1482 		if(t1 > t2) {
       
  1483 			temp = t1;
       
  1484 			t1 = t2;
       
  1485 			t2 = temp;
       
  1486 		}
       
  1487 
       
  1488 		if(t1 > tnear) tnear = t1;
       
  1489 		if(t2 < tfar) tfar = t2;
       
  1490 
       
  1491 		if(tnear > tfar) return M3G_FALSE;
       
  1492 		if(tfar < 0) return M3G_FALSE;
       
  1493 	}
       
  1494 	else {
       
  1495 		if(YO > YH || YO < YL) return M3G_FALSE;
       
  1496 	}
       
  1497 
       
  1498 	/* Z slab */
       
  1499 	if(ZD != 0) {
       
  1500 		t1 = m3gSub(ZL, ZO) / ZD;
       
  1501 		t2 = m3gSub(ZH, ZO) / ZD;
       
  1502 
       
  1503 		if(t1 > t2) {
       
  1504 			temp = t1;
       
  1505 			t1 = t2;
       
  1506 			t2 = temp;
       
  1507 		}
       
  1508 
       
  1509 		if(t1 > tnear) tnear = t1;
       
  1510 		if(t2 < tfar) tfar = t2;
       
  1511 
       
  1512 		if(tnear > tfar) return M3G_FALSE;
       
  1513 		if(tfar < 0) return M3G_FALSE;
       
  1514 	}
       
  1515 	else {
       
  1516 		if(ZO > ZH || ZO < ZL) return M3G_FALSE;
       
  1517 	}
       
  1518 
       
  1519 	return M3G_TRUE;
       
  1520 }
       
  1521 
       
  1522 /*!
       
  1523  * \brief Calculates the intersection of two rectangles. Always fills
       
  1524  * the intersection result.
       
  1525  *
       
  1526  * \param dst   result of the intersection
       
  1527  * \param r1    rectangle 1
       
  1528  * \param r2    rectangle 2
       
  1529  */
       
  1530 static M3Gbool m3gIntersectRectangle(M3GRectangle *dst, M3GRectangle *r1, M3GRectangle *r2)
       
  1531 {
       
  1532     M3Gbool intersects = M3G_TRUE;
       
  1533     M3Gint min, max;
       
  1534 
       
  1535     max = (r1->x) >= (r2->x) ? (r1->x) : (r2->x);
       
  1536     min = (r1->x + r1->width) <= (r2->x + r2->width) ? (r1->x + r1->width) : (r2->x + r2->width);
       
  1537     if ((min - max) < 0) intersects = M3G_FALSE;
       
  1538     dst->x = max;
       
  1539     dst->width = min - max;
       
  1540 
       
  1541     max = (r1->y) >= (r2->y) ? (r1->y) : (r2->y);
       
  1542     min = (r1->y + r1->height) <= (r2->y + r2->height) ? (r1->y + r1->height) : (r2->y + r2->height);
       
  1543     if ((min - max) < 0) intersects = M3G_FALSE;
       
  1544     dst->y = max;
       
  1545     dst->height = min - max;
       
  1546 
       
  1547     return intersects;
       
  1548 }
       
  1549 
       
  1550 /*-------- float-to-int color conversions --------*/
       
  1551 
       
  1552 static M3Guint m3gAlpha1f(M3Gfloat a)
       
  1553 {
       
  1554     M3Guint alpha = (M3Guint) m3gFloatToByte(a);
       
  1555     return (alpha << 24) | M3G_RGB_MASK;
       
  1556 }
       
  1557 
       
  1558 static M3Guint m3gColor3f(M3Gfloat r, M3Gfloat g, M3Gfloat b)
       
  1559 {
       
  1560     return ((M3Guint) m3gFloatToByte(r) << 16)
       
  1561         |  ((M3Guint) m3gFloatToByte(g) <<  8)
       
  1562         |   (M3Guint) m3gFloatToByte(b)
       
  1563         |   M3G_ALPHA_MASK;
       
  1564 }
       
  1565 
       
  1566 static M3Guint m3gColor4f(M3Gfloat r, M3Gfloat g, M3Gfloat b, M3Gfloat a)
       
  1567 {
       
  1568     return ((M3Guint) m3gFloatToByte(r) << 16)
       
  1569         |  ((M3Guint) m3gFloatToByte(g) <<  8)
       
  1570         |   (M3Guint) m3gFloatToByte(b)
       
  1571         |  ((M3Guint) m3gFloatToByte(a) << 24);
       
  1572 }
       
  1573 
       
  1574 static void m3gFloatColor(M3Gint argb, M3Gfloat intensity, M3Gfloat *rgba)
       
  1575 {
       
  1576     /* NOTE we intentionally aim a bit high here -- some GL
       
  1577      * implementations may round down instead of closest */
       
  1578     
       
  1579     const M3Gfloat oneOver255 = (M3Gfloat)(1.0001 / 255.0);
       
  1580     
       
  1581 	rgba[0] = (M3Gfloat)((argb >> 16) & 0xFF);
       
  1582 	rgba[1] = (M3Gfloat)((argb >>  8) & 0xFF);
       
  1583 	rgba[2] = (M3Gfloat)((argb      ) & 0xFF);
       
  1584 	rgba[3] = (M3Gfloat)((argb >> 24) & 0xFF);
       
  1585     
       
  1586     m3gScale4(rgba, m3gMul(oneOver255, intensity));
       
  1587 }
       
  1588 
       
  1589 /*!
       
  1590  * \brief Converts the 3x3 submatrix of a matrix into fixed point
       
  1591  *
       
  1592  * The input matrix must be an affine one, i.e. the bottom row must be
       
  1593  * 0 0 0 1.  The output matrix is guaranteed to be such that it can be
       
  1594  * multiplied with a 16-bit 3-vector without overflowing, while using
       
  1595  * the 32-bit range optimally.
       
  1596  *
       
  1597  * \param mtx  the original matrix
       
  1598  * \param elem 9 shorts to hold the fixed point base numbers
       
  1599  * \return floating point exponent for the values in \c elem
       
  1600  *         (number of bits to shift left for actual values)
       
  1601  */
       
  1602 static M3Gint m3gGetFixedPoint3x3Basis(const Matrix *mtx, M3Gshort *elem)
       
  1603 {
       
  1604     M3Gint outExp;
       
  1605     M3Gint row, col;
       
  1606     const M3Guint *m;
       
  1607     
       
  1608     if (!mtx->complete) {
       
  1609         m3gFillClassifiedMatrix((Matrix*) mtx);
       
  1610     }
       
  1611     m = (const M3Guint*) mtx->elem;
       
  1612     
       
  1613     /* First, find the maximum exponent value in the whole matrix */
       
  1614 
       
  1615     outExp = 0;
       
  1616     for (col = 0; col < 3; ++col) {
       
  1617         for (row = 0; row < 3; ++row) {
       
  1618             M3Gint element = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
       
  1619             outExp = M3G_MAX(outExp, element);
       
  1620         }
       
  1621     }
       
  1622     outExp >>= 23;
       
  1623 
       
  1624     /* Our candidate exponent is the maximum found plus 9, which is
       
  1625      * guaranteed to shift the maximum unsigned 24-bit mantissa (which
       
  1626      * always will have the high bit set) down to the signed 16-bit
       
  1627      * range */
       
  1628 
       
  1629     outExp += 9;
       
  1630     
       
  1631     /* Now proceed to sum each row and see what's the actual smallest
       
  1632      * exponent we can safely use without overflowing in a 16+16
       
  1633      * matrix-vector multiplication; this will win us one bit
       
  1634      * (doubling the precision) compared to the conservative approach
       
  1635      * of just shifting everything down by 10 bits */
       
  1636 
       
  1637     for (row = 0; row < 3; ++row) {
       
  1638 
       
  1639         /* Sum the absolute values on this row */
       
  1640             
       
  1641         M3Gint rowSum = 0;
       
  1642         for (col = 0; col < 3; ++col) {
       
  1643             M3Gint a = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
       
  1644             M3Gint shift = outExp - (a >> 23);
       
  1645             M3G_ASSERT(shift < 265);
       
  1646                 
       
  1647             if (shift < 24) {
       
  1648                 rowSum += ((a & MANTISSA_MASK) | LEADING_ONE) >> shift;
       
  1649             }
       
  1650         }
       
  1651 
       
  1652         /* Now we have a 26-bit sum of the absolute values on this
       
  1653          * row, and shift that down until we fit the target range of
       
  1654          * [0, 65535].  Note that this still leaves *exactly* enough
       
  1655          * space for adding in an arbitrary 16-bit translation vector
       
  1656          * after multiplying with the matrix! */
       
  1657             
       
  1658         while (rowSum >= (1 << 16)) {
       
  1659             rowSum >>= 1;
       
  1660             ++outExp;
       
  1661         }
       
  1662     }
       
  1663 
       
  1664     /* De-bias the exponent, but add in an extra 23 to account for the
       
  1665      * decimal bits in the floating point mantissa values we started
       
  1666      * with (we're returning the exponent as "bits to shift left to
       
  1667      * get integers", so we're off by 23 from IEEE notation) */
       
  1668     
       
  1669     outExp = (outExp - 127) - 23;
       
  1670     
       
  1671     /* Finally, shift all the matrix elements to our final output
       
  1672      * precision */
       
  1673     
       
  1674     for (col = 0; col < 3; ++col) {
       
  1675         m3gFloatVecToShort(3, mtx->elem + MELEM(0, col), outExp, elem);
       
  1676         elem += 3;
       
  1677     }
       
  1678     return outExp;
       
  1679 }
       
  1680 
       
  1681 /*!
       
  1682  * \brief Gets the translation component of a matrix as fixed point
       
  1683  *
       
  1684  * \param mtx  the matrix
       
  1685  * \param elem 3 shorts to write the vector into
       
  1686  * \return floating point exponent for the values in \c elem
       
  1687  *         (number of bits to shift left for actual values)
       
  1688  */
       
  1689 static M3Gint m3gGetFixedPointTranslation(const Matrix *mtx, M3Gshort *elem)
       
  1690 {
       
  1691     const M3Guint *m;
       
  1692     
       
  1693     M3G_ASSERT(m3gIsWUnity(mtx));
       
  1694     if (!mtx->complete) {
       
  1695         m3gFillClassifiedMatrix((Matrix*) mtx);
       
  1696     }
       
  1697     m = (const M3Guint*) &mtx->elem[MELEM(0, 3)];
       
  1698 
       
  1699     /* Find the maximum exponent, then scale down by 9 bits from that
       
  1700      * to shift the unsigned 24-bit mantissas to the signed 16-bit
       
  1701      * range */
       
  1702     {
       
  1703         M3Gint outExp;
       
  1704         M3Guint maxElem = m[0] & ~SIGN_MASK;
       
  1705         maxElem = M3G_MAX(maxElem, m[1] & ~SIGN_MASK);
       
  1706         maxElem = M3G_MAX(maxElem, m[2] & ~SIGN_MASK);
       
  1707         
       
  1708         outExp = (M3Gint)(maxElem >> 23) - (127 + 23) + 9;
       
  1709         m3gFloatVecToShort(3, mtx->elem + MELEM(0, 3), outExp, elem);
       
  1710         return outExp;
       
  1711     }
       
  1712 }
       
  1713 
       
  1714 /*!
       
  1715  * \internal
       
  1716  * \brief Compute a bounding box enclosing two other boxes
       
  1717  *
       
  1718  * \param box   box to fit
       
  1719  * \param a     first box to enclose or NULL
       
  1720  * \param b     second box to enclose or NULL
       
  1721  * 
       
  1722  * \note If both input boxes are NULL, the box is not modified.
       
  1723  */
       
  1724 static void m3gFitAABB(AABB *box, const AABB *a, const AABB *b)
       
  1725 {
       
  1726     int i;
       
  1727 
       
  1728     M3G_ASSERT_PTR(box);
       
  1729     
       
  1730     if (a) {
       
  1731         m3gValidateAABB(a);
       
  1732     }
       
  1733     if (b) {
       
  1734         m3gValidateAABB(b);
       
  1735     }
       
  1736 
       
  1737     if (a && b) {
       
  1738         for (i = 0; i < 3; ++i) {
       
  1739             box->min[i] = M3G_MIN(a->min[i], b->min[i]);
       
  1740             box->max[i] = M3G_MAX(a->max[i], b->max[i]);
       
  1741         }
       
  1742     }
       
  1743     else if (a) {
       
  1744         *box = *a;
       
  1745     }
       
  1746     else if (b) {
       
  1747         *box = *b;
       
  1748     }
       
  1749 }
       
  1750 
       
  1751 /*
       
  1752  * \internal
       
  1753  * \brief Transform an axis-aligned bounding box with a matrix
       
  1754  *
       
  1755  * This results in a box that encloses the transformed original box.
       
  1756  * 
       
  1757  * Based on "Transforming Axis-Aligned Bounding Boxes" by Jim Arvo
       
  1758  * from Graphics Gems.
       
  1759  * 
       
  1760  * \note The bottom row of the matrix is ignored in the transformation.
       
  1761  */
       
  1762 static void m3gTransformAABB(AABB *box, const Matrix *mtx)
       
  1763 {
       
  1764     M3Gfloat boxMin[3], boxMax[3];
       
  1765     M3Gfloat newMin[3], newMax[3];
       
  1766 
       
  1767     m3gValidateAABB(box);
       
  1768     
       
  1769     if (!mtx->complete) {
       
  1770         m3gFillClassifiedMatrix((Matrix*) mtx);
       
  1771     }
       
  1772 
       
  1773     /* Get the original minimum and maximum extents as floats, and add
       
  1774      * the translation as the base for the transformed box */
       
  1775     {
       
  1776         int i;
       
  1777         for (i = 0; i < 3; ++i) {
       
  1778             boxMin[i] = box->min[i];
       
  1779             boxMax[i] = box->max[i];
       
  1780             newMin[i] = newMax[i] = M44F(mtx, i, 3);
       
  1781         }
       
  1782     }
       
  1783 
       
  1784     /* Transform into the new minimum and maximum coordinates using
       
  1785      * the upper left 3x3 part of the matrix */
       
  1786     {
       
  1787         M3Gint row, col;
       
  1788         
       
  1789         for (row = 0; row < 3; ++row) {
       
  1790             for (col = 0; col < 3; ++col) {
       
  1791                 M3Gfloat a = m3gMul(M44F(mtx, row, col), boxMin[col]);
       
  1792                 M3Gfloat b = m3gMul(M44F(mtx, row, col), boxMax[col]);
       
  1793                 
       
  1794                 if (a < b) { 
       
  1795                     newMin[row] = m3gAdd(newMin[row], a);
       
  1796                     newMax[row] = m3gAdd(newMax[row], b);
       
  1797                 }
       
  1798                 else { 
       
  1799                     newMin[row] = m3gAdd(newMin[row], b);
       
  1800                     newMax[row] = m3gAdd(newMax[row], a);
       
  1801                 }
       
  1802             }
       
  1803         }
       
  1804     }
       
  1805 
       
  1806     /* Write back into the bounding box */
       
  1807     {
       
  1808         int i;
       
  1809         for (i = 0; i < 3; ++i) {
       
  1810             box->min[i] = newMin[i];
       
  1811             box->max[i] = newMax[i];
       
  1812         }
       
  1813     }
       
  1814     
       
  1815     m3gValidateAABB(box);
       
  1816 }
       
  1817 
       
  1818 #if defined(M3G_DEBUG)
       
  1819 /*!
       
  1820  * \brief
       
  1821  */
       
  1822 static void m3gValidateAABB(const AABB *aabb)
       
  1823 {
       
  1824     m3gValidateFloats(6, (float*) aabb);
       
  1825 }
       
  1826 #endif
       
  1827 
       
  1828 /*----------------------------------------------------------------------
       
  1829  * Public functions
       
  1830  *--------------------------------------------------------------------*/
       
  1831 
       
  1832 /*!
       
  1833  * \brief Linear interpolation of vectors
       
  1834  *
       
  1835  * \param size     number of components
       
  1836  * \param vec      output vector
       
  1837  * \param s        interpolation factor
       
  1838  * \param start    initial value
       
  1839  * \param end      target value
       
  1840  */
       
  1841 #if defined(M3G_HW_FLOAT_VFPV2)
       
  1842 
       
  1843 M3G_API __asm void m3gLerp(M3Gint size,
       
  1844 				   M3Gfloat *vec,
       
  1845 				   M3Gfloat s,
       
  1846 				   const M3Gfloat *start, const M3Gfloat *end)
       
  1847 {
       
  1848 // r0 = size
       
  1849 // r1 = *vec
       
  1850 // r2 = s
       
  1851 // r3 = *start
       
  1852 // sp[0] = *end
       
  1853 
       
  1854 		CODE32
       
  1855 /*
       
  1856     M3Gfloat sCompl = 1.0 - s;
       
  1857     for (i = 0; i < size; ++i) {
       
  1858         vec[i] = sCompl*start[i] + s*end[i];
       
  1859     }
       
  1860 */
       
  1861 //
       
  1862 // if size = 0, return
       
  1863 //
       
  1864 		CMP		r0, #0
       
  1865 		BXEQ	lr
       
  1866 
       
  1867 		FMSR	s0, r2
       
  1868 		MOV		r2, r3
       
  1869 		LDR		r3, [sp]
       
  1870 
       
  1871 		FLDS	s1,=1.0
       
  1872 		STMFD	sp!, {r4-r5}
       
  1873 		FSUBS	s2, s1, s0			// sCompl = 1 - s
       
  1874 
       
  1875 		FMRX	r4, FPSCR
       
  1876 		CMP		r0, #4
       
  1877 		BGT		_m3gLerp_over4Loop
       
  1878 
       
  1879 //
       
  1880 // if 1 < size <= 4
       
  1881 //
       
  1882 // set vector STRIDE = 1, LENGTH = 4
       
  1883 		BIC		r12, r4, #0x00370000
       
  1884 		ORR		r12, #(3<<16)
       
  1885 		FMXR	FPSCR, r12
       
  1886 
       
  1887 		FLDMIAS	r2!, {s4-s7}		// load the start[i] values
       
  1888 		FLDMIAS	r3!, {s8-s11}		// load the end[i] values
       
  1889 		FMULS	s12, s4, s2			// [s12-s15]  = [sCompl*start[0] .. sCompl*start[3]]
       
  1890 		FMACS	s12, s8, s0			// [s12-s15] += [    	s*end[0] ..        s*end[3]]
       
  1891 		CMP		r0, #1
       
  1892 		FSTS	s12, [r1]
       
  1893 		FSTSGT	s13, [r1, #4]
       
  1894 		CMP		r0, #3
       
  1895 		FSTSGE	s14, [r1, #8]
       
  1896 		FSTSGT	s15, [r1, #12]
       
  1897 
       
  1898 		FMXR	FPSCR, r4
       
  1899 
       
  1900 		LDMFD	sp!, {r4-r5}
       
  1901 
       
  1902 		BX		lr
       
  1903 //
       
  1904 // if size > 4, interpolate 8 values in one loop
       
  1905 //
       
  1906 _m3gLerp_over4Loop
       
  1907 
       
  1908 		FSTMFDD	sp!, {d8-d9}
       
  1909 		MOVS	r5, r0, ASR #3			// size/8
       
  1910 		SUB		r0, r0, r5, LSL #3		// tail length
       
  1911 
       
  1912 // set vector STRIDE = 1, LENGTH = 8
       
  1913 		BIC		r12, r4, #0x00370000
       
  1914 		ORR		r12, #(7<<16)
       
  1915 		FMXR	FPSCR, r12
       
  1916 
       
  1917 
       
  1918 _m3gLerp_alignedLoop
       
  1919 
       
  1920 		FLDMIASNE	r2!, {s8-s15}		// load the start[i] values
       
  1921 		FLDMIASNE	r3!, {s16-s23}		// load the end[i] values
       
  1922 		FMULSNE		s24, s8, s2			// [s16-s23]  = [ sCompl*start[0] sCompl*start[1] .. sCompl*start[7]]
       
  1923 		FMACSNE		s24, s16, s0		// [s16-s23] += [		 s*end[0]        s*end[1] ..		s*end[7]]
       
  1924 		FSTMIASNE	r1!, {s24-s31}
       
  1925 		SUBSNE		r5, #1
       
  1926 
       
  1927 		BNE			_m3gLerp_alignedLoop
       
  1928 
       
  1929 // process the 0-7 remaining values in the tail
       
  1930 
       
  1931 		CMP			r0, #1
       
  1932 		FLDMIASGE	r2!, {s8-s15}		
       
  1933 		FLDMIASGE	r3!, {s16-s23}		
       
  1934 		FMULSGE		s24, s8, s2			
       
  1935 		FMACSGE		s24, s16, s0		
       
  1936 		FSTSGE		s24, [r1]
       
  1937 		FSTSGT		s25, [r1, #4]
       
  1938 		CMP			r0, #3
       
  1939 		FSTSGE		s26, [r1, #8]
       
  1940 		FSTSGT		s27, [r1, #12]
       
  1941 		CMP			r0, #5
       
  1942 		FSTSGE		s28, [r1, #16]
       
  1943 		FSTSGT		s29, [r1, #20]
       
  1944 		CMP			r0, #7
       
  1945 		FSTSEQ		s30, [r1, #24]
       
  1946 
       
  1947 		FMXR	FPSCR, r4
       
  1948 
       
  1949 		FLDMFDD	sp!, {d8-d9}
       
  1950 		LDMFD	sp!, {r4-r5}
       
  1951 
       
  1952 		BX		lr
       
  1953 
       
  1954 }
       
  1955 #else /* #if defined(M3G_HW_FLOAT_VFPV2) */
       
  1956 
       
  1957 M3G_API void m3gLerp(M3Gint size,
       
  1958              M3Gfloat *vec,
       
  1959              M3Gfloat s,
       
  1960              const M3Gfloat *start, const M3Gfloat *end)
       
  1961 {
       
  1962     int i;
       
  1963 
       
  1964     M3Gfloat sCompl = m3gSub(1.f, s);
       
  1965     for (i = 0; i < size; ++i) {
       
  1966         vec[i] = m3gAdd(m3gMul(sCompl, start[i]), m3gMul(s, end[i]));
       
  1967     }
       
  1968 }
       
  1969 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
       
  1970 
       
  1971 /*!
       
  1972  * \brief Hermite spline interpolation of vectors
       
  1973  *
       
  1974  * \param size      number of components
       
  1975  * \param vec       output vector
       
  1976  * \param s         interpolation factor
       
  1977  * \param start     start value vector
       
  1978  * \param end       end value vector
       
  1979  * \param tStart    start tangent vector
       
  1980  * \param tEnd      end tangent vector
       
  1981  */
       
  1982 M3G_API void m3gHermite(M3Gint size,
       
  1983                         M3Gfloat *vec,
       
  1984                         M3Gfloat s,
       
  1985                         const M3Gfloat *start, const M3Gfloat *end,
       
  1986                         const M3Gfloat *tStart, const M3Gfloat *tEnd)
       
  1987 {
       
  1988     M3Gfloat s2 = m3gSquare(s);
       
  1989     M3Gfloat s3 = m3gMul(s2, s);
       
  1990     int i;
       
  1991     
       
  1992     for (i = 0; i < size; ++i) {
       
  1993         vec[i] =
       
  1994             m3gMadd(start[i],
       
  1995                     m3gAdd(m3gSub(m3gDouble(s3), m3gMul(3.f, s2)), 1.f),
       
  1996                     m3gMadd(end[i],
       
  1997                             m3gSub(m3gMul(3.f, s2), m3gDouble(s3)),
       
  1998                             m3gMadd(tStart[i],
       
  1999                                     m3gAdd(m3gSub(s3, m3gDouble(s2)), s),
       
  2000                                     m3gMul(tEnd[i],
       
  2001                                            m3gSub(s3, s2)))));
       
  2002 
       
  2003     }
       
  2004     
       
  2005     /*  vec = ( 2*s3 - 3*s2 + 1) * start
       
  2006             + (-2*s3 + 3*s2    ) * end
       
  2007             + (   s3 - 2*s2 + s) * tStart
       
  2008             + (   s3 -   s2    ) * tEnd;    */
       
  2009 }
       
  2010 
       
  2011 /*--------------------------------------------------------------------*/
       
  2012 
       
  2013 /*!
       
  2014  * \brief Sets a matrix to a copy of another matrix
       
  2015  */
       
  2016 M3G_API void m3gCopyMatrix(Matrix *dst, const Matrix *src)
       
  2017 {
       
  2018     M3G_ASSERT(dst != NULL && src != NULL);
       
  2019     *dst = *src;
       
  2020 }
       
  2021 
       
  2022 /*!
       
  2023  * \brief Vector addition
       
  2024  */
       
  2025 M3G_API void m3gAddVec3(Vec3 *vec, const Vec3 *other)
       
  2026 {
       
  2027     vec->x = m3gAdd(vec->x, other->x);
       
  2028     vec->y = m3gAdd(vec->y, other->y);
       
  2029     vec->z = m3gAdd(vec->z, other->z);
       
  2030 }
       
  2031 
       
  2032 /*!
       
  2033  * \brief Vector addition
       
  2034  */
       
  2035 M3G_API void m3gAddVec4(Vec4 *vec, const Vec4 *other)
       
  2036 {
       
  2037     vec->x = m3gAdd(vec->x, other->x);
       
  2038     vec->y = m3gAdd(vec->y, other->y);
       
  2039     vec->z = m3gAdd(vec->z, other->z);
       
  2040     vec->w = m3gAdd(vec->w, other->w);
       
  2041 }
       
  2042 
       
  2043 /*!
       
  2044  * \brief Cross product of two 3D vectors expressed as 4D vectors
       
  2045  */
       
  2046 M3G_API void m3gCross(Vec3 *dst, const Vec3 *a, const Vec3 *b)
       
  2047 {
       
  2048     dst->x = m3gSub(m3gMul(a->y, b->z), m3gMul(a->z, b->y));
       
  2049     dst->y = m3gSub(m3gMul(a->z, b->x), m3gMul(a->x, b->z));
       
  2050     dst->z = m3gSub(m3gMul(a->x, b->y), m3gMul(a->y, b->x));
       
  2051 }
       
  2052 
       
  2053 /*!
       
  2054  * \brief Dot product of two vectors
       
  2055  */
       
  2056 M3G_API M3Gfloat m3gDot3(const Vec3 *a, const Vec3 *b)
       
  2057 {
       
  2058     M3Gfloat d;
       
  2059     d = m3gMul(a->x, b->x);
       
  2060     d = m3gMadd(a->y, b->y, d);
       
  2061     d = m3gMadd(a->z, b->z, d);
       
  2062     return d;
       
  2063 }
       
  2064 
       
  2065 /*!
       
  2066  * \brief Dot product of two vectors
       
  2067  */
       
  2068 M3G_API M3Gfloat m3gDot4(const Vec4 *a, const Vec4 *b)
       
  2069 {
       
  2070     M3Gfloat d;
       
  2071     d = m3gMul(a->x, b->x);
       
  2072     d = m3gMadd(a->y, b->y, d);
       
  2073     d = m3gMadd(a->z, b->z, d);
       
  2074     d = m3gMadd(a->w, b->w, d);
       
  2075     return d;
       
  2076 }
       
  2077 
       
  2078 /*!
       
  2079  * \brief
       
  2080  */
       
  2081 M3G_API void m3gSetVec3(Vec3 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z)
       
  2082 {
       
  2083     M3G_ASSERT_PTR(v);
       
  2084     v->x = x;
       
  2085     v->y = y;
       
  2086     v->z = z;
       
  2087 }
       
  2088 
       
  2089 /*!
       
  2090  * \brief
       
  2091  */
       
  2092 M3G_API void m3gSetVec4(Vec4 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z, M3Gfloat w)
       
  2093 {
       
  2094     M3G_ASSERT_PTR(v);
       
  2095     v->x = x;
       
  2096     v->y = y;
       
  2097     v->z = z;
       
  2098     v->w = w;
       
  2099 }
       
  2100 
       
  2101 /*!
       
  2102  * \brief Vector subtraction
       
  2103  */
       
  2104 M3G_API void m3gSubVec3(Vec3 *vec, const Vec3 *other)
       
  2105 {
       
  2106     vec->x = m3gSub(vec->x, other->x);
       
  2107     vec->y = m3gSub(vec->y, other->y);
       
  2108     vec->z = m3gSub(vec->z, other->z);
       
  2109 }
       
  2110 
       
  2111 /*!
       
  2112  * \brief Vector subtraction
       
  2113  */
       
  2114 M3G_API void m3gSubVec4(Vec4 *vec, const Vec4 *other)
       
  2115 {
       
  2116     vec->x = m3gSub(vec->x, other->x);
       
  2117     vec->y = m3gSub(vec->y, other->y);
       
  2118     vec->z = m3gSub(vec->z, other->z);
       
  2119     vec->w = m3gSub(vec->w, other->w);
       
  2120 }
       
  2121 
       
  2122 /*!
       
  2123  * \brief Vector length
       
  2124  */
       
  2125 M3G_API M3Gfloat m3gLengthVec3(const Vec3 *vec)
       
  2126 {
       
  2127     return m3gSqrt(m3gAdd(m3gAdd(m3gSquare(vec->x),
       
  2128                                  m3gSquare(vec->y)),
       
  2129                           m3gSquare(vec->z)));
       
  2130 }
       
  2131 
       
  2132 /*!
       
  2133  * \brief Vector scaling
       
  2134  */
       
  2135 M3G_API void m3gScaleVec3(Vec3 *vec, const M3Gfloat s)
       
  2136 {
       
  2137     vec->x = m3gMul(vec->x, s);
       
  2138     vec->y = m3gMul(vec->y, s);
       
  2139     vec->z = m3gMul(vec->z, s);
       
  2140 }
       
  2141 
       
  2142 /*!
       
  2143  * \brief Vector scaling
       
  2144  */
       
  2145 M3G_API void m3gScaleVec4(Vec4 *vec, const M3Gfloat s)
       
  2146 {
       
  2147     vec->x = m3gMul(vec->x, s);
       
  2148     vec->y = m3gMul(vec->y, s);
       
  2149     vec->z = m3gMul(vec->z, s);
       
  2150     vec->w = m3gMul(vec->w, s);
       
  2151 }
       
  2152 
       
  2153 /*!
       
  2154  * \brief Returns an angle-axis representation for a quaternion
       
  2155  *
       
  2156  * \note There are many, and this is not guaranteed to return a
       
  2157  * particular one
       
  2158  */
       
  2159 M3G_API void m3gGetAngleAxis(const Quat *quat, M3Gfloat *angle, Vec3 *axis)
       
  2160 {
       
  2161     M3Gfloat x, y, z, sinTheta;
       
  2162 
       
  2163     M3G_ASSERT_PTR(quat);
       
  2164     M3G_ASSERT_PTR(angle);
       
  2165     M3G_ASSERT_PTR(axis);
       
  2166 
       
  2167     x = quat->x;
       
  2168     y = quat->y;
       
  2169     z = quat->z;
       
  2170 
       
  2171     sinTheta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(x), m3gSquare(y)),
       
  2172                               m3gSquare(z)));
       
  2173 
       
  2174     if (sinTheta > EPSILON) {
       
  2175         M3Gfloat ooSinTheta = m3gRcp(sinTheta);
       
  2176         axis->x = m3gMul(x, ooSinTheta);
       
  2177         axis->y = m3gMul(y, ooSinTheta);
       
  2178         axis->z = m3gMul(z, ooSinTheta);
       
  2179     }
       
  2180     else {
       
  2181         /* return a valid axis even for no rotation */
       
  2182         axis->x = axis->y = 0.0f;
       
  2183         axis->z = 1.0f;
       
  2184     }
       
  2185     *angle = m3gMul(2.0f * RAD2DEG, m3gArcCos(quat->w));
       
  2186 }
       
  2187 
       
  2188 /*!
       
  2189  * \brief Gets a single matrix column
       
  2190  */
       
  2191 M3G_API void m3gGetMatrixColumn(const Matrix *mtx, M3Gint col, Vec4 *dst)
       
  2192 {
       
  2193     if ((col & ~3) == 0) {
       
  2194         if (!mtx->complete) {
       
  2195             m3gFillClassifiedMatrix((Matrix*)mtx);
       
  2196         }
       
  2197         dst->x = M44F(mtx, 0, col);
       
  2198         dst->y = M44F(mtx, 1, col);
       
  2199         dst->z = M44F(mtx, 2, col);
       
  2200         dst->w = M44F(mtx, 3, col);
       
  2201     }
       
  2202     else {
       
  2203         M3G_ASSERT(M3G_FALSE);
       
  2204     }
       
  2205 }
       
  2206 
       
  2207 /*!
       
  2208  * \brief Returns the floating point values of a matrix as consecutive
       
  2209  * columns
       
  2210  */
       
  2211 M3G_API void m3gGetMatrixColumns(const Matrix *mtx, M3Gfloat *dst)
       
  2212 {
       
  2213     M3G_ASSERT(mtx != NULL && dst != NULL);
       
  2214 
       
  2215     if (!mtx->complete) {
       
  2216         m3gFillClassifiedMatrix((Matrix*)mtx);
       
  2217     }
       
  2218     m3gCopy(dst, mtx->elem, sizeof(mtx->elem));
       
  2219 }
       
  2220 
       
  2221 /*!
       
  2222  * \brief Gets a single matrix row
       
  2223  */
       
  2224 M3G_API void m3gGetMatrixRow(const Matrix *mtx, M3Gint row, Vec4 *dst)
       
  2225 {
       
  2226     if ((row & ~3) == 0) {
       
  2227         if (!mtx->complete) {
       
  2228             m3gFillClassifiedMatrix((Matrix*)mtx);
       
  2229         }
       
  2230         dst->x = M44F(mtx, row, 0);
       
  2231         dst->y = M44F(mtx, row, 1);
       
  2232         dst->z = M44F(mtx, row, 2);
       
  2233         dst->w = M44F(mtx, row, 3);
       
  2234     }
       
  2235     else {
       
  2236         M3G_ASSERT(M3G_FALSE);
       
  2237     }
       
  2238 }
       
  2239 
       
  2240 /*!
       
  2241  * \brief Returns the floating point values of a matrix as consecutive
       
  2242  * rows
       
  2243  */
       
  2244 M3G_API void m3gGetMatrixRows(const Matrix *mtx, M3Gfloat *dst)
       
  2245 {
       
  2246     M3G_ASSERT(mtx != NULL && dst != NULL);
       
  2247 
       
  2248     if (!mtx->complete) {
       
  2249         m3gFillClassifiedMatrix((Matrix*)mtx);
       
  2250     }
       
  2251     {
       
  2252         int row;
       
  2253         for (row = 0; row < 4; ++row) {
       
  2254             *dst++ = mtx->elem[ 0 + row];
       
  2255             *dst++ = mtx->elem[ 4 + row];
       
  2256             *dst++ = mtx->elem[ 8 + row];
       
  2257             *dst++ = mtx->elem[12 + row];
       
  2258         }
       
  2259     }
       
  2260 }
       
  2261 
       
  2262 /*!
       
  2263  * \brief Sets a matrix to identity
       
  2264  */
       
  2265 M3G_API void m3gIdentityMatrix(Matrix *mtx)
       
  2266 {
       
  2267     M3G_ASSERT(mtx != NULL);
       
  2268     m3gClassifyAs(MC_IDENTITY, mtx);
       
  2269 }
       
  2270 
       
  2271 /*!
       
  2272  * \brief Sets a quaternion to identity
       
  2273  */
       
  2274 M3G_API void m3gIdentityQuat(Quat *quat)
       
  2275 {
       
  2276     M3G_ASSERT(quat != NULL);
       
  2277     quat->x = quat->y = quat->z = 0.0f;
       
  2278     quat->w = 1.0f;
       
  2279 }
       
  2280 
       
  2281 /*!
       
  2282  * \brief Inverts a matrix
       
  2283  */
       
  2284 M3G_API M3Gbool m3gInvertMatrix(Matrix *mtx)
       
  2285 {
       
  2286     M3Gfloat *matrix;
       
  2287     M3Gint i;
       
  2288     M3Gfloat tmp[12];
       
  2289     M3Gfloat src[16];
       
  2290     M3Gfloat det;
       
  2291 
       
  2292     M3G_ASSERT(mtx != NULL);
       
  2293 
       
  2294     if (!m3gIsClassified(mtx)) {
       
  2295         m3gClassify(mtx);
       
  2296     }
       
  2297 
       
  2298     /* Quick exit for identity */
       
  2299     
       
  2300     if (mtx->mask == MC_IDENTITY) {
       
  2301         return M3G_TRUE;
       
  2302     }
       
  2303 
       
  2304     /* Look for other common cases; these require that we have valid
       
  2305      * values in all the elements, so fill in the values first */
       
  2306     {
       
  2307         M3Guint mask = mtx->mask;
       
  2308         
       
  2309         if (!mtx->complete) {
       
  2310             m3gFillClassifiedMatrix(mtx);
       
  2311         }
       
  2312         
       
  2313         if ((mask | (0x3F << 24)) == MC_TRANSLATION) {
       
  2314             M44F(mtx, 0, 3) = m3gNegate(M44F(mtx, 0, 3));
       
  2315             M44F(mtx, 1, 3) = m3gNegate(M44F(mtx, 1, 3));
       
  2316             M44F(mtx, 2, 3) = m3gNegate(M44F(mtx, 2, 3));
       
  2317             mtx->mask = MC_TRANSLATION;
       
  2318             return M3G_TRUE;
       
  2319         }
       
  2320         if ((mask | 0x300C03) == MC_SCALING) {
       
  2321             if ((mask &  3       ) == 0 ||
       
  2322                 (mask & (3 << 10)) == 0 ||
       
  2323                 (mask & (3 << 20)) == 0) {
       
  2324                 return M3G_FALSE; /* zero scale for at least one axis */
       
  2325             }
       
  2326             M44F(mtx, 0, 0) = m3gRcp(M44F(mtx, 0, 0));
       
  2327             M44F(mtx, 1, 1) = m3gRcp(M44F(mtx, 1, 1));
       
  2328             M44F(mtx, 2, 2) = m3gRcp(M44F(mtx, 2, 2));
       
  2329             return M3G_TRUE;
       
  2330         }
       
  2331     }
       
  2332 
       
  2333     /* Do a full 4x4 inversion as a last resort */
       
  2334         
       
  2335 	matrix = mtx->elem;
       
  2336 
       
  2337     /* transpose matrix */
       
  2338     for (i = 0; i < 4; i++) {
       
  2339         src[i] = matrix[i*4];
       
  2340         src[i+4] = matrix[i*4+1];
       
  2341         src[i+8] = matrix[i*4+2];
       
  2342         src[i+12] = matrix[i*4+3];
       
  2343     }
       
  2344 
       
  2345     /* calculate pairs for first 8 elements (cofactors) */
       
  2346     tmp[0] = src[10]*src[15];
       
  2347     tmp[1] = src[11]*src[14];
       
  2348     tmp[2] = src[9]*src[15];
       
  2349     tmp[3] = src[11]*src[13];
       
  2350     tmp[4] = src[9]*src[14];
       
  2351     tmp[5] = src[10]*src[13];
       
  2352     tmp[6] = src[8]*src[15];
       
  2353     tmp[7] = src[11]*src[12];
       
  2354     tmp[8] = src[8]*src[14];
       
  2355     tmp[9] = src[10]*src[12];
       
  2356     tmp[10] = src[8]*src[13];
       
  2357     tmp[11] = src[9]*src[12];
       
  2358 
       
  2359     /* calculate first 8 elements (cofactors) */
       
  2360     matrix[0] = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7];
       
  2361     matrix[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7];
       
  2362     matrix[1] = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7];
       
  2363     matrix[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7];
       
  2364     matrix[2] = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7];
       
  2365     matrix[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7];
       
  2366     matrix[3] = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6];
       
  2367     matrix[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6];
       
  2368     matrix[4] = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3];
       
  2369     matrix[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3];
       
  2370     matrix[5] = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3];
       
  2371     matrix[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3];
       
  2372     matrix[6] = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3];
       
  2373     matrix[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3];
       
  2374     matrix[7] = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2];
       
  2375     matrix[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2];
       
  2376 
       
  2377     /* calculate pairs for second 8 elements (cofactors) */
       
  2378     tmp[0] = src[2]*src[7];
       
  2379     tmp[1] = src[3]*src[6];
       
  2380     tmp[2] = src[1]*src[7];
       
  2381     tmp[3] = src[3]*src[5];
       
  2382     tmp[4] = src[1]*src[6];
       
  2383     tmp[5] = src[2]*src[5];
       
  2384     tmp[6] = src[0]*src[7];
       
  2385     tmp[7] = src[3]*src[4];
       
  2386     tmp[8] = src[0]*src[6];
       
  2387     tmp[9] = src[2]*src[4];
       
  2388     tmp[10] = src[0]*src[5];
       
  2389     tmp[11] = src[1]*src[4];
       
  2390 
       
  2391     /* calculate second 8 elements (cofactors) */
       
  2392     matrix[8] = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15];
       
  2393     matrix[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15];
       
  2394     matrix[9] = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15];
       
  2395     matrix[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15];
       
  2396     matrix[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15];
       
  2397     matrix[10] -= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15];
       
  2398     matrix[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14];
       
  2399     matrix[11] -= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14];
       
  2400     matrix[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9];
       
  2401     matrix[12] -= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10];
       
  2402     matrix[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10];
       
  2403     matrix[13] -= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8];
       
  2404     matrix[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8];
       
  2405     matrix[14] -= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9];
       
  2406     matrix[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9];
       
  2407     matrix[15] -= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8];
       
  2408 
       
  2409     /* calculate determinant */
       
  2410     det = src[0]*matrix[0]+src[1]*matrix[1]+src[2]*matrix[2]+src[3]*matrix[3];
       
  2411 
       
  2412     /* matrix has no inverse */
       
  2413     if (det == 0.0f) {
       
  2414         return M3G_FALSE;
       
  2415     }
       
  2416 
       
  2417     /* calculate matrix inverse */
       
  2418     det = 1/det;
       
  2419     for (i = 0; i < 16; i++) {
       
  2420         matrix[i] *= det;
       
  2421     }
       
  2422 
       
  2423     mtx->classified = M3G_FALSE;
       
  2424 	return M3G_TRUE;
       
  2425 }
       
  2426 
       
  2427 /*!
       
  2428  * \brief Sets a matrix to the inverse of another matrix
       
  2429  */
       
  2430 M3G_API M3Gbool m3gMatrixInverse(Matrix *mtx, const Matrix *other)
       
  2431 {
       
  2432     M3G_ASSERT(mtx != NULL && other != NULL);
       
  2433 
       
  2434     if (!m3gIsClassified(other)) {
       
  2435         m3gClassify((Matrix*)other);
       
  2436     }
       
  2437 
       
  2438 	m3gCopyMatrix(mtx, other);
       
  2439 	return m3gInvertMatrix(mtx);
       
  2440 }
       
  2441 
       
  2442 /*!
       
  2443  * \brief Sets a matrix to the transpose of another matrix
       
  2444  */
       
  2445 M3G_API void m3gMatrixTranspose(Matrix *mtx, const Matrix *other)
       
  2446 {
       
  2447     M3Gbyte i;
       
  2448     M3G_ASSERT(mtx != NULL && other != NULL);
       
  2449 
       
  2450     if (!other->complete) {
       
  2451         m3gFillClassifiedMatrix((Matrix *)other);
       
  2452     }
       
  2453 
       
  2454     for (i = 0; i < 4; i++) {
       
  2455         mtx->elem[i] = other->elem[i*4];
       
  2456         mtx->elem[i+4] = other->elem[i*4+1];
       
  2457         mtx->elem[i+8] = other->elem[i*4+2];
       
  2458         mtx->elem[i+12] = other->elem[i*4+3];
       
  2459     }
       
  2460     mtx->classified = M3G_FALSE;
       
  2461     mtx->complete = M3G_TRUE;
       
  2462 }
       
  2463 
       
  2464 M3G_API M3Gbool m3gInverseTranspose(Matrix *mtx, const Matrix *other)
       
  2465 {
       
  2466     Matrix temp;
       
  2467     if (!m3gMatrixInverse(&temp, other)) {
       
  2468         return M3G_FALSE;
       
  2469     }
       
  2470     m3gMatrixTranspose(mtx, &temp);
       
  2471     return M3G_TRUE;
       
  2472 }
       
  2473 
       
  2474 /*!
       
  2475  * \brief Sets a matrix to the product of two other matrices
       
  2476  *
       
  2477  * \note \c dst can not be either of \c left or \c right; if it is,
       
  2478  * the results are undefined
       
  2479  */
       
  2480 M3G_API void m3gMatrixProduct(Matrix *dst, const Matrix *left, const Matrix *right)
       
  2481 {
       
  2482     M3G_ASSERT_PTR(dst);
       
  2483     M3G_ASSERT_PTR(left);
       
  2484     M3G_ASSERT_PTR(right);
       
  2485     M3G_ASSERT(dst != left && dst != right);
       
  2486 
       
  2487     /* Classify input matrices and take early exits for identities */
       
  2488 
       
  2489     if (!m3gIsClassified(left)) {
       
  2490         m3gClassify((Matrix*)left);
       
  2491     }
       
  2492     if (left->mask == MC_IDENTITY) {
       
  2493         m3gCopyMatrix(dst, right);
       
  2494         return;
       
  2495     }
       
  2496 
       
  2497     if (!m3gIsClassified(right)) {
       
  2498         m3gClassify((Matrix*)right);
       
  2499     }
       
  2500     if (right->mask == MC_IDENTITY) {
       
  2501         m3gCopyMatrix(dst, left);
       
  2502         return;
       
  2503     }
       
  2504 
       
  2505     /* Special quick paths for 3x4 matrices */
       
  2506     
       
  2507     if (m3gIsWUnity(left) && m3gIsWUnity(right)) {
       
  2508 
       
  2509         /* Translation? */
       
  2510         
       
  2511         if ((left->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
       
  2512             
       
  2513             if (left->mask != MC_TRANSLATION && !left->complete) {
       
  2514                 m3gFillClassifiedMatrix((Matrix*)left);
       
  2515             }
       
  2516             if (right->mask != MC_TRANSLATION && !right->complete) {
       
  2517                 m3gFillClassifiedMatrix((Matrix*)right);
       
  2518             }
       
  2519 
       
  2520             m3gCopyMatrix(dst, right);
       
  2521             
       
  2522             M44F(dst, 0, 3) = m3gAdd(M44F(left, 0, 3), M44F(dst, 0, 3));
       
  2523             M44F(dst, 1, 3) = m3gAdd(M44F(left, 1, 3), M44F(dst, 1, 3));
       
  2524             M44F(dst, 2, 3) = m3gAdd(M44F(left, 2, 3), M44F(dst, 2, 3));
       
  2525             
       
  2526             dst->mask |= MC_TRANSLATION_PART;
       
  2527             return;
       
  2528         }
       
  2529 
       
  2530         if ((right->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
       
  2531             Vec4 tvec;
       
  2532 
       
  2533             if (left->mask != MC_TRANSLATION && !left->complete) {
       
  2534                 m3gFillClassifiedMatrix((Matrix*)left);
       
  2535             }
       
  2536             if (right->mask != MC_TRANSLATION && !right->complete) {
       
  2537                 m3gFillClassifiedMatrix((Matrix*)right);
       
  2538             }
       
  2539 
       
  2540             m3gCopyMatrix(dst, left);
       
  2541             
       
  2542             m3gGetMatrixColumn(right, 3, &tvec);
       
  2543             m3gTransformVec4(dst, &tvec);
       
  2544             
       
  2545             M44F(dst, 0, 3) = tvec.x;
       
  2546             M44F(dst, 1, 3) = tvec.y;
       
  2547             M44F(dst, 2, 3) = tvec.z;
       
  2548             
       
  2549             dst->mask |= MC_TRANSLATION_PART;
       
  2550             return;
       
  2551         }
       
  2552 
       
  2553     }
       
  2554         
       
  2555     /* Compute product and set output classification */
       
  2556 
       
  2557     m3gGenericMatrixProduct(dst, left, right);
       
  2558 }
       
  2559 
       
  2560 /*!
       
  2561  * \brief Normalizes a quaternion
       
  2562  */
       
  2563 M3G_API void m3gNormalizeQuat(Quat *q)
       
  2564 {
       
  2565     M3Gfloat norm;
       
  2566     M3G_ASSERT_PTR(q);
       
  2567     
       
  2568     norm = m3gNorm4(&q->x);
       
  2569     
       
  2570     if (norm > EPSILON) {
       
  2571         norm = m3gRcpSqrt(norm);
       
  2572         m3gScale4(&q->x, norm);
       
  2573     }
       
  2574     else {
       
  2575         m3gIdentityQuat(q);
       
  2576     }
       
  2577 }
       
  2578 
       
  2579 /*!
       
  2580  * \brief Normalizes a three-vector
       
  2581  */
       
  2582 M3G_API void m3gNormalizeVec3(Vec3 *v)
       
  2583 {
       
  2584     M3Gfloat norm;
       
  2585     M3G_ASSERT_PTR(v);
       
  2586     
       
  2587     norm = m3gNorm3(&v->x);
       
  2588     
       
  2589     if (norm > EPSILON) {
       
  2590         norm = m3gRcpSqrt(norm);
       
  2591         m3gScale3(&v->x, norm);
       
  2592     }
       
  2593     else {
       
  2594         m3gZero(v, sizeof(Vec3));
       
  2595     }
       
  2596 }
       
  2597 
       
  2598 /*!
       
  2599  * \brief Normalizes a four-vector
       
  2600  */
       
  2601 M3G_API void m3gNormalizeVec4(Vec4 *v)
       
  2602 {
       
  2603     M3Gfloat norm;
       
  2604     M3G_ASSERT_PTR(v);
       
  2605     
       
  2606     norm = m3gNorm4(&v->x);
       
  2607     
       
  2608     if (norm > EPSILON) {
       
  2609         norm = m3gRcpSqrt(norm);
       
  2610         m3gScale4(&v->x, norm);
       
  2611     }
       
  2612     else {
       
  2613         m3gZero(v, sizeof(Vec4));
       
  2614     }
       
  2615 }
       
  2616 
       
  2617 /*!
       
  2618  * \brief Multiplies a matrix from the right with another matrix
       
  2619  */
       
  2620 M3G_API void m3gPostMultiplyMatrix(Matrix *mtx, const Matrix *other)
       
  2621 {
       
  2622     Matrix temp;
       
  2623     M3G_ASSERT_PTR(mtx);
       
  2624     M3G_ASSERT_PTR(other);
       
  2625 
       
  2626     m3gCopyMatrix(&temp, mtx);
       
  2627     m3gMatrixProduct(mtx, &temp, other);
       
  2628 }
       
  2629 
       
  2630 /*!
       
  2631  * \brief Multiplies a matrix from the left with another matrix
       
  2632  */
       
  2633 M3G_API void m3gPreMultiplyMatrix(Matrix *mtx, const Matrix *other)
       
  2634 {
       
  2635     Matrix temp;
       
  2636     M3G_ASSERT_PTR(mtx);
       
  2637     M3G_ASSERT_PTR(other);
       
  2638 
       
  2639     m3gCopyMatrix(&temp, mtx);
       
  2640     m3gMatrixProduct(mtx, other, &temp);
       
  2641 }
       
  2642 
       
  2643 /*!
       
  2644  * \brief Multiplies a matrix with a rotation matrix.
       
  2645  */
       
  2646 M3G_API void m3gPostRotateMatrix(Matrix *mtx,
       
  2647                          M3Gfloat angle,
       
  2648                          M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
       
  2649 {
       
  2650     Quat q;
       
  2651     m3gSetAngleAxis(&q, angle, ax, ay, az);
       
  2652     m3gPostRotateMatrixQuat(mtx, &q);
       
  2653 }
       
  2654 
       
  2655 /*!
       
  2656  * \brief Multiplies a matrix with a translation matrix.
       
  2657  */
       
  2658 M3G_API void m3gPostRotateMatrixQuat(Matrix *mtx, const Quat *quat)
       
  2659 {
       
  2660     Matrix temp;
       
  2661     m3gQuatMatrix(&temp, quat);
       
  2662     m3gPostMultiplyMatrix(mtx, &temp);
       
  2663 }
       
  2664 
       
  2665 /*!
       
  2666  * \brief Multiplies a matrix with a scale matrix.
       
  2667  */
       
  2668 M3G_API void m3gPostScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
       
  2669 {
       
  2670     Matrix temp;
       
  2671     m3gScalingMatrix(&temp, sx, sy, sz);
       
  2672     m3gPostMultiplyMatrix(mtx, &temp);
       
  2673 }
       
  2674 
       
  2675 /*!
       
  2676  * \brief Multiplies a matrix with a translation (matrix).
       
  2677  */
       
  2678 M3G_API void m3gPostTranslateMatrix(Matrix *mtx,
       
  2679                             M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
       
  2680 {
       
  2681     Matrix temp;
       
  2682     m3gTranslationMatrix(&temp, tx, ty, tz);
       
  2683     m3gPostMultiplyMatrix(mtx, &temp);
       
  2684 }
       
  2685 
       
  2686 /*!
       
  2687  * \brief Multiplies a matrix with a rotation matrix
       
  2688  */
       
  2689 M3G_API void m3gPreRotateMatrix(Matrix *mtx,
       
  2690                         M3Gfloat angle,
       
  2691                         M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
       
  2692 {
       
  2693     Quat q;
       
  2694     m3gSetAngleAxis(&q, angle, ax, ay, az);
       
  2695     m3gPreRotateMatrixQuat(mtx, &q);
       
  2696 }
       
  2697 
       
  2698 /*!
       
  2699  * \brief Multiplies a matrix with a quaternion rotation
       
  2700  */
       
  2701 M3G_API void m3gPreRotateMatrixQuat(Matrix *mtx, const Quat *quat)
       
  2702 {
       
  2703     Matrix temp;
       
  2704     m3gQuatMatrix(&temp, quat);
       
  2705     m3gPreMultiplyMatrix(mtx, &temp);
       
  2706 }
       
  2707 
       
  2708 /*!
       
  2709  * \brief Multiplies a matrix with a scale matrix.
       
  2710  */
       
  2711 M3G_API void m3gPreScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
       
  2712 {
       
  2713     Matrix temp;
       
  2714     m3gScalingMatrix(&temp, sx, sy, sz);
       
  2715     m3gPreMultiplyMatrix(mtx, &temp);
       
  2716 }
       
  2717 
       
  2718 /*!
       
  2719  * \brief Multiplies a matrix with a translation (matrix).
       
  2720  */
       
  2721 M3G_API void m3gPreTranslateMatrix(Matrix *mtx,
       
  2722                            M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
       
  2723 {
       
  2724     Matrix temp;
       
  2725     m3gTranslationMatrix(&temp, tx, ty, tz);
       
  2726     m3gPreMultiplyMatrix(mtx, &temp);
       
  2727 }
       
  2728 
       
  2729 /*!
       
  2730  * \brief Converts a quaternion into a matrix
       
  2731  *
       
  2732  * The output is a matrix effecting the same rotation as the
       
  2733  * quaternion passed as input
       
  2734  */
       
  2735 M3G_API void m3gQuatMatrix(Matrix *mtx, const Quat *quat)
       
  2736 {
       
  2737     M3Gfloat qx = quat->x;
       
  2738     M3Gfloat qy = quat->y;
       
  2739     M3Gfloat qz = quat->z;
       
  2740     M3Gfloat qw = quat->w;
       
  2741 
       
  2742     /* Quick exit for identity rotations */
       
  2743 
       
  2744     if (IS_ZERO(qx) && IS_ZERO(qy) && IS_ZERO(qz)) {
       
  2745         m3gIdentityMatrix(mtx);
       
  2746         return;
       
  2747     }
       
  2748 
       
  2749     {
       
  2750         /* Determine the rough type of the output matrix */
       
  2751 
       
  2752         M3Guint type = MC_SCALING_ROTATION;
       
  2753         if (IS_ZERO(qz) && IS_ZERO(qy)) {
       
  2754             type = MC_X_ROTATION;
       
  2755         }
       
  2756         else if (IS_ZERO(qz) && IS_ZERO(qx)) {
       
  2757             type = MC_Y_ROTATION;
       
  2758         }
       
  2759         else if (IS_ZERO(qx) && IS_ZERO(qy)) {
       
  2760             type = MC_Z_ROTATION;
       
  2761         }
       
  2762         m3gClassifyAs(type, mtx);
       
  2763 
       
  2764         /* Generate the non-constant parts of the matrix */
       
  2765         {
       
  2766             M3Gfloat wx, wy, wz, xx, yy, yz, xy, xz, zz;
       
  2767 
       
  2768             xx = m3gMul(qx, qx);
       
  2769             xy = m3gMul(qx, qy);
       
  2770             xz = m3gMul(qx, qz);
       
  2771             yy = m3gMul(qy, qy);
       
  2772             yz = m3gMul(qy, qz);
       
  2773             zz = m3gMul(qz, qz);
       
  2774             wx = m3gMul(qw, qx);
       
  2775             wy = m3gMul(qw, qy);
       
  2776             wz = m3gMul(qw, qz);
       
  2777 
       
  2778             if (type != MC_X_ROTATION) {
       
  2779                 M44F(mtx, 0, 0) = m3gSub(1.f, m3gDouble(m3gAdd(yy, zz)));
       
  2780                 M44F(mtx, 0, 1) =             m3gDouble(m3gSub(xy, wz));
       
  2781                 M44F(mtx, 0, 2) =             m3gDouble(m3gAdd(xz, wy));
       
  2782             }
       
  2783 
       
  2784             if (type != MC_Y_ROTATION) {
       
  2785                 M44F(mtx, 1, 0) =             m3gDouble(m3gAdd(xy, wz));
       
  2786                 M44F(mtx, 1, 1) = m3gSub(1.f, m3gDouble(m3gAdd(xx, zz)));
       
  2787                 M44F(mtx, 1, 2) =             m3gDouble(m3gSub(yz, wx));
       
  2788             }
       
  2789 
       
  2790             if (type != MC_Z_ROTATION) {
       
  2791                 M44F(mtx, 2, 0) =             m3gDouble(m3gSub(xz, wy));
       
  2792                 M44F(mtx, 2, 1) =             m3gDouble(m3gAdd(yz, wx));
       
  2793                 M44F(mtx, 2, 2) = m3gSub(1.f, m3gDouble(m3gAdd(xx, yy)));
       
  2794             }
       
  2795         }
       
  2796         m3gSubClassify(mtx);
       
  2797     }
       
  2798 }
       
  2799 
       
  2800 /*!
       
  2801  * \brief Generates a scaling matrix
       
  2802  */
       
  2803 M3G_API void m3gScalingMatrix(
       
  2804     Matrix *mtx,
       
  2805     const M3Gfloat sx, const M3Gfloat sy, const M3Gfloat sz)
       
  2806 {
       
  2807     M3G_ASSERT_PTR(mtx);
       
  2808     M44F(mtx, 0, 0) = sx;
       
  2809     M44F(mtx, 1, 1) = sy;
       
  2810     M44F(mtx, 2, 2) = sz;
       
  2811     m3gClassifyAs(MC_SCALING, mtx);
       
  2812     m3gSubClassify(mtx);
       
  2813 }
       
  2814 
       
  2815 /*!
       
  2816  * \brief Sets a quaternion to represent an angle-axis rotation
       
  2817  */
       
  2818 M3G_API void m3gSetAngleAxis(Quat *quat,
       
  2819                      M3Gfloat angle,
       
  2820                      M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
       
  2821 {
       
  2822     m3gSetAngleAxisRad(quat, m3gMul(angle, M3G_DEG2RAD), ax, ay, az);
       
  2823 }
       
  2824 
       
  2825 /*!
       
  2826  * \brief Sets a quaternion to represent an angle-axis rotation
       
  2827  */
       
  2828 M3G_API void m3gSetAngleAxisRad(Quat *quat,
       
  2829                         M3Gfloat angleRad,
       
  2830                         M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
       
  2831 {
       
  2832     M3G_ASSERT_PTR(quat);
       
  2833     
       
  2834     if (!IS_ZERO(angleRad)) {
       
  2835         M3Gfloat s;
       
  2836         M3Gfloat halfAngle = m3gHalf(angleRad);
       
  2837 
       
  2838         s = m3gSin(halfAngle);
       
  2839         
       
  2840         {
       
  2841             M3Gfloat sqrNorm = m3gMadd(ax, ax, m3gMadd(ay, ay, m3gMul(az, az)));
       
  2842             if (sqrNorm < 0.995f || sqrNorm > 1.005f) {
       
  2843                 if (sqrNorm > EPSILON) {
       
  2844                     M3Gfloat ooNorm = m3gRcpSqrt(sqrNorm);
       
  2845                     ax = m3gMul(ax, ooNorm);
       
  2846                     ay = m3gMul(ay, ooNorm);
       
  2847                     az = m3gMul(az, ooNorm);
       
  2848                 }
       
  2849                 else {
       
  2850                     ax = ay = az = 0.0f;
       
  2851                 }
       
  2852             }
       
  2853         }
       
  2854 
       
  2855         quat->x = m3gMul(s, ax);
       
  2856         quat->y = m3gMul(s, ay);
       
  2857         quat->z = m3gMul(s, az);
       
  2858         quat->w = m3gCos(halfAngle);
       
  2859     }
       
  2860     else {
       
  2861         m3gIdentityQuat(quat);
       
  2862     }
       
  2863 }
       
  2864 
       
  2865 /*!
       
  2866  * \brief Quaternion multiplication.
       
  2867  */
       
  2868 M3G_API void m3gMulQuat(Quat *quat, const Quat *other)
       
  2869 {
       
  2870     Quat q;
       
  2871     q = *quat;
       
  2872 
       
  2873     quat->w = m3gMul(q.w, other->w) - m3gMul(q.x, other->x) - m3gMul(q.y, other->y) - m3gMul(q.z, other->z);
       
  2874     quat->x = m3gMul(q.w, other->x) + m3gMul(q.x, other->w) + m3gMul(q.y, other->z) - m3gMul(q.z, other->y);
       
  2875     quat->y = m3gMul(q.w, other->y) - m3gMul(q.x, other->z) + m3gMul(q.y, other->w) + m3gMul(q.z, other->x);
       
  2876     quat->z = m3gMul(q.w, other->z) + m3gMul(q.x, other->y) - m3gMul(q.y, other->x) + m3gMul(q.z, other->w);
       
  2877 }
       
  2878 
       
  2879 /*!
       
  2880  * \brief Makes this quaternion represent the rotation from one 3D
       
  2881  * vector to another
       
  2882  */
       
  2883 M3G_API void m3gSetQuatRotation(Quat *quat,
       
  2884                                 const Vec3 *from, const Vec3 *to)
       
  2885 {
       
  2886     M3Gfloat cosAngle;
       
  2887 
       
  2888     M3G_ASSERT_PTR(quat);
       
  2889     M3G_ASSERT_PTR(from);
       
  2890     M3G_ASSERT_PTR(to);
       
  2891 
       
  2892     cosAngle = m3gDot3(from, to);
       
  2893 
       
  2894     if (cosAngle > (1.0f - EPSILON)) {  /* zero angle */
       
  2895         m3gIdentityQuat(quat);
       
  2896         return;
       
  2897     }
       
  2898     else if (cosAngle > (1.0e-3f - 1.0f)) { /* normal case */
       
  2899         Vec3 axis;
       
  2900         m3gCross(&axis, from, to);
       
  2901         m3gSetAngleAxisRad(quat, m3gArcCos(cosAngle), axis.x, axis.y, axis.z);
       
  2902     }
       
  2903     else {
       
  2904 
       
  2905         /* Opposite vectors; must generate an arbitrary perpendicular
       
  2906          * vector and use that as the rotation axis. Here, we try the
       
  2907          * Z axis first, and if that seems too parallel to the
       
  2908          * vectors, project the Y axis instead: Z is the only good
       
  2909          * choice for Z-constrained rotations, and Y by definition
       
  2910          * must be perpendicular to that. */
       
  2911 
       
  2912         Vec3 axis, temp;
       
  2913         M3Gfloat s;
       
  2914 
       
  2915         axis.x = axis.y = axis.z = 0.0f;
       
  2916         if (m3gAbs(from->z) < (1.0f - EPSILON)) {
       
  2917             axis.z = 1.0f;
       
  2918         }
       
  2919         else {
       
  2920             axis.y = 1.0f;
       
  2921         }
       
  2922 
       
  2923         s = m3gDot3(&axis, from);
       
  2924         temp = *from;
       
  2925         m3gScaleVec3(&temp, s);
       
  2926         m3gSubVec3(&axis, &temp);
       
  2927 
       
  2928         m3gSetAngleAxis(quat, 180.f, axis.x, axis.y, axis.z);
       
  2929     }
       
  2930 }
       
  2931 
       
  2932 /*!
       
  2933  * \brief Sets the values of a matrix
       
  2934  *
       
  2935  */
       
  2936 M3G_API void m3gSetMatrixColumns(Matrix *mtx, const M3Gfloat *src)
       
  2937 {
       
  2938     M3G_ASSERT(mtx != NULL && src != NULL);
       
  2939 
       
  2940     m3gCopy(mtx->elem, src, sizeof(mtx->elem));
       
  2941     mtx->classified = M3G_FALSE;
       
  2942     mtx->complete = M3G_TRUE;
       
  2943 }
       
  2944 
       
  2945 /*!
       
  2946  * \brief Sets the values of a matrix
       
  2947  *
       
  2948  */
       
  2949 M3G_API void m3gSetMatrixRows(Matrix *mtx, const M3Gfloat *src)
       
  2950 {
       
  2951     M3G_ASSERT(mtx != NULL && src != NULL);
       
  2952     {
       
  2953         int row;
       
  2954         for (row = 0; row < 4; ++row) {
       
  2955             mtx->elem[ 0 + row] = *src++;
       
  2956             mtx->elem[ 4 + row] = *src++;
       
  2957             mtx->elem[ 8 + row] = *src++;
       
  2958             mtx->elem[12 + row] = *src++;
       
  2959         }
       
  2960     }
       
  2961     mtx->classified = M3G_FALSE;
       
  2962     mtx->complete = M3G_TRUE;
       
  2963 }
       
  2964 
       
  2965 /*!
       
  2966  * \brief Transforms a 4-vector with a matrix
       
  2967  */
       
  2968 #if defined(M3G_HW_FLOAT_VFPV2)
       
  2969 
       
  2970 __asm void _m3gTransformVec4(const Matrix *mtx, Vec4 *vec, M3Gint n)
       
  2971 {
       
  2972 
       
  2973 		CODE32
       
  2974 
       
  2975 		FSTMFDD	sp!, {d8-d11}
       
  2976 
       
  2977 		FMRX	r3, FPSCR		 
       
  2978 		BIC		r12, r3, #0x00370000
       
  2979 		ORR		r12, #(3<<16)
       
  2980 		FMXR	FPSCR, r12
       
  2981 		CMP		r2, #4
       
  2982 
       
  2983 		FLDMIAS	r0, {s4-s19}		// [mtx0  mtx1 ..]
       
  2984 		FLDMIAS	r1, {s0-s3}			// [vec.x  vec.y  vec.z  vec.w]
       
  2985 		FMULS	s20, s4, s0			// [s20-s23]  = [v.x*mtx0  v.x*mtx1  v.x*mtx2  v.x*mtx3 ]
       
  2986 		FMACS	s20, s8, s1			// [s20-s23] += [v.y*mtx4  v.y*mtx5  v.y*mtx6  v.y*mtx7 ]
       
  2987 		FMACS	s20, s12, s2		// [s20-s23] += [v.z*mtx8  v.z*mtx9  v.z*mtx10 v.z*mtx11]
       
  2988 		FMACS	s20, s16, s3		// [s20-s23] += [v.w*mtx12 v.w*mtx13 v.w*mtx14 v.w*mtx15]
       
  2989 		FSTMIAS		r1!, {s20-s22}
       
  2990 		FSTMIASEQ	r1, {s23}
       
  2991 
       
  2992 		FMXR	FPSCR, r3
       
  2993 		FLDMFDD	sp!, {d8-d11}
       
  2994 
       
  2995 		BX		lr
       
  2996 }
       
  2997 #endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
       
  2998 
       
  2999 M3G_API void m3gTransformVec4(const Matrix *mtx, Vec4 *vec)
       
  3000 {
       
  3001     M3Guint type;
       
  3002     M3G_ASSERT(mtx != NULL && vec != NULL);
       
  3003 
       
  3004     if (!m3gIsClassified(mtx)) {
       
  3005         m3gClassify((Matrix*)mtx);
       
  3006     }
       
  3007 
       
  3008     type = mtx->mask;
       
  3009 
       
  3010     if (type == MC_IDENTITY) {
       
  3011         return;
       
  3012     }
       
  3013     else {
       
  3014         Vec4 v = *vec;
       
  3015         int i;
       
  3016         int n = m3gIsWUnity(mtx) ? 3 : 4;
       
  3017 
       
  3018         if (!mtx->complete) {
       
  3019             m3gFillClassifiedMatrix((Matrix*)mtx);
       
  3020         }
       
  3021 #if	defined(M3G_HW_FLOAT_VFPV2)
       
  3022 		_m3gTransformVec4(mtx, vec, n);
       
  3023 #else
       
  3024         for (i = 0; i < n; ++i) {
       
  3025             M3Gfloat d = m3gMul(M44F(mtx, i, 0), v.x);
       
  3026             d = m3gMadd(M44F(mtx, i, 1), v.y, d);
       
  3027             d = m3gMadd(M44F(mtx, i, 2), v.z, d);
       
  3028             d = m3gMadd(M44F(mtx, i, 3), v.w, d);
       
  3029             (&vec->x)[i] = d;
       
  3030         }
       
  3031 #endif
       
  3032     }
       
  3033 }
       
  3034 
       
  3035 /*!
       
  3036  * \brief Generates a translation matrix
       
  3037  */
       
  3038 M3G_API void m3gTranslationMatrix(
       
  3039     Matrix *mtx,
       
  3040     const M3Gfloat tx, const M3Gfloat ty, const M3Gfloat tz)
       
  3041 {
       
  3042     M3G_ASSERT_PTR(mtx);
       
  3043     M44F(mtx, 0, 3) = tx;
       
  3044     M44F(mtx, 1, 3) = ty;
       
  3045     M44F(mtx, 2, 3) = tz;
       
  3046     m3gClassifyAs(MC_TRANSLATION, mtx);
       
  3047     m3gSubClassify(mtx);
       
  3048 }
       
  3049 
       
  3050 /*!
       
  3051  * \brief Float vector assignment.
       
  3052  */
       
  3053 M3G_API void m3gSetQuat(Quat *quat, const M3Gfloat *vec)
       
  3054 {
       
  3055     quat->x = vec[0];
       
  3056     quat->y = vec[1];
       
  3057     quat->z = vec[2];
       
  3058     quat->w = vec[3];
       
  3059 }
       
  3060 
       
  3061 /*!
       
  3062  * \brief Slerp between quaternions q0 and q1, storing the result in quat.
       
  3063  */
       
  3064 M3G_API void m3gSlerpQuat(Quat *quat,
       
  3065                   M3Gfloat s,
       
  3066                   const Quat *q0, const Quat *q1)
       
  3067 {
       
  3068     M3Gfloat s0, s1;
       
  3069     M3Gfloat cosTheta = m3gDot4((const Vec4 *)q0, (const Vec4 *)q1);
       
  3070     M3Gfloat oneMinusS = m3gSub(1.0f, s);
       
  3071 
       
  3072     if (cosTheta > EPSILON - 1.0f) {
       
  3073         if (cosTheta < 1.0f - EPSILON) {
       
  3074             M3Gfloat theta    = m3gArcCos(cosTheta);
       
  3075             M3Gfloat sinTheta = m3gSin(theta);
       
  3076             s0 = m3gSin(m3gMul(oneMinusS, theta)) / sinTheta;
       
  3077             s1 = m3gSin(m3gMul(s, theta)) / sinTheta;
       
  3078         }
       
  3079         else {
       
  3080             /* For quaternions very close to each other, use plain
       
  3081                linear interpolation for numerical stability. */
       
  3082             s0 = oneMinusS;
       
  3083             s1 = s;
       
  3084         }
       
  3085         quat->x = m3gMadd(s0, q0->x, m3gMul(s1, q1->x));
       
  3086         quat->y = m3gMadd(s0, q0->y, m3gMul(s1, q1->y));
       
  3087         quat->z = m3gMadd(s0, q0->z, m3gMul(s1, q1->z));
       
  3088         quat->w = m3gMadd(s0, q0->w, m3gMul(s1, q1->w));
       
  3089     }
       
  3090     else {
       
  3091         /* Slerp is undefined if the two quaternions are (nearly)
       
  3092            opposite, so we just pick an arbitrary plane of
       
  3093            rotation here. */
       
  3094 
       
  3095         quat->x = -q0->y;
       
  3096         quat->y =  q0->x;
       
  3097         quat->z = -q0->w;
       
  3098         quat->w =  q0->z;
       
  3099 
       
  3100         s0 = m3gSin(m3gMul(oneMinusS, HALF_PI));
       
  3101         s1 = m3gSin(m3gMul(s, HALF_PI));
       
  3102 
       
  3103         quat->x = m3gMadd(s0, q0->x, m3gMul(s1, quat->x));
       
  3104         quat->y = m3gMadd(s0, q0->y, m3gMul(s1, quat->y));
       
  3105         quat->z = m3gMadd(s0, q0->z, m3gMul(s1, quat->z));
       
  3106     }
       
  3107 }
       
  3108 
       
  3109 /*!
       
  3110  * \brief Interpolate quaternions using spline, or "squad" interpolation.
       
  3111  */
       
  3112 M3G_API void m3gSquadQuat(Quat *quat,
       
  3113                   M3Gfloat s,
       
  3114                   const Quat *q0, const Quat *a, const Quat *b, const Quat *q1)
       
  3115 {
       
  3116 
       
  3117     Quat temp0;
       
  3118     Quat temp1;
       
  3119     m3gSlerpQuat(&temp0, s, q0, q1);
       
  3120     m3gSlerpQuat(&temp1, s, a, b);
       
  3121 
       
  3122     m3gSlerpQuat(quat, m3gDouble(m3gMul(s, m3gSub(1.0f, s))), &temp0, &temp1);
       
  3123 }
       
  3124 
       
  3125 
       
  3126