m3g/m3gcore11/src/m3g_math.c
changeset 0 5d03bc08d59c
child 33 25f95128741d
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/m3g/m3gcore11/src/m3g_math.c	Tue Feb 02 01:47:50 2010 +0200
@@ -0,0 +1,3126 @@
+/*
+* Copyright (c) 2003 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of the License "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: Internal math function implementations
+*
+*/
+
+
+/*!
+ * \internal
+ * \file
+ * \brief Internal math function implementations
+ *
+ */
+
+#ifndef M3G_CORE_INCLUDE
+#   error included by m3g_core.c; do not compile separately.
+#endif
+
+#include "m3g_defs.h"
+#include "m3g_memory.h"
+
+#if defined(M3G_SOFT_FLOAT)
+#include <math.h>
+#endif
+
+/*----------------------------------------------------------------------
+ * Private types and definitions
+ *--------------------------------------------------------------------*/
+
+/* Enumerated common matrix classifications */
+#define MC_IDENTITY             0x40100401 // 01000000 00010000 00000100 00000001
+#define MC_FRUSTUM              0x30BF0C03 // 00110000 10111111 00001100 00000011
+#define MC_PERSPECTIVE          0x30B00C03 // 00110000 10110000 00001100 00000011
+#define MC_ORTHO                0x7F300C03 // 01111111 00110000 00001100 00000011
+#define MC_PARALLEL             0x70300C03 // 01110000 00110000 00001100 00000011
+#define MC_SCALING_ROTATION     0x403F3F3F // 01000000 00111111 00111111 00111111
+#define MC_SCALING              0x40300C03 // 01000000 00110000 00001100 00000011
+#define MC_TRANSLATION          0x7F100401 // 01111111 00010000 00000100 00000001
+#define MC_X_ROTATION           0x403C3C01 // 01000000 00111100 00111100 00000001
+#define MC_Y_ROTATION           0x40330433 // 01000000 00110011 00000100 00110011
+#define MC_Z_ROTATION           0x40100F0F // 01000000 00010000 00001111 00001111
+#define MC_W_UNITY              0x7F3F3F3F // 01111111 00111111 00111111 00111111
+#define MC_GENERIC              0xFFFFFFFF
+
+/* Partial masks for individual matrix components */
+#define MC_TRANSLATION_PART     0x3F000000 // 00111111 00000000 00000000 00000000
+#define MC_SCALE_PART           0x00300C03 // 00000000 00110000 00001100 00000011
+#define MC_SCALE_ROTATION_PART  0x003F3F3F // 00000000 00111111 00111111 00111111
+
+/* Matrix element classification masks */
+#define ELEM_ZERO       0x00
+#define ELEM_ONE        0x01
+#define ELEM_MINUS_ONE  0x02
+#define ELEM_ANY        0x03
+
+/*!
+ * \internal
+ * \brief calculates the offset of a 4x4 matrix element in a linear
+ * array
+ *
+ * \notes The current convention is column-major, as in OpenGL ES
+ */
+#define MELEM(row, col) ((row) + ((col) << 2))
+
+/*!
+ * \internal
+ * \brief Macro for accessing 4x4 float matrix elements
+ *
+ * \param mtx pointer to the first element of the matrix
+ * \param row matrix row
+ * \param col matrix column
+ */
+#define M44F(mtx, row, col)     ((mtx)->elem[MELEM((row), (col))])
+
+/*--------------------------------------------------------------------*/
+
+/*----------------------------------------------------------------------
+ * Private functions
+ *--------------------------------------------------------------------*/
+
+
+/*!
+ * \internal
+ * \brief ARM VFPv2 implementation of 4x4 matrix multiplication.
+ *
+ * \param dst	multiplication result
+ * \param left	left-hand matrix
+ * \param right right-hand matrix
+ */
+#if defined(M3G_HW_FLOAT_VFPV2)
+
+__asm void _m3gGenericMatrixProduct(Matrix *dst,
+                                    const Matrix *left, const Matrix *right)
+{
+// r0 = *dst
+// r1 = *left
+// r2 = *right
+
+		CODE32
+
+// save the VFP registers and set the vector STRIDE = 1, LENGTH = 4
+
+		FSTMFDD	sp!, {d8-d15}
+
+		FMRX	r3, FPSCR		 
+		BIC		r12, r3, #0x00370000
+		ORR		r12, #0x00030000
+		FMXR	FPSCR, r12
+
+// left = [a0  a1  a2  a3	right = [b0  b1  b2  b3
+//	 	   a4  a5  a6  a7		     b4  b5  b6  b7
+//		   a8  a9 a10 a11		     b8  b9 b10 b11
+//		  a12 a13 a14 a15]		    b12 b13 b14 b15]
+//
+// dst = [a0*b0+a4*b1+a8*b2+a12*b3  a1*b0+a5*b1+a9*b2+a13*b3  ..
+//		  a0*b4+a4*b5+a8*b6+a12*b7  a1*b4+a5*b5+a9*b6+a13*b7  ..
+//					.							.
+//					.							.
+
+		FLDMIAS	r1!, {s8-s23}		// load the left matrix [a0-a15] to registers s8-s23
+		FLDMIAS	r2!, {s0-s7}		// load [b0-b7] of right matrix to registers s0-s7
+		FMULS	s24, s8, s0			// [s24-s27]  = [a0*b0  a1*b0  a2*b0  a3*b0]
+		FMULS	s28, s8, s4			// [s28-s31]  = [a0*b4  a1*b4  a2*b4  a3*b4]
+		FMACS	s24, s12, s1		// [s24-s27] += [a4*b1  a5*b1  a6*b1  a7*b1]
+		FMACS	s28, s12, s5		// [s28-s31] += [a4*b5  a5*b5  a6*b5  a7*b5]
+		FMACS	s24, s16, s2		// [s24-s27] += [a8*b2  a9*b2  a10*b2 a11*b2]
+		FMACS	s28, s16, s6		// [s28-s31] += [a8*b6  a9*b6  a10*b6 a11*b6]
+		FMACS	s24, s20, s3		// [s24-s27] += [a12*b3 a13*b3 a14*b3 a15*b3]
+		FMACS	s28, s20, s7		// [s28-s31] += [a12*b7 a13*b37a14*b7 a15*b7]
+		FLDMIAS	r2!, {s0-s7}		// load [b8-b15]
+		FSTMIAS	r0!, {s24-s31}		// write [dst0-dst7]
+		FMULS	s24, s8, s0			
+		FMULS	s28, s8, s4			
+		FMACS	s24, s12, s1		
+		FMACS	s28, s12, s5		
+		FMACS	s24, s16, s2		
+		FMACS	s28, s16, s6		
+		FMACS	s24, s20, s3		
+		FMACS	s28, s20, s7		
+		FSTMIAS	r0!, {s24-s31}
+
+// Restore the VFP registers and return.
+
+		FMXR	FPSCR, r3
+
+		FLDMFDD	sp!, {d8-d15}	
+
+		BX		lr
+
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+
+
+/*------------------ Elementary float ------------------*/
+
+#if defined(M3G_SOFT_FLOAT)
+
+#if defined (M3G_BUILD_ARM)
+
+/*!
+ * \internal
+ * \brief Floating point multiplication implementation for ARM.
+ */
+__asm M3Gfloat m3gMul(const M3Gfloat a,
+                      const M3Gfloat b)
+{
+    /**
+     * Extract the exponents of the multiplicands and add them
+     * together. Flush to zero if either exponent or their sum
+     * is zero.
+     */
+
+    mov     r12, #0xff;
+    ands    r2, r0, r12, lsl #23;   // exit if e1 == 0
+    andnes  r3, r1, r12, lsl #23;   // exit if e2 == 0
+    subne   r2, r2, #(127 << 23);
+    addnes  r12, r2, r3;            // exit if e1+e2-127 <= 0
+    movle   r0, #0;
+    bxle    lr;
+
+    /**
+     * Determine the sign of the result. Note that the exponent
+     * may have overflowed to the sign bit, and thus the result
+     * may be an arbitrary negative value when it really should
+     * be +Inf or -Inf.
+     */
+
+    teq     r0, r1;
+    orrmi   r12, r12, #0x80000000;
+
+    /**
+     * Multiply the mantissas. First shift the mantissas up to
+     * unsigned 1.31 fixed point, adding the leading "1" bit at
+     * the same time, and finally do a 32x32 -> 64 bit unsigned
+     * multiplication. The result is in unsigned 2.62 fixed point,
+     * representing the interval [1.0, 4.0).
+     */
+
+    mov     r2, #0x80000000;
+    orr     r0, r2, r0, lsl #8;
+    orr     r1, r2, r1, lsl #8;
+    umulls  r2, r3, r0, r1;
+
+    /**
+     * If the highest bit of the 64-bit result is set, then the
+     * mantissa lies in [2.0, 4.0) and needs to be renormalized.
+     * That is, the mantissa is shifted one bit to the right and
+     * the exponent correspondingly increased by 1. Note that we
+     * lose the leading "1" bit from the mantissa by adding it up
+     * with the exponent.
+     */
+
+    subpl   r12, r12, #(1 << 23);    // no overflow: exponent -= 1
+    addpl   r0, r12, r3, lsr #7;     // no overflow: exponent += 1
+    addmi   r0, r12, r3, lsr #8;     // overflow: exponent += 1
+    bx      lr;
+}
+
+/*!
+ * \internal
+ * \brief Floating point addition implementation for ARM.
+ */
+__asm M3Gfloat m3gAdd(const M3Gfloat a,
+                      const M3Gfloat b)
+{
+    /**
+     * If the operands have opposite signs then this is not really
+     * an addition but a subtraction. Subtraction is much slower,
+     * so we have a separate code path for it, rather than trying
+     * to save space by handling both in the same place.
+     */
+    
+    teq     r0, r1;
+    bmi     _m3gSub;
+
+    /**
+     * Sort the operands such that the larger operand is in R0 and
+     * the smaller in R1. The sign bits do not affect the ordering,
+     * since they are known to be equal.
+     */
+    
+    subs    r2, r0, r1;
+    submi   r0, r0, r2;
+    addmi   r1, r1, r2;
+
+    /**
+     * Extract the exponent of the smaller operand into R2 and compute
+     * the difference between the larger and smaller exponent into R3.
+     * (Note that the sign bits cancel out in the subtraction.) The
+     * exponent delta tells how many bits the mantissa of the smaller
+     * operand must be shifted to the right in order to bring the
+     * operands into equal scale.
+     */
+
+    mov     r2, r1, lsr #23;
+    rsb     r3, r2, r0, lsr #23;
+
+    /**
+     * Check if the exponent delta is bigger than 23 bits, or if the
+     * smaller exponent is zero. In either case, exit the routine and
+     * return the larger operand (which is already in R0). Note that
+     * this means that subnormals are treated as zero.
+     */
+
+    rsbs    r12, r3, #23;               // N set, V clear if R3 > 23
+    tstpl   r2, #0xff;                  // execute only if R3 <= 23
+    bxle    lr;                         // exit if Z set or N != V
+
+    /**
+     * Extract the mantissas and shift them up to unsigned 1.31 fixed
+     * point, inserting the implied leading "1" bit at the same time.
+     * Finally, align the decimal points and add up the mantissas.
+     */
+    
+    mov     r12, #0x80000000;
+    orr     r0, r12, r0, lsl #8;
+    orr     r1, r12, r1, lsl #8;
+    adds    r0, r0, r1, lsr r3;
+
+    /**
+     * Compute the final exponent by adding up the smaller exponent
+     * (R2), the exponent delta (R3), and the possible overflow bit
+     * (carry flag). Note that in case of overflow, the leading "1"
+     * has ended up in the carry flag and thus needs not be explicitly
+     * discarded. Finally, put the mantissa together with the sign and
+     * exponent.
+     */
+
+    adc     r2, r2, r3;                 // r2 = smallExp + deltaExp + overflow
+    movcc   r0, r0, lsl #1;             // no overflow: discard leading 1
+    mov     r0, r0, lsr #9;
+    orr     r0, r0, r2, lsl #23;
+    bx      lr;
+    
+_m3gSub
+
+    /**
+     * Sort the operands such that the one with larger magnitude is
+     * in R0 and has the correct sign (the sign of the final result),
+     * while the smaller operand is in R1 with an inverted sign bit.
+     */
+    
+    eor     r1, r1, #0x80000000;
+    subs    r2, r0, r1;
+    eormi   r2, r2, #0x80000000;
+    submi   r0, r0, r2;
+    addmi   r1, r1, r2;
+
+    /**
+     * Extract the exponent of the smaller operand into R2 and compute
+     * the difference between the larger and smaller exponent into R3.
+     * (Note that the sign bits cancel out in the subtraction.) The
+     * exponent delta tells how many bits the mantissa of the smaller
+     * operand must be shifted to the right in order to bring the
+     * operands into equal scale.
+     */
+
+    mov     r2, r1, lsr #23;
+    rsbs    r3, r2, r0, lsr #23;
+
+    /**
+     * Check if the exponent delta is bigger than 31 bits, or if the
+     * smaller exponent is zero. In either case, exit the routine and
+     * return the larger operand (which is already in R0). Note that
+     * this means that subnormals are treated as zero.
+     */
+
+    rsbs    r12, r3, #31;               // N set, V clear if R3 > 31
+    tstpl   r2, #0xff;                  // execute only if R3 <= 31
+    bxle    lr;                         // exit if Z set or N != V
+
+    /**
+     * Extract the mantissas and shift them up to unsigned 1.31 fixed
+     * point, inserting the implied leading "1" bit at the same time.
+     * Then align the decimal points and subtract the smaller mantissa
+     * from the larger one.
+     */
+    
+    mov     r12, #0x80000000;
+    orr     r0, r12, r0, lsl #8;
+    orr     r1, r12, r1, lsl #8;
+    subs    r0, r0, r1, lsr r3;
+
+    /**
+     * We split the range of possible results into three categories:
+     * 
+     *   1. [1.0, 2.0) ==> no renormalization needed (highest bit set)
+     *   2. [0.5, 1.0) ==> only one left-shift needed
+     *   3. (0.0, 0.5) ==> multiple left-shifts needed
+     *   4. zero       ==> just return
+     *   
+     * Cases 1 and 2 are handled in the main code path. Cases 3 and 4
+     * are less common by far, so we branch to a separate code fragment
+     * for those.
+     */
+    
+    movpls  r0, r0, lsl #1;             // Cases 2,3,4: shift left
+    bpl     _m3gSubRenormalize;         // Cases 3 & 4: branch out
+    
+    /**
+     * Now we have normalized the mantissa such that the highest bit
+     * is set. Here we only need to adjust the exponent, if necessary,
+     * and put the pieces together. Note that we lose the leading "1"
+     * bit from the mantissa by adding it up with the exponent. We can
+     * also do proper rounding (towards nearest) instead of truncation
+     * (towards zero) at no extra cost!
+     */
+
+    sbc     r3, r3, #1;                 // deltaExp -= 1 (Case 1) or 2 (Case 2)
+    add     r2, r2, r3;                 // resultExp = smallExp + deltaExp
+    movs    r0, r0, lsr #8;             // shift mantissa, keep leading "1"
+    adc     r0, r0, r2, lsl #23;        // resultExp += 1, mantissa += carry
+    bx      lr;
+
+    /**
+     * Separate code path for cases 3 and 4 (see above). The mantissa
+     * has already been shifted up by one, but the exponent has not
+     * been correspondingly decreased. We also know that the highest
+     * bit is still not set, and that the carry flag is clear.
+     */
+
+_m3gSubRenormalize
+    bxeq    lr;
+    subcc   r3, r3, #2;
+    movccs  r0, r0, lsl #2;
+
+    /**
+     * If the carry flag is still not set, i.e. there were more than
+     * two leading zeros in the mantissa initially, loop until we
+     * find the highest set bit.
+     */
+    
+_m3gSubRenormalizeLoop
+    subcc   r3, r3, #1;
+    movccs  r0, r0, lsl #1;
+    bcc     _m3gSubRenormalizeLoop;
+
+    /**
+     * Now the leading "1" is in the carry flag, so we can just add up
+     * the exponent and mantissa as usual, doing proper rounding at
+     * the same time. However, cases where the exponent goes negative
+     * (that is, underflows) must be detected and flushed to zero.
+     */
+    
+    add     r3, r2, r3;
+    movs    r0, r0, lsr #9;
+    adc     r0, r0, r3, lsl #23;
+    teq     r0, r2, lsl #23;
+    movmi   r0, #0;
+    bx      lr;
+}
+
+#else /* M3G_BUILD_ARM */
+
+/*!
+ * \internal
+ * \brief Floating point addition implementation
+ *
+ */
+static M3G_INLINE M3Gfloat m3gFloatAdd(const M3Guint aBits,
+                                       const M3Guint bBits)
+{
+    M3Guint large, small, signMask;
+    
+    /* Early exits for underflow cases */
+
+    large = (M3Gint)(aBits & ~SIGN_MASK);
+    if (large <= 0x00800000) {
+        return INT_AS_FLOAT(bBits);
+    }
+    small = (M3Gint)(bBits & ~SIGN_MASK);
+    if (small <= 0x00800000) {
+        return INT_AS_FLOAT(aBits);
+    }
+
+    /* Swap the numbers so that "large" really is larger; the unsigned
+     * (or de-signed) bitmasks for floats are nicely monotonous, so we
+     * can compare directly.  Also store the sign of the larger number
+     * for future reference. */
+
+    if (small > large) {
+        M3Gint temp = small;
+        small = large;
+        large = temp;
+        signMask = (bBits & SIGN_MASK);
+    }
+    else {
+        signMask = (aBits & SIGN_MASK);
+    }
+    
+    {
+        M3Guint res, overflow;
+        M3Guint resExp, expDelta;
+        
+        /* Store the larger exponent as our candidate result exponent,
+         * and compute the difference between the exponents */
+
+        resExp = (large >> 23);
+        expDelta = resExp - (small >> 23);
+
+        /* Take an early exit if the change would be insignificant;
+         * this also guards against odd results from shifting by more
+         * than 31 (undefined in C) */
+
+        if (expDelta >= 24) {
+            res = large | signMask;
+            return INT_AS_FLOAT(res);
+        }
+
+        /* Convert the mantissas into shifted integers, and shift the
+         * smaller number to the same scale with the larger one. */
+
+        large = (large & MANTISSA_MASK) | LEADING_ONE;
+        small = (small & MANTISSA_MASK) | LEADING_ONE;
+        small >>= expDelta;
+        M3G_ASSERT(large >= small);
+        
+        /* Check whether we're really adding or subtracting the
+         * smaller number, and branch to slightly different routines
+         * respectively */
+
+        if (((aBits ^ bBits) & SIGN_MASK) == 0) {
+
+            /* Matching signs; just add the numbers and check for
+             * overflow, shifting to compensate as necessary. */
+
+            res = large + small;
+            
+            overflow = (res >> 24);
+            res >>= overflow;
+            resExp += overflow;
+        }
+        else {
+
+            /* Different signs, so let's subtract the smaller value;
+             * also check that we're not subtracting a number from
+             * itself (so we don't have to normalize a zero below) */
+            
+            if (small == large) {
+                return 0.0f; /* x - x = 0 */
+            }
+
+            res = (large << 8) - (small << 8);
+
+            /* Renormalize the number by shifting until we've got the
+             * high bit in place */
+
+            while ((res >> 24) == 0) {
+                res <<= 8;
+                resExp -= 8;
+            }
+            while ((res >> 31) == 0) {
+                res <<= 1;
+                --resExp;
+            }
+            res >>= 8;
+        }
+
+        /* Flush to zero in case of over/underflow of the exponent */
+
+        if (resExp >= 255) {
+            return 0.0f;
+        }
+        
+        /* Compose the final number into "res"; note that we pull in
+         * the sign of the original larger number, which should still
+         * be valid */
+
+        res &= MANTISSA_MASK;
+        res |= (resExp << 23);
+        res |= signMask;
+
+        return INT_AS_FLOAT(res);
+    }
+}
+
+/*!
+ * \internal
+ * \brief Floating point multiplication implementation
+ */
+static M3G_INLINE M3Gfloat m3gFloatMul(const M3Guint aBits,
+                                       const M3Guint bBits)
+{
+    M3Guint a, b;
+
+    /* Early exits for underflow and multiplication by zero */
+
+    a = (aBits & ~SIGN_MASK);
+    if (a <= 0x00800000) {
+        return 0.0f;
+    }
+    
+    b = (bBits & ~SIGN_MASK);
+    if (b <= 0x00800000) {
+        return 0.0f;
+    }
+    
+    {
+        M3Guint res, exponent, overflow;
+        
+        /* Compute the exponent of the result, assuming the mantissas
+         * don't overflow; then mask out the original exponents */
+        
+        exponent = (a >> 23) + (b >> 23) - 127;
+        a &= MANTISSA_MASK;
+        b &= MANTISSA_MASK;
+
+        /* Compute the new mantissa from:
+         *
+         *   (1 + a)(1 + b) = ab + a + b + 1
+         *   
+         * First shift the mantissas from 0.23 down to 0.16 for the
+         * multiplication, then shift back to 0.23 for adding in the
+         * "a + b + 1" part of the equation.  */
+        
+        res = (a >> 7) * (b >> 7);              /* "ab" at 0.32 */
+        res = (res >> 9) + a + b + LEADING_ONE;
+
+        /* Add the leading one, then normalize the result by checking
+         * the overflow bit and dividing by two if necessary */
+
+        overflow = (res >> 24);
+        res >>= overflow;
+        exponent += overflow;
+
+        /* Flush to zero in case of over/underflow of the exponent */
+
+        if (exponent >= 255) {
+            return 0.0f;
+        }
+
+        /* Compose the final number into "res" */
+
+        res &= MANTISSA_MASK;
+        res |= (exponent << 23);
+        res |= (aBits ^ bBits) & SIGN_MASK;
+
+        return INT_AS_FLOAT(res);
+    }
+}
+
+#endif /* M3G_BUILD_ARM */
+
+/*!
+ * \internal
+ * \brief Computes the signed fractional part of a floating point
+ * number
+ *
+ * \param x  floating point value
+ * \return x signed fraction of x in ]-1, 1[
+ */
+static M3Gfloat m3gFrac(M3Gfloat x)
+{
+    /* Quick exit for -1 < x < 1 */
+    
+    if (m3gAbs(x) < 1.0f) {
+        return x;
+    }
+
+    /* Shift the mantissa to the proper place, mask out the integer
+     * part, and renormalize */
+    {
+        M3Guint ix = FLOAT_AS_UINT(x);
+        M3Gint expo = ((ix >> 23) & 0xFF) - 127;
+        M3G_ASSERT(expo >= 0);
+
+        /* The decimal part will always be zero for large values with
+         * exponents over 24 */
+        
+        if (expo >= 24) {
+            return 0.f;
+        }
+        else {
+            
+            /* Shift the integer part out of the mantissa and see what
+             * we have left */
+            
+            M3Guint base = (ix & MANTISSA_MASK) | LEADING_ONE;
+            base = (base << expo) & MANTISSA_MASK;
+
+            /* Quick exit (and guard against infinite looping) for
+             * zero */
+
+            if (base == 0) {
+                return 0.f;
+            }
+
+            /* We now have an exponent of 0 (i.e. no shifting), but
+             * must renormalize to get a set bit in place of the
+             * hidden (implicit one) bit */
+            
+            expo = 0;
+            
+            while ((base >> 19) == 0) {
+                base <<= 4;
+                expo -= 4;
+            }
+            while ((base >> 23) == 0) {
+                base <<= 1;
+                --expo;
+            }
+
+            /* Compose the final number */
+
+            ix =
+                (base & MANTISSA_MASK) |
+                ((expo + 127) << 23) |
+                (ix & SIGN_MASK);
+            return INT_AS_FLOAT(ix);
+        }
+    }
+}
+
+#endif /* M3G_SOFT_FLOAT */
+
+#if defined(M3G_DEBUG)
+/*!
+ * \internal
+ * \brief Checks for NaN or infinity in a floating point input
+ */
+static void m3gValidateFloats(int n, float *p)
+{
+    while (n-- > 0) {
+        M3G_ASSERT(EXPONENT(*p) < 120);
+        ++p;
+    }
+}
+#else
+#   define m3gValidateFloats(n, p)
+#endif
+    
+/*------------------ Trigonometry and exp ----------*/
+
+
+#if defined(M3G_SOFT_FLOAT)
+/*!
+ * \internal
+ * \brief Sine for the first quadrant
+ *
+ * \param x floating point value in [0, PI/2]
+ * \return sine of \c x
+ */
+static M3Gfloat m3gSinFirstQuadrant(M3Gfloat x)
+{
+    M3Guint bits = FLOAT_AS_UINT(x);
+    
+    if (bits <= 0x3ba3d70au)    /* 0.005f */
+        return x;
+    else {
+        static const M3Gfloat sinTermLut[4] = {
+            -1.0f / (2*3),
+            -1.0f / (4*5),
+            -1.0f / (6*7),
+            -1.0f / (8*9)
+        };
+
+        M3Gfloat xx = m3gSquare(x);
+        M3Gfloat sinx = x;
+        int i;
+
+        for (i = 0; i < 4; ++i) {
+            x    = m3gMul(x, m3gMul(xx, sinTermLut[i]));
+            sinx = m3gAdd(sinx, x);
+        }
+
+        return sinx;
+    }
+}
+#endif /* M3G_SOFT_FLOAT */
+
+#if defined(M3G_SOFT_FLOAT)
+/*!
+ * \internal
+ * \brief Computes sine for the first period
+ *
+ * \param x floating point value in [0, 2PI]
+ * \return sine of x
+ */
+static M3Gfloat m3gSinFirstPeriod(const M3Gfloat x)
+{
+    M3G_ASSERT(x >= 0 && x <= TWO_PI);
+    
+    if (x >= PI) {
+        return m3gNegate(m3gSinFirstQuadrant(x >= ONE_AND_HALF_PI ?
+                                             m3gSub(TWO_PI, x) :
+                                             m3gSub(x, PI)));
+    }
+    return m3gSinFirstQuadrant((x >= HALF_PI) ? m3gSub(PI, x) : x);
+}
+#endif /* M3G_SOFT_FLOAT */
+
+/*------------- Float vs. int conversion helpers -------------*/
+
+/*!
+ * \internal
+ * \brief Scales and clamps a float to unsigned byte range
+ */
+static M3G_INLINE M3Gint m3gFloatToByte(const M3Gfloat a)
+{
+    return m3gRoundToInt(m3gMul(255.f, m3gClampFloat(a, 0.f, 1.f)));
+}
+
+/*------------------ Vector helpers ------------------*/
+
+/*!
+ * \internal
+ * \brief Computes the norm of a floating-point 3-vector
+ */
+static M3G_INLINE M3Gfloat m3gNorm3(const M3Gfloat *v)
+{
+    return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
+                  m3gSquare(v[2]));
+}
+
+/*!
+ * \internal
+ * \brief Computes the norm of a floating-point 4-vector
+ */
+static M3G_INLINE M3Gfloat m3gNorm4(const M3Gfloat *v)
+{
+    return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
+                  m3gAdd(m3gSquare(v[2]), m3gSquare(v[3])));
+}
+
+/*!
+ * \internal
+ * \brief Scales a floating-point 3-vector
+ */
+static M3G_INLINE void m3gScale3(M3Gfloat *v, M3Gfloat s)
+{
+    v[0] = m3gMul(v[0], s);
+    v[1] = m3gMul(v[1], s);
+    v[2] = m3gMul(v[2], s);
+}
+
+/*!
+ * \internal
+ * \brief Scales a floating-point 4-vector
+ */
+static M3G_INLINE void m3gScale4(M3Gfloat *v, M3Gfloat s)
+{
+    v[0] = m3gMul(v[0], s);
+    v[1] = m3gMul(v[1], s);
+    v[2] = m3gMul(v[2], s);
+    v[3] = m3gMul(v[3], s);
+}
+
+
+/*------------------ Matrices ------------------*/
+
+/*!
+ * \internal
+ */
+static M3G_INLINE M3Gbool m3gIsClassified(const Matrix *mtx)
+{
+    M3G_ASSERT(mtx != NULL);
+    return (M3Gbool) mtx->classified;
+}
+
+/*!
+ * \internal
+ * \brief Returns a classification for a single floating point number
+ */
+static M3G_INLINE M3Guint m3gElementClass(const M3Gfloat x)
+{
+    if (IS_ZERO(x)) {
+        return ELEM_ZERO;
+    }
+    else if (IS_ONE(x)) {
+        return ELEM_ONE;
+    }
+    else if (IS_MINUS_ONE(x)) {
+        return ELEM_MINUS_ONE;
+    }
+    return ELEM_ANY;
+}
+
+/*!
+ * \internal
+ * \brief Computes the classification mask of a matrix
+ *
+ * The mask is constructed from two bits per elements, with the lowest
+ * two bits corresponding to the first element in the \c elem array of
+ * the matrix.
+ */
+static void m3gClassify(Matrix *mtx)
+{
+    M3Guint mask = 0;
+    const M3Gfloat *p;
+    int i;
+
+    M3G_ASSERT(mtx != NULL);
+    M3G_ASSERT(!m3gIsClassified(mtx));
+
+    p = mtx->elem;
+    for (i = 0; i < 16; ++i) {
+        M3Gfloat elem = *p++;
+        mask |= (m3gElementClass(elem) << (i*2));
+    }
+    mtx->mask = mask;
+    mtx->classified = M3G_TRUE;
+}
+
+/*!
+ * \internal
+ * \brief Sets matrix classification directly
+ */
+static M3G_INLINE void m3gClassifyAs(M3Guint mask, Matrix *mtx)
+{
+    M3G_ASSERT(mtx != NULL);
+    mtx->mask = mask;
+    mtx->classified = M3G_TRUE;
+    mtx->complete = M3G_FALSE;
+}
+
+/*!
+ * \internal
+ * \brief Attempts to classify a matrix more precisely
+ *
+ * Tries to classify all "free" elements of a matrix into one of the
+ * predefined constants to improve precision and performance in
+ * subsequent calculations.
+ */
+static void m3gSubClassify(Matrix *mtx)
+{
+    M3G_ASSERT_PTR(mtx);
+    M3G_ASSERT(m3gIsClassified(mtx));
+    {
+        const M3Gfloat *p = mtx->elem;
+        M3Guint inMask = mtx->mask;
+        M3Guint outMask = inMask;
+        int i;
+
+        for (i = 0; i < 16; ++i, inMask >>= 2) {
+            M3Gfloat elem = *p++;
+            if ((inMask & 0x03) == ELEM_ANY) {
+                outMask &= ~(0x03u << (i*2));
+                outMask |= (m3gElementClass(elem) << (i*2));
+            }
+        }
+        mtx->mask = outMask;
+    }
+}
+
+/*!
+ * \internal
+ * \brief Fills in the implicit values for a classified matrix
+ */
+static void m3gFillClassifiedMatrix(Matrix *mtx)
+{
+    int i;
+    M3Guint mask;
+    M3Gfloat *p;
+
+    M3G_ASSERT(mtx != NULL);
+    M3G_ASSERT(mtx->classified);
+    M3G_ASSERT(!mtx->complete);
+
+    mask = mtx->mask;
+    p = mtx->elem;
+
+    for (i = 0; i < 16; ++i, mask >>= 2) {
+        unsigned elem = (mask & 0x03);
+        switch (elem) {
+        case ELEM_ZERO:         *p++ =  0.0f; break;
+        case ELEM_ONE:          *p++ =  1.0f; break;
+        case ELEM_MINUS_ONE:    *p++ = -1.0f; break;
+        default:                ++p;
+        }
+    }
+    mtx->complete = M3G_TRUE;
+}
+
+
+#if !defined(M3G_HW_FLOAT)
+/*!
+ * \internal
+ * \brief Performs one multiply-add of classified matrix elements
+ *
+ * \param amask element class of the first multiplicand
+ * \param a     float value of the first multiplicand
+ * \param bmask element class of the second multiplicand
+ * \param b     float value of the second multiplicand
+ * \param c     float value to add
+ * \return a * b + c
+ * 
+ * \notes inline, as only called from the matrix product function
+ */
+static M3G_INLINE M3Gfloat m3gClassifiedMadd(
+    const M3Gbitmask amask, const M3Gfloat *pa,
+    const M3Gbitmask bmask, const M3Gfloat *pb,
+    const M3Gfloat c)
+{
+    /* Check for zero product to reduce the switch cases below */
+    
+    if (amask == ELEM_ZERO || bmask == ELEM_ZERO) {
+        return c;    
+    }
+
+    /* Branch based on the classification of a */
+    
+    switch (amask) {
+        
+    case ELEM_ANY:
+        if (bmask == ELEM_ONE) {
+            return m3gAdd(*pa, c);      /*  a * 1 + c  =  a + c  */
+        }
+        if (bmask == ELEM_MINUS_ONE) {
+            return m3gSub(c, *pa);      /*  a * -1 + c = -a + c  */
+        }
+        return m3gMadd(*pa, *pb, c);    /*  a * b + c            */
+        
+    case ELEM_ONE:
+        if (bmask == ELEM_ONE) {
+            return m3gAdd(c, 1.f);      /*  1 * 1 + c  = 1 + c   */
+        }
+        if (bmask == ELEM_MINUS_ONE) {
+            return m3gSub(c, 1.f);      /*  1 * -1 + c = -1 + c  */
+        }
+        return m3gAdd(*pb, c);          /*  1 * b + c  =  b + c  */
+        
+    case ELEM_MINUS_ONE:
+        if (bmask == ELEM_ONE) {
+            return m3gSub(c, 1.f);      /* -1 * 1 + c  = -1 + c  */
+        }
+        if (bmask == ELEM_MINUS_ONE) {
+            return m3gAdd(c, 1.f);      /* -1 * -1 + c =  1 + c  */
+        }
+        return m3gSub(c, *pb);          /* -1 * b + c  = -b + c  */
+        
+    default:
+        M3G_ASSERT(M3G_FALSE);
+        return 0.0f;
+    }
+}
+#endif /*!defined(M3G_HW_FLOAT)*/
+
+/*!
+ * \internal
+ * \brief Computes a generic 4x4 matrix product
+ */
+static void m3gGenericMatrixProduct(Matrix *dst,
+                                    const Matrix *left, const Matrix *right)
+{
+    M3G_ASSERT(dst != NULL && left != NULL && right != NULL);
+
+    {
+        
+#       if defined(M3G_HW_FLOAT)
+        if (!left->complete) {
+            m3gFillClassifiedMatrix((Matrix*)left);
+        }
+        if (!right->complete) {
+            m3gFillClassifiedMatrix((Matrix*)right);
+        }
+#       else
+        int row;
+        const unsigned lmask = left->mask;
+        const unsigned rmask = right->mask;
+#       endif
+        
+#if defined(M3G_HW_FLOAT_VFPV2)
+		_m3gGenericMatrixProduct(dst, left, right);
+#else	
+        for (row = 0; row < 4; ++row) {
+            int col;
+            for (col = 0; col < 4; ++col) {
+                int k;
+                M3Gfloat a = 0;
+                for (k = 0; k < 4; ++k) {
+                    M3Gint lidx = MELEM(row, k);
+                    M3Gint ridx = MELEM(k, col);
+#                   if defined(M3G_HW_FLOAT)
+                    a = m3gMadd(left->elem[lidx], right->elem[ridx], a);
+#                   else
+                    a = m3gClassifiedMadd((lmask >> (2 * lidx)) & 3,
+                                          &left->elem[lidx],
+                                          (rmask >> (2 * ridx)) & 3,
+                                          &right->elem[ridx],
+                                          a);
+#                   endif /*!M3G_HW_FLOAT*/
+                }
+                M44F(dst, row, col) = a;
+            }
+        }
+#endif /*!M3G_HW_FLOAT_VFPV2*/
+    }
+    dst->complete = M3G_TRUE;
+    dst->classified = M3G_FALSE;
+}
+
+/*!
+ * \internal
+ * \brief Converts a float vector to 16-bit integers
+ *
+ * \param size   vector length
+ * \param vec    pointer to float vector
+ * \param scale  scale of output values, as the number of bits to
+ *               shift left to get actual values
+ * \param outVec output value vector
+ */
+static void m3gFloatVecToShort(M3Gint size, const M3Gfloat *vec,
+                               M3Gint scale, M3Gshort *outVec)
+{
+    const M3Guint *vecInt = (const M3Guint*) vec;
+    M3Gint i;
+    
+    for (i = 0; i < size; ++i) {
+        M3Guint a = vecInt[i];
+        if ((a & ~SIGN_MASK) < (1 << 23)) {
+            *outVec++ = 0;
+        }
+        else {
+            M3Gint shift =
+                scale - ((M3Gint)((vecInt[i] >> 23) & 0xFFu) - (127 + 23));
+            M3G_ASSERT(shift > 8); /* or the high bits will overflow */
+            
+            if (shift > 23) {
+                *outVec++ = 0;
+            }
+            else {
+                M3Gint out =
+                    (M3Gint) (((a & MANTISSA_MASK) | LEADING_ONE) >> shift);
+                if (a >> 31) {
+                    out = -out;
+                }
+                M3G_ASSERT(m3gInRange(out, -32767, 32767));
+                *outVec++ = (M3Gshort) out;
+            }
+        }
+    }
+}
+
+/*----------------------------------------------------------------------
+ * Internal functions
+ *--------------------------------------------------------------------*/
+
+/*--------------------------------------------------------------------*/
+
+#if defined(M3G_SOFT_FLOAT)
+
+#if !defined (M3G_BUILD_ARM)
+
+static M3Gfloat m3gAdd(const M3Gfloat a, const M3Gfloat b)
+{
+    return m3gFloatAdd(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
+}
+
+static M3Gfloat m3gMul(const M3Gfloat a, const M3Gfloat b)
+{
+    return m3gFloatMul(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
+}
+
+#endif /* M3G_BUILD_ARM */
+
+/*!
+ * \internal
+ * \brief Computes the reciprocal of the square root
+ *
+ * \param x a floating point value
+ * \return 1 / square root of \c x
+ */
+static M3Gfloat m3gRcpSqrt(const M3Gfloat x)
+{
+    /* Approximation followed by Newton-Raphson iteration a'la
+     * "Floating-point tricks" by J. Blinn, but we iterate several
+     * times to improve precision */
+    
+    M3Gint i = (M3G_FLOAT_ONE + (M3G_FLOAT_ONE >> 1))
+        - (FLOAT_AS_UINT(x) >> 1);
+    M3Gfloat y = INT_AS_FLOAT(i);
+    for (i = 0; i < 3; ++i) {
+        y = m3gMul(y, m3gSub(1.5f, m3gHalf(m3gMul(x, m3gSquare(y)))));
+    }
+    return y;
+}
+
+/*!
+ * \internal
+ * \brief Computes the square root
+ *
+ * \param x a floating point value
+ * \return square root of \c x
+ */
+static M3Gfloat m3gSqrt(const M3Gfloat x)
+{
+    /* Approximation followed by Newton-Raphson iteration a'la
+     * "Floating-point tricks" by J. Blinn, but we iterate several
+     * times to improve precision */
+
+    M3Gint i = (FLOAT_AS_UINT(x) >> 1) + (M3G_FLOAT_ONE >> 1);
+    M3Gfloat y = INT_AS_FLOAT(i);
+    for (i = 0; i < 2; ++i) {
+        y = m3gDiv(m3gAdd(m3gSquare(y), x), m3gDouble(y));
+    }
+    return y;
+}
+
+#endif /* M3G_SOFT_FLOAT */
+
+/*--------------------------------------------------------------------*/
+
+#if defined(M3G_SOFT_FLOAT)
+/*!
+ * \internal
+ */
+static M3Gfloat m3gArcCos(const M3Gfloat x)
+{
+    return (M3Gfloat) acos(x);
+}
+
+/*!
+ * \internal
+ */
+static M3Gfloat m3gArcTan(const M3Gfloat y, const M3Gfloat x)
+{
+    return (M3Gfloat) atan2(y, x);
+}
+
+/*!
+ * \internal
+ */
+static M3Gfloat m3gCos(const M3Gfloat x)
+{
+    return m3gSin(m3gAdd(x, HALF_PI));
+}
+
+/*!
+ * \internal
+ * \brief
+ */
+static M3Gfloat m3gSin(const M3Gfloat x)
+{
+    M3Gfloat f = x;
+    
+    /* If x is greater than two pi, do a modulo operation to bring it
+     * back in range for the internal sine function */
+    
+    if (m3gAbs(f) >= TWO_PI) {
+        f = m3gMul (f, (1.f / TWO_PI));
+        f = m3gFrac(f);
+        f = m3gMul (f, TWO_PI);
+    }
+
+    /* Compute the result, negating both the input value and the
+     * result if x was negative */
+    {
+        M3Guint i = FLOAT_AS_UINT(f);
+        M3Guint neg = (i & SIGN_MASK);
+        i ^= neg;
+        f = m3gSinFirstPeriod(INT_AS_FLOAT(i));
+        i = FLOAT_AS_UINT(f) ^ neg;
+        return INT_AS_FLOAT(i);
+    }
+}
+
+/*!
+ * \internal
+ */
+static M3Gfloat m3gTan(const M3Gfloat x)
+{
+    return (M3Gfloat) tan(x);
+}
+
+/*!
+ * \internal
+ */
+static M3Gfloat m3gExp(const M3Gfloat a)
+{
+    return (M3Gfloat) exp(a);
+}
+#endif /* M3G_SOFT_FLOAT */
+
+/*!
+ * \brief Checks whether the bottom row of a matrix is 0 0 0 1
+ */
+static M3Gbool m3gIsWUnity(const Matrix *mtx)
+{
+    M3G_ASSERT_PTR(mtx);
+
+    if (!m3gIsClassified(mtx)) {
+        return (IS_ZERO(M44F(mtx, 3, 0)) &&
+                IS_ZERO(M44F(mtx, 3, 1)) &&
+                IS_ZERO(M44F(mtx, 3, 2)) &&
+                IS_ONE (M44F(mtx, 3, 3)));
+    }
+    else {
+        return ((mtx->mask & 0xC0C0C0C0) == (ELEM_ONE << 30));
+    }
+}
+
+/*!
+ * \brief Makes a quaternion by exponentiating a quaternion logarithm
+ */
+static void m3gExpQuat(Quat *quat, const Vec3 *qExp)
+{
+    M3Gfloat theta;
+
+    M3G_ASSERT_PTR(quat);
+    M3G_ASSERT_PTR(qExp);
+
+    theta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(qExp->x),
+                                  m3gSquare(qExp->y)),
+                           m3gSquare(qExp->z)));
+
+    if (theta > EPSILON) {
+        M3Gfloat s = m3gMul(m3gSin(theta), m3gRcp(theta));
+        quat->x = m3gMul(qExp->x, s);
+        quat->y = m3gMul(qExp->y, s);
+        quat->z = m3gMul(qExp->z, s);
+        quat->w = m3gCos(theta);
+    }
+    else {
+        quat->x = quat->y = quat->z = 0.0f;
+        quat->w = 1.0f;
+    }
+}
+
+/*!
+ * \brief Natural logarithm of a unit quaternion.
+ */
+static void m3gLogQuat(Vec3 *qLog, const Quat *quat)
+{
+    M3Gfloat sinTheta = m3gSqrt(m3gNorm3((const float *) quat));
+
+    if (sinTheta > EPSILON) {
+        M3Gfloat s = m3gArcTan(sinTheta, quat->w) / sinTheta;
+        qLog->x = m3gMul(s, quat->x);
+        qLog->y = m3gMul(s, quat->y);
+        qLog->z = m3gMul(s, quat->z);
+    }
+    else {
+        qLog->x = qLog->y = qLog->z = 0.0f;
+    }
+}
+
+/*!
+ * \brief Make quaternion the "logarithmic difference" between two
+ * other quaternions.
+ */
+static void m3gLogDiffQuat(Vec3 *logDiff,
+                           const Quat *from, const Quat *to)
+{
+    Quat temp;
+    temp.x = m3gNegate(from->x);
+    temp.y = m3gNegate(from->y);
+    temp.z = m3gNegate(from->z);
+    temp.w =           from->w;
+    m3gMulQuat(&temp, to);
+    m3gLogQuat(logDiff, &temp);
+}
+
+/*!
+ * \brief Rounds a float to the nearest integer
+ *
+ * Overflows are clamped to the maximum or minimum representable
+ * value.
+ */
+static M3Gint m3gRoundToInt(const M3Gfloat a)
+{
+    M3Guint base = FLOAT_AS_UINT(a);
+    M3Gint signMask, expo;
+
+    /* Decompose the number into sign, exponent, and base number */
+    
+    signMask = ((M3Gint) base >> 31);   /* -> 0 or 0xFFFFFFFF */
+    expo = (M3Gint)((base >> 23) & 0xFF) - 127;
+    
+    /* First check for large values and return either the negative or
+     * the positive maximum integer in case of overflow.  The overflow
+     * check can be made on the exponent alone, as large floats are
+     * spaced several integer values apart so that nothing will
+     * overflow because of rounding later on */
+    
+    if (expo >= 31) {
+        return (M3Gint)((1U << 31) - 1 + (((M3Guint) signMask) & 1));
+    }
+
+    /* Also check for underflow to avoid problems with shifting by
+     * more than 31 */
+
+    if (expo < -1) {
+        return 0;
+    }
+    
+    /* Mask out the sign and exponent bits, shift the base number so
+     * that the lowest bit corresponds to one half, then add one
+     * (half) and shift to round to the closest integer. */
+
+    base = (base | LEADING_ONE) << 8;   /* shift mantissa to 1.31 */
+    base =  base >> (30 - expo);        /* integer value as 31.1 */
+    base = (base + 1) >> 1;             /* round to nearest 32.0 */
+    
+    /* Factor in the sign (negate if originally negative) and
+     * return */
+
+    return ((M3Gint) base ^ signMask) - signMask;
+}
+
+/*!
+ * \brief Calculates ray-triangle intersection.
+ *
+ * http://www.acm.org/jgt/papers/MollerTrumbore97
+ */
+static M3Gbool m3gIntersectTriangle(const Vec3 *orig, const Vec3 *dir,
+                                    const Vec3 *vert0, const Vec3 *vert1, const Vec3 *vert2,
+                                    Vec3 *tuv, M3Gint cullMode)
+{
+    Vec3 edge1, edge2, tvec, pvec, qvec;
+    M3Gfloat det,inv_det;
+
+    /* find vectors for two edges sharing vert0 */
+    edge1 = *vert1;
+    edge2 = *vert2;
+    m3gSubVec3(&edge1, vert0);
+    m3gSubVec3(&edge2, vert0);
+
+    /* begin calculating determinant - also used to calculate U parameter */
+    m3gCross(&pvec, dir, &edge2);
+
+    /* if determinant is near zero, ray lies in plane of triangle */
+    det = m3gDot3(&edge1, &pvec);
+
+    if (cullMode == 0 && det <= 0) return M3G_FALSE;
+    if (cullMode == 1 && det >= 0) return M3G_FALSE;
+
+    if (det > -EPSILON && det < EPSILON)
+        return M3G_FALSE;
+    inv_det = m3gRcp(det);
+
+    /* calculate distance from vert0 to ray origin */
+    tvec = *orig;
+    m3gSubVec3(&tvec, vert0);
+
+    /* calculate U parameter and test bounds */
+    tuv->y = m3gMul(m3gDot3(&tvec, &pvec), inv_det);
+    if (tuv->y < 0.0f || tuv->y > 1.0f)
+        return M3G_FALSE;
+
+    /* prepare to test V parameter */
+    m3gCross(&qvec, &tvec, &edge1);
+
+    /* calculate V parameter and test bounds */
+    tuv->z = m3gMul(m3gDot3(dir, &qvec), inv_det);
+    if (tuv->z < 0.0f || m3gAdd(tuv->y, tuv->z) > 1.0f)
+        return M3G_FALSE;
+
+    /* calculate t, ray intersects triangle */
+    tuv->x = m3gMul(m3gDot3(&edge2, &qvec), inv_det);
+
+    return M3G_TRUE;
+}
+
+/*!
+ * \brief Calculates ray-box intersection.
+ *
+ * http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm
+ */
+
+#define XO	orig->x
+#define YO	orig->y
+#define ZO	orig->z
+#define XD	dir->x
+#define YD	dir->y
+#define ZD	dir->z
+
+#define XL	box->min[0]
+#define YL	box->min[1]
+#define ZL	box->min[2]
+#define XH	box->max[0]
+#define YH	box->max[1]
+#define ZH	box->max[2]
+
+/*!
+ * \internal
+ * \brief Ray - bounding box intersection
+ *
+ */
+static M3Gbool m3gIntersectBox(const Vec3 *orig, const Vec3 *dir, const AABB *box)
+{
+	M3Gfloat tnear = M3G_MIN_NEGATIVE_FLOAT;
+	M3Gfloat tfar  = M3G_MAX_POSITIVE_FLOAT;
+	M3Gfloat t1, t2, temp;
+
+	/* X slab */
+	if(XD != 0) {
+		t1 = m3gSub(XL, XO) / XD;
+		t2 = m3gSub(XH, XO) / XD;
+
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(XO > XH || XO < XL) return M3G_FALSE;
+	}
+
+	/* Y slab */
+	if(YD != 0) {
+		t1 = m3gSub(YL, YO) / YD;
+		t2 = m3gSub(YH, YO) / YD;
+
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(YO > YH || YO < YL) return M3G_FALSE;
+	}
+
+	/* Z slab */
+	if(ZD != 0) {
+		t1 = m3gSub(ZL, ZO) / ZD;
+		t2 = m3gSub(ZH, ZO) / ZD;
+
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(ZO > ZH || ZO < ZL) return M3G_FALSE;
+	}
+
+	return M3G_TRUE;
+}
+
+/*!
+ * \brief Calculates the intersection of two rectangles. Always fills
+ * the intersection result.
+ *
+ * \param dst   result of the intersection
+ * \param r1    rectangle 1
+ * \param r2    rectangle 2
+ */
+static M3Gbool m3gIntersectRectangle(M3GRectangle *dst, M3GRectangle *r1, M3GRectangle *r2)
+{
+    M3Gbool intersects = M3G_TRUE;
+    M3Gint min, max;
+
+    max = (r1->x) >= (r2->x) ? (r1->x) : (r2->x);
+    min = (r1->x + r1->width) <= (r2->x + r2->width) ? (r1->x + r1->width) : (r2->x + r2->width);
+    if ((min - max) < 0) intersects = M3G_FALSE;
+    dst->x = max;
+    dst->width = min - max;
+
+    max = (r1->y) >= (r2->y) ? (r1->y) : (r2->y);
+    min = (r1->y + r1->height) <= (r2->y + r2->height) ? (r1->y + r1->height) : (r2->y + r2->height);
+    if ((min - max) < 0) intersects = M3G_FALSE;
+    dst->y = max;
+    dst->height = min - max;
+
+    return intersects;
+}
+
+/*-------- float-to-int color conversions --------*/
+
+static M3Guint m3gAlpha1f(M3Gfloat a)
+{
+    M3Guint alpha = (M3Guint) m3gFloatToByte(a);
+    return (alpha << 24) | M3G_RGB_MASK;
+}
+
+static M3Guint m3gColor3f(M3Gfloat r, M3Gfloat g, M3Gfloat b)
+{
+    return ((M3Guint) m3gFloatToByte(r) << 16)
+        |  ((M3Guint) m3gFloatToByte(g) <<  8)
+        |   (M3Guint) m3gFloatToByte(b)
+        |   M3G_ALPHA_MASK;
+}
+
+static M3Guint m3gColor4f(M3Gfloat r, M3Gfloat g, M3Gfloat b, M3Gfloat a)
+{
+    return ((M3Guint) m3gFloatToByte(r) << 16)
+        |  ((M3Guint) m3gFloatToByte(g) <<  8)
+        |   (M3Guint) m3gFloatToByte(b)
+        |  ((M3Guint) m3gFloatToByte(a) << 24);
+}
+
+static void m3gFloatColor(M3Gint argb, M3Gfloat intensity, M3Gfloat *rgba)
+{
+    /* NOTE we intentionally aim a bit high here -- some GL
+     * implementations may round down instead of closest */
+    
+    const M3Gfloat oneOver255 = (M3Gfloat)(1.0001 / 255.0);
+    
+	rgba[0] = (M3Gfloat)((argb >> 16) & 0xFF);
+	rgba[1] = (M3Gfloat)((argb >>  8) & 0xFF);
+	rgba[2] = (M3Gfloat)((argb      ) & 0xFF);
+	rgba[3] = (M3Gfloat)((argb >> 24) & 0xFF);
+    
+    m3gScale4(rgba, m3gMul(oneOver255, intensity));
+}
+
+/*!
+ * \brief Converts the 3x3 submatrix of a matrix into fixed point
+ *
+ * The input matrix must be an affine one, i.e. the bottom row must be
+ * 0 0 0 1.  The output matrix is guaranteed to be such that it can be
+ * multiplied with a 16-bit 3-vector without overflowing, while using
+ * the 32-bit range optimally.
+ *
+ * \param mtx  the original matrix
+ * \param elem 9 shorts to hold the fixed point base numbers
+ * \return floating point exponent for the values in \c elem
+ *         (number of bits to shift left for actual values)
+ */
+static M3Gint m3gGetFixedPoint3x3Basis(const Matrix *mtx, M3Gshort *elem)
+{
+    M3Gint outExp;
+    M3Gint row, col;
+    const M3Guint *m;
+    
+    if (!mtx->complete) {
+        m3gFillClassifiedMatrix((Matrix*) mtx);
+    }
+    m = (const M3Guint*) mtx->elem;
+    
+    /* First, find the maximum exponent value in the whole matrix */
+
+    outExp = 0;
+    for (col = 0; col < 3; ++col) {
+        for (row = 0; row < 3; ++row) {
+            M3Gint element = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
+            outExp = M3G_MAX(outExp, element);
+        }
+    }
+    outExp >>= 23;
+
+    /* Our candidate exponent is the maximum found plus 9, which is
+     * guaranteed to shift the maximum unsigned 24-bit mantissa (which
+     * always will have the high bit set) down to the signed 16-bit
+     * range */
+
+    outExp += 9;
+    
+    /* Now proceed to sum each row and see what's the actual smallest
+     * exponent we can safely use without overflowing in a 16+16
+     * matrix-vector multiplication; this will win us one bit
+     * (doubling the precision) compared to the conservative approach
+     * of just shifting everything down by 10 bits */
+
+    for (row = 0; row < 3; ++row) {
+
+        /* Sum the absolute values on this row */
+            
+        M3Gint rowSum = 0;
+        for (col = 0; col < 3; ++col) {
+            M3Gint a = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
+            M3Gint shift = outExp - (a >> 23);
+            M3G_ASSERT(shift < 265);
+                
+            if (shift < 24) {
+                rowSum += ((a & MANTISSA_MASK) | LEADING_ONE) >> shift;
+            }
+        }
+
+        /* Now we have a 26-bit sum of the absolute values on this
+         * row, and shift that down until we fit the target range of
+         * [0, 65535].  Note that this still leaves *exactly* enough
+         * space for adding in an arbitrary 16-bit translation vector
+         * after multiplying with the matrix! */
+            
+        while (rowSum >= (1 << 16)) {
+            rowSum >>= 1;
+            ++outExp;
+        }
+    }
+
+    /* De-bias the exponent, but add in an extra 23 to account for the
+     * decimal bits in the floating point mantissa values we started
+     * with (we're returning the exponent as "bits to shift left to
+     * get integers", so we're off by 23 from IEEE notation) */
+    
+    outExp = (outExp - 127) - 23;
+    
+    /* Finally, shift all the matrix elements to our final output
+     * precision */
+    
+    for (col = 0; col < 3; ++col) {
+        m3gFloatVecToShort(3, mtx->elem + MELEM(0, col), outExp, elem);
+        elem += 3;
+    }
+    return outExp;
+}
+
+/*!
+ * \brief Gets the translation component of a matrix as fixed point
+ *
+ * \param mtx  the matrix
+ * \param elem 3 shorts to write the vector into
+ * \return floating point exponent for the values in \c elem
+ *         (number of bits to shift left for actual values)
+ */
+static M3Gint m3gGetFixedPointTranslation(const Matrix *mtx, M3Gshort *elem)
+{
+    const M3Guint *m;
+    
+    M3G_ASSERT(m3gIsWUnity(mtx));
+    if (!mtx->complete) {
+        m3gFillClassifiedMatrix((Matrix*) mtx);
+    }
+    m = (const M3Guint*) &mtx->elem[MELEM(0, 3)];
+
+    /* Find the maximum exponent, then scale down by 9 bits from that
+     * to shift the unsigned 24-bit mantissas to the signed 16-bit
+     * range */
+    {
+        M3Gint outExp;
+        M3Guint maxElem = m[0] & ~SIGN_MASK;
+        maxElem = M3G_MAX(maxElem, m[1] & ~SIGN_MASK);
+        maxElem = M3G_MAX(maxElem, m[2] & ~SIGN_MASK);
+        
+        outExp = (M3Gint)(maxElem >> 23) - (127 + 23) + 9;
+        m3gFloatVecToShort(3, mtx->elem + MELEM(0, 3), outExp, elem);
+        return outExp;
+    }
+}
+
+/*!
+ * \internal
+ * \brief Compute a bounding box enclosing two other boxes
+ *
+ * \param box   box to fit
+ * \param a     first box to enclose or NULL
+ * \param b     second box to enclose or NULL
+ * 
+ * \note If both input boxes are NULL, the box is not modified.
+ */
+static void m3gFitAABB(AABB *box, const AABB *a, const AABB *b)
+{
+    int i;
+
+    M3G_ASSERT_PTR(box);
+    
+    if (a) {
+        m3gValidateAABB(a);
+    }
+    if (b) {
+        m3gValidateAABB(b);
+    }
+
+    if (a && b) {
+        for (i = 0; i < 3; ++i) {
+            box->min[i] = M3G_MIN(a->min[i], b->min[i]);
+            box->max[i] = M3G_MAX(a->max[i], b->max[i]);
+        }
+    }
+    else if (a) {
+        *box = *a;
+    }
+    else if (b) {
+        *box = *b;
+    }
+}
+
+/*
+ * \internal
+ * \brief Transform an axis-aligned bounding box with a matrix
+ *
+ * This results in a box that encloses the transformed original box.
+ * 
+ * Based on "Transforming Axis-Aligned Bounding Boxes" by Jim Arvo
+ * from Graphics Gems.
+ * 
+ * \note The bottom row of the matrix is ignored in the transformation.
+ */
+static void m3gTransformAABB(AABB *box, const Matrix *mtx)
+{
+    M3Gfloat boxMin[3], boxMax[3];
+    M3Gfloat newMin[3], newMax[3];
+
+    m3gValidateAABB(box);
+    
+    if (!mtx->complete) {
+        m3gFillClassifiedMatrix((Matrix*) mtx);
+    }
+
+    /* Get the original minimum and maximum extents as floats, and add
+     * the translation as the base for the transformed box */
+    {
+        int i;
+        for (i = 0; i < 3; ++i) {
+            boxMin[i] = box->min[i];
+            boxMax[i] = box->max[i];
+            newMin[i] = newMax[i] = M44F(mtx, i, 3);
+        }
+    }
+
+    /* Transform into the new minimum and maximum coordinates using
+     * the upper left 3x3 part of the matrix */
+    {
+        M3Gint row, col;
+        
+        for (row = 0; row < 3; ++row) {
+            for (col = 0; col < 3; ++col) {
+                M3Gfloat a = m3gMul(M44F(mtx, row, col), boxMin[col]);
+                M3Gfloat b = m3gMul(M44F(mtx, row, col), boxMax[col]);
+                
+                if (a < b) { 
+                    newMin[row] = m3gAdd(newMin[row], a);
+                    newMax[row] = m3gAdd(newMax[row], b);
+                }
+                else { 
+                    newMin[row] = m3gAdd(newMin[row], b);
+                    newMax[row] = m3gAdd(newMax[row], a);
+                }
+            }
+        }
+    }
+
+    /* Write back into the bounding box */
+    {
+        int i;
+        for (i = 0; i < 3; ++i) {
+            box->min[i] = newMin[i];
+            box->max[i] = newMax[i];
+        }
+    }
+    
+    m3gValidateAABB(box);
+}
+
+#if defined(M3G_DEBUG)
+/*!
+ * \brief
+ */
+static void m3gValidateAABB(const AABB *aabb)
+{
+    m3gValidateFloats(6, (float*) aabb);
+}
+#endif
+
+/*----------------------------------------------------------------------
+ * Public functions
+ *--------------------------------------------------------------------*/
+
+/*!
+ * \brief Linear interpolation of vectors
+ *
+ * \param size     number of components
+ * \param vec      output vector
+ * \param s        interpolation factor
+ * \param start    initial value
+ * \param end      target value
+ */
+#if defined(M3G_HW_FLOAT_VFPV2)
+
+M3G_API __asm void m3gLerp(M3Gint size,
+				   M3Gfloat *vec,
+				   M3Gfloat s,
+				   const M3Gfloat *start, const M3Gfloat *end)
+{
+// r0 = size
+// r1 = *vec
+// r2 = s
+// r3 = *start
+// sp[0] = *end
+
+		CODE32
+/*
+    M3Gfloat sCompl = 1.0 - s;
+    for (i = 0; i < size; ++i) {
+        vec[i] = sCompl*start[i] + s*end[i];
+    }
+*/
+//
+// if size = 0, return
+//
+		CMP		r0, #0
+		BXEQ	lr
+
+		FMSR	s0, r2
+		MOV		r2, r3
+		LDR		r3, [sp]
+
+		FLDS	s1,=1.0
+		STMFD	sp!, {r4-r5}
+		FSUBS	s2, s1, s0			// sCompl = 1 - s
+
+		FMRX	r4, FPSCR
+		CMP		r0, #4
+		BGT		_m3gLerp_over4Loop
+
+//
+// if 1 < size <= 4
+//
+// set vector STRIDE = 1, LENGTH = 4
+		BIC		r12, r4, #0x00370000
+		ORR		r12, #(3<<16)
+		FMXR	FPSCR, r12
+
+		FLDMIAS	r2!, {s4-s7}		// load the start[i] values
+		FLDMIAS	r3!, {s8-s11}		// load the end[i] values
+		FMULS	s12, s4, s2			// [s12-s15]  = [sCompl*start[0] .. sCompl*start[3]]
+		FMACS	s12, s8, s0			// [s12-s15] += [    	s*end[0] ..        s*end[3]]
+		CMP		r0, #1
+		FSTS	s12, [r1]
+		FSTSGT	s13, [r1, #4]
+		CMP		r0, #3
+		FSTSGE	s14, [r1, #8]
+		FSTSGT	s15, [r1, #12]
+
+		FMXR	FPSCR, r4
+
+		LDMFD	sp!, {r4-r5}
+
+		BX		lr
+//
+// if size > 4, interpolate 8 values in one loop
+//
+_m3gLerp_over4Loop
+
+		FSTMFDD	sp!, {d8-d9}
+		MOVS	r5, r0, ASR #3			// size/8
+		SUB		r0, r0, r5, LSL #3		// tail length
+
+// set vector STRIDE = 1, LENGTH = 8
+		BIC		r12, r4, #0x00370000
+		ORR		r12, #(7<<16)
+		FMXR	FPSCR, r12
+
+
+_m3gLerp_alignedLoop
+
+		FLDMIASNE	r2!, {s8-s15}		// load the start[i] values
+		FLDMIASNE	r3!, {s16-s23}		// load the end[i] values
+		FMULSNE		s24, s8, s2			// [s16-s23]  = [ sCompl*start[0] sCompl*start[1] .. sCompl*start[7]]
+		FMACSNE		s24, s16, s0		// [s16-s23] += [		 s*end[0]        s*end[1] ..		s*end[7]]
+		FSTMIASNE	r1!, {s24-s31}
+		SUBSNE		r5, #1
+
+		BNE			_m3gLerp_alignedLoop
+
+// process the 0-7 remaining values in the tail
+
+		CMP			r0, #1
+		FLDMIASGE	r2!, {s8-s15}		
+		FLDMIASGE	r3!, {s16-s23}		
+		FMULSGE		s24, s8, s2			
+		FMACSGE		s24, s16, s0		
+		FSTSGE		s24, [r1]
+		FSTSGT		s25, [r1, #4]
+		CMP			r0, #3
+		FSTSGE		s26, [r1, #8]
+		FSTSGT		s27, [r1, #12]
+		CMP			r0, #5
+		FSTSGE		s28, [r1, #16]
+		FSTSGT		s29, [r1, #20]
+		CMP			r0, #7
+		FSTSEQ		s30, [r1, #24]
+
+		FMXR	FPSCR, r4
+
+		FLDMFDD	sp!, {d8-d9}
+		LDMFD	sp!, {r4-r5}
+
+		BX		lr
+
+}
+#else /* #if defined(M3G_HW_FLOAT_VFPV2) */
+
+M3G_API void m3gLerp(M3Gint size,
+             M3Gfloat *vec,
+             M3Gfloat s,
+             const M3Gfloat *start, const M3Gfloat *end)
+{
+    int i;
+
+    M3Gfloat sCompl = m3gSub(1.f, s);
+    for (i = 0; i < size; ++i) {
+        vec[i] = m3gAdd(m3gMul(sCompl, start[i]), m3gMul(s, end[i]));
+    }
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+
+/*!
+ * \brief Hermite spline interpolation of vectors
+ *
+ * \param size      number of components
+ * \param vec       output vector
+ * \param s         interpolation factor
+ * \param start     start value vector
+ * \param end       end value vector
+ * \param tStart    start tangent vector
+ * \param tEnd      end tangent vector
+ */
+M3G_API void m3gHermite(M3Gint size,
+                        M3Gfloat *vec,
+                        M3Gfloat s,
+                        const M3Gfloat *start, const M3Gfloat *end,
+                        const M3Gfloat *tStart, const M3Gfloat *tEnd)
+{
+    M3Gfloat s2 = m3gSquare(s);
+    M3Gfloat s3 = m3gMul(s2, s);
+    int i;
+    
+    for (i = 0; i < size; ++i) {
+        vec[i] =
+            m3gMadd(start[i],
+                    m3gAdd(m3gSub(m3gDouble(s3), m3gMul(3.f, s2)), 1.f),
+                    m3gMadd(end[i],
+                            m3gSub(m3gMul(3.f, s2), m3gDouble(s3)),
+                            m3gMadd(tStart[i],
+                                    m3gAdd(m3gSub(s3, m3gDouble(s2)), s),
+                                    m3gMul(tEnd[i],
+                                           m3gSub(s3, s2)))));
+
+    }
+    
+    /*  vec = ( 2*s3 - 3*s2 + 1) * start
+            + (-2*s3 + 3*s2    ) * end
+            + (   s3 - 2*s2 + s) * tStart
+            + (   s3 -   s2    ) * tEnd;    */
+}
+
+/*--------------------------------------------------------------------*/
+
+/*!
+ * \brief Sets a matrix to a copy of another matrix
+ */
+M3G_API void m3gCopyMatrix(Matrix *dst, const Matrix *src)
+{
+    M3G_ASSERT(dst != NULL && src != NULL);
+    *dst = *src;
+}
+
+/*!
+ * \brief Vector addition
+ */
+M3G_API void m3gAddVec3(Vec3 *vec, const Vec3 *other)
+{
+    vec->x = m3gAdd(vec->x, other->x);
+    vec->y = m3gAdd(vec->y, other->y);
+    vec->z = m3gAdd(vec->z, other->z);
+}
+
+/*!
+ * \brief Vector addition
+ */
+M3G_API void m3gAddVec4(Vec4 *vec, const Vec4 *other)
+{
+    vec->x = m3gAdd(vec->x, other->x);
+    vec->y = m3gAdd(vec->y, other->y);
+    vec->z = m3gAdd(vec->z, other->z);
+    vec->w = m3gAdd(vec->w, other->w);
+}
+
+/*!
+ * \brief Cross product of two 3D vectors expressed as 4D vectors
+ */
+M3G_API void m3gCross(Vec3 *dst, const Vec3 *a, const Vec3 *b)
+{
+    dst->x = m3gSub(m3gMul(a->y, b->z), m3gMul(a->z, b->y));
+    dst->y = m3gSub(m3gMul(a->z, b->x), m3gMul(a->x, b->z));
+    dst->z = m3gSub(m3gMul(a->x, b->y), m3gMul(a->y, b->x));
+}
+
+/*!
+ * \brief Dot product of two vectors
+ */
+M3G_API M3Gfloat m3gDot3(const Vec3 *a, const Vec3 *b)
+{
+    M3Gfloat d;
+    d = m3gMul(a->x, b->x);
+    d = m3gMadd(a->y, b->y, d);
+    d = m3gMadd(a->z, b->z, d);
+    return d;
+}
+
+/*!
+ * \brief Dot product of two vectors
+ */
+M3G_API M3Gfloat m3gDot4(const Vec4 *a, const Vec4 *b)
+{
+    M3Gfloat d;
+    d = m3gMul(a->x, b->x);
+    d = m3gMadd(a->y, b->y, d);
+    d = m3gMadd(a->z, b->z, d);
+    d = m3gMadd(a->w, b->w, d);
+    return d;
+}
+
+/*!
+ * \brief
+ */
+M3G_API void m3gSetVec3(Vec3 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z)
+{
+    M3G_ASSERT_PTR(v);
+    v->x = x;
+    v->y = y;
+    v->z = z;
+}
+
+/*!
+ * \brief
+ */
+M3G_API void m3gSetVec4(Vec4 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z, M3Gfloat w)
+{
+    M3G_ASSERT_PTR(v);
+    v->x = x;
+    v->y = y;
+    v->z = z;
+    v->w = w;
+}
+
+/*!
+ * \brief Vector subtraction
+ */
+M3G_API void m3gSubVec3(Vec3 *vec, const Vec3 *other)
+{
+    vec->x = m3gSub(vec->x, other->x);
+    vec->y = m3gSub(vec->y, other->y);
+    vec->z = m3gSub(vec->z, other->z);
+}
+
+/*!
+ * \brief Vector subtraction
+ */
+M3G_API void m3gSubVec4(Vec4 *vec, const Vec4 *other)
+{
+    vec->x = m3gSub(vec->x, other->x);
+    vec->y = m3gSub(vec->y, other->y);
+    vec->z = m3gSub(vec->z, other->z);
+    vec->w = m3gSub(vec->w, other->w);
+}
+
+/*!
+ * \brief Vector length
+ */
+M3G_API M3Gfloat m3gLengthVec3(const Vec3 *vec)
+{
+    return m3gSqrt(m3gAdd(m3gAdd(m3gSquare(vec->x),
+                                 m3gSquare(vec->y)),
+                          m3gSquare(vec->z)));
+}
+
+/*!
+ * \brief Vector scaling
+ */
+M3G_API void m3gScaleVec3(Vec3 *vec, const M3Gfloat s)
+{
+    vec->x = m3gMul(vec->x, s);
+    vec->y = m3gMul(vec->y, s);
+    vec->z = m3gMul(vec->z, s);
+}
+
+/*!
+ * \brief Vector scaling
+ */
+M3G_API void m3gScaleVec4(Vec4 *vec, const M3Gfloat s)
+{
+    vec->x = m3gMul(vec->x, s);
+    vec->y = m3gMul(vec->y, s);
+    vec->z = m3gMul(vec->z, s);
+    vec->w = m3gMul(vec->w, s);
+}
+
+/*!
+ * \brief Returns an angle-axis representation for a quaternion
+ *
+ * \note There are many, and this is not guaranteed to return a
+ * particular one
+ */
+M3G_API void m3gGetAngleAxis(const Quat *quat, M3Gfloat *angle, Vec3 *axis)
+{
+    M3Gfloat x, y, z, sinTheta;
+
+    M3G_ASSERT_PTR(quat);
+    M3G_ASSERT_PTR(angle);
+    M3G_ASSERT_PTR(axis);
+
+    x = quat->x;
+    y = quat->y;
+    z = quat->z;
+
+    sinTheta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(x), m3gSquare(y)),
+                              m3gSquare(z)));
+
+    if (sinTheta > EPSILON) {
+        M3Gfloat ooSinTheta = m3gRcp(sinTheta);
+        axis->x = m3gMul(x, ooSinTheta);
+        axis->y = m3gMul(y, ooSinTheta);
+        axis->z = m3gMul(z, ooSinTheta);
+    }
+    else {
+        /* return a valid axis even for no rotation */
+        axis->x = axis->y = 0.0f;
+        axis->z = 1.0f;
+    }
+    *angle = m3gMul(2.0f * RAD2DEG, m3gArcCos(quat->w));
+}
+
+/*!
+ * \brief Gets a single matrix column
+ */
+M3G_API void m3gGetMatrixColumn(const Matrix *mtx, M3Gint col, Vec4 *dst)
+{
+    if ((col & ~3) == 0) {
+        if (!mtx->complete) {
+            m3gFillClassifiedMatrix((Matrix*)mtx);
+        }
+        dst->x = M44F(mtx, 0, col);
+        dst->y = M44F(mtx, 1, col);
+        dst->z = M44F(mtx, 2, col);
+        dst->w = M44F(mtx, 3, col);
+    }
+    else {
+        M3G_ASSERT(M3G_FALSE);
+    }
+}
+
+/*!
+ * \brief Returns the floating point values of a matrix as consecutive
+ * columns
+ */
+M3G_API void m3gGetMatrixColumns(const Matrix *mtx, M3Gfloat *dst)
+{
+    M3G_ASSERT(mtx != NULL && dst != NULL);
+
+    if (!mtx->complete) {
+        m3gFillClassifiedMatrix((Matrix*)mtx);
+    }
+    m3gCopy(dst, mtx->elem, sizeof(mtx->elem));
+}
+
+/*!
+ * \brief Gets a single matrix row
+ */
+M3G_API void m3gGetMatrixRow(const Matrix *mtx, M3Gint row, Vec4 *dst)
+{
+    if ((row & ~3) == 0) {
+        if (!mtx->complete) {
+            m3gFillClassifiedMatrix((Matrix*)mtx);
+        }
+        dst->x = M44F(mtx, row, 0);
+        dst->y = M44F(mtx, row, 1);
+        dst->z = M44F(mtx, row, 2);
+        dst->w = M44F(mtx, row, 3);
+    }
+    else {
+        M3G_ASSERT(M3G_FALSE);
+    }
+}
+
+/*!
+ * \brief Returns the floating point values of a matrix as consecutive
+ * rows
+ */
+M3G_API void m3gGetMatrixRows(const Matrix *mtx, M3Gfloat *dst)
+{
+    M3G_ASSERT(mtx != NULL && dst != NULL);
+
+    if (!mtx->complete) {
+        m3gFillClassifiedMatrix((Matrix*)mtx);
+    }
+    {
+        int row;
+        for (row = 0; row < 4; ++row) {
+            *dst++ = mtx->elem[ 0 + row];
+            *dst++ = mtx->elem[ 4 + row];
+            *dst++ = mtx->elem[ 8 + row];
+            *dst++ = mtx->elem[12 + row];
+        }
+    }
+}
+
+/*!
+ * \brief Sets a matrix to identity
+ */
+M3G_API void m3gIdentityMatrix(Matrix *mtx)
+{
+    M3G_ASSERT(mtx != NULL);
+    m3gClassifyAs(MC_IDENTITY, mtx);
+}
+
+/*!
+ * \brief Sets a quaternion to identity
+ */
+M3G_API void m3gIdentityQuat(Quat *quat)
+{
+    M3G_ASSERT(quat != NULL);
+    quat->x = quat->y = quat->z = 0.0f;
+    quat->w = 1.0f;
+}
+
+/*!
+ * \brief Inverts a matrix
+ */
+M3G_API M3Gbool m3gInvertMatrix(Matrix *mtx)
+{
+    M3Gfloat *matrix;
+    M3Gint i;
+    M3Gfloat tmp[12];
+    M3Gfloat src[16];
+    M3Gfloat det;
+
+    M3G_ASSERT(mtx != NULL);
+
+    if (!m3gIsClassified(mtx)) {
+        m3gClassify(mtx);
+    }
+
+    /* Quick exit for identity */
+    
+    if (mtx->mask == MC_IDENTITY) {
+        return M3G_TRUE;
+    }
+
+    /* Look for other common cases; these require that we have valid
+     * values in all the elements, so fill in the values first */
+    {
+        M3Guint mask = mtx->mask;
+        
+        if (!mtx->complete) {
+            m3gFillClassifiedMatrix(mtx);
+        }
+        
+        if ((mask | (0x3F << 24)) == MC_TRANSLATION) {
+            M44F(mtx, 0, 3) = m3gNegate(M44F(mtx, 0, 3));
+            M44F(mtx, 1, 3) = m3gNegate(M44F(mtx, 1, 3));
+            M44F(mtx, 2, 3) = m3gNegate(M44F(mtx, 2, 3));
+            mtx->mask = MC_TRANSLATION;
+            return M3G_TRUE;
+        }
+        if ((mask | 0x300C03) == MC_SCALING) {
+            if ((mask &  3       ) == 0 ||
+                (mask & (3 << 10)) == 0 ||
+                (mask & (3 << 20)) == 0) {
+                return M3G_FALSE; /* zero scale for at least one axis */
+            }
+            M44F(mtx, 0, 0) = m3gRcp(M44F(mtx, 0, 0));
+            M44F(mtx, 1, 1) = m3gRcp(M44F(mtx, 1, 1));
+            M44F(mtx, 2, 2) = m3gRcp(M44F(mtx, 2, 2));
+            return M3G_TRUE;
+        }
+    }
+
+    /* Do a full 4x4 inversion as a last resort */
+        
+	matrix = mtx->elem;
+
+    /* transpose matrix */
+    for (i = 0; i < 4; i++) {
+        src[i] = matrix[i*4];
+        src[i+4] = matrix[i*4+1];
+        src[i+8] = matrix[i*4+2];
+        src[i+12] = matrix[i*4+3];
+    }
+
+    /* calculate pairs for first 8 elements (cofactors) */
+    tmp[0] = src[10]*src[15];
+    tmp[1] = src[11]*src[14];
+    tmp[2] = src[9]*src[15];
+    tmp[3] = src[11]*src[13];
+    tmp[4] = src[9]*src[14];
+    tmp[5] = src[10]*src[13];
+    tmp[6] = src[8]*src[15];
+    tmp[7] = src[11]*src[12];
+    tmp[8] = src[8]*src[14];
+    tmp[9] = src[10]*src[12];
+    tmp[10] = src[8]*src[13];
+    tmp[11] = src[9]*src[12];
+
+    /* calculate first 8 elements (cofactors) */
+    matrix[0] = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7];
+    matrix[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7];
+    matrix[1] = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7];
+    matrix[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7];
+    matrix[2] = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7];
+    matrix[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7];
+    matrix[3] = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6];
+    matrix[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6];
+    matrix[4] = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3];
+    matrix[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3];
+    matrix[5] = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3];
+    matrix[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3];
+    matrix[6] = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3];
+    matrix[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3];
+    matrix[7] = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2];
+    matrix[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2];
+
+    /* calculate pairs for second 8 elements (cofactors) */
+    tmp[0] = src[2]*src[7];
+    tmp[1] = src[3]*src[6];
+    tmp[2] = src[1]*src[7];
+    tmp[3] = src[3]*src[5];
+    tmp[4] = src[1]*src[6];
+    tmp[5] = src[2]*src[5];
+    tmp[6] = src[0]*src[7];
+    tmp[7] = src[3]*src[4];
+    tmp[8] = src[0]*src[6];
+    tmp[9] = src[2]*src[4];
+    tmp[10] = src[0]*src[5];
+    tmp[11] = src[1]*src[4];
+
+    /* calculate second 8 elements (cofactors) */
+    matrix[8] = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15];
+    matrix[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15];
+    matrix[9] = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15];
+    matrix[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15];
+    matrix[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15];
+    matrix[10] -= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15];
+    matrix[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14];
+    matrix[11] -= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14];
+    matrix[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9];
+    matrix[12] -= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10];
+    matrix[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10];
+    matrix[13] -= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8];
+    matrix[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8];
+    matrix[14] -= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9];
+    matrix[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9];
+    matrix[15] -= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8];
+
+    /* calculate determinant */
+    det = src[0]*matrix[0]+src[1]*matrix[1]+src[2]*matrix[2]+src[3]*matrix[3];
+
+    /* matrix has no inverse */
+    if (det == 0.0f) {
+        return M3G_FALSE;
+    }
+
+    /* calculate matrix inverse */
+    det = 1/det;
+    for (i = 0; i < 16; i++) {
+        matrix[i] *= det;
+    }
+
+    mtx->classified = M3G_FALSE;
+	return M3G_TRUE;
+}
+
+/*!
+ * \brief Sets a matrix to the inverse of another matrix
+ */
+M3G_API M3Gbool m3gMatrixInverse(Matrix *mtx, const Matrix *other)
+{
+    M3G_ASSERT(mtx != NULL && other != NULL);
+
+    if (!m3gIsClassified(other)) {
+        m3gClassify((Matrix*)other);
+    }
+
+	m3gCopyMatrix(mtx, other);
+	return m3gInvertMatrix(mtx);
+}
+
+/*!
+ * \brief Sets a matrix to the transpose of another matrix
+ */
+M3G_API void m3gMatrixTranspose(Matrix *mtx, const Matrix *other)
+{
+    M3Gbyte i;
+    M3G_ASSERT(mtx != NULL && other != NULL);
+
+    if (!other->complete) {
+        m3gFillClassifiedMatrix((Matrix *)other);
+    }
+
+    for (i = 0; i < 4; i++) {
+        mtx->elem[i] = other->elem[i*4];
+        mtx->elem[i+4] = other->elem[i*4+1];
+        mtx->elem[i+8] = other->elem[i*4+2];
+        mtx->elem[i+12] = other->elem[i*4+3];
+    }
+    mtx->classified = M3G_FALSE;
+    mtx->complete = M3G_TRUE;
+}
+
+M3G_API M3Gbool m3gInverseTranspose(Matrix *mtx, const Matrix *other)
+{
+    Matrix temp;
+    if (!m3gMatrixInverse(&temp, other)) {
+        return M3G_FALSE;
+    }
+    m3gMatrixTranspose(mtx, &temp);
+    return M3G_TRUE;
+}
+
+/*!
+ * \brief Sets a matrix to the product of two other matrices
+ *
+ * \note \c dst can not be either of \c left or \c right; if it is,
+ * the results are undefined
+ */
+M3G_API void m3gMatrixProduct(Matrix *dst, const Matrix *left, const Matrix *right)
+{
+    M3G_ASSERT_PTR(dst);
+    M3G_ASSERT_PTR(left);
+    M3G_ASSERT_PTR(right);
+    M3G_ASSERT(dst != left && dst != right);
+
+    /* Classify input matrices and take early exits for identities */
+
+    if (!m3gIsClassified(left)) {
+        m3gClassify((Matrix*)left);
+    }
+    if (left->mask == MC_IDENTITY) {
+        m3gCopyMatrix(dst, right);
+        return;
+    }
+
+    if (!m3gIsClassified(right)) {
+        m3gClassify((Matrix*)right);
+    }
+    if (right->mask == MC_IDENTITY) {
+        m3gCopyMatrix(dst, left);
+        return;
+    }
+
+    /* Special quick paths for 3x4 matrices */
+    
+    if (m3gIsWUnity(left) && m3gIsWUnity(right)) {
+
+        /* Translation? */
+        
+        if ((left->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
+            
+            if (left->mask != MC_TRANSLATION && !left->complete) {
+                m3gFillClassifiedMatrix((Matrix*)left);
+            }
+            if (right->mask != MC_TRANSLATION && !right->complete) {
+                m3gFillClassifiedMatrix((Matrix*)right);
+            }
+
+            m3gCopyMatrix(dst, right);
+            
+            M44F(dst, 0, 3) = m3gAdd(M44F(left, 0, 3), M44F(dst, 0, 3));
+            M44F(dst, 1, 3) = m3gAdd(M44F(left, 1, 3), M44F(dst, 1, 3));
+            M44F(dst, 2, 3) = m3gAdd(M44F(left, 2, 3), M44F(dst, 2, 3));
+            
+            dst->mask |= MC_TRANSLATION_PART;
+            return;
+        }
+
+        if ((right->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
+            Vec4 tvec;
+
+            if (left->mask != MC_TRANSLATION && !left->complete) {
+                m3gFillClassifiedMatrix((Matrix*)left);
+            }
+            if (right->mask != MC_TRANSLATION && !right->complete) {
+                m3gFillClassifiedMatrix((Matrix*)right);
+            }
+
+            m3gCopyMatrix(dst, left);
+            
+            m3gGetMatrixColumn(right, 3, &tvec);
+            m3gTransformVec4(dst, &tvec);
+            
+            M44F(dst, 0, 3) = tvec.x;
+            M44F(dst, 1, 3) = tvec.y;
+            M44F(dst, 2, 3) = tvec.z;
+            
+            dst->mask |= MC_TRANSLATION_PART;
+            return;
+        }
+
+    }
+        
+    /* Compute product and set output classification */
+
+    m3gGenericMatrixProduct(dst, left, right);
+}
+
+/*!
+ * \brief Normalizes a quaternion
+ */
+M3G_API void m3gNormalizeQuat(Quat *q)
+{
+    M3Gfloat norm;
+    M3G_ASSERT_PTR(q);
+    
+    norm = m3gNorm4(&q->x);
+    
+    if (norm > EPSILON) {
+        norm = m3gRcpSqrt(norm);
+        m3gScale4(&q->x, norm);
+    }
+    else {
+        m3gIdentityQuat(q);
+    }
+}
+
+/*!
+ * \brief Normalizes a three-vector
+ */
+M3G_API void m3gNormalizeVec3(Vec3 *v)
+{
+    M3Gfloat norm;
+    M3G_ASSERT_PTR(v);
+    
+    norm = m3gNorm3(&v->x);
+    
+    if (norm > EPSILON) {
+        norm = m3gRcpSqrt(norm);
+        m3gScale3(&v->x, norm);
+    }
+    else {
+        m3gZero(v, sizeof(Vec3));
+    }
+}
+
+/*!
+ * \brief Normalizes a four-vector
+ */
+M3G_API void m3gNormalizeVec4(Vec4 *v)
+{
+    M3Gfloat norm;
+    M3G_ASSERT_PTR(v);
+    
+    norm = m3gNorm4(&v->x);
+    
+    if (norm > EPSILON) {
+        norm = m3gRcpSqrt(norm);
+        m3gScale4(&v->x, norm);
+    }
+    else {
+        m3gZero(v, sizeof(Vec4));
+    }
+}
+
+/*!
+ * \brief Multiplies a matrix from the right with another matrix
+ */
+M3G_API void m3gPostMultiplyMatrix(Matrix *mtx, const Matrix *other)
+{
+    Matrix temp;
+    M3G_ASSERT_PTR(mtx);
+    M3G_ASSERT_PTR(other);
+
+    m3gCopyMatrix(&temp, mtx);
+    m3gMatrixProduct(mtx, &temp, other);
+}
+
+/*!
+ * \brief Multiplies a matrix from the left with another matrix
+ */
+M3G_API void m3gPreMultiplyMatrix(Matrix *mtx, const Matrix *other)
+{
+    Matrix temp;
+    M3G_ASSERT_PTR(mtx);
+    M3G_ASSERT_PTR(other);
+
+    m3gCopyMatrix(&temp, mtx);
+    m3gMatrixProduct(mtx, other, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a rotation matrix.
+ */
+M3G_API void m3gPostRotateMatrix(Matrix *mtx,
+                         M3Gfloat angle,
+                         M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+    Quat q;
+    m3gSetAngleAxis(&q, angle, ax, ay, az);
+    m3gPostRotateMatrixQuat(mtx, &q);
+}
+
+/*!
+ * \brief Multiplies a matrix with a translation matrix.
+ */
+M3G_API void m3gPostRotateMatrixQuat(Matrix *mtx, const Quat *quat)
+{
+    Matrix temp;
+    m3gQuatMatrix(&temp, quat);
+    m3gPostMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a scale matrix.
+ */
+M3G_API void m3gPostScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
+{
+    Matrix temp;
+    m3gScalingMatrix(&temp, sx, sy, sz);
+    m3gPostMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a translation (matrix).
+ */
+M3G_API void m3gPostTranslateMatrix(Matrix *mtx,
+                            M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
+{
+    Matrix temp;
+    m3gTranslationMatrix(&temp, tx, ty, tz);
+    m3gPostMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a rotation matrix
+ */
+M3G_API void m3gPreRotateMatrix(Matrix *mtx,
+                        M3Gfloat angle,
+                        M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+    Quat q;
+    m3gSetAngleAxis(&q, angle, ax, ay, az);
+    m3gPreRotateMatrixQuat(mtx, &q);
+}
+
+/*!
+ * \brief Multiplies a matrix with a quaternion rotation
+ */
+M3G_API void m3gPreRotateMatrixQuat(Matrix *mtx, const Quat *quat)
+{
+    Matrix temp;
+    m3gQuatMatrix(&temp, quat);
+    m3gPreMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a scale matrix.
+ */
+M3G_API void m3gPreScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
+{
+    Matrix temp;
+    m3gScalingMatrix(&temp, sx, sy, sz);
+    m3gPreMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Multiplies a matrix with a translation (matrix).
+ */
+M3G_API void m3gPreTranslateMatrix(Matrix *mtx,
+                           M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
+{
+    Matrix temp;
+    m3gTranslationMatrix(&temp, tx, ty, tz);
+    m3gPreMultiplyMatrix(mtx, &temp);
+}
+
+/*!
+ * \brief Converts a quaternion into a matrix
+ *
+ * The output is a matrix effecting the same rotation as the
+ * quaternion passed as input
+ */
+M3G_API void m3gQuatMatrix(Matrix *mtx, const Quat *quat)
+{
+    M3Gfloat qx = quat->x;
+    M3Gfloat qy = quat->y;
+    M3Gfloat qz = quat->z;
+    M3Gfloat qw = quat->w;
+
+    /* Quick exit for identity rotations */
+
+    if (IS_ZERO(qx) && IS_ZERO(qy) && IS_ZERO(qz)) {
+        m3gIdentityMatrix(mtx);
+        return;
+    }
+
+    {
+        /* Determine the rough type of the output matrix */
+
+        M3Guint type = MC_SCALING_ROTATION;
+        if (IS_ZERO(qz) && IS_ZERO(qy)) {
+            type = MC_X_ROTATION;
+        }
+        else if (IS_ZERO(qz) && IS_ZERO(qx)) {
+            type = MC_Y_ROTATION;
+        }
+        else if (IS_ZERO(qx) && IS_ZERO(qy)) {
+            type = MC_Z_ROTATION;
+        }
+        m3gClassifyAs(type, mtx);
+
+        /* Generate the non-constant parts of the matrix */
+        {
+            M3Gfloat wx, wy, wz, xx, yy, yz, xy, xz, zz;
+
+            xx = m3gMul(qx, qx);
+            xy = m3gMul(qx, qy);
+            xz = m3gMul(qx, qz);
+            yy = m3gMul(qy, qy);
+            yz = m3gMul(qy, qz);
+            zz = m3gMul(qz, qz);
+            wx = m3gMul(qw, qx);
+            wy = m3gMul(qw, qy);
+            wz = m3gMul(qw, qz);
+
+            if (type != MC_X_ROTATION) {
+                M44F(mtx, 0, 0) = m3gSub(1.f, m3gDouble(m3gAdd(yy, zz)));
+                M44F(mtx, 0, 1) =             m3gDouble(m3gSub(xy, wz));
+                M44F(mtx, 0, 2) =             m3gDouble(m3gAdd(xz, wy));
+            }
+
+            if (type != MC_Y_ROTATION) {
+                M44F(mtx, 1, 0) =             m3gDouble(m3gAdd(xy, wz));
+                M44F(mtx, 1, 1) = m3gSub(1.f, m3gDouble(m3gAdd(xx, zz)));
+                M44F(mtx, 1, 2) =             m3gDouble(m3gSub(yz, wx));
+            }
+
+            if (type != MC_Z_ROTATION) {
+                M44F(mtx, 2, 0) =             m3gDouble(m3gSub(xz, wy));
+                M44F(mtx, 2, 1) =             m3gDouble(m3gAdd(yz, wx));
+                M44F(mtx, 2, 2) = m3gSub(1.f, m3gDouble(m3gAdd(xx, yy)));
+            }
+        }
+        m3gSubClassify(mtx);
+    }
+}
+
+/*!
+ * \brief Generates a scaling matrix
+ */
+M3G_API void m3gScalingMatrix(
+    Matrix *mtx,
+    const M3Gfloat sx, const M3Gfloat sy, const M3Gfloat sz)
+{
+    M3G_ASSERT_PTR(mtx);
+    M44F(mtx, 0, 0) = sx;
+    M44F(mtx, 1, 1) = sy;
+    M44F(mtx, 2, 2) = sz;
+    m3gClassifyAs(MC_SCALING, mtx);
+    m3gSubClassify(mtx);
+}
+
+/*!
+ * \brief Sets a quaternion to represent an angle-axis rotation
+ */
+M3G_API void m3gSetAngleAxis(Quat *quat,
+                     M3Gfloat angle,
+                     M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+    m3gSetAngleAxisRad(quat, m3gMul(angle, M3G_DEG2RAD), ax, ay, az);
+}
+
+/*!
+ * \brief Sets a quaternion to represent an angle-axis rotation
+ */
+M3G_API void m3gSetAngleAxisRad(Quat *quat,
+                        M3Gfloat angleRad,
+                        M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+    M3G_ASSERT_PTR(quat);
+    
+    if (!IS_ZERO(angleRad)) {
+        M3Gfloat s;
+        M3Gfloat halfAngle = m3gHalf(angleRad);
+
+        s = m3gSin(halfAngle);
+        
+        {
+            M3Gfloat sqrNorm = m3gMadd(ax, ax, m3gMadd(ay, ay, m3gMul(az, az)));
+            if (sqrNorm < 0.995f || sqrNorm > 1.005f) {
+                if (sqrNorm > EPSILON) {
+                    M3Gfloat ooNorm = m3gRcpSqrt(sqrNorm);
+                    ax = m3gMul(ax, ooNorm);
+                    ay = m3gMul(ay, ooNorm);
+                    az = m3gMul(az, ooNorm);
+                }
+                else {
+                    ax = ay = az = 0.0f;
+                }
+            }
+        }
+
+        quat->x = m3gMul(s, ax);
+        quat->y = m3gMul(s, ay);
+        quat->z = m3gMul(s, az);
+        quat->w = m3gCos(halfAngle);
+    }
+    else {
+        m3gIdentityQuat(quat);
+    }
+}
+
+/*!
+ * \brief Quaternion multiplication.
+ */
+M3G_API void m3gMulQuat(Quat *quat, const Quat *other)
+{
+    Quat q;
+    q = *quat;
+
+    quat->w = m3gMul(q.w, other->w) - m3gMul(q.x, other->x) - m3gMul(q.y, other->y) - m3gMul(q.z, other->z);
+    quat->x = m3gMul(q.w, other->x) + m3gMul(q.x, other->w) + m3gMul(q.y, other->z) - m3gMul(q.z, other->y);
+    quat->y = m3gMul(q.w, other->y) - m3gMul(q.x, other->z) + m3gMul(q.y, other->w) + m3gMul(q.z, other->x);
+    quat->z = m3gMul(q.w, other->z) + m3gMul(q.x, other->y) - m3gMul(q.y, other->x) + m3gMul(q.z, other->w);
+}
+
+/*!
+ * \brief Makes this quaternion represent the rotation from one 3D
+ * vector to another
+ */
+M3G_API void m3gSetQuatRotation(Quat *quat,
+                                const Vec3 *from, const Vec3 *to)
+{
+    M3Gfloat cosAngle;
+
+    M3G_ASSERT_PTR(quat);
+    M3G_ASSERT_PTR(from);
+    M3G_ASSERT_PTR(to);
+
+    cosAngle = m3gDot3(from, to);
+
+    if (cosAngle > (1.0f - EPSILON)) {  /* zero angle */
+        m3gIdentityQuat(quat);
+        return;
+    }
+    else if (cosAngle > (1.0e-3f - 1.0f)) { /* normal case */
+        Vec3 axis;
+        m3gCross(&axis, from, to);
+        m3gSetAngleAxisRad(quat, m3gArcCos(cosAngle), axis.x, axis.y, axis.z);
+    }
+    else {
+
+        /* Opposite vectors; must generate an arbitrary perpendicular
+         * vector and use that as the rotation axis. Here, we try the
+         * Z axis first, and if that seems too parallel to the
+         * vectors, project the Y axis instead: Z is the only good
+         * choice for Z-constrained rotations, and Y by definition
+         * must be perpendicular to that. */
+
+        Vec3 axis, temp;
+        M3Gfloat s;
+
+        axis.x = axis.y = axis.z = 0.0f;
+        if (m3gAbs(from->z) < (1.0f - EPSILON)) {
+            axis.z = 1.0f;
+        }
+        else {
+            axis.y = 1.0f;
+        }
+
+        s = m3gDot3(&axis, from);
+        temp = *from;
+        m3gScaleVec3(&temp, s);
+        m3gSubVec3(&axis, &temp);
+
+        m3gSetAngleAxis(quat, 180.f, axis.x, axis.y, axis.z);
+    }
+}
+
+/*!
+ * \brief Sets the values of a matrix
+ *
+ */
+M3G_API void m3gSetMatrixColumns(Matrix *mtx, const M3Gfloat *src)
+{
+    M3G_ASSERT(mtx != NULL && src != NULL);
+
+    m3gCopy(mtx->elem, src, sizeof(mtx->elem));
+    mtx->classified = M3G_FALSE;
+    mtx->complete = M3G_TRUE;
+}
+
+/*!
+ * \brief Sets the values of a matrix
+ *
+ */
+M3G_API void m3gSetMatrixRows(Matrix *mtx, const M3Gfloat *src)
+{
+    M3G_ASSERT(mtx != NULL && src != NULL);
+    {
+        int row;
+        for (row = 0; row < 4; ++row) {
+            mtx->elem[ 0 + row] = *src++;
+            mtx->elem[ 4 + row] = *src++;
+            mtx->elem[ 8 + row] = *src++;
+            mtx->elem[12 + row] = *src++;
+        }
+    }
+    mtx->classified = M3G_FALSE;
+    mtx->complete = M3G_TRUE;
+}
+
+/*!
+ * \brief Transforms a 4-vector with a matrix
+ */
+#if defined(M3G_HW_FLOAT_VFPV2)
+
+__asm void _m3gTransformVec4(const Matrix *mtx, Vec4 *vec, M3Gint n)
+{
+
+		CODE32
+
+		FSTMFDD	sp!, {d8-d11}
+
+		FMRX	r3, FPSCR		 
+		BIC		r12, r3, #0x00370000
+		ORR		r12, #(3<<16)
+		FMXR	FPSCR, r12
+		CMP		r2, #4
+
+		FLDMIAS	r0, {s4-s19}		// [mtx0  mtx1 ..]
+		FLDMIAS	r1, {s0-s3}			// [vec.x  vec.y  vec.z  vec.w]
+		FMULS	s20, s4, s0			// [s20-s23]  = [v.x*mtx0  v.x*mtx1  v.x*mtx2  v.x*mtx3 ]
+		FMACS	s20, s8, s1			// [s20-s23] += [v.y*mtx4  v.y*mtx5  v.y*mtx6  v.y*mtx7 ]
+		FMACS	s20, s12, s2		// [s20-s23] += [v.z*mtx8  v.z*mtx9  v.z*mtx10 v.z*mtx11]
+		FMACS	s20, s16, s3		// [s20-s23] += [v.w*mtx12 v.w*mtx13 v.w*mtx14 v.w*mtx15]
+		FSTMIAS		r1!, {s20-s22}
+		FSTMIASEQ	r1, {s23}
+
+		FMXR	FPSCR, r3
+		FLDMFDD	sp!, {d8-d11}
+
+		BX		lr
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+
+M3G_API void m3gTransformVec4(const Matrix *mtx, Vec4 *vec)
+{
+    M3Guint type;
+    M3G_ASSERT(mtx != NULL && vec != NULL);
+
+    if (!m3gIsClassified(mtx)) {
+        m3gClassify((Matrix*)mtx);
+    }
+
+    type = mtx->mask;
+
+    if (type == MC_IDENTITY) {
+        return;
+    }
+    else {
+        Vec4 v = *vec;
+        int i;
+        int n = m3gIsWUnity(mtx) ? 3 : 4;
+
+        if (!mtx->complete) {
+            m3gFillClassifiedMatrix((Matrix*)mtx);
+        }
+#if	defined(M3G_HW_FLOAT_VFPV2)
+		_m3gTransformVec4(mtx, vec, n);
+#else
+        for (i = 0; i < n; ++i) {
+            M3Gfloat d = m3gMul(M44F(mtx, i, 0), v.x);
+            d = m3gMadd(M44F(mtx, i, 1), v.y, d);
+            d = m3gMadd(M44F(mtx, i, 2), v.z, d);
+            d = m3gMadd(M44F(mtx, i, 3), v.w, d);
+            (&vec->x)[i] = d;
+        }
+#endif
+    }
+}
+
+/*!
+ * \brief Generates a translation matrix
+ */
+M3G_API void m3gTranslationMatrix(
+    Matrix *mtx,
+    const M3Gfloat tx, const M3Gfloat ty, const M3Gfloat tz)
+{
+    M3G_ASSERT_PTR(mtx);
+    M44F(mtx, 0, 3) = tx;
+    M44F(mtx, 1, 3) = ty;
+    M44F(mtx, 2, 3) = tz;
+    m3gClassifyAs(MC_TRANSLATION, mtx);
+    m3gSubClassify(mtx);
+}
+
+/*!
+ * \brief Float vector assignment.
+ */
+M3G_API void m3gSetQuat(Quat *quat, const M3Gfloat *vec)
+{
+    quat->x = vec[0];
+    quat->y = vec[1];
+    quat->z = vec[2];
+    quat->w = vec[3];
+}
+
+/*!
+ * \brief Slerp between quaternions q0 and q1, storing the result in quat.
+ */
+M3G_API void m3gSlerpQuat(Quat *quat,
+                  M3Gfloat s,
+                  const Quat *q0, const Quat *q1)
+{
+    M3Gfloat s0, s1;
+    M3Gfloat cosTheta = m3gDot4((const Vec4 *)q0, (const Vec4 *)q1);
+    M3Gfloat oneMinusS = m3gSub(1.0f, s);
+
+    if (cosTheta > EPSILON - 1.0f) {
+        if (cosTheta < 1.0f - EPSILON) {
+            M3Gfloat theta    = m3gArcCos(cosTheta);
+            M3Gfloat sinTheta = m3gSin(theta);
+            s0 = m3gSin(m3gMul(oneMinusS, theta)) / sinTheta;
+            s1 = m3gSin(m3gMul(s, theta)) / sinTheta;
+        }
+        else {
+            /* For quaternions very close to each other, use plain
+               linear interpolation for numerical stability. */
+            s0 = oneMinusS;
+            s1 = s;
+        }
+        quat->x = m3gMadd(s0, q0->x, m3gMul(s1, q1->x));
+        quat->y = m3gMadd(s0, q0->y, m3gMul(s1, q1->y));
+        quat->z = m3gMadd(s0, q0->z, m3gMul(s1, q1->z));
+        quat->w = m3gMadd(s0, q0->w, m3gMul(s1, q1->w));
+    }
+    else {
+        /* Slerp is undefined if the two quaternions are (nearly)
+           opposite, so we just pick an arbitrary plane of
+           rotation here. */
+
+        quat->x = -q0->y;
+        quat->y =  q0->x;
+        quat->z = -q0->w;
+        quat->w =  q0->z;
+
+        s0 = m3gSin(m3gMul(oneMinusS, HALF_PI));
+        s1 = m3gSin(m3gMul(s, HALF_PI));
+
+        quat->x = m3gMadd(s0, q0->x, m3gMul(s1, quat->x));
+        quat->y = m3gMadd(s0, q0->y, m3gMul(s1, quat->y));
+        quat->z = m3gMadd(s0, q0->z, m3gMul(s1, quat->z));
+    }
+}
+
+/*!
+ * \brief Interpolate quaternions using spline, or "squad" interpolation.
+ */
+M3G_API void m3gSquadQuat(Quat *quat,
+                  M3Gfloat s,
+                  const Quat *q0, const Quat *a, const Quat *b, const Quat *q1)
+{
+
+    Quat temp0;
+    Quat temp1;
+    m3gSlerpQuat(&temp0, s, q0, q1);
+    m3gSlerpQuat(&temp1, s, a, b);
+
+    m3gSlerpQuat(quat, m3gDouble(m3gMul(s, m3gSub(1.0f, s))), &temp0, &temp1);
+}
+
+
+