FCL/sf/os/graphics: comparison m3g/m3gcore11/src/m3g

equal deleted inserted replaced

--1:000000000000
+:5d03bc08d59c
+/*
+* Copyright (c) 2003 Nokia Corporation and/or its subsidiary(-ies).
+* All rights reserved.
+* This component and the accompanying materials are made available
+* under the terms of the License "Eclipse Public License v1.0"
+* which accompanies this distribution, and is available
+* at the URL "http://www.eclipse.org/legal/epl-v10.html".
+*
+* Initial Contributors:
+* Nokia Corporation - initial contribution.
+*
+* Contributors:
+*
+* Description: Internal math function implementations
+*
+*/
+/*!
+* \internal
+* \file
+* \brief Internal math function implementations
+*
+*/
+#ifndef M3G_CORE_INCLUDE
+#   error included by m3g_core.c; do not compile separately.
+#endif
+#include "m3g_defs.h"
+#include "m3g_memory.h"
+#if defined(M3G_SOFT_FLOAT)
+#include <math.h>
+#endif
+/*----------------------------------------------------------------------
+* Private types and definitions
+*--------------------------------------------------------------------*/
+/* Enumerated common matrix classifications */
+#define MC_IDENTITY             0x40100401 // 01000000 00010000 00000100 00000001
+#define MC_FRUSTUM              0x30BF0C03 // 00110000 10111111 00001100 00000011
+#define MC_PERSPECTIVE          0x30B00C03 // 00110000 10110000 00001100 00000011
+#define MC_ORTHO                0x7F300C03 // 01111111 00110000 00001100 00000011
+#define MC_PARALLEL             0x70300C03 // 01110000 00110000 00001100 00000011
+#define MC_SCALING_ROTATION     0x403F3F3F // 01000000 00111111 00111111 00111111
+#define MC_SCALING              0x40300C03 // 01000000 00110000 00001100 00000011
+#define MC_TRANSLATION          0x7F100401 // 01111111 00010000 00000100 00000001
+#define MC_X_ROTATION           0x403C3C01 // 01000000 00111100 00111100 00000001
+#define MC_Y_ROTATION           0x40330433 // 01000000 00110011 00000100 00110011
+#define MC_Z_ROTATION           0x40100F0F // 01000000 00010000 00001111 00001111
+#define MC_W_UNITY              0x7F3F3F3F // 01111111 00111111 00111111 00111111
+#define MC_GENERIC              0xFFFFFFFF
+/* Partial masks for individual matrix components */
+#define MC_TRANSLATION_PART     0x3F000000 // 00111111 00000000 00000000 00000000
+#define MC_SCALE_PART           0x00300C03 // 00000000 00110000 00001100 00000011
+#define MC_SCALE_ROTATION_PART  0x003F3F3F // 00000000 00111111 00111111 00111111
+/* Matrix element classification masks */
+#define ELEM_ZERO       0x00
+#define ELEM_ONE        0x01
+#define ELEM_MINUS_ONE  0x02
+#define ELEM_ANY        0x03
+/*!
+* \internal
+* \brief calculates the offset of a 4x4 matrix element in a linear
+* array
+*
+* \notes The current convention is column-major, as in OpenGL ES
+*/
+#define MELEM(row, col) ((row) + ((col) << 2))
+/*!
+* \internal
+* \brief Macro for accessing 4x4 float matrix elements
+*
+* \param mtx pointer to the first element of the matrix
+* \param row matrix row
+* \param col matrix column
+*/
+#define M44F(mtx, row, col)     ((mtx)->elem[MELEM((row), (col))])
+/*--------------------------------------------------------------------*/
+/*----------------------------------------------------------------------
+* Private functions
+*--------------------------------------------------------------------*/
+/*!
+* \internal
+* \brief ARM VFPv2 implementation of 4x4 matrix multiplication.
+*
+* \param dst	multiplication result
+* \param left	left-hand matrix
+* \param right right-hand matrix
+*/
+#if defined(M3G_HW_FLOAT_VFPV2)
+__asm void _m3gGenericMatrixProduct(Matrix *dst,
+const Matrix *left, const Matrix *right)
+{
+// r0 = *dst
+// r1 = *left
+// r2 = *right
+		CODE32
+// save the VFP registers and set the vector STRIDE = 1, LENGTH = 4
+		FSTMFDD	sp!, {d8-d15}
+		FMRX	r3, FPSCR
+		BIC		r12, r3, #0x00370000
+		ORR		r12, #0x00030000
+		FMXR	FPSCR, r12
+// left = [a0  a1  a2  a3	right = [b0  b1  b2  b3
+//	 	   a4  a5  a6  a7		     b4  b5  b6  b7
+//		   a8  a9 a10 a11		     b8  b9 b10 b11
+//		  a12 a13 a14 a15]		    b12 b13 b14 b15]
+//
+// dst = [a0*b0+a4*b1+a8*b2+a12*b3  a1*b0+a5*b1+a9*b2+a13*b3  ..
+//		  a0*b4+a4*b5+a8*b6+a12*b7  a1*b4+a5*b5+a9*b6+a13*b7  ..
+//					.							.
+//					.							.
+		FLDMIAS	r1!, {s8-s23}		// load the left matrix [a0-a15] to registers s8-s23
+		FLDMIAS	r2!, {s0-s7}		// load [b0-b7] of right matrix to registers s0-s7
+		FMULS	s24, s8, s0			// [s24-s27]  = [a0*b0  a1*b0  a2*b0  a3*b0]
+		FMULS	s28, s8, s4			// [s28-s31]  = [a0*b4  a1*b4  a2*b4  a3*b4]
+		FMACS	s24, s12, s1		// [s24-s27] += [a4*b1  a5*b1  a6*b1  a7*b1]
+		FMACS	s28, s12, s5		// [s28-s31] += [a4*b5  a5*b5  a6*b5  a7*b5]
+		FMACS	s24, s16, s2		// [s24-s27] += [a8*b2  a9*b2  a10*b2 a11*b2]
+		FMACS	s28, s16, s6		// [s28-s31] += [a8*b6  a9*b6  a10*b6 a11*b6]
+		FMACS	s24, s20, s3		// [s24-s27] += [a12*b3 a13*b3 a14*b3 a15*b3]
+		FMACS	s28, s20, s7		// [s28-s31] += [a12*b7 a13*b37a14*b7 a15*b7]
+		FLDMIAS	r2!, {s0-s7}		// load [b8-b15]
+		FSTMIAS	r0!, {s24-s31}		// write [dst0-dst7]
+		FMULS	s24, s8, s0
+		FMULS	s28, s8, s4
+		FMACS	s24, s12, s1
+		FMACS	s28, s12, s5
+		FMACS	s24, s16, s2
+		FMACS	s28, s16, s6
+		FMACS	s24, s20, s3
+		FMACS	s28, s20, s7
+		FSTMIAS	r0!, {s24-s31}
+// Restore the VFP registers and return.
+		FMXR	FPSCR, r3
+		FLDMFDD	sp!, {d8-d15}
+		BX		lr
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+/*------------------ Elementary float ------------------*/
+#if defined(M3G_SOFT_FLOAT)
+#if defined (M3G_BUILD_ARM)
+/*!
+* \internal
+* \brief Floating point multiplication implementation for ARM.
+*/
+__asm M3Gfloat m3gMul(const M3Gfloat a,
+const M3Gfloat b)
+{
+/**
+* Extract the exponents of the multiplicands and add them
+* together. Flush to zero if either exponent or their sum
+* is zero.
+*/
+mov     r12, #0xff;
+ands    r2, r0, r12, lsl #23;   // exit if e1 == 0
+andnes  r3, r1, r12, lsl #23;   // exit if e2 == 0
+subne   r2, r2, #(127 << 23);
+addnes  r12, r2, r3;            // exit if e1+e2-127 <= 0
+movle   r0, #0;
+bxle    lr;
+/**
+* Determine the sign of the result. Note that the exponent
+* may have overflowed to the sign bit, and thus the result
+* may be an arbitrary negative value when it really should
+* be +Inf or -Inf.
+*/
+teq     r0, r1;
+orrmi   r12, r12, #0x80000000;
+/**
+* Multiply the mantissas. First shift the mantissas up to
+* unsigned 1.31 fixed point, adding the leading "1" bit at
+* the same time, and finally do a 32x32 -> 64 bit unsigned
+* multiplication. The result is in unsigned 2.62 fixed point,
+* representing the interval [1.0, 4.0).
+*/
+mov     r2, #0x80000000;
+orr     r0, r2, r0, lsl #8;
+orr     r1, r2, r1, lsl #8;
+umulls  r2, r3, r0, r1;
+/**
+* If the highest bit of the 64-bit result is set, then the
+* mantissa lies in [2.0, 4.0) and needs to be renormalized.
+* That is, the mantissa is shifted one bit to the right and
+* the exponent correspondingly increased by 1. Note that we
+* lose the leading "1" bit from the mantissa by adding it up
+* with the exponent.
+*/
+subpl   r12, r12, #(1 << 23);    // no overflow: exponent -= 1
+addpl   r0, r12, r3, lsr #7;     // no overflow: exponent += 1
+addmi   r0, r12, r3, lsr #8;     // overflow: exponent += 1
+bx      lr;
+}
+/*!
+* \internal
+* \brief Floating point addition implementation for ARM.
+*/
+__asm M3Gfloat m3gAdd(const M3Gfloat a,
+const M3Gfloat b)
+{
+/**
+* If the operands have opposite signs then this is not really
+* an addition but a subtraction. Subtraction is much slower,
+* so we have a separate code path for it, rather than trying
+* to save space by handling both in the same place.
+*/
+teq     r0, r1;
+bmi     _m3gSub;
+/**
+* Sort the operands such that the larger operand is in R0 and
+* the smaller in R1. The sign bits do not affect the ordering,
+* since they are known to be equal.
+*/
+subs    r2, r0, r1;
+submi   r0, r0, r2;
+addmi   r1, r1, r2;
+/**
+* Extract the exponent of the smaller operand into R2 and compute
+* the difference between the larger and smaller exponent into R3.
+* (Note that the sign bits cancel out in the subtraction.) The
+* exponent delta tells how many bits the mantissa of the smaller
+* operand must be shifted to the right in order to bring the
+* operands into equal scale.
+*/
+mov     r2, r1, lsr #23;
+rsb     r3, r2, r0, lsr #23;
+/**
+* Check if the exponent delta is bigger than 23 bits, or if the
+* smaller exponent is zero. In either case, exit the routine and
+* return the larger operand (which is already in R0). Note that
+* this means that subnormals are treated as zero.
+*/
+rsbs    r12, r3, #23;               // N set, V clear if R3 > 23
+tstpl   r2, #0xff;                  // execute only if R3 <= 23
+bxle    lr;                         // exit if Z set or N != V
+/**
+* Extract the mantissas and shift them up to unsigned 1.31 fixed
+* point, inserting the implied leading "1" bit at the same time.
+* Finally, align the decimal points and add up the mantissas.
+*/
+mov     r12, #0x80000000;
+orr     r0, r12, r0, lsl #8;
+orr     r1, r12, r1, lsl #8;
+adds    r0, r0, r1, lsr r3;
+/**
+* Compute the final exponent by adding up the smaller exponent
+* (R2), the exponent delta (R3), and the possible overflow bit
+* (carry flag). Note that in case of overflow, the leading "1"
+* has ended up in the carry flag and thus needs not be explicitly
+* discarded. Finally, put the mantissa together with the sign and
+* exponent.
+*/
+adc     r2, r2, r3;                 // r2 = smallExp + deltaExp + overflow
+movcc   r0, r0, lsl #1;             // no overflow: discard leading 1
+mov     r0, r0, lsr #9;
+orr     r0, r0, r2, lsl #23;
+bx      lr;
+_m3gSub
+/**
+* Sort the operands such that the one with larger magnitude is
+* in R0 and has the correct sign (the sign of the final result),
+* while the smaller operand is in R1 with an inverted sign bit.
+*/
+eor     r1, r1, #0x80000000;
+subs    r2, r0, r1;
+eormi   r2, r2, #0x80000000;
+submi   r0, r0, r2;
+addmi   r1, r1, r2;
+/**
+* Extract the exponent of the smaller operand into R2 and compute
+* the difference between the larger and smaller exponent into R3.
+* (Note that the sign bits cancel out in the subtraction.) The
+* exponent delta tells how many bits the mantissa of the smaller
+* operand must be shifted to the right in order to bring the
+* operands into equal scale.
+*/
+mov     r2, r1, lsr #23;
+rsbs    r3, r2, r0, lsr #23;
+/**
+* Check if the exponent delta is bigger than 31 bits, or if the
+* smaller exponent is zero. In either case, exit the routine and
+* return the larger operand (which is already in R0). Note that
+* this means that subnormals are treated as zero.
+*/
+rsbs    r12, r3, #31;               // N set, V clear if R3 > 31
+tstpl   r2, #0xff;                  // execute only if R3 <= 31
+bxle    lr;                         // exit if Z set or N != V
+/**
+* Extract the mantissas and shift them up to unsigned 1.31 fixed
+* point, inserting the implied leading "1" bit at the same time.
+* Then align the decimal points and subtract the smaller mantissa
+* from the larger one.
+*/
+mov     r12, #0x80000000;
+orr     r0, r12, r0, lsl #8;
+orr     r1, r12, r1, lsl #8;
+subs    r0, r0, r1, lsr r3;
+/**
+* We split the range of possible results into three categories:
+*
+*   1. [1.0, 2.0) ==> no renormalization needed (highest bit set)
+*   2. [0.5, 1.0) ==> only one left-shift needed
+*   3. (0.0, 0.5) ==> multiple left-shifts needed
+*   4. zero       ==> just return
+*
+* Cases 1 and 2 are handled in the main code path. Cases 3 and 4
+* are less common by far, so we branch to a separate code fragment
+* for those.
+*/
+movpls  r0, r0, lsl #1;             // Cases 2,3,4: shift left
+bpl     _m3gSubRenormalize;         // Cases 3 & 4: branch out
+/**
+* Now we have normalized the mantissa such that the highest bit
+* is set. Here we only need to adjust the exponent, if necessary,
+* and put the pieces together. Note that we lose the leading "1"
+* bit from the mantissa by adding it up with the exponent. We can
+* also do proper rounding (towards nearest) instead of truncation
+* (towards zero) at no extra cost!
+*/
+sbc     r3, r3, #1;                 // deltaExp -= 1 (Case 1) or 2 (Case 2)
+add     r2, r2, r3;                 // resultExp = smallExp + deltaExp
+movs    r0, r0, lsr #8;             // shift mantissa, keep leading "1"
+adc     r0, r0, r2, lsl #23;        // resultExp += 1, mantissa += carry
+bx      lr;
+/**
+* Separate code path for cases 3 and 4 (see above). The mantissa
+* has already been shifted up by one, but the exponent has not
+* been correspondingly decreased. We also know that the highest
+* bit is still not set, and that the carry flag is clear.
+*/
+_m3gSubRenormalize
+bxeq    lr;
+subcc   r3, r3, #2;
+movccs  r0, r0, lsl #2;
+/**
+* If the carry flag is still not set, i.e. there were more than
+* two leading zeros in the mantissa initially, loop until we
+* find the highest set bit.
+*/
+_m3gSubRenormalizeLoop
+subcc   r3, r3, #1;
+movccs  r0, r0, lsl #1;
+bcc     _m3gSubRenormalizeLoop;
+/**
+* Now the leading "1" is in the carry flag, so we can just add up
+* the exponent and mantissa as usual, doing proper rounding at
+* the same time. However, cases where the exponent goes negative
+* (that is, underflows) must be detected and flushed to zero.
+*/
+add     r3, r2, r3;
+movs    r0, r0, lsr #9;
+adc     r0, r0, r3, lsl #23;
+teq     r0, r2, lsl #23;
+movmi   r0, #0;
+bx      lr;
+}
+#else /* M3G_BUILD_ARM */
+/*!
+* \internal
+* \brief Floating point addition implementation
+*
+*/
+static M3G_INLINE M3Gfloat m3gFloatAdd(const M3Guint aBits,
+const M3Guint bBits)
+{
+M3Guint large, small, signMask;
+/* Early exits for underflow cases */
+large = (M3Gint)(aBits & ~SIGN_MASK);
+if (large <= 0x00800000) {
+return INT_AS_FLOAT(bBits);
+}
+small = (M3Gint)(bBits & ~SIGN_MASK);
+if (small <= 0x00800000) {
+return INT_AS_FLOAT(aBits);
+}
+/* Swap the numbers so that "large" really is larger; the unsigned
+* (or de-signed) bitmasks for floats are nicely monotonous, so we
+* can compare directly.  Also store the sign of the larger number
+* for future reference. */
+if (small > large) {
+M3Gint temp = small;
+small = large;
+large = temp;
+signMask = (bBits & SIGN_MASK);
+}
+else {
+signMask = (aBits & SIGN_MASK);
+}
+{
+M3Guint res, overflow;
+M3Guint resExp, expDelta;
+/* Store the larger exponent as our candidate result exponent,
+* and compute the difference between the exponents */
+resExp = (large >> 23);
+expDelta = resExp - (small >> 23);
+/* Take an early exit if the change would be insignificant;
+* this also guards against odd results from shifting by more
+* than 31 (undefined in C) */
+if (expDelta >= 24) {
+res = large | signMask;
+return INT_AS_FLOAT(res);
+}
+/* Convert the mantissas into shifted integers, and shift the
+* smaller number to the same scale with the larger one. */
+large = (large & MANTISSA_MASK) | LEADING_ONE;
+small = (small & MANTISSA_MASK) | LEADING_ONE;
+small >>= expDelta;
+M3G_ASSERT(large >= small);
+/* Check whether we're really adding or subtracting the
+* smaller number, and branch to slightly different routines
+* respectively */
+if (((aBits ^ bBits) & SIGN_MASK) == 0) {
+/* Matching signs; just add the numbers and check for
+* overflow, shifting to compensate as necessary. */
+res = large + small;
+overflow = (res >> 24);
+res >>= overflow;
+resExp += overflow;
+}
+else {
+/* Different signs, so let's subtract the smaller value;
+* also check that we're not subtracting a number from
+* itself (so we don't have to normalize a zero below) */
+if (small == large) {
+return 0.0f; /* x - x = 0 */
+}
+res = (large << 8) - (small << 8);
+/* Renormalize the number by shifting until we've got the
+* high bit in place */
+while ((res >> 24) == 0) {
+res <<= 8;
+resExp -= 8;
+}
+while ((res >> 31) == 0) {
+res <<= 1;
+--resExp;
+}
+res >>= 8;
+}
+/* Flush to zero in case of over/underflow of the exponent */
+if (resExp >= 255) {
+return 0.0f;
+}
+/* Compose the final number into "res"; note that we pull in
+* the sign of the original larger number, which should still
+* be valid */
+res &= MANTISSA_MASK;
+res |= (resExp << 23);
+res |= signMask;
+return INT_AS_FLOAT(res);
+}
+}
+/*!
+* \internal
+* \brief Floating point multiplication implementation
+*/
+static M3G_INLINE M3Gfloat m3gFloatMul(const M3Guint aBits,
+const M3Guint bBits)
+{
+M3Guint a, b;
+/* Early exits for underflow and multiplication by zero */
+a = (aBits & ~SIGN_MASK);
+if (a <= 0x00800000) {
+return 0.0f;
+}
+b = (bBits & ~SIGN_MASK);
+if (b <= 0x00800000) {
+return 0.0f;
+}
+{
+M3Guint res, exponent, overflow;
+/* Compute the exponent of the result, assuming the mantissas
+* don't overflow; then mask out the original exponents */
+exponent = (a >> 23) + (b >> 23) - 127;
+a &= MANTISSA_MASK;
+b &= MANTISSA_MASK;
+/* Compute the new mantissa from:
+*
+*   (1 + a)(1 + b) = ab + a + b + 1
+*
+* First shift the mantissas from 0.23 down to 0.16 for the
+* multiplication, then shift back to 0.23 for adding in the
+* "a + b + 1" part of the equation.  */
+res = (a >> 7) * (b >> 7);              /* "ab" at 0.32 */
+res = (res >> 9) + a + b + LEADING_ONE;
+/* Add the leading one, then normalize the result by checking
+* the overflow bit and dividing by two if necessary */
+overflow = (res >> 24);
+res >>= overflow;
+exponent += overflow;
+/* Flush to zero in case of over/underflow of the exponent */
+if (exponent >= 255) {
+return 0.0f;
+}
+/* Compose the final number into "res" */
+res &= MANTISSA_MASK;
+res |= (exponent << 23);
+res |= (aBits ^ bBits) & SIGN_MASK;
+return INT_AS_FLOAT(res);
+}
+}
+#endif /* M3G_BUILD_ARM */
+/*!
+* \internal
+* \brief Computes the signed fractional part of a floating point
+* number
+*
+* \param x  floating point value
+* \return x signed fraction of x in ]-1, 1[
+*/
+static M3Gfloat m3gFrac(M3Gfloat x)
+{
+/* Quick exit for -1 < x < 1 */
+if (m3gAbs(x) < 1.0f) {
+return x;
+}
+/* Shift the mantissa to the proper place, mask out the integer
+* part, and renormalize */
+{
+M3Guint ix = FLOAT_AS_UINT(x);
+M3Gint expo = ((ix >> 23) & 0xFF) - 127;
+M3G_ASSERT(expo >= 0);
+/* The decimal part will always be zero for large values with
+* exponents over 24 */
+if (expo >= 24) {
+return 0.f;
+}
+else {
+/* Shift the integer part out of the mantissa and see what
+* we have left */
+M3Guint base = (ix & MANTISSA_MASK) | LEADING_ONE;
+base = (base << expo) & MANTISSA_MASK;
+/* Quick exit (and guard against infinite looping) for
+* zero */
+if (base == 0) {
+return 0.f;
+}
+/* We now have an exponent of 0 (i.e. no shifting), but
+* must renormalize to get a set bit in place of the
+* hidden (implicit one) bit */
+expo = 0;
+while ((base >> 19) == 0) {
+base <<= 4;
+expo -= 4;
+}
+while ((base >> 23) == 0) {
+base <<= 1;
+--expo;
+}
+/* Compose the final number */
+ix =
+(base & MANTISSA_MASK) |
+((expo + 127) << 23) |
+(ix & SIGN_MASK);
+return INT_AS_FLOAT(ix);
+}
+}
+}
+#endif /* M3G_SOFT_FLOAT */
+#if defined(M3G_DEBUG)
+/*!
+* \internal
+* \brief Checks for NaN or infinity in a floating point input
+*/
+static void m3gValidateFloats(int n, float *p)
+{
+while (n-- > 0) {
+M3G_ASSERT(EXPONENT(*p) < 120);
+++p;
+}
+}
+#else
+#   define m3gValidateFloats(n, p)
+#endif
+/*------------------ Trigonometry and exp ----------*/
+#if defined(M3G_SOFT_FLOAT)
+/*!
+* \internal
+* \brief Sine for the first quadrant
+*
+* \param x floating point value in [0, PI/2]
+* \return sine of \c x
+*/
+static M3Gfloat m3gSinFirstQuadrant(M3Gfloat x)
+{
+M3Guint bits = FLOAT_AS_UINT(x);
+if (bits <= 0x3ba3d70au)    /* 0.005f */
+return x;
+else {
+static const M3Gfloat sinTermLut[4] = {
+-1.0f / (2*3),
+-1.0f / (4*5),
+-1.0f / (6*7),
+-1.0f / (8*9)
+};
+M3Gfloat xx = m3gSquare(x);
+M3Gfloat sinx = x;
+int i;
+for (i = 0; i < 4; ++i) {
+x    = m3gMul(x, m3gMul(xx, sinTermLut[i]));
+sinx = m3gAdd(sinx, x);
+}
+return sinx;
+}
+}
+#endif /* M3G_SOFT_FLOAT */
+#if defined(M3G_SOFT_FLOAT)
+/*!
+* \internal
+* \brief Computes sine for the first period
+*
+* \param x floating point value in [0, 2PI]
+* \return sine of x
+*/
+static M3Gfloat m3gSinFirstPeriod(const M3Gfloat x)
+{
+M3G_ASSERT(x >= 0 && x <= TWO_PI);
+if (x >= PI) {
+return m3gNegate(m3gSinFirstQuadrant(x >= ONE_AND_HALF_PI ?
+m3gSub(TWO_PI, x) :
+m3gSub(x, PI)));
+}
+return m3gSinFirstQuadrant((x >= HALF_PI) ? m3gSub(PI, x) : x);
+}
+#endif /* M3G_SOFT_FLOAT */
+/*------------- Float vs. int conversion helpers -------------*/
+/*!
+* \internal
+* \brief Scales and clamps a float to unsigned byte range
+*/
+static M3G_INLINE M3Gint m3gFloatToByte(const M3Gfloat a)
+{
+return m3gRoundToInt(m3gMul(255.f, m3gClampFloat(a, 0.f, 1.f)));
+}
+/*------------------ Vector helpers ------------------*/
+/*!
+* \internal
+* \brief Computes the norm of a floating-point 3-vector
+*/
+static M3G_INLINE M3Gfloat m3gNorm3(const M3Gfloat *v)
+{
+return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
+m3gSquare(v[2]));
+}
+/*!
+* \internal
+* \brief Computes the norm of a floating-point 4-vector
+*/
+static M3G_INLINE M3Gfloat m3gNorm4(const M3Gfloat *v)
+{
+return m3gAdd(m3gAdd(m3gSquare(v[0]), m3gSquare(v[1])),
+m3gAdd(m3gSquare(v[2]), m3gSquare(v[3])));
+}
+/*!
+* \internal
+* \brief Scales a floating-point 3-vector
+*/
+static M3G_INLINE void m3gScale3(M3Gfloat *v, M3Gfloat s)
+{
+v[0] = m3gMul(v[0], s);
+v[1] = m3gMul(v[1], s);
+v[2] = m3gMul(v[2], s);
+}
+/*!
+* \internal
+* \brief Scales a floating-point 4-vector
+*/
+static M3G_INLINE void m3gScale4(M3Gfloat *v, M3Gfloat s)
+{
+v[0] = m3gMul(v[0], s);
+v[1] = m3gMul(v[1], s);
+v[2] = m3gMul(v[2], s);
+v[3] = m3gMul(v[3], s);
+}
+/*------------------ Matrices ------------------*/
+/*!
+* \internal
+*/
+static M3G_INLINE M3Gbool m3gIsClassified(const Matrix *mtx)
+{
+M3G_ASSERT(mtx != NULL);
+return (M3Gbool) mtx->classified;
+}
+/*!
+* \internal
+* \brief Returns a classification for a single floating point number
+*/
+static M3G_INLINE M3Guint m3gElementClass(const M3Gfloat x)
+{
+if (IS_ZERO(x)) {
+return ELEM_ZERO;
+}
+else if (IS_ONE(x)) {
+return ELEM_ONE;
+}
+else if (IS_MINUS_ONE(x)) {
+return ELEM_MINUS_ONE;
+}
+return ELEM_ANY;
+}
+/*!
+* \internal
+* \brief Computes the classification mask of a matrix
+*
+* The mask is constructed from two bits per elements, with the lowest
+* two bits corresponding to the first element in the \c elem array of
+* the matrix.
+*/
+static void m3gClassify(Matrix *mtx)
+{
+M3Guint mask = 0;
+const M3Gfloat *p;
+int i;
+M3G_ASSERT(mtx != NULL);
+M3G_ASSERT(!m3gIsClassified(mtx));
+p = mtx->elem;
+for (i = 0; i < 16; ++i) {
+M3Gfloat elem = *p++;
+mask |= (m3gElementClass(elem) << (i*2));
+}
+mtx->mask = mask;
+mtx->classified = M3G_TRUE;
+}
+/*!
+* \internal
+* \brief Sets matrix classification directly
+*/
+static M3G_INLINE void m3gClassifyAs(M3Guint mask, Matrix *mtx)
+{
+M3G_ASSERT(mtx != NULL);
+mtx->mask = mask;
+mtx->classified = M3G_TRUE;
+mtx->complete = M3G_FALSE;
+}
+/*!
+* \internal
+* \brief Attempts to classify a matrix more precisely
+*
+* Tries to classify all "free" elements of a matrix into one of the
+* predefined constants to improve precision and performance in
+* subsequent calculations.
+*/
+static void m3gSubClassify(Matrix *mtx)
+{
+M3G_ASSERT_PTR(mtx);
+M3G_ASSERT(m3gIsClassified(mtx));
+{
+const M3Gfloat *p = mtx->elem;
+M3Guint inMask = mtx->mask;
+M3Guint outMask = inMask;
+int i;
+for (i = 0; i < 16; ++i, inMask >>= 2) {
+M3Gfloat elem = *p++;
+if ((inMask & 0x03) == ELEM_ANY) {
+outMask &= ~(0x03u << (i*2));
+outMask |= (m3gElementClass(elem) << (i*2));
+}
+}
+mtx->mask = outMask;
+}
+}
+/*!
+* \internal
+* \brief Fills in the implicit values for a classified matrix
+*/
+static void m3gFillClassifiedMatrix(Matrix *mtx)
+{
+int i;
+M3Guint mask;
+M3Gfloat *p;
+M3G_ASSERT(mtx != NULL);
+M3G_ASSERT(mtx->classified);
+M3G_ASSERT(!mtx->complete);
+mask = mtx->mask;
+p = mtx->elem;
+for (i = 0; i < 16; ++i, mask >>= 2) {
+unsigned elem = (mask & 0x03);
+switch (elem) {
+case ELEM_ZERO:         *p++ =  0.0f; break;
+case ELEM_ONE:          *p++ =  1.0f; break;
+case ELEM_MINUS_ONE:    *p++ = -1.0f; break;
+default:                ++p;
+}
+}
+mtx->complete = M3G_TRUE;
+}
+#if !defined(M3G_HW_FLOAT)
+/*!
+* \internal
+* \brief Performs one multiply-add of classified matrix elements
+*
+* \param amask element class of the first multiplicand
+* \param a     float value of the first multiplicand
+* \param bmask element class of the second multiplicand
+* \param b     float value of the second multiplicand
+* \param c     float value to add
+* \return a * b + c
+*
+* \notes inline, as only called from the matrix product function
+*/
+static M3G_INLINE M3Gfloat m3gClassifiedMadd(
+const M3Gbitmask amask, const M3Gfloat *pa,
+const M3Gbitmask bmask, const M3Gfloat *pb,
+const M3Gfloat c)
+{
+/* Check for zero product to reduce the switch cases below */
+if (amask == ELEM_ZERO || bmask == ELEM_ZERO) {
+return c;
+}
+/* Branch based on the classification of a */
+switch (amask) {
+case ELEM_ANY:
+if (bmask == ELEM_ONE) {
+return m3gAdd(*pa, c);      /*  a * 1 + c  =  a + c  */
+}
+if (bmask == ELEM_MINUS_ONE) {
+return m3gSub(c, *pa);      /*  a * -1 + c = -a + c  */
+}
+return m3gMadd(*pa, *pb, c);    /*  a * b + c            */
+case ELEM_ONE:
+if (bmask == ELEM_ONE) {
+return m3gAdd(c, 1.f);      /*  1 * 1 + c  = 1 + c   */
+}
+if (bmask == ELEM_MINUS_ONE) {
+return m3gSub(c, 1.f);      /*  1 * -1 + c = -1 + c  */
+}
+return m3gAdd(*pb, c);          /*  1 * b + c  =  b + c  */
+case ELEM_MINUS_ONE:
+if (bmask == ELEM_ONE) {
+return m3gSub(c, 1.f);      /* -1 * 1 + c  = -1 + c  */
+}
+if (bmask == ELEM_MINUS_ONE) {
+return m3gAdd(c, 1.f);      /* -1 * -1 + c =  1 + c  */
+}
+return m3gSub(c, *pb);          /* -1 * b + c  = -b + c  */
+default:
+M3G_ASSERT(M3G_FALSE);
+return 0.0f;
+}
+}
+#endif /*!defined(M3G_HW_FLOAT)*/
+/*!
+* \internal
+* \brief Computes a generic 4x4 matrix product
+*/
+static void m3gGenericMatrixProduct(Matrix *dst,
+const Matrix *left, const Matrix *right)
+{
+M3G_ASSERT(dst != NULL && left != NULL && right != NULL);
+{
+#       if defined(M3G_HW_FLOAT)
+if (!left->complete) {
+m3gFillClassifiedMatrix((Matrix*)left);
+}
+if (!right->complete) {
+m3gFillClassifiedMatrix((Matrix*)right);
+}
+#       else
+int row;
+const unsigned lmask = left->mask;
+const unsigned rmask = right->mask;
+#       endif
+#if defined(M3G_HW_FLOAT_VFPV2)
+		_m3gGenericMatrixProduct(dst, left, right);
+#else
+for (row = 0; row < 4; ++row) {
+int col;
+for (col = 0; col < 4; ++col) {
+int k;
+M3Gfloat a = 0;
+for (k = 0; k < 4; ++k) {
+M3Gint lidx = MELEM(row, k);
+M3Gint ridx = MELEM(k, col);
+#                   if defined(M3G_HW_FLOAT)
+a = m3gMadd(left->elem[lidx], right->elem[ridx], a);
+#                   else
+a = m3gClassifiedMadd((lmask >> (2 * lidx)) & 3,
+&left->elem[lidx],
+(rmask >> (2 * ridx)) & 3,
+&right->elem[ridx],
+a);
+#                   endif /*!M3G_HW_FLOAT*/
+}
+M44F(dst, row, col) = a;
+}
+}
+#endif /*!M3G_HW_FLOAT_VFPV2*/
+}
+dst->complete = M3G_TRUE;
+dst->classified = M3G_FALSE;
+}
+/*!
+* \internal
+* \brief Converts a float vector to 16-bit integers
+*
+* \param size   vector length
+* \param vec    pointer to float vector
+* \param scale  scale of output values, as the number of bits to
+*               shift left to get actual values
+* \param outVec output value vector
+*/
+static void m3gFloatVecToShort(M3Gint size, const M3Gfloat *vec,
+M3Gint scale, M3Gshort *outVec)
+{
+const M3Guint *vecInt = (const M3Guint*) vec;
+M3Gint i;
+for (i = 0; i < size; ++i) {
+M3Guint a = vecInt[i];
+if ((a & ~SIGN_MASK) < (1 << 23)) {
+*outVec++ = 0;
+}
+else {
+M3Gint shift =
+scale - ((M3Gint)((vecInt[i] >> 23) & 0xFFu) - (127 + 23));
+M3G_ASSERT(shift > 8); /* or the high bits will overflow */
+if (shift > 23) {
+*outVec++ = 0;
+}
+else {
+M3Gint out =
+(M3Gint) (((a & MANTISSA_MASK) | LEADING_ONE) >> shift);
+if (a >> 31) {
+out = -out;
+}
+M3G_ASSERT(m3gInRange(out, -32767, 32767));
+*outVec++ = (M3Gshort) out;
+}
+}
+}
+}
+/*----------------------------------------------------------------------
+* Internal functions
+*--------------------------------------------------------------------*/
+/*--------------------------------------------------------------------*/
+#if defined(M3G_SOFT_FLOAT)
+#if !defined (M3G_BUILD_ARM)
+static M3Gfloat m3gAdd(const M3Gfloat a, const M3Gfloat b)
+{
+return m3gFloatAdd(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
+}
+static M3Gfloat m3gMul(const M3Gfloat a, const M3Gfloat b)
+{
+return m3gFloatMul(FLOAT_AS_UINT(a), FLOAT_AS_UINT(b));
+}
+#endif /* M3G_BUILD_ARM */
+/*!
+* \internal
+* \brief Computes the reciprocal of the square root
+*
+* \param x a floating point value
+* \return 1 / square root of \c x
+*/
+static M3Gfloat m3gRcpSqrt(const M3Gfloat x)
+{
+/* Approximation followed by Newton-Raphson iteration a'la
+* "Floating-point tricks" by J. Blinn, but we iterate several
+* times to improve precision */
+M3Gint i = (M3G_FLOAT_ONE + (M3G_FLOAT_ONE >> 1))
+- (FLOAT_AS_UINT(x) >> 1);
+M3Gfloat y = INT_AS_FLOAT(i);
+for (i = 0; i < 3; ++i) {
+y = m3gMul(y, m3gSub(1.5f, m3gHalf(m3gMul(x, m3gSquare(y)))));
+}
+return y;
+}
+/*!
+* \internal
+* \brief Computes the square root
+*
+* \param x a floating point value
+* \return square root of \c x
+*/
+static M3Gfloat m3gSqrt(const M3Gfloat x)
+{
+/* Approximation followed by Newton-Raphson iteration a'la
+* "Floating-point tricks" by J. Blinn, but we iterate several
+* times to improve precision */
+M3Gint i = (FLOAT_AS_UINT(x) >> 1) + (M3G_FLOAT_ONE >> 1);
+M3Gfloat y = INT_AS_FLOAT(i);
+for (i = 0; i < 2; ++i) {
+y = m3gDiv(m3gAdd(m3gSquare(y), x), m3gDouble(y));
+}
+return y;
+}
+#endif /* M3G_SOFT_FLOAT */
+/*--------------------------------------------------------------------*/
+#if defined(M3G_SOFT_FLOAT)
+/*!
+* \internal
+*/
+static M3Gfloat m3gArcCos(const M3Gfloat x)
+{
+return (M3Gfloat) acos(x);
+}
+/*!
+* \internal
+*/
+static M3Gfloat m3gArcTan(const M3Gfloat y, const M3Gfloat x)
+{
+return (M3Gfloat) atan2(y, x);
+}
+/*!
+* \internal
+*/
+static M3Gfloat m3gCos(const M3Gfloat x)
+{
+return m3gSin(m3gAdd(x, HALF_PI));
+}
+/*!
+* \internal
+* \brief
+*/
+static M3Gfloat m3gSin(const M3Gfloat x)
+{
+M3Gfloat f = x;
+/* If x is greater than two pi, do a modulo operation to bring it
+* back in range for the internal sine function */
+if (m3gAbs(f) >= TWO_PI) {
+f = m3gMul (f, (1.f / TWO_PI));
+f = m3gFrac(f);
+f = m3gMul (f, TWO_PI);
+}
+/* Compute the result, negating both the input value and the
+* result if x was negative */
+{
+M3Guint i = FLOAT_AS_UINT(f);
+M3Guint neg = (i & SIGN_MASK);
+i ^= neg;
+f = m3gSinFirstPeriod(INT_AS_FLOAT(i));
+i = FLOAT_AS_UINT(f) ^ neg;
+return INT_AS_FLOAT(i);
+}
+}
+/*!
+* \internal
+*/
+static M3Gfloat m3gTan(const M3Gfloat x)
+{
+return (M3Gfloat) tan(x);
+}
+/*!
+* \internal
+*/
+static M3Gfloat m3gExp(const M3Gfloat a)
+{
+return (M3Gfloat) exp(a);
+}
+#endif /* M3G_SOFT_FLOAT */
+/*!
+* \brief Checks whether the bottom row of a matrix is 0 0 0 1
+*/
+static M3Gbool m3gIsWUnity(const Matrix *mtx)
+{
+M3G_ASSERT_PTR(mtx);
+if (!m3gIsClassified(mtx)) {
+return (IS_ZERO(M44F(mtx, 3, 0)) &&
+IS_ZERO(M44F(mtx, 3, 1)) &&
+IS_ZERO(M44F(mtx, 3, 2)) &&
+IS_ONE (M44F(mtx, 3, 3)));
+}
+else {
+return ((mtx->mask & 0xC0C0C0C0) == (ELEM_ONE << 30));
+}
+}
+/*!
+* \brief Makes a quaternion by exponentiating a quaternion logarithm
+*/
+static void m3gExpQuat(Quat *quat, const Vec3 *qExp)
+{
+M3Gfloat theta;
+M3G_ASSERT_PTR(quat);
+M3G_ASSERT_PTR(qExp);
+theta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(qExp->x),
+m3gSquare(qExp->y)),
+m3gSquare(qExp->z)));
+if (theta > EPSILON) {
+M3Gfloat s = m3gMul(m3gSin(theta), m3gRcp(theta));
+quat->x = m3gMul(qExp->x, s);
+quat->y = m3gMul(qExp->y, s);
+quat->z = m3gMul(qExp->z, s);
+quat->w = m3gCos(theta);
+}
+else {
+quat->x = quat->y = quat->z = 0.0f;
+quat->w = 1.0f;
+}
+}
+/*!
+* \brief Natural logarithm of a unit quaternion.
+*/
+static void m3gLogQuat(Vec3 *qLog, const Quat *quat)
+{
+M3Gfloat sinTheta = m3gSqrt(m3gNorm3((const float *) quat));
+if (sinTheta > EPSILON) {
+M3Gfloat s = m3gArcTan(sinTheta, quat->w) / sinTheta;
+qLog->x = m3gMul(s, quat->x);
+qLog->y = m3gMul(s, quat->y);
+qLog->z = m3gMul(s, quat->z);
+}
+else {
+qLog->x = qLog->y = qLog->z = 0.0f;
+}
+}
+/*!
+* \brief Make quaternion the "logarithmic difference" between two
+* other quaternions.
+*/
+static void m3gLogDiffQuat(Vec3 *logDiff,
+const Quat *from, const Quat *to)
+{
+Quat temp;
+temp.x = m3gNegate(from->x);
+temp.y = m3gNegate(from->y);
+temp.z = m3gNegate(from->z);
+temp.w =           from->w;
+m3gMulQuat(&temp, to);
+m3gLogQuat(logDiff, &temp);
+}
+/*!
+* \brief Rounds a float to the nearest integer
+*
+* Overflows are clamped to the maximum or minimum representable
+* value.
+*/
+static M3Gint m3gRoundToInt(const M3Gfloat a)
+{
+M3Guint base = FLOAT_AS_UINT(a);
+M3Gint signMask, expo;
+/* Decompose the number into sign, exponent, and base number */
+signMask = ((M3Gint) base >> 31);   /* -> 0 or 0xFFFFFFFF */
+expo = (M3Gint)((base >> 23) & 0xFF) - 127;
+/* First check for large values and return either the negative or
+* the positive maximum integer in case of overflow.  The overflow
+* check can be made on the exponent alone, as large floats are
+* spaced several integer values apart so that nothing will
+* overflow because of rounding later on */
+if (expo >= 31) {
+return (M3Gint)((1U << 31) - 1 + (((M3Guint) signMask) & 1));
+}
+/* Also check for underflow to avoid problems with shifting by
+* more than 31 */
+if (expo < -1) {
+return 0;
+}
+/* Mask out the sign and exponent bits, shift the base number so
+* that the lowest bit corresponds to one half, then add one
+* (half) and shift to round to the closest integer. */
+base = (base | LEADING_ONE) << 8;   /* shift mantissa to 1.31 */
+base =  base >> (30 - expo);        /* integer value as 31.1 */
+base = (base + 1) >> 1;             /* round to nearest 32.0 */
+/* Factor in the sign (negate if originally negative) and
+* return */
+return ((M3Gint) base ^ signMask) - signMask;
+}
+/*!
+* \brief Calculates ray-triangle intersection.
+*
+* http://www.acm.org/jgt/papers/MollerTrumbore97
+*/
+static M3Gbool m3gIntersectTriangle(const Vec3 *orig, const Vec3 *dir,
+const Vec3 *vert0, const Vec3 *vert1, const Vec3 *vert2,
+Vec3 *tuv, M3Gint cullMode)
+{
+Vec3 edge1, edge2, tvec, pvec, qvec;
+M3Gfloat det,inv_det;
+/* find vectors for two edges sharing vert0 */
+edge1 = *vert1;
+edge2 = *vert2;
+m3gSubVec3(&edge1, vert0);
+m3gSubVec3(&edge2, vert0);
+/* begin calculating determinant - also used to calculate U parameter */
+m3gCross(&pvec, dir, &edge2);
+/* if determinant is near zero, ray lies in plane of triangle */
+det = m3gDot3(&edge1, &pvec);
+if (cullMode == 0 && det <= 0) return M3G_FALSE;
+if (cullMode == 1 && det >= 0) return M3G_FALSE;
+if (det > -EPSILON && det < EPSILON)
+return M3G_FALSE;
+inv_det = m3gRcp(det);
+/* calculate distance from vert0 to ray origin */
+tvec = *orig;
+m3gSubVec3(&tvec, vert0);
+/* calculate U parameter and test bounds */
+tuv->y = m3gMul(m3gDot3(&tvec, &pvec), inv_det);
+if (tuv->y < 0.0f || tuv->y > 1.0f)
+return M3G_FALSE;
+/* prepare to test V parameter */
+m3gCross(&qvec, &tvec, &edge1);
+/* calculate V parameter and test bounds */
+tuv->z = m3gMul(m3gDot3(dir, &qvec), inv_det);
+if (tuv->z < 0.0f || m3gAdd(tuv->y, tuv->z) > 1.0f)
+return M3G_FALSE;
+/* calculate t, ray intersects triangle */
+tuv->x = m3gMul(m3gDot3(&edge2, &qvec), inv_det);
+return M3G_TRUE;
+}
+/*!
+* \brief Calculates ray-box intersection.
+*
+* http://www.siggraph.org/education/materials/HyperGraph/raytrace/rtinter3.htm
+*/
+#define XO	orig->x
+#define YO	orig->y
+#define ZO	orig->z
+#define XD	dir->x
+#define YD	dir->y
+#define ZD	dir->z
+#define XL	box->min[0]
+#define YL	box->min[1]
+#define ZL	box->min[2]
+#define XH	box->max[0]
+#define YH	box->max[1]
+#define ZH	box->max[2]
+/*!
+* \internal
+* \brief Ray - bounding box intersection
+*
+*/
+static M3Gbool m3gIntersectBox(const Vec3 *orig, const Vec3 *dir, const AABB *box)
+{
+	M3Gfloat tnear = M3G_MIN_NEGATIVE_FLOAT;
+	M3Gfloat tfar  = M3G_MAX_POSITIVE_FLOAT;
+	M3Gfloat t1, t2, temp;
+	/* X slab */
+	if(XD != 0) {
+		t1 = m3gSub(XL, XO) / XD;
+		t2 = m3gSub(XH, XO) / XD;
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(XO > XH || XO < XL) return M3G_FALSE;
+	}
+	/* Y slab */
+	if(YD != 0) {
+		t1 = m3gSub(YL, YO) / YD;
+		t2 = m3gSub(YH, YO) / YD;
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(YO > YH || YO < YL) return M3G_FALSE;
+	}
+	/* Z slab */
+	if(ZD != 0) {
+		t1 = m3gSub(ZL, ZO) / ZD;
+		t2 = m3gSub(ZH, ZO) / ZD;
+		if(t1 > t2) {
+			temp = t1;
+			t1 = t2;
+			t2 = temp;
+		}
+		if(t1 > tnear) tnear = t1;
+		if(t2 < tfar) tfar = t2;
+		if(tnear > tfar) return M3G_FALSE;
+		if(tfar < 0) return M3G_FALSE;
+	}
+	else {
+		if(ZO > ZH || ZO < ZL) return M3G_FALSE;
+	}
+	return M3G_TRUE;
+}
+/*!
+* \brief Calculates the intersection of two rectangles. Always fills
+* the intersection result.
+*
+* \param dst   result of the intersection
+* \param r1    rectangle 1
+* \param r2    rectangle 2
+*/
+static M3Gbool m3gIntersectRectangle(M3GRectangle *dst, M3GRectangle *r1, M3GRectangle *r2)
+{
+M3Gbool intersects = M3G_TRUE;
+M3Gint min, max;
+max = (r1->x) >= (r2->x) ? (r1->x) : (r2->x);
+min = (r1->x + r1->width) <= (r2->x + r2->width) ? (r1->x + r1->width) : (r2->x + r2->width);
+if ((min - max) < 0) intersects = M3G_FALSE;
+dst->x = max;
+dst->width = min - max;
+max = (r1->y) >= (r2->y) ? (r1->y) : (r2->y);
+min = (r1->y + r1->height) <= (r2->y + r2->height) ? (r1->y + r1->height) : (r2->y + r2->height);
+if ((min - max) < 0) intersects = M3G_FALSE;
+dst->y = max;
+dst->height = min - max;
+return intersects;
+}
+/*-------- float-to-int color conversions --------*/
+static M3Guint m3gAlpha1f(M3Gfloat a)
+{
+M3Guint alpha = (M3Guint) m3gFloatToByte(a);
+return (alpha << 24) | M3G_RGB_MASK;
+}
+static M3Guint m3gColor3f(M3Gfloat r, M3Gfloat g, M3Gfloat b)
+{
+return ((M3Guint) m3gFloatToByte(r) << 16)
+|  ((M3Guint) m3gFloatToByte(g) <<  8)
+|   (M3Guint) m3gFloatToByte(b)
+|   M3G_ALPHA_MASK;
+}
+static M3Guint m3gColor4f(M3Gfloat r, M3Gfloat g, M3Gfloat b, M3Gfloat a)
+{
+return ((M3Guint) m3gFloatToByte(r) << 16)
+|  ((M3Guint) m3gFloatToByte(g) <<  8)
+|   (M3Guint) m3gFloatToByte(b)
+|  ((M3Guint) m3gFloatToByte(a) << 24);
+}
+static void m3gFloatColor(M3Gint argb, M3Gfloat intensity, M3Gfloat *rgba)
+{
+/* NOTE we intentionally aim a bit high here -- some GL
+* implementations may round down instead of closest */
+const M3Gfloat oneOver255 = (M3Gfloat)(1.0001 / 255.0);
+	rgba[0] = (M3Gfloat)((argb >> 16) & 0xFF);
+	rgba[1] = (M3Gfloat)((argb >>  8) & 0xFF);
+	rgba[2] = (M3Gfloat)((argb      ) & 0xFF);
+	rgba[3] = (M3Gfloat)((argb >> 24) & 0xFF);
+m3gScale4(rgba, m3gMul(oneOver255, intensity));
+}
+/*!
+* \brief Converts the 3x3 submatrix of a matrix into fixed point
+*
+* The input matrix must be an affine one, i.e. the bottom row must be
+* 0 0 0 1.  The output matrix is guaranteed to be such that it can be
+* multiplied with a 16-bit 3-vector without overflowing, while using
+* the 32-bit range optimally.
+*
+* \param mtx  the original matrix
+* \param elem 9 shorts to hold the fixed point base numbers
+* \return floating point exponent for the values in \c elem
+*         (number of bits to shift left for actual values)
+*/
+static M3Gint m3gGetFixedPoint3x3Basis(const Matrix *mtx, M3Gshort *elem)
+{
+M3Gint outExp;
+M3Gint row, col;
+const M3Guint *m;
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*) mtx);
+}
+m = (const M3Guint*) mtx->elem;
+/* First, find the maximum exponent value in the whole matrix */
+outExp = 0;
+for (col = 0; col < 3; ++col) {
+for (row = 0; row < 3; ++row) {
+M3Gint element = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
+outExp = M3G_MAX(outExp, element);
+}
+}
+outExp >>= 23;
+/* Our candidate exponent is the maximum found plus 9, which is
+* guaranteed to shift the maximum unsigned 24-bit mantissa (which
+* always will have the high bit set) down to the signed 16-bit
+* range */
+outExp += 9;
+/* Now proceed to sum each row and see what's the actual smallest
+* exponent we can safely use without overflowing in a 16+16
+* matrix-vector multiplication; this will win us one bit
+* (doubling the precision) compared to the conservative approach
+* of just shifting everything down by 10 bits */
+for (row = 0; row < 3; ++row) {
+/* Sum the absolute values on this row */
+M3Gint rowSum = 0;
+for (col = 0; col < 3; ++col) {
+M3Gint a = (M3Gint)(m[MELEM(row, col)] & ~SIGN_MASK);
+M3Gint shift = outExp - (a >> 23);
+M3G_ASSERT(shift < 265);
+if (shift < 24) {
+rowSum += ((a & MANTISSA_MASK) | LEADING_ONE) >> shift;
+}
+}
+/* Now we have a 26-bit sum of the absolute values on this
+* row, and shift that down until we fit the target range of
+* [0, 65535].  Note that this still leaves *exactly* enough
+* space for adding in an arbitrary 16-bit translation vector
+* after multiplying with the matrix! */
+while (rowSum >= (1 << 16)) {
+rowSum >>= 1;
+++outExp;
+}
+}
+/* De-bias the exponent, but add in an extra 23 to account for the
+* decimal bits in the floating point mantissa values we started
+* with (we're returning the exponent as "bits to shift left to
+* get integers", so we're off by 23 from IEEE notation) */
+outExp = (outExp - 127) - 23;
+/* Finally, shift all the matrix elements to our final output
+* precision */
+for (col = 0; col < 3; ++col) {
+m3gFloatVecToShort(3, mtx->elem + MELEM(0, col), outExp, elem);
+elem += 3;
+}
+return outExp;
+}
+/*!
+* \brief Gets the translation component of a matrix as fixed point
+*
+* \param mtx  the matrix
+* \param elem 3 shorts to write the vector into
+* \return floating point exponent for the values in \c elem
+*         (number of bits to shift left for actual values)
+*/
+static M3Gint m3gGetFixedPointTranslation(const Matrix *mtx, M3Gshort *elem)
+{
+const M3Guint *m;
+M3G_ASSERT(m3gIsWUnity(mtx));
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*) mtx);
+}
+m = (const M3Guint*) &mtx->elem[MELEM(0, 3)];
+/* Find the maximum exponent, then scale down by 9 bits from that
+* to shift the unsigned 24-bit mantissas to the signed 16-bit
+* range */
+{
+M3Gint outExp;
+M3Guint maxElem = m[0] & ~SIGN_MASK;
+maxElem = M3G_MAX(maxElem, m[1] & ~SIGN_MASK);
+maxElem = M3G_MAX(maxElem, m[2] & ~SIGN_MASK);
+outExp = (M3Gint)(maxElem >> 23) - (127 + 23) + 9;
+m3gFloatVecToShort(3, mtx->elem + MELEM(0, 3), outExp, elem);
+return outExp;
+}
+}
+/*!
+* \internal
+* \brief Compute a bounding box enclosing two other boxes
+*
+* \param box   box to fit
+* \param a     first box to enclose or NULL
+* \param b     second box to enclose or NULL
+*
+* \note If both input boxes are NULL, the box is not modified.
+*/
+static void m3gFitAABB(AABB *box, const AABB *a, const AABB *b)
+{
+int i;
+M3G_ASSERT_PTR(box);
+if (a) {
+m3gValidateAABB(a);
+}
+if (b) {
+m3gValidateAABB(b);
+}
+if (a && b) {
+for (i = 0; i < 3; ++i) {
+box->min[i] = M3G_MIN(a->min[i], b->min[i]);
+box->max[i] = M3G_MAX(a->max[i], b->max[i]);
+}
+}
+else if (a) {
+*box = *a;
+}
+else if (b) {
+*box = *b;
+}
+}
+/*
+* \internal
+* \brief Transform an axis-aligned bounding box with a matrix
+*
+* This results in a box that encloses the transformed original box.
+*
+* Based on "Transforming Axis-Aligned Bounding Boxes" by Jim Arvo
+* from Graphics Gems.
+*
+* \note The bottom row of the matrix is ignored in the transformation.
+*/
+static void m3gTransformAABB(AABB *box, const Matrix *mtx)
+{
+M3Gfloat boxMin[3], boxMax[3];
+M3Gfloat newMin[3], newMax[3];
+m3gValidateAABB(box);
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*) mtx);
+}
+/* Get the original minimum and maximum extents as floats, and add
+* the translation as the base for the transformed box */
+{
+int i;
+for (i = 0; i < 3; ++i) {
+boxMin[i] = box->min[i];
+boxMax[i] = box->max[i];
+newMin[i] = newMax[i] = M44F(mtx, i, 3);
+}
+}
+/* Transform into the new minimum and maximum coordinates using
+* the upper left 3x3 part of the matrix */
+{
+M3Gint row, col;
+for (row = 0; row < 3; ++row) {
+for (col = 0; col < 3; ++col) {
+M3Gfloat a = m3gMul(M44F(mtx, row, col), boxMin[col]);
+M3Gfloat b = m3gMul(M44F(mtx, row, col), boxMax[col]);
+if (a < b) {
+newMin[row] = m3gAdd(newMin[row], a);
+newMax[row] = m3gAdd(newMax[row], b);
+}
+else {
+newMin[row] = m3gAdd(newMin[row], b);
+newMax[row] = m3gAdd(newMax[row], a);
+}
+}
+}
+}
+/* Write back into the bounding box */
+{
+int i;
+for (i = 0; i < 3; ++i) {
+box->min[i] = newMin[i];
+box->max[i] = newMax[i];
+}
+}
+m3gValidateAABB(box);
+}
+#if defined(M3G_DEBUG)
+/*!
+* \brief
+*/
+static void m3gValidateAABB(const AABB *aabb)
+{
+m3gValidateFloats(6, (float*) aabb);
+}
+#endif
+/*----------------------------------------------------------------------
+* Public functions
+*--------------------------------------------------------------------*/
+/*!
+* \brief Linear interpolation of vectors
+*
+* \param size     number of components
+* \param vec      output vector
+* \param s        interpolation factor
+* \param start    initial value
+* \param end      target value
+*/
+#if defined(M3G_HW_FLOAT_VFPV2)
+M3G_API __asm void m3gLerp(M3Gint size,
+				   M3Gfloat *vec,
+				   M3Gfloat s,
+				   const M3Gfloat *start, const M3Gfloat *end)
+{
+// r0 = size
+// r1 = *vec
+// r2 = s
+// r3 = *start
+// sp[0] = *end
+		CODE32
+/*
+M3Gfloat sCompl = 1.0 - s;
+for (i = 0; i < size; ++i) {
+vec[i] = sCompl*start[i] + s*end[i];
+}
+*/
+//
+// if size = 0, return
+//
+		CMP		r0, #0
+		BXEQ	lr
+		FMSR	s0, r2
+		MOV		r2, r3
+		LDR		r3, [sp]
+		FLDS	s1,=1.0
+		STMFD	sp!, {r4-r5}
+		FSUBS	s2, s1, s0			// sCompl = 1 - s
+		FMRX	r4, FPSCR
+		CMP		r0, #4
+		BGT		_m3gLerp_over4Loop
+//
+// if 1 < size <= 4
+//
+// set vector STRIDE = 1, LENGTH = 4
+		BIC		r12, r4, #0x00370000
+		ORR		r12, #(3<<16)
+		FMXR	FPSCR, r12
+		FLDMIAS	r2!, {s4-s7}		// load the start[i] values
+		FLDMIAS	r3!, {s8-s11}		// load the end[i] values
+		FMULS	s12, s4, s2			// [s12-s15]  = [sCompl*start[0] .. sCompl*start[3]]
+		FMACS	s12, s8, s0			// [s12-s15] += [    	s*end[0] ..        s*end[3]]
+		CMP		r0, #1
+		FSTS	s12, [r1]
+		FSTSGT	s13, [r1, #4]
+		CMP		r0, #3
+		FSTSGE	s14, [r1, #8]
+		FSTSGT	s15, [r1, #12]
+		FMXR	FPSCR, r4
+		LDMFD	sp!, {r4-r5}
+		BX		lr
+//
+// if size > 4, interpolate 8 values in one loop
+//
+_m3gLerp_over4Loop
+		FSTMFDD	sp!, {d8-d9}
+		MOVS	r5, r0, ASR #3			// size/8
+		SUB		r0, r0, r5, LSL #3		// tail length
+// set vector STRIDE = 1, LENGTH = 8
+		BIC		r12, r4, #0x00370000
+		ORR		r12, #(7<<16)
+		FMXR	FPSCR, r12
+_m3gLerp_alignedLoop
+		FLDMIASNE	r2!, {s8-s15}		// load the start[i] values
+		FLDMIASNE	r3!, {s16-s23}		// load the end[i] values
+		FMULSNE		s24, s8, s2			// [s16-s23]  = [ sCompl*start[0] sCompl*start[1] .. sCompl*start[7]]
+		FMACSNE		s24, s16, s0		// [s16-s23] += [		 s*end[0]        s*end[1] ..		s*end[7]]
+		FSTMIASNE	r1!, {s24-s31}
+		SUBSNE		r5, #1
+		BNE			_m3gLerp_alignedLoop
+// process the 0-7 remaining values in the tail
+		CMP			r0, #1
+		FLDMIASGE	r2!, {s8-s15}
+		FLDMIASGE	r3!, {s16-s23}
+		FMULSGE		s24, s8, s2
+		FMACSGE		s24, s16, s0
+		FSTSGE		s24, [r1]
+		FSTSGT		s25, [r1, #4]
+		CMP			r0, #3
+		FSTSGE		s26, [r1, #8]
+		FSTSGT		s27, [r1, #12]
+		CMP			r0, #5
+		FSTSGE		s28, [r1, #16]
+		FSTSGT		s29, [r1, #20]
+		CMP			r0, #7
+		FSTSEQ		s30, [r1, #24]
+		FMXR	FPSCR, r4
+		FLDMFDD	sp!, {d8-d9}
+		LDMFD	sp!, {r4-r5}
+		BX		lr
+}
+#else /* #if defined(M3G_HW_FLOAT_VFPV2) */
+M3G_API void m3gLerp(M3Gint size,
+M3Gfloat *vec,
+M3Gfloat s,
+const M3Gfloat *start, const M3Gfloat *end)
+{
+int i;
+M3Gfloat sCompl = m3gSub(1.f, s);
+for (i = 0; i < size; ++i) {
+vec[i] = m3gAdd(m3gMul(sCompl, start[i]), m3gMul(s, end[i]));
+}
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+/*!
+* \brief Hermite spline interpolation of vectors
+*
+* \param size      number of components
+* \param vec       output vector
+* \param s         interpolation factor
+* \param start     start value vector
+* \param end       end value vector
+* \param tStart    start tangent vector
+* \param tEnd      end tangent vector
+*/
+M3G_API void m3gHermite(M3Gint size,
+M3Gfloat *vec,
+M3Gfloat s,
+const M3Gfloat *start, const M3Gfloat *end,
+const M3Gfloat *tStart, const M3Gfloat *tEnd)
+{
+M3Gfloat s2 = m3gSquare(s);
+M3Gfloat s3 = m3gMul(s2, s);
+int i;
+for (i = 0; i < size; ++i) {
+vec[i] =
+m3gMadd(start[i],
+m3gAdd(m3gSub(m3gDouble(s3), m3gMul(3.f, s2)), 1.f),
+m3gMadd(end[i],
+m3gSub(m3gMul(3.f, s2), m3gDouble(s3)),
+m3gMadd(tStart[i],
+m3gAdd(m3gSub(s3, m3gDouble(s2)), s),
+m3gMul(tEnd[i],
+m3gSub(s3, s2)))));
+}
+/*  vec = ( 2*s3 - 3*s2 + 1) * start
++ (-2*s3 + 3*s2    ) * end
++ (   s3 - 2*s2 + s) * tStart
++ (   s3 -   s2    ) * tEnd;    */
+}
+/*--------------------------------------------------------------------*/
+/*!
+* \brief Sets a matrix to a copy of another matrix
+*/
+M3G_API void m3gCopyMatrix(Matrix *dst, const Matrix *src)
+{
+M3G_ASSERT(dst != NULL && src != NULL);
+*dst = *src;
+}
+/*!
+* \brief Vector addition
+*/
+M3G_API void m3gAddVec3(Vec3 *vec, const Vec3 *other)
+{
+vec->x = m3gAdd(vec->x, other->x);
+vec->y = m3gAdd(vec->y, other->y);
+vec->z = m3gAdd(vec->z, other->z);
+}
+/*!
+* \brief Vector addition
+*/
+M3G_API void m3gAddVec4(Vec4 *vec, const Vec4 *other)
+{
+vec->x = m3gAdd(vec->x, other->x);
+vec->y = m3gAdd(vec->y, other->y);
+vec->z = m3gAdd(vec->z, other->z);
+vec->w = m3gAdd(vec->w, other->w);
+}
+/*!
+* \brief Cross product of two 3D vectors expressed as 4D vectors
+*/
+M3G_API void m3gCross(Vec3 *dst, const Vec3 *a, const Vec3 *b)
+{
+dst->x = m3gSub(m3gMul(a->y, b->z), m3gMul(a->z, b->y));
+dst->y = m3gSub(m3gMul(a->z, b->x), m3gMul(a->x, b->z));
+dst->z = m3gSub(m3gMul(a->x, b->y), m3gMul(a->y, b->x));
+}
+/*!
+* \brief Dot product of two vectors
+*/
+M3G_API M3Gfloat m3gDot3(const Vec3 *a, const Vec3 *b)
+{
+M3Gfloat d;
+d = m3gMul(a->x, b->x);
+d = m3gMadd(a->y, b->y, d);
+d = m3gMadd(a->z, b->z, d);
+return d;
+}
+/*!
+* \brief Dot product of two vectors
+*/
+M3G_API M3Gfloat m3gDot4(const Vec4 *a, const Vec4 *b)
+{
+M3Gfloat d;
+d = m3gMul(a->x, b->x);
+d = m3gMadd(a->y, b->y, d);
+d = m3gMadd(a->z, b->z, d);
+d = m3gMadd(a->w, b->w, d);
+return d;
+}
+/*!
+* \brief
+*/
+M3G_API void m3gSetVec3(Vec3 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z)
+{
+M3G_ASSERT_PTR(v);
+v->x = x;
+v->y = y;
+v->z = z;
+}
+/*!
+* \brief
+*/
+M3G_API void m3gSetVec4(Vec4 *v, M3Gfloat x, M3Gfloat y, M3Gfloat z, M3Gfloat w)
+{
+M3G_ASSERT_PTR(v);
+v->x = x;
+v->y = y;
+v->z = z;
+v->w = w;
+}
+/*!
+* \brief Vector subtraction
+*/
+M3G_API void m3gSubVec3(Vec3 *vec, const Vec3 *other)
+{
+vec->x = m3gSub(vec->x, other->x);
+vec->y = m3gSub(vec->y, other->y);
+vec->z = m3gSub(vec->z, other->z);
+}
+/*!
+* \brief Vector subtraction
+*/
+M3G_API void m3gSubVec4(Vec4 *vec, const Vec4 *other)
+{
+vec->x = m3gSub(vec->x, other->x);
+vec->y = m3gSub(vec->y, other->y);
+vec->z = m3gSub(vec->z, other->z);
+vec->w = m3gSub(vec->w, other->w);
+}
+/*!
+* \brief Vector length
+*/
+M3G_API M3Gfloat m3gLengthVec3(const Vec3 *vec)
+{
+return m3gSqrt(m3gAdd(m3gAdd(m3gSquare(vec->x),
+m3gSquare(vec->y)),
+m3gSquare(vec->z)));
+}
+/*!
+* \brief Vector scaling
+*/
+M3G_API void m3gScaleVec3(Vec3 *vec, const M3Gfloat s)
+{
+vec->x = m3gMul(vec->x, s);
+vec->y = m3gMul(vec->y, s);
+vec->z = m3gMul(vec->z, s);
+}
+/*!
+* \brief Vector scaling
+*/
+M3G_API void m3gScaleVec4(Vec4 *vec, const M3Gfloat s)
+{
+vec->x = m3gMul(vec->x, s);
+vec->y = m3gMul(vec->y, s);
+vec->z = m3gMul(vec->z, s);
+vec->w = m3gMul(vec->w, s);
+}
+/*!
+* \brief Returns an angle-axis representation for a quaternion
+*
+* \note There are many, and this is not guaranteed to return a
+* particular one
+*/
+M3G_API void m3gGetAngleAxis(const Quat *quat, M3Gfloat *angle, Vec3 *axis)
+{
+M3Gfloat x, y, z, sinTheta;
+M3G_ASSERT_PTR(quat);
+M3G_ASSERT_PTR(angle);
+M3G_ASSERT_PTR(axis);
+x = quat->x;
+y = quat->y;
+z = quat->z;
+sinTheta = m3gSqrt(m3gAdd(m3gAdd(m3gSquare(x), m3gSquare(y)),
+m3gSquare(z)));
+if (sinTheta > EPSILON) {
+M3Gfloat ooSinTheta = m3gRcp(sinTheta);
+axis->x = m3gMul(x, ooSinTheta);
+axis->y = m3gMul(y, ooSinTheta);
+axis->z = m3gMul(z, ooSinTheta);
+}
+else {
+/* return a valid axis even for no rotation */
+axis->x = axis->y = 0.0f;
+axis->z = 1.0f;
+}
+*angle = m3gMul(2.0f * RAD2DEG, m3gArcCos(quat->w));
+}
+/*!
+* \brief Gets a single matrix column
+*/
+M3G_API void m3gGetMatrixColumn(const Matrix *mtx, M3Gint col, Vec4 *dst)
+{
+if ((col & ~3) == 0) {
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*)mtx);
+}
+dst->x = M44F(mtx, 0, col);
+dst->y = M44F(mtx, 1, col);
+dst->z = M44F(mtx, 2, col);
+dst->w = M44F(mtx, 3, col);
+}
+else {
+M3G_ASSERT(M3G_FALSE);
+}
+}
+/*!
+* \brief Returns the floating point values of a matrix as consecutive
+* columns
+*/
+M3G_API void m3gGetMatrixColumns(const Matrix *mtx, M3Gfloat *dst)
+{
+M3G_ASSERT(mtx != NULL && dst != NULL);
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*)mtx);
+}
+m3gCopy(dst, mtx->elem, sizeof(mtx->elem));
+}
+/*!
+* \brief Gets a single matrix row
+*/
+M3G_API void m3gGetMatrixRow(const Matrix *mtx, M3Gint row, Vec4 *dst)
+{
+if ((row & ~3) == 0) {
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*)mtx);
+}
+dst->x = M44F(mtx, row, 0);
+dst->y = M44F(mtx, row, 1);
+dst->z = M44F(mtx, row, 2);
+dst->w = M44F(mtx, row, 3);
+}
+else {
+M3G_ASSERT(M3G_FALSE);
+}
+}
+/*!
+* \brief Returns the floating point values of a matrix as consecutive
+* rows
+*/
+M3G_API void m3gGetMatrixRows(const Matrix *mtx, M3Gfloat *dst)
+{
+M3G_ASSERT(mtx != NULL && dst != NULL);
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*)mtx);
+}
+{
+int row;
+for (row = 0; row < 4; ++row) {
+*dst++ = mtx->elem[ 0 + row];
+*dst++ = mtx->elem[ 4 + row];
+*dst++ = mtx->elem[ 8 + row];
+*dst++ = mtx->elem[12 + row];
+}
+}
+}
+/*!
+* \brief Sets a matrix to identity
+*/
+M3G_API void m3gIdentityMatrix(Matrix *mtx)
+{
+M3G_ASSERT(mtx != NULL);
+m3gClassifyAs(MC_IDENTITY, mtx);
+}
+/*!
+* \brief Sets a quaternion to identity
+*/
+M3G_API void m3gIdentityQuat(Quat *quat)
+{
+M3G_ASSERT(quat != NULL);
+quat->x = quat->y = quat->z = 0.0f;
+quat->w = 1.0f;
+}
+/*!
+* \brief Inverts a matrix
+*/
+M3G_API M3Gbool m3gInvertMatrix(Matrix *mtx)
+{
+M3Gfloat *matrix;
+M3Gint i;
+M3Gfloat tmp[12];
+M3Gfloat src[16];
+M3Gfloat det;
+M3G_ASSERT(mtx != NULL);
+if (!m3gIsClassified(mtx)) {
+m3gClassify(mtx);
+}
+/* Quick exit for identity */
+if (mtx->mask == MC_IDENTITY) {
+return M3G_TRUE;
+}
+/* Look for other common cases; these require that we have valid
+* values in all the elements, so fill in the values first */
+{
+M3Guint mask = mtx->mask;
+if (!mtx->complete) {
+m3gFillClassifiedMatrix(mtx);
+}
+if ((mask | (0x3F << 24)) == MC_TRANSLATION) {
+M44F(mtx, 0, 3) = m3gNegate(M44F(mtx, 0, 3));
+M44F(mtx, 1, 3) = m3gNegate(M44F(mtx, 1, 3));
+M44F(mtx, 2, 3) = m3gNegate(M44F(mtx, 2, 3));
+mtx->mask = MC_TRANSLATION;
+return M3G_TRUE;
+}
+if ((mask | 0x300C03) == MC_SCALING) {
+if ((mask &  3       ) == 0 ||
+(mask & (3 << 10)) == 0 ||
+(mask & (3 << 20)) == 0) {
+return M3G_FALSE; /* zero scale for at least one axis */
+}
+M44F(mtx, 0, 0) = m3gRcp(M44F(mtx, 0, 0));
+M44F(mtx, 1, 1) = m3gRcp(M44F(mtx, 1, 1));
+M44F(mtx, 2, 2) = m3gRcp(M44F(mtx, 2, 2));
+return M3G_TRUE;
+}
+}
+/* Do a full 4x4 inversion as a last resort */
+	matrix = mtx->elem;
+/* transpose matrix */
+for (i = 0; i < 4; i++) {
+src[i] = matrix[i*4];
+src[i+4] = matrix[i*4+1];
+src[i+8] = matrix[i*4+2];
+src[i+12] = matrix[i*4+3];
+}
+/* calculate pairs for first 8 elements (cofactors) */
+tmp[0] = src[10]*src[15];
+tmp[1] = src[11]*src[14];
+tmp[2] = src[9]*src[15];
+tmp[3] = src[11]*src[13];
+tmp[4] = src[9]*src[14];
+tmp[5] = src[10]*src[13];
+tmp[6] = src[8]*src[15];
+tmp[7] = src[11]*src[12];
+tmp[8] = src[8]*src[14];
+tmp[9] = src[10]*src[12];
+tmp[10] = src[8]*src[13];
+tmp[11] = src[9]*src[12];
+/* calculate first 8 elements (cofactors) */
+matrix[0] = tmp[0]*src[5] + tmp[3]*src[6] + tmp[4]*src[7];
+matrix[0] -= tmp[1]*src[5] + tmp[2]*src[6] + tmp[5]*src[7];
+matrix[1] = tmp[1]*src[4] + tmp[6]*src[6] + tmp[9]*src[7];
+matrix[1] -= tmp[0]*src[4] + tmp[7]*src[6] + tmp[8]*src[7];
+matrix[2] = tmp[2]*src[4] + tmp[7]*src[5] + tmp[10]*src[7];
+matrix[2] -= tmp[3]*src[4] + tmp[6]*src[5] + tmp[11]*src[7];
+matrix[3] = tmp[5]*src[4] + tmp[8]*src[5] + tmp[11]*src[6];
+matrix[3] -= tmp[4]*src[4] + tmp[9]*src[5] + tmp[10]*src[6];
+matrix[4] = tmp[1]*src[1] + tmp[2]*src[2] + tmp[5]*src[3];
+matrix[4] -= tmp[0]*src[1] + tmp[3]*src[2] + tmp[4]*src[3];
+matrix[5] = tmp[0]*src[0] + tmp[7]*src[2] + tmp[8]*src[3];
+matrix[5] -= tmp[1]*src[0] + tmp[6]*src[2] + tmp[9]*src[3];
+matrix[6] = tmp[3]*src[0] + tmp[6]*src[1] + tmp[11]*src[3];
+matrix[6] -= tmp[2]*src[0] + tmp[7]*src[1] + tmp[10]*src[3];
+matrix[7] = tmp[4]*src[0] + tmp[9]*src[1] + tmp[10]*src[2];
+matrix[7] -= tmp[5]*src[0] + tmp[8]*src[1] + tmp[11]*src[2];
+/* calculate pairs for second 8 elements (cofactors) */
+tmp[0] = src[2]*src[7];
+tmp[1] = src[3]*src[6];
+tmp[2] = src[1]*src[7];
+tmp[3] = src[3]*src[5];
+tmp[4] = src[1]*src[6];
+tmp[5] = src[2]*src[5];
+tmp[6] = src[0]*src[7];
+tmp[7] = src[3]*src[4];
+tmp[8] = src[0]*src[6];
+tmp[9] = src[2]*src[4];
+tmp[10] = src[0]*src[5];
+tmp[11] = src[1]*src[4];
+/* calculate second 8 elements (cofactors) */
+matrix[8] = tmp[0]*src[13] + tmp[3]*src[14] + tmp[4]*src[15];
+matrix[8] -= tmp[1]*src[13] + tmp[2]*src[14] + tmp[5]*src[15];
+matrix[9] = tmp[1]*src[12] + tmp[6]*src[14] + tmp[9]*src[15];
+matrix[9] -= tmp[0]*src[12] + tmp[7]*src[14] + tmp[8]*src[15];
+matrix[10] = tmp[2]*src[12] + tmp[7]*src[13] + tmp[10]*src[15];
+matrix[10] -= tmp[3]*src[12] + tmp[6]*src[13] + tmp[11]*src[15];
+matrix[11] = tmp[5]*src[12] + tmp[8]*src[13] + tmp[11]*src[14];
+matrix[11] -= tmp[4]*src[12] + tmp[9]*src[13] + tmp[10]*src[14];
+matrix[12] = tmp[2]*src[10] + tmp[5]*src[11] + tmp[1]*src[9];
+matrix[12] -= tmp[4]*src[11] + tmp[0]*src[9] + tmp[3]*src[10];
+matrix[13] = tmp[8]*src[11] + tmp[0]*src[8] + tmp[7]*src[10];
+matrix[13] -= tmp[6]*src[10] + tmp[9]*src[11] + tmp[1]*src[8];
+matrix[14] = tmp[6]*src[9] + tmp[11]*src[11] + tmp[3]*src[8];
+matrix[14] -= tmp[10]*src[11] + tmp[2]*src[8] + tmp[7]*src[9];
+matrix[15] = tmp[10]*src[10] + tmp[4]*src[8] + tmp[9]*src[9];
+matrix[15] -= tmp[8]*src[9] + tmp[11]*src[10] + tmp[5]*src[8];
+/* calculate determinant */
+det = src[0]*matrix[0]+src[1]*matrix[1]+src[2]*matrix[2]+src[3]*matrix[3];
+/* matrix has no inverse */
+if (det == 0.0f) {
+return M3G_FALSE;
+}
+/* calculate matrix inverse */
+det = 1/det;
+for (i = 0; i < 16; i++) {
+matrix[i] *= det;
+}
+mtx->classified = M3G_FALSE;
+	return M3G_TRUE;
+}
+/*!
+* \brief Sets a matrix to the inverse of another matrix
+*/
+M3G_API M3Gbool m3gMatrixInverse(Matrix *mtx, const Matrix *other)
+{
+M3G_ASSERT(mtx != NULL && other != NULL);
+if (!m3gIsClassified(other)) {
+m3gClassify((Matrix*)other);
+}
+	m3gCopyMatrix(mtx, other);
+	return m3gInvertMatrix(mtx);
+}
+/*!
+* \brief Sets a matrix to the transpose of another matrix
+*/
+M3G_API void m3gMatrixTranspose(Matrix *mtx, const Matrix *other)
+{
+M3Gbyte i;
+M3G_ASSERT(mtx != NULL && other != NULL);
+if (!other->complete) {
+m3gFillClassifiedMatrix((Matrix *)other);
+}
+for (i = 0; i < 4; i++) {
+mtx->elem[i] = other->elem[i*4];
+mtx->elem[i+4] = other->elem[i*4+1];
+mtx->elem[i+8] = other->elem[i*4+2];
+mtx->elem[i+12] = other->elem[i*4+3];
+}
+mtx->classified = M3G_FALSE;
+mtx->complete = M3G_TRUE;
+}
+M3G_API M3Gbool m3gInverseTranspose(Matrix *mtx, const Matrix *other)
+{
+Matrix temp;
+if (!m3gMatrixInverse(&temp, other)) {
+return M3G_FALSE;
+}
+m3gMatrixTranspose(mtx, &temp);
+return M3G_TRUE;
+}
+/*!
+* \brief Sets a matrix to the product of two other matrices
+*
+* \note \c dst can not be either of \c left or \c right; if it is,
+* the results are undefined
+*/
+M3G_API void m3gMatrixProduct(Matrix *dst, const Matrix *left, const Matrix *right)
+{
+M3G_ASSERT_PTR(dst);
+M3G_ASSERT_PTR(left);
+M3G_ASSERT_PTR(right);
+M3G_ASSERT(dst != left && dst != right);
+/* Classify input matrices and take early exits for identities */
+if (!m3gIsClassified(left)) {
+m3gClassify((Matrix*)left);
+}
+if (left->mask == MC_IDENTITY) {
+m3gCopyMatrix(dst, right);
+return;
+}
+if (!m3gIsClassified(right)) {
+m3gClassify((Matrix*)right);
+}
+if (right->mask == MC_IDENTITY) {
+m3gCopyMatrix(dst, left);
+return;
+}
+/* Special quick paths for 3x4 matrices */
+if (m3gIsWUnity(left) && m3gIsWUnity(right)) {
+/* Translation? */
+if ((left->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
+if (left->mask != MC_TRANSLATION && !left->complete) {
+m3gFillClassifiedMatrix((Matrix*)left);
+}
+if (right->mask != MC_TRANSLATION && !right->complete) {
+m3gFillClassifiedMatrix((Matrix*)right);
+}
+m3gCopyMatrix(dst, right);
+M44F(dst, 0, 3) = m3gAdd(M44F(left, 0, 3), M44F(dst, 0, 3));
+M44F(dst, 1, 3) = m3gAdd(M44F(left, 1, 3), M44F(dst, 1, 3));
+M44F(dst, 2, 3) = m3gAdd(M44F(left, 2, 3), M44F(dst, 2, 3));
+dst->mask |= MC_TRANSLATION_PART;
+return;
+}
+if ((right->mask & ~MC_TRANSLATION_PART) == MC_IDENTITY) {
+Vec4 tvec;
+if (left->mask != MC_TRANSLATION && !left->complete) {
+m3gFillClassifiedMatrix((Matrix*)left);
+}
+if (right->mask != MC_TRANSLATION && !right->complete) {
+m3gFillClassifiedMatrix((Matrix*)right);
+}
+m3gCopyMatrix(dst, left);
+m3gGetMatrixColumn(right, 3, &tvec);
+m3gTransformVec4(dst, &tvec);
+M44F(dst, 0, 3) = tvec.x;
+M44F(dst, 1, 3) = tvec.y;
+M44F(dst, 2, 3) = tvec.z;
+dst->mask |= MC_TRANSLATION_PART;
+return;
+}
+}
+/* Compute product and set output classification */
+m3gGenericMatrixProduct(dst, left, right);
+}
+/*!
+* \brief Normalizes a quaternion
+*/
+M3G_API void m3gNormalizeQuat(Quat *q)
+{
+M3Gfloat norm;
+M3G_ASSERT_PTR(q);
+norm = m3gNorm4(&q->x);
+if (norm > EPSILON) {
+norm = m3gRcpSqrt(norm);
+m3gScale4(&q->x, norm);
+}
+else {
+m3gIdentityQuat(q);
+}
+}
+/*!
+* \brief Normalizes a three-vector
+*/
+M3G_API void m3gNormalizeVec3(Vec3 *v)
+{
+M3Gfloat norm;
+M3G_ASSERT_PTR(v);
+norm = m3gNorm3(&v->x);
+if (norm > EPSILON) {
+norm = m3gRcpSqrt(norm);
+m3gScale3(&v->x, norm);
+}
+else {
+m3gZero(v, sizeof(Vec3));
+}
+}
+/*!
+* \brief Normalizes a four-vector
+*/
+M3G_API void m3gNormalizeVec4(Vec4 *v)
+{
+M3Gfloat norm;
+M3G_ASSERT_PTR(v);
+norm = m3gNorm4(&v->x);
+if (norm > EPSILON) {
+norm = m3gRcpSqrt(norm);
+m3gScale4(&v->x, norm);
+}
+else {
+m3gZero(v, sizeof(Vec4));
+}
+}
+/*!
+* \brief Multiplies a matrix from the right with another matrix
+*/
+M3G_API void m3gPostMultiplyMatrix(Matrix *mtx, const Matrix *other)
+{
+Matrix temp;
+M3G_ASSERT_PTR(mtx);
+M3G_ASSERT_PTR(other);
+m3gCopyMatrix(&temp, mtx);
+m3gMatrixProduct(mtx, &temp, other);
+}
+/*!
+* \brief Multiplies a matrix from the left with another matrix
+*/
+M3G_API void m3gPreMultiplyMatrix(Matrix *mtx, const Matrix *other)
+{
+Matrix temp;
+M3G_ASSERT_PTR(mtx);
+M3G_ASSERT_PTR(other);
+m3gCopyMatrix(&temp, mtx);
+m3gMatrixProduct(mtx, other, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a rotation matrix.
+*/
+M3G_API void m3gPostRotateMatrix(Matrix *mtx,
+M3Gfloat angle,
+M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+Quat q;
+m3gSetAngleAxis(&q, angle, ax, ay, az);
+m3gPostRotateMatrixQuat(mtx, &q);
+}
+/*!
+* \brief Multiplies a matrix with a translation matrix.
+*/
+M3G_API void m3gPostRotateMatrixQuat(Matrix *mtx, const Quat *quat)
+{
+Matrix temp;
+m3gQuatMatrix(&temp, quat);
+m3gPostMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a scale matrix.
+*/
+M3G_API void m3gPostScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
+{
+Matrix temp;
+m3gScalingMatrix(&temp, sx, sy, sz);
+m3gPostMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a translation (matrix).
+*/
+M3G_API void m3gPostTranslateMatrix(Matrix *mtx,
+M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
+{
+Matrix temp;
+m3gTranslationMatrix(&temp, tx, ty, tz);
+m3gPostMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a rotation matrix
+*/
+M3G_API void m3gPreRotateMatrix(Matrix *mtx,
+M3Gfloat angle,
+M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+Quat q;
+m3gSetAngleAxis(&q, angle, ax, ay, az);
+m3gPreRotateMatrixQuat(mtx, &q);
+}
+/*!
+* \brief Multiplies a matrix with a quaternion rotation
+*/
+M3G_API void m3gPreRotateMatrixQuat(Matrix *mtx, const Quat *quat)
+{
+Matrix temp;
+m3gQuatMatrix(&temp, quat);
+m3gPreMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a scale matrix.
+*/
+M3G_API void m3gPreScaleMatrix(Matrix *mtx, M3Gfloat sx, M3Gfloat sy, M3Gfloat sz)
+{
+Matrix temp;
+m3gScalingMatrix(&temp, sx, sy, sz);
+m3gPreMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Multiplies a matrix with a translation (matrix).
+*/
+M3G_API void m3gPreTranslateMatrix(Matrix *mtx,
+M3Gfloat tx, M3Gfloat ty, M3Gfloat tz)
+{
+Matrix temp;
+m3gTranslationMatrix(&temp, tx, ty, tz);
+m3gPreMultiplyMatrix(mtx, &temp);
+}
+/*!
+* \brief Converts a quaternion into a matrix
+*
+* The output is a matrix effecting the same rotation as the
+* quaternion passed as input
+*/
+M3G_API void m3gQuatMatrix(Matrix *mtx, const Quat *quat)
+{
+M3Gfloat qx = quat->x;
+M3Gfloat qy = quat->y;
+M3Gfloat qz = quat->z;
+M3Gfloat qw = quat->w;
+/* Quick exit for identity rotations */
+if (IS_ZERO(qx) && IS_ZERO(qy) && IS_ZERO(qz)) {
+m3gIdentityMatrix(mtx);
+return;
+}
+{
+/* Determine the rough type of the output matrix */
+M3Guint type = MC_SCALING_ROTATION;
+if (IS_ZERO(qz) && IS_ZERO(qy)) {
+type = MC_X_ROTATION;
+}
+else if (IS_ZERO(qz) && IS_ZERO(qx)) {
+type = MC_Y_ROTATION;
+}
+else if (IS_ZERO(qx) && IS_ZERO(qy)) {
+type = MC_Z_ROTATION;
+}
+m3gClassifyAs(type, mtx);
+/* Generate the non-constant parts of the matrix */
+{
+M3Gfloat wx, wy, wz, xx, yy, yz, xy, xz, zz;
+xx = m3gMul(qx, qx);
+xy = m3gMul(qx, qy);
+xz = m3gMul(qx, qz);
+yy = m3gMul(qy, qy);
+yz = m3gMul(qy, qz);
+zz = m3gMul(qz, qz);
+wx = m3gMul(qw, qx);
+wy = m3gMul(qw, qy);
+wz = m3gMul(qw, qz);
+if (type != MC_X_ROTATION) {
+M44F(mtx, 0, 0) = m3gSub(1.f, m3gDouble(m3gAdd(yy, zz)));
+M44F(mtx, 0, 1) =             m3gDouble(m3gSub(xy, wz));
+M44F(mtx, 0, 2) =             m3gDouble(m3gAdd(xz, wy));
+}
+if (type != MC_Y_ROTATION) {
+M44F(mtx, 1, 0) =             m3gDouble(m3gAdd(xy, wz));
+M44F(mtx, 1, 1) = m3gSub(1.f, m3gDouble(m3gAdd(xx, zz)));
+M44F(mtx, 1, 2) =             m3gDouble(m3gSub(yz, wx));
+}
+if (type != MC_Z_ROTATION) {
+M44F(mtx, 2, 0) =             m3gDouble(m3gSub(xz, wy));
+M44F(mtx, 2, 1) =             m3gDouble(m3gAdd(yz, wx));
+M44F(mtx, 2, 2) = m3gSub(1.f, m3gDouble(m3gAdd(xx, yy)));
+}
+}
+m3gSubClassify(mtx);
+}
+}
+/*!
+* \brief Generates a scaling matrix
+*/
+M3G_API void m3gScalingMatrix(
+Matrix *mtx,
+const M3Gfloat sx, const M3Gfloat sy, const M3Gfloat sz)
+{
+M3G_ASSERT_PTR(mtx);
+M44F(mtx, 0, 0) = sx;
+M44F(mtx, 1, 1) = sy;
+M44F(mtx, 2, 2) = sz;
+m3gClassifyAs(MC_SCALING, mtx);
+m3gSubClassify(mtx);
+}
+/*!
+* \brief Sets a quaternion to represent an angle-axis rotation
+*/
+M3G_API void m3gSetAngleAxis(Quat *quat,
+M3Gfloat angle,
+M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+m3gSetAngleAxisRad(quat, m3gMul(angle, M3G_DEG2RAD), ax, ay, az);
+}
+/*!
+* \brief Sets a quaternion to represent an angle-axis rotation
+*/
+M3G_API void m3gSetAngleAxisRad(Quat *quat,
+M3Gfloat angleRad,
+M3Gfloat ax, M3Gfloat ay, M3Gfloat az)
+{
+M3G_ASSERT_PTR(quat);
+if (!IS_ZERO(angleRad)) {
+M3Gfloat s;
+M3Gfloat halfAngle = m3gHalf(angleRad);
+s = m3gSin(halfAngle);
+{
+M3Gfloat sqrNorm = m3gMadd(ax, ax, m3gMadd(ay, ay, m3gMul(az, az)));
+if (sqrNorm < 0.995f || sqrNorm > 1.005f) {
+if (sqrNorm > EPSILON) {
+M3Gfloat ooNorm = m3gRcpSqrt(sqrNorm);
+ax = m3gMul(ax, ooNorm);
+ay = m3gMul(ay, ooNorm);
+az = m3gMul(az, ooNorm);
+}
+else {
+ax = ay = az = 0.0f;
+}
+}
+}
+quat->x = m3gMul(s, ax);
+quat->y = m3gMul(s, ay);
+quat->z = m3gMul(s, az);
+quat->w = m3gCos(halfAngle);
+}
+else {
+m3gIdentityQuat(quat);
+}
+}
+/*!
+* \brief Quaternion multiplication.
+*/
+M3G_API void m3gMulQuat(Quat *quat, const Quat *other)
+{
+Quat q;
+q = *quat;
+quat->w = m3gMul(q.w, other->w) - m3gMul(q.x, other->x) - m3gMul(q.y, other->y) - m3gMul(q.z, other->z);
+quat->x = m3gMul(q.w, other->x) + m3gMul(q.x, other->w) + m3gMul(q.y, other->z) - m3gMul(q.z, other->y);
+quat->y = m3gMul(q.w, other->y) - m3gMul(q.x, other->z) + m3gMul(q.y, other->w) + m3gMul(q.z, other->x);
+quat->z = m3gMul(q.w, other->z) + m3gMul(q.x, other->y) - m3gMul(q.y, other->x) + m3gMul(q.z, other->w);
+}
+/*!
+* \brief Makes this quaternion represent the rotation from one 3D
+* vector to another
+*/
+M3G_API void m3gSetQuatRotation(Quat *quat,
+const Vec3 *from, const Vec3 *to)
+{
+M3Gfloat cosAngle;
+M3G_ASSERT_PTR(quat);
+M3G_ASSERT_PTR(from);
+M3G_ASSERT_PTR(to);
+cosAngle = m3gDot3(from, to);
+if (cosAngle > (1.0f - EPSILON)) {  /* zero angle */
+m3gIdentityQuat(quat);
+return;
+}
+else if (cosAngle > (1.0e-3f - 1.0f)) { /* normal case */
+Vec3 axis;
+m3gCross(&axis, from, to);
+m3gSetAngleAxisRad(quat, m3gArcCos(cosAngle), axis.x, axis.y, axis.z);
+}
+else {
+/* Opposite vectors; must generate an arbitrary perpendicular
+* vector and use that as the rotation axis. Here, we try the
+* Z axis first, and if that seems too parallel to the
+* vectors, project the Y axis instead: Z is the only good
+* choice for Z-constrained rotations, and Y by definition
+* must be perpendicular to that. */
+Vec3 axis, temp;
+M3Gfloat s;
+axis.x = axis.y = axis.z = 0.0f;
+if (m3gAbs(from->z) < (1.0f - EPSILON)) {
+axis.z = 1.0f;
+}
+else {
+axis.y = 1.0f;
+}
+s = m3gDot3(&axis, from);
+temp = *from;
+m3gScaleVec3(&temp, s);
+m3gSubVec3(&axis, &temp);
+m3gSetAngleAxis(quat, 180.f, axis.x, axis.y, axis.z);
+}
+}
+/*!
+* \brief Sets the values of a matrix
+*
+*/
+M3G_API void m3gSetMatrixColumns(Matrix *mtx, const M3Gfloat *src)
+{
+M3G_ASSERT(mtx != NULL && src != NULL);
+m3gCopy(mtx->elem, src, sizeof(mtx->elem));
+mtx->classified = M3G_FALSE;
+mtx->complete = M3G_TRUE;
+}
+/*!
+* \brief Sets the values of a matrix
+*
+*/
+M3G_API void m3gSetMatrixRows(Matrix *mtx, const M3Gfloat *src)
+{
+M3G_ASSERT(mtx != NULL && src != NULL);
+{
+int row;
+for (row = 0; row < 4; ++row) {
+mtx->elem[ 0 + row] = *src++;
+mtx->elem[ 4 + row] = *src++;
+mtx->elem[ 8 + row] = *src++;
+mtx->elem[12 + row] = *src++;
+}
+}
+mtx->classified = M3G_FALSE;
+mtx->complete = M3G_TRUE;
+}
+/*!
+* \brief Transforms a 4-vector with a matrix
+*/
+#if defined(M3G_HW_FLOAT_VFPV2)
+__asm void _m3gTransformVec4(const Matrix *mtx, Vec4 *vec, M3Gint n)
+{
+		CODE32
+		FSTMFDD	sp!, {d8-d11}
+		FMRX	r3, FPSCR
+		BIC		r12, r3, #0x00370000
+		ORR		r12, #(3<<16)
+		FMXR	FPSCR, r12
+		CMP		r2, #4
+		FLDMIAS	r0, {s4-s19}		// [mtx0  mtx1 ..]
+		FLDMIAS	r1, {s0-s3}			// [vec.x  vec.y  vec.z  vec.w]
+		FMULS	s20, s4, s0			// [s20-s23]  = [v.x*mtx0  v.x*mtx1  v.x*mtx2  v.x*mtx3 ]
+		FMACS	s20, s8, s1			// [s20-s23] += [v.y*mtx4  v.y*mtx5  v.y*mtx6  v.y*mtx7 ]
+		FMACS	s20, s12, s2		// [s20-s23] += [v.z*mtx8  v.z*mtx9  v.z*mtx10 v.z*mtx11]
+		FMACS	s20, s16, s3		// [s20-s23] += [v.w*mtx12 v.w*mtx13 v.w*mtx14 v.w*mtx15]
+		FSTMIAS		r1!, {s20-s22}
+		FSTMIASEQ	r1, {s23}
+		FMXR	FPSCR, r3
+		FLDMFDD	sp!, {d8-d11}
+		BX		lr
+}
+#endif /* #if defined(M3G_HW_FLOAT_VFPV2) */
+M3G_API void m3gTransformVec4(const Matrix *mtx, Vec4 *vec)
+{
+M3Guint type;
+M3G_ASSERT(mtx != NULL && vec != NULL);
+if (!m3gIsClassified(mtx)) {
+m3gClassify((Matrix*)mtx);
+}
+type = mtx->mask;
+if (type == MC_IDENTITY) {
+return;
+}
+else {
+Vec4 v = *vec;
+int i;
+int n = m3gIsWUnity(mtx) ? 3 : 4;
+if (!mtx->complete) {
+m3gFillClassifiedMatrix((Matrix*)mtx);
+}
+#if	defined(M3G_HW_FLOAT_VFPV2)
+		_m3gTransformVec4(mtx, vec, n);
+#else
+for (i = 0; i < n; ++i) {
+M3Gfloat d = m3gMul(M44F(mtx, i, 0), v.x);
+d = m3gMadd(M44F(mtx, i, 1), v.y, d);
+d = m3gMadd(M44F(mtx, i, 2), v.z, d);
+d = m3gMadd(M44F(mtx, i, 3), v.w, d);
+(&vec->x)[i] = d;
+}
+#endif
+}
+}
+/*!
+* \brief Generates a translation matrix
+*/
+M3G_API void m3gTranslationMatrix(
+Matrix *mtx,
+const M3Gfloat tx, const M3Gfloat ty, const M3Gfloat tz)
+{
+M3G_ASSERT_PTR(mtx);
+M44F(mtx, 0, 3) = tx;
+M44F(mtx, 1, 3) = ty;
+M44F(mtx, 2, 3) = tz;
+m3gClassifyAs(MC_TRANSLATION, mtx);
+m3gSubClassify(mtx);
+}
+/*!
+* \brief Float vector assignment.
+*/
+M3G_API void m3gSetQuat(Quat *quat, const M3Gfloat *vec)
+{
+quat->x = vec[0];
+quat->y = vec[1];
+quat->z = vec[2];
+quat->w = vec[3];
+}
+/*!
+* \brief Slerp between quaternions q0 and q1, storing the result in quat.
+*/
+M3G_API void m3gSlerpQuat(Quat *quat,
+M3Gfloat s,
+const Quat *q0, const Quat *q1)
+{
+M3Gfloat s0, s1;
+M3Gfloat cosTheta = m3gDot4((const Vec4 *)q0, (const Vec4 *)q1);
+M3Gfloat oneMinusS = m3gSub(1.0f, s);
+if (cosTheta > EPSILON - 1.0f) {
+if (cosTheta < 1.0f - EPSILON) {
+M3Gfloat theta    = m3gArcCos(cosTheta);
+M3Gfloat sinTheta = m3gSin(theta);
+s0 = m3gSin(m3gMul(oneMinusS, theta)) / sinTheta;
+s1 = m3gSin(m3gMul(s, theta)) / sinTheta;
+}
+else {
+/* For quaternions very close to each other, use plain
+linear interpolation for numerical stability. */
+s0 = oneMinusS;
+s1 = s;
+}
+quat->x = m3gMadd(s0, q0->x, m3gMul(s1, q1->x));
+quat->y = m3gMadd(s0, q0->y, m3gMul(s1, q1->y));
+quat->z = m3gMadd(s0, q0->z, m3gMul(s1, q1->z));
+quat->w = m3gMadd(s0, q0->w, m3gMul(s1, q1->w));
+}
+else {
+/* Slerp is undefined if the two quaternions are (nearly)
+opposite, so we just pick an arbitrary plane of
+rotation here. */
+quat->x = -q0->y;
+quat->y =  q0->x;
+quat->z = -q0->w;
+quat->w =  q0->z;
+s0 = m3gSin(m3gMul(oneMinusS, HALF_PI));
+s1 = m3gSin(m3gMul(s, HALF_PI));
+quat->x = m3gMadd(s0, q0->x, m3gMul(s1, quat->x));
+quat->y = m3gMadd(s0, q0->y, m3gMul(s1, quat->y));
+quat->z = m3gMadd(s0, q0->z, m3gMul(s1, quat->z));
+}
+}
+/*!
+* \brief Interpolate quaternions using spline, or "squad" interpolation.
+*/
+M3G_API void m3gSquadQuat(Quat *quat,
+M3Gfloat s,
+const Quat *q0, const Quat *a, const Quat *b, const Quat *q1)
+{
+Quat temp0;
+Quat temp1;
+m3gSlerpQuat(&temp0, s, q0, q1);
+m3gSlerpQuat(&temp1, s, a, b);
+m3gSlerpQuat(quat, m3gDouble(m3gMul(s, m3gSub(1.0f, s))), &temp0, &temp1);
+}

changeset 0	5d03bc08d59c
child 33	25f95128741d