/* Metrowerks x86 Runtime Support Library 
 * Copyright  1995-2003 Metrowerks Corporation.  All rights reserved.
 *
 * $Date: 2005/02/25 15:03:06 $
 * $Revision: 1.1.2.1 $
 */

/*
-- 	check Intel compiler dox so we won't miss any macros
--	allow old (instruction) named-intrinsics via macros
--	_MM_TRANSPOSE4_PS?
*/

/*
 *	SSE intrinsics API support.  
 *
 *	Culled from Intel architecture docs, MSDN, and other web references.
 *
 *	Note, in Codewarrior, inline asm is as good as intrinsics
 *	so we implement the majority of functions as inlines.
 */

#ifndef _XMMINTRIN_H
#define _XMMINTRIN_H

#include "x86_prefix.h"
#include <mmintrin.h>

#if __cplusplus
extern "C" {
#endif

/*
	Direct use of the members is HIGHLY DISCOURAGED since it forces
	objects to memory (otherwise they will be allocated to XMM registers)
*/
typedef __declspec(intrin_type) struct  __declspec(align(16)) __m128 {
    float       m128_f32[4];
} __m128;

typedef __declspec(intrin_type) struct __declspec(align(16)) __m128d {
    double       m128_f64[2];
} __m128d;

typedef __declspec(intrin_type) union __declspec(align(16)) __m128i {
    unsigned __int64    m128_u64[2];
    float               m128_f32[4];
    __int8              m128_i8[16];
    __int16             m128_i16[8];
    __int32             m128_i32[4];    
    __int64		        m128_i64[2];
    unsigned __int8     m128_u8[16];
    unsigned __int16    m128_u16[8];
    unsigned __int32    m128_u32[4];
} __m128i;

#pragma volatile_asm off

// compiler intrinsics

extern __m128i _mm_set1_epi32 (int i);
extern __m128 _mm_set1_ps(float w);
extern __m128d _mm_set1_pd(double w);

extern __m128 _mm_set_ss(float w);
extern __m128 _mm_set_ps(float z, float y, float x, float w);
extern __m128 _mm_set_sd(double w);
extern __m128 _mm_set_pd(double z, double y);
extern __m128i _mm_set_epi32(int i3, int i2, int i1, int i0);

#define _mm_set_ps1 _mm_set1_ps

// "intrinsics"

__inline __m128d _mm_add_pd(register __m128d a, register __m128d b)
{
	__asm ADDPD a,b;
	return a;
}

__inline __m128 _mm_add_ps(register __m128 a, register __m128 b)
{
	__asm ADDPS a,b;
	return a;
}

__inline __m128d _mm_add_sd(register __m128d a, register __m128d b)
{
	__asm ADDSD a,b;
	return a;
}

__inline __m128 _mm_add_ss(register __m128 a, register __m128 b)
{
	__asm ADDSS a,b;
	return a;
}

__inline __m128d _mm_andnot_pd(register __m128d a, register __m128d b)
{
	__asm ANDNPD a,b;
	return a;
}

__inline __m128 _mm_andnot_ps(register __m128 a, register __m128 b)
{
	__asm ANDNPS a,b;
	return a;
}

__inline __m128d _mm_and_pd(register __m128d a, register __m128d b)
{
	__asm ANDPD a,b;
	return a;
}

__inline __m128 _mm_and_ps(register __m128 a, register __m128 b)
{
	__asm ANDPS a,b;
	return a;
}

__inline void _mm_clflush(register char* a)
{
	__asm MOV eax,a;
	__asm CLFLUSH [eax];
}

__inline __m128d _mm_cmpeq_pd(register __m128d a, register __m128d b)
{
	__asm CMPEQPD a,b;
	return a;
}

__inline __m128d _mm_cmplt_pd(register __m128d a, register __m128d b)
{
	__asm CMPLTPD a,b;
	return a;
}

__inline __m128d _mm_cmple_pd(register __m128d a, register __m128d b)
{
	__asm CMPLEPD a,b;
	return a;
}

__inline __m128d _mm_cmpgt_pd(register __m128d a, register __m128d b)
{
#if __option(ieeefp)
	register __m128d t;
	__asm MOVUPS t,b;
	__asm CMPLTPD t,a;
	return t;
#else
	__asm CMPNLEPD a,b;
	return a;
#endif
}

__inline __m128d _mm_cmpge_pd(register __m128d a, register __m128d b)
{
#if __option(ieeefp)
	register __m128d t;
	__asm MOVUPS t,b;
	__asm CMPLEPD t,a;
	return t;
#else
	__asm CMPNLTPD a,b;
	return a;
#endif
}

__inline __m128d _mm_cmpneq_pd(register __m128d a, register __m128d b)
{
	__asm CMPNEQPD a,b;
	return a;
}

__inline __m128 _mm_cmpeq_ps(register __m128 a, register __m128 b)
{
	__asm CMPEQPS a,b;
	return a;
}

__inline __m128 _mm_cmplt_ps(register __m128 a, register __m128 b)
{
	__asm CMPLTPS a,b;
	return a;
}

__inline __m128 _mm_cmple_ps(register __m128 a, register __m128 b)
{
	__asm CMPLEPS a,b;
	return a;
}

__inline __m128 _mm_cmpgt_ps(register __m128 a, register __m128 b)
{
#if __option(ieeefp)
	register __m128 t;
	__asm MOVUPS t,b;
	__asm CMPLTPS t,a;
	return t;
#else
	__asm CMPNLEPS a,b;
	return a;
#endif
}

__inline __m128 _mm_cmpge_ps(register __m128 a, register __m128 b)
{
#if __option(ieeefp)
	register __m128 t;
	__asm MOVUPS t,b;
	__asm CMPLEPS t,a;
	return t;
#else
	__asm CMPNLTPS a,b;
	return a;
#endif
}

__inline __m128 _mm_cmpneq_ps(register __m128 a, register __m128 b)
{
	__asm CMPNEQPS a,b;
	return a;
}

__inline __m128d _mm_cmpeq_sd(register __m128d a, register __m128d b)
{
	__asm CMPEQSD a,b;
	return a;
}

__inline __m128d _mm_cmplt_sd(register __m128d a, register __m128d b)
{
	__asm CMPLTSD a,b;
	return a;
}

__inline __m128d _mm_cmple_sd(register __m128d a, register __m128d b)
{
	__asm CMPLESD a,b;
	return a;
}

__inline __m128d _mm_cmpgt_sd(register __m128d a, register __m128d b)
{
#if __option(ieeefp)
	register __m128d t;
	__asm MOVUPD t,b;
	__asm CMPLTSD t,a;
	return t;
#else
	__asm CMPNLESD a,b;
	return a;
#endif
}

__inline __m128d _mm_cmpge_sd(register __m128d a, register __m128d b)
{
#if __option(ieeefp)
	register __m128d t;
	__asm MOVUPD t,b;
	__asm CMPLESD t,a;
	return t;
#else
	__asm CMPNLTSD a,b;
	return a;
#endif
}

__inline __m128d _mm_cmpneq_sd(register __m128d a, register __m128d b)
{
	__asm CMPNEQSD a,b;
	return a;
}

__inline __m128 _mm_cmpeq_ss(register __m128 a, register __m128 b)
{
	__asm CMPEQSS a,b;
	return a;
}

__inline __m128 _mm_cmplt_ss(register __m128 a, register __m128 b)
{
	__asm CMPLTSS a,b;
	return a;
}

__inline __m128 _mm_cmple_ss(register __m128 a, register __m128 b)
{
	__asm CMPLESS a,b;
	return a;
}

__inline __m128 _mm_cmpgt_ss(register __m128 a, register __m128 b)
{
#if __option(ieeefp)
	register __m128 t;
	__asm MOVUPS t,b;
	__asm CMPLTSS t,a;
	return t;
#else
	__asm CMPNLESS a,b;
	return a;
#endif
}

__inline __m128 _mm_cmpge_ss(register __m128 a, register __m128 b)
{
#if __option(ieeefp)
	register __m128 t;
	__asm MOVUPS t,b;
	__asm CMPLESS t,a;
	return t;
#else
	__asm CMPNLTSS a,b;
	return a;
#endif
}

__inline __m128 _mm_cmpneq_ss(register __m128 a, register __m128 b)
{
	__asm CMPNEQSS a,b;
	return a;
}

__inline int _mm_comieq_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETE r;
	return r;
}

__inline int _mm_comilt_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETB r;
	return r;
}

__inline int _mm_comile_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETBE r;
	return r;
}

__inline int _mm_comigt_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETA r;
	return r;
}

__inline int _mm_comige_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETAE r;
	return r;
}

__inline int _mm_comineq_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm COMISD a,b;
	__asm SETNE r;
	return r;
}

__inline int _mm_comieq_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETE r;
	return r;
}

__inline int _mm_comilt_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETB r;
	return r;
}

__inline int _mm_comile_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETBE r;
	return r;
}

__inline int _mm_comigt_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETA r;
	return r;
}

__inline int _mm_comige_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETAE r;
	return r;
}

__inline int _mm_comineq_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm COMISS a,b;
	__asm SETNE r;
	return r;
}

__inline __m128d _mm_cvtepi32_pd(register __m128i a)
{
	register __m128d r;
	__asm CVTDQ2PD r,a;
	return r;
}

__inline __m128 _mm_cvtepi32_ps(register __m128i a)
{
	register __m128 r;
	__asm CVTDQ2PS r,a;
	return r;
}

__inline __m128i _mm_cvtpd_epi32(register __m128d a)
{
	register __m128i r;
	__asm CVTPD2DQ r,a;
	return r;
}

__inline __m64 _mm_cvtpd_pi32(register __m128d a)
{
	register __m64 r;
	__asm CVTPD2PI r,a;
	return r;
}

__inline __m128d _mm_cvtpi32_pd(register __m64 a)
{
	register __m128d r;
	__asm CVTPI2PD r,a;
	return r;
}

__inline __m128 _mm_cvt_pi2ps(register __m128 a, register __m64 b)
{
	__asm CVTPI2PS a,b;
	return a;
}

__inline __m128 _mm_cvtpi32_ps(register __m128 a, register __m64 b)
{
	__asm CVTPI2PS a,b;
	return a;
}

__inline __m128i _mm_cvtps_epi32(register __m128 a)
{
	register __m128i r;
	__asm CVTPS2DQ r,a;
	return r;
}

__inline __m128d _mm_cvtps_pd(register __m128 a)
{
	register __m128d r;
	__asm CVTPS2PD r,a;
	return r;
}

__inline __m64 _mm_cvt_ps2pi(register __m128 a)
{
	register __m64 r;
	__asm CVTPS2PI r,a;
	return r;
}

__inline __m64 _mm_cvtps_pi32(register __m128 a)
{
	register __m64 r;
	__asm CVTPS2PI r,a;
	return r;
}

__inline int _mm_cvtsd_si32(register __m128d a)
{
	register int r;
	__asm CVTSD2SI r,a;
	return r;
}

__inline __m128 _mm_cvtsd_ss(register __m128 a, register __m128d b)
{
	__asm CVTSD2SS a,b;
	return a;
}

__inline __m128d _mm_cvtsi32_sd(register __m128d a, register int b)
{
	__asm CVTSI2SD a,b;
	return a;
}

__inline __m128 _mm_cvt_si2ss(register __m128 a, register int b)
{
	__asm CVTSI2SS a,b;
	return a;
}

__inline __m128 _mm_cvtsi32_ss(register __m128 a, register int b)
{
	__asm CVTSI2SS a,b;
	return a;
}

__inline __m128d _mm_cvtss_sd(register __m128d a, register __m128 b)
{
	__asm CVTSS2SD a,b;
	return a;
}

__inline int _mm_cvt_ss2si(register __m128 a)
{
	register int r;
	__asm CVTSS2SI r,a;
	return r;
}

__inline int _mm_cvtss_si32(register __m128 a)
{
	register int r;
	__asm CVTSS2SI r,a;
	return r;
}

__inline __m128i _mm_cvttpd_epi32(register __m128d a)
{
	register __m128i r;
	__asm CVTTPD2DQ r,a;
	return r;
}

__inline __m64 _mm_cvttpd_pi32(register __m128d a)
{
	register __m64 r;
	__asm CVTTPD2PI r,a;
	return r;
}

__inline __m128i _mm_cvtt_epi32(register __m128 a)
{
	register __m128i r;
	__asm CVTTPS2DQ r,a;
	return r;
}

__inline __m64 _mm_cvtt_ps2pi(register __m128 a)
{
	register __m64 r;
	__asm CVTTPS2PI r,a;
	return r;
}

__inline __m64 _mm_cvttps_pi32(register __m128 a)
{
	register __m64 r;
	__asm CVTTPS2PI r,a;
	return r;
}

__inline int _mm_cvttsd_si32(register __m128d a)
{
	register int r;
	__asm CVTTSD2SI r,a;
	return r;
}

__inline int _mm_cvtt_ss2si(register __m128 a)
{
	register int r;
	__asm CVTTSS2SI r,a;
	return r;
}

__inline int _mm_cvttss_si32(register __m128 a)
{
	register int r;
	__asm CVTTSS2SI r,a;
	return r;
}

__inline __m128d _mm_div_pd(register __m128d a, register __m128d b)
{
	__asm DIVPD a,b;
	return a;
}

__inline __m128 _mm_div_ps(register __m128 a, register __m128 b)
{
	__asm DIVPS a,b;
	return a;
}

__inline __m128d _mm_div_sd(register __m128d a, register __m128d b)
{
	__asm DIVSD a,b;
	return a;
}

__inline __m128 _mm_div_ss(register __m128 a, register __m128 b)
{
	__asm DIVSS a,b;
	return a;
}

__inline void _mm_setcsr(unsigned int a)
{
	__asm LDMXCSR a;
}

__inline void _mm_lfence()
{
	__asm LFENCE;
}

__inline void _mm_maskmoveu_si128(register __m128i a, register __m128i b, register char* c)
{
	__asm mov edi, c
	__asm MASKMOVDQU a,b;
}

__inline __m128d _mm_max_pd(register __m128d a, register __m128d b)
{
	__asm MAXPD a,b;
	return a;
}

__inline __m128 _mm_max_ps(register __m128 a, register __m128 b)
{
	__asm MAXPS a,b;
	return a;
}

__inline __m128d _mm_max_sd(register __m128d a, register __m128d b)
{
	__asm MAXSD a,b;
	return a;
}

__inline __m128 _mm_max_ss(register __m128 a, register __m128 b)
{
	__asm MAXSS a,b;
	return a;
}

__inline void _mm_mfence()
{
	__asm MFENCE;
}

__inline __m128d _mm_min_pd(register __m128d a, register __m128d b)
{
	__asm MINPD a,b;
	return a;
}

__inline __m128 _mm_min_ps(register __m128 a, register __m128 b)
{
	__asm MINPS a,b;
	return a;
}

__inline __m128d _mm_min_sd(register __m128d a, register __m128d b)
{
	__asm MINSD a,b;
	return a;
}

__inline __m128 _mm_min_ss(register __m128 a, register __m128 b)
{
	__asm MINSS a,b;
	return a;
}

__inline __m128d _mm_load_pd(register double* a)
{
	register void *p; //= (void *)a;
	register __m128d r;
	__asm MOV p,a;
	__asm MOVUPD r,[p];
	return r;
}

__inline void _mm_store_pd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVUPD [p],b;
}

__inline __m128 _mm_load_ps(register float* a)
{
	register void *p; //= (void *)a;
	register __m128 r;
	__asm MOV p,a;
	__asm MOVUPS r,[p];
	return r;
}

__inline void _mm_store_ps(register float* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVUPS [p],b;
}

__inline __m128i _mm_cvtsi32_si128(register int a)
{
	register __m128i r;
	__asm MOVD r,a;
	return r;
}

__inline int _mm_cvtsi128_si32(register __m128i a)
{
	register int r;
	__asm MOVD r,a;
	return r;
}

__inline __m128i _mm_load_si128(register __m128i* a)
{
	register void *p; //= (void *)a;
	register __m128i r;
	__asm MOV p,a;
	__asm MOVDQA r,[p];
	return r;
}

__inline void _mm_store_si128(register __m128i* a, register __m128i b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVDQA [p],b;
}

__inline __m128i _mm_loadu_si128(register __m128i* a)
{
	register void *p; //= (void *)a;
	register __m128i r;
	__asm MOV p,a;
	__asm MOVDQU r,[p];
	return r;
}

__inline void _mm_storeu_si128(register __m128i* a, register __m128i b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVDQU [p],b;
}

__inline __m64 _mm_movepi64_pi64(register __m128i a)
{
	register __m64 r;
	__asm MOVDQ2Q r,a;
	return r;
}

__inline __m128 _mm_movehl_ps(register __m128 a, register __m128 b)
{
	__asm MOVHLPS a,b;
	return a;
}

__inline __m128d _mm_loadh_pd(register __m128d a, register double* b)
{
	register void *q; //= (void *)b;
	__asm MOV q,b;
	__asm MOVHPD a,[q];
	return a;
}

__inline void _mm_storeh_pd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVHPD [p],b;
}

__inline __m128 _mm_loadh_pi(register __m128 a, register __m64* b)
{
	register void *q; //= (void *)b;
	__asm MOV q,b;
	__asm MOVHPS a,[q];
	return a;
}

__inline void _mm_storeh_pi(register __m64* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVHPS [p],b;
}

__inline __m128d _mm_loadl_pd(register __m128d a, register double* b)
{
	register void *q; //= (void *)b;
	__asm MOV q,b;
	__asm MOVLPD a,[q];
	return a;
}

__inline void _mm_storel_pd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVLPD [p],b;
}

__inline __m128 _mm_loadl_pi(register __m128 a, register __m64* b)
{
	register void *q; //= (void *)b;
	__asm MOV q,b;
	__asm MOVLPS a,[q];
	return a;
}

__inline void _mm_storel_pi(register __m64* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVLPS [p],b;
}

__inline __m128 _mm_movelh_ps(register __m128 a, register __m128 b)
{
	__asm MOVLHPS a,b;
	return a;
}

__inline int _mm_movemask_pd(register __m128d a)
{
	register int r;
	__asm MOVMSKPD r,a;
	return r;
}

__inline int _mm_movemask_ps(register __m128 a)
{
	register int r;
	__asm MOVMSKPS r,a;
	return r;
}

__inline void _mm_stream_si128(register __m128i* a, register __m128i b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVNTDQ [p],b;
}

__inline void _mm_stream_pd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVNTPD [p],b;
}

__inline void _mm_stream_ps(register float* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVNTPS [p],b;
}

__inline void _mm_stream_si32(register int* a, register int b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVNTI [p],b;
}

__inline void _mm_stream_pi(register __m64* a, register __m64 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVNTQ [p],b;
}

__inline __m128i _mm_loadl_epi64(register __m128i* a)
{
	register void *p; //= (void *)a;
	register __m128i r;
	__asm MOV p,a;
	__asm MOVQ r,[p];
	return r;
}

__inline void _mm_storel_epi64(register __m128i* a, register __m128i b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVQ [p],b;
}

__inline __m128i _mm_move_epi64(register __m128i a)
{
	register __m128i r;
	__asm MOVQ r,a;
	return a;
}

__inline __m128i _mm_movpi64_epi64(register __m64 a)
{
	register __m128i r;
	__asm MOVQ2DQ r,a;
	return r;
}

__inline __m128d _mm_load_sd(register double* a)
{
	register void *p; //= (void *)a;
	register __m128d r;
	__asm MOV p,a;
	__asm MOVSD r,qword ptr [p];
	return r;
}

__inline void _mm_store_sd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVSD qword ptr [p],b;
}

__inline __m128d _mm_move_sd(register __m128d a, register __m128d b)
{
	__asm MOVSD a,b;
	return a;
}

__inline __m128 _mm_load_ss(register float* a)
{
	register void *p; //= (void *)a;
	register __m128 r;
	__asm MOV p,a;
	__asm MOVSS r,dword ptr [p];
	return r;
}

__inline void _mm_store_ss(register float* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVSS dword ptr [p],b;
}

__inline __m128 _mm_move_ss(register __m128 a, register __m128 b)
{
	__asm MOVSS a,b;
	return a;
}

__inline __m128d _mm_loadu_pd(register double* a)
{
	register void *p; //= (void *)a;
	register __m128d r;
	__asm MOV p,a;
	__asm MOVUPD r,xmmword ptr [p];
	return r;
}

__inline void _mm_storeu_pd(register double* a, register __m128d b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVUPD xmmword ptr [p],b;
}

__inline __m128 _mm_loadu_ps(register float* a)
{
	register void *p; //= (void *)a;
	register __m128 r;
	__asm MOV p,a;
	__asm MOVUPS r,xmmword ptr [p];
	return r;
}

__inline void _mm_storeu_ps(register float* a, register __m128 b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a;
	__asm MOVUPS xmmword ptr [p],b;
}

__inline __m128d _mm_mul_pd(register __m128d a, register __m128d b)
{
	__asm MULPD a,b;
	return a;
}

__inline __m128 _mm_mul_ps(register __m128 a, register __m128 b)
{
	__asm MULPS a,b;
	return a;
}

__inline __m128d _mm_mul_sd(register __m128d a, register __m128d b)
{
	__asm MULSD a,b;
	return a;
}

__inline __m128 _mm_mul_ss(register __m128 a, register __m128 b)
{
	__asm MULSS a,b;
	return a;
}

__inline __m128d _mm_or_pd(register __m128d a, register __m128d b)
{
	__asm ORPD a,b;
	return a;
}

__inline __m128 _mm_or_ps(register __m128 a, register __m128 b)
{
	__asm ORPS a,b;
	return a;
}

__inline __m128i _mm_packs_epi16(register __m128i a, register __m128i b)
{
	__asm PACKSSWB a,b;
	return a;
}

__inline __m128i _mm_packs_epi32(register __m128i a, register __m128i b)
{
	__asm PACKSSDW a,b;
	return a;
}

__inline __m128i _mm_packus_epi16(register __m128i a, register __m128i b)
{
	__asm PACKUSWB a,b;
	return a;
}

__inline __m128i _mm_add_epi8(register __m128i a, register __m128i b)
{
	__asm PADDB a,b;
	return a;
}

__inline __m128i _mm_add_epi16(register __m128i a, register __m128i b)
{
	__asm PADDW a,b;
	return a;
}

__inline __m128i _mm_add_epi32(register __m128i a, register __m128i b)
{
	__asm PADDD a,b;
	return a;
}

__inline __m128i _mm_add_epi64(register __m128i a, register __m128i b)
{
	__asm PADDQ a,b;
	return a;
}

__inline __m128i _mm_adds_epi8(register __m128i a, register __m128i b)
{
	__asm PADDSB a,b;
	return a;
}

__inline __m128i _mm_adds_epi16(register __m128i a, register __m128i b)
{
	__asm PADDSW a,b;
	return a;
}

__inline __m128i _mm_add_epu8(register __m128i a, register __m128i b)
{
	__asm PADDUSB a,b;
	return a;
}

__inline __m128i _mm_add_epu16(register __m128i a, register __m128i b)
{
	__asm PADDUSW a,b;
	return a;
}

__inline __m128i _mm_and_si128(register __m128i a, register __m128i b)
{
	__asm PAND a,b;
	return a;
}

__inline __m128i _mm_andnot_si128(register __m128i a, register __m128i b)
{
	__asm PANDN a,b;
	return a;
}

__inline void _mm_pause()
{
	__asm PAUSE;
}

__inline __m128i _mm_avg_epu8(register __m128i a, register __m128i b)
{
	__asm PAVGB a,b;
	return a;
}

__inline __m128i _mm_avg_epu16(register __m128i a, register __m128i b)
{
	__asm PAVGW a,b;
	return a;
}

__inline __m128i _mm_cmpeq_epi8(register __m128i a, register __m128i b)
{
	__asm PCMPEQB a,b;
	return a;
}

__inline __m128i _mm_cmpeq_epi16(register __m128i a, register __m128i b)
{
	__asm PCMPEQW a,b;
	return a;
}

__inline __m128i _mm_cmpeq_epi32(register __m128i a, register __m128i b)
{
	__asm PCMPEQD a,b;
	return a;
}

__inline __m128i _mm_cmpgt_epi8(register __m128i a, register __m128i b)
{
	__asm PCMPGTB a,b;
	return a;
}

__inline __m128i _mm_cmpgt_epi16(register __m128i a, register __m128i b)
{
	__asm PCMPGTW a,b;
	return a;
}

__inline __m128i _mm_cmpgt_epi32(register __m128i a, register __m128i b)
{
	__asm PCMPGTD a,b;
	return a;
}

__inline int _mm_extract_epi16(register __m128i a, register int b)
{
	register int r;
	__asm PEXTRW r,a,b;
	return r;
}

__inline __m128i _mm_insert_epi16(register __m128i a, register int b, register int c)
{
	__asm PINSRW a,b,c;
	return a;
}

__inline __m128i _mm_madd_epi16(register __m128i a, register __m128i b)
{
	__asm PMADDWD a,b;
	return a;
}

__inline __m128i _mm_max_epi16(register __m128i a, register __m128i b)
{
	__asm PMAXSW a,b;
	return a;
}

__inline __m128i _mm_max_epu8(register __m128i a, register __m128i b)
{
	__asm PMAXUB a,b;
	return a;
}

__inline __m128i _mm_min_epi16(register __m128i a, register __m128i b)
{
	__asm PMINSW a,b;
	return a;
}

__inline __m128i _mm_min_epu8(register __m128i a, register __m128i b)
{
	__asm PMINUB a,b;
	return a;
}

__inline int _mm_movemask_epi8(register __m128i a)
{
	register int r;
	__asm PMOVMSKB r,a;
	return r;
}

__inline __m128i _mm_mulhi_epu16(register __m128i a, register __m128i b)
{
	__asm PMULHUW a,b;
	return a;
}

__inline __m128i _mm_mulhi_epi16(register __m128i a, register __m128i b)
{
	__asm PMULHW a,b;
	return a;
}

__inline __m128i _mm_mullo_epi16(register __m128i a, register __m128i b)
{
	__asm PMULLW a,b;
	return a;
}

__inline __m128i _mm_mul_epu32(register __m128i a, register __m128i b)
{
	__asm PMULUDQ a,b;
	return a;
}

__inline __m128i _mm_or_si128(register __m128i a, register __m128i b)
{
	__asm POR a,b;
	return a;
}

#define _MM_HINT_T0		1
#define _MM_HINT_T1		2
#define _MM_HINT_T2		3
#define _MM_HINT_NTA	0

__inline void _mm_prefetch(register char* a, register int b)
{
	register void *p; //= (void *)a;
	__asm MOV p,a
	// the if() should be optimized away
	if (b == _MM_HINT_T0)
		__asm PREFETCHT0 [p];
	else if (b == _MM_HINT_T1)
		__asm PREFETCHT1 [p];
	else if (b == _MM_HINT_T2)
		__asm PREFETCHT2 [p];
	else 
		__asm PREFETCHNTA [p];
}

__inline __m128i _mm_sad_epu8(register __m128i a, register __m128i b)
{
	__asm PSADBW a,b;
	return a;
}

__inline __m128i _mm_shuffle_epi32(register __m128i a, register int b)
{
	__asm PSHUFD a,b;
	return a;
}

__inline __m128i _mm_shufflehi_epi16(register __m128i a, register int b)
{
	__asm PSHUFHW a,b;
	return a;
}

__inline __m128i _mm_shufflelo_epi16(register __m128i a, register int b)
{
	__asm PSHUFLW a,b;
	return a;
}

__inline __m128i _mm_sll_epi16(register __m128i a, register __m128i b)
{
	__asm PSLLW a,b;
	return a;
}

__inline __m128i _mm_slli_epi16(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSLLW a,b0;
	return a;
}

__inline __m128i _mm_sll_epi32(register __m128i a, register __m128i b)
{
	__asm PSLLD a,b;
	return a;
}

__inline __m128i _mm_slli_epi32(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSLLD a,b0;
	return a;
}

__inline __m128i _mm_sll_epi64(register __m128i a, register __m128i b)
{
	__asm PSLLQ a,b;
	return a;
}

__inline __m128i _mm_slli_epi64(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSLLQ a,b;
	return a;
}

__inline __m128i _mm_slli_si128(register __m128i a, register int b)
{
	const long long b0 = b;
	__asm PSLLDQ a,b0;
	return a;
}

__inline __m128i _mm_sra_epi16(register __m128i a, register __m128i b)
{
	__asm PSRAW a,b;
	return a;
}

__inline __m128i _mm_srai_epi16(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRAW a,b0;
	return a;
}

__inline __m128i _mm_sra_epi32(register __m128i a, register __m128i b)
{
	__asm PSRAD a,b;
	return a;
}

__inline __m128i _mm_srai_epi32(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRAD a,b0;
	return a;
}

__inline __m128i _mm_srl_epi16(register __m128i a, register __m128i b)
{
	__asm PSRLW a,b;
	return a;
}

__inline __m128i _mm_srli_epi16(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRLW a,b0;
	return a;
}

__inline __m128i _mm_srl_epi32(register __m128i a, register __m128i b)
{
	__asm PSRLD a,b;
	return a;
}

__inline __m128i _mm_srli_epi32(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRLD a,b0;
	return a;
}

__inline __m128i _mm_srl_epi64(register __m128i a, register __m128i b)
{
	__asm PSRLQ a,b;
	return a;
}

__inline __m128i _mm_srli_epi64(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRLQ a,b0;
	return a;
}

__inline __m128i _mm_srli_si128(register __m128i a, register int b)
{
	long long b0 = b;
	__asm PSRLDQ a,b0;
	return a;
}

__inline __m128i _mm_sub_epi8(register __m128i a, register __m128i b)
{
	__asm PSUBB a,b;
	return a;
}

__inline __m128i _mm_sub_epi16(register __m128i a, register __m128i b)
{
	__asm PSUBW a,b;
	return a;
}

__inline __m128i _mm_sub_epi32(register __m128i a, register __m128i b)
{
	__asm PSUBD a,b;
	return a;
}

__inline __m128i _mm_sub_epi64(register __m128i a, register __m128i b)
{
	__asm PSUBQ a,b;
	return a;
}

__inline __m128i _mm_subs_epi8(register __m128i a, register __m128i b)
{
	__asm PSUBSB a,b;
	return a;
}

__inline __m128i _mm_subs_epi16(register __m128i a, register __m128i b)
{
	__asm PSUBSW a,b;
	return a;
}

__inline __m128i _mm_sub_epu8(register __m128i a, register __m128i b)
{
	__asm PSUBUSB a,b;
	return a;
}

__inline __m128i _mm_sub_epu16(register __m128i a, register __m128i b)
{
	__asm PSUBUSW a,b;
	return a;
}

__inline __m128i _mm_unpackhi_epi8(register __m128i a, register __m128i b)
{
	__asm PUNPCKHBW a,b;
	return a;
}

__inline __m128i _mm_unpackhi_epi16(register __m128i a, register __m128i b)
{
	__asm PUNPCKHWD a,b;
	return a;
}

__inline __m128i _mm_unpackhi_epi32(register __m128i a, register __m128i b)
{
	__asm PUNPCKHDQ a,b;
	return a;
}

__inline __m128i _mm_unpackhi_pi64(register __m128i a, register __m128i b)
{
	__asm PUNPCKHQDQ a,b;
	return a;
}

__inline __m128i _mm_unpacklo_epi8(register __m128i a, register __m128i b)
{
	__asm PUNPCKLBW a,b;
	return a;
}

__inline __m128i _mm_unpacklo_epi16(register __m128i a, register __m128i b)
{
	__asm PUNPCKLWD a,b;
	return a;
}

__inline __m128i _mm_unpacklo_epi32(register __m128i a, register __m128i b)
{
	__asm PUNPCKLDQ a,b;
	return a;
}

__inline __m128i _mm_unpacklo_pi64(register __m128i a, register __m128i b)
{
	__asm PUNPCKLQDQ a,b;
	return a;
}

__inline __m128i _mm_xor_si128(register __m128i a, register __m128i b)
{
	__asm PXOR a,b;
	return a;
}

__inline __m128 _mm_rcp_ps(register __m128 a)
{
	__asm RCPPS a, a;
	return a;
}

__inline __m128 _mm_rcp_ss(register __m128 a)
{
	__asm RCPSS a, a;
	return a;
}

__inline __m128 _mm_rsqrt_ps(register __m128 a)
{
	__asm RSQRTPS a, a;
	return a;
}

__inline __m128 _mm_rsqrt_ss(register __m128 a)
{
	__asm RSQRTSS a, a;
	return a;
}

__inline void _mm_sfence()
{
	__asm SFENCE;
}

/*
#define _mm_shuffle_ps(a, b, c) ({ __m128d a_ = a; __m128d b_ = b; int c_ = c; __asm { SHUFPD a_,b_,c_ } a_; })

#define _mm_shuffle_ss(a, b, c) ({ __m128 a_ = a; __m128 b_ = b; int c_ = c; __asm { SHUFPS a_,b_,c_ } a_; })
*/

__inline __m128d _mm_shuffle_ps(register __m128d a, register __m128d b, register int c)
{
	__asm SHUFPD a,b,c;
	return a;
}

__inline __m128 _mm_shuffle_ss(register __m128 a, register __m128 b, register int c)
{
	__asm SHUFPS a,b,c;
	return a;
}

__inline __m128d _mm_sqrt_pd(register __m128d a)
{
	__asm SQRTPD a, a;
	return a;
}

__inline __m128d _mm_sqrt_sd(register __m128d a)
{
	__asm SQRTSD a, a;
	return a;
}

__inline __m128 _mm_sqrt_ps(register __m128 a)
{
	__asm SQRTPS a, a;
	return a;
}

__inline __m128 _mm_sqrt_ss(register __m128 a)
{
	__asm SQRTSS a, a;
	return a;
}

__inline int _mm_getcsr()
{
	register int r;
	__asm STMXCSR r;
	return r;
}

__inline __m128d _mm_sub_pd(register __m128d a, register __m128d b)
{
	__asm SUBPD a,b;
	return a;
}

__inline __m128 _mm_sub_ps(register __m128 a, register __m128 b)
{
	__asm SUBPS a,b;
	return a;
}

__inline __m128d _mm_sub_sd(register __m128d a, register __m128d b)
{
	__asm SUBSD a,b;
	return a;
}

__inline __m128 _mm_sub_ss(register __m128 a, register __m128 b)
{
	__asm SUBSS a,b;
	return a;
}

__inline int _mm_ucomieq_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETE r;
	return r;
}

__inline int _mm_ucomilt_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETB r;
	return r;
}

__inline int _mm_ucomile_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETBE r;
	return r;
}

__inline int _mm_ucomigt_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETA r;
	return r;
}

__inline int _mm_ucomige_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETAE r;
	return r;
}

__inline int _mm_ucomineq_sd(register __m128d a, register __m128d b)
{
	register int r = 0;
	__asm UCOMISD a,b;
	__asm SETNE r;
	return r;
}

__inline int _mm_ucomieq_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETE r;
	return r;
}

__inline int _mm_ucomilt_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETB r;
	return r;
}

__inline int _mm_ucomile_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETBE r;
	return r;
}

__inline int _mm_ucomigt_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETA r;
	return r;
}

__inline int _mm_ucomige_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETAE r;
	return r;
}

__inline int _mm_ucomineq_ss(register __m128 a, register __m128 b)
{
	register int r = 0;
	__asm UCOMISS a,b;
	__asm SETNE r;
	return r;
}

__inline __m128d _mm_unpackhi_pd(register __m128d a, register __m128d b)
{
	__asm UNPCKHPD a,b;
	return a;
}

__inline __m128 _mm_unpackhi_ps(register __m128 a, register __m128 b)
{
	__asm UNPCKHPS a,b;
	return a;
}

__inline __m128d _mm_unpacklo_pd(register __m128d a, register __m128d b)
{
	__asm UNPCKLPD a,b;
	return a;
}

__inline __m128 _mm_unpacklo_ps(register __m128 a, register __m128 b)
{
	__asm UNPCKLPS a,b;
	return a;
}

__inline __m128d _mm_xor_pd(register __m128d a, register __m128d b)
{
	__asm XORPD a,b;
	return a;
}

__inline __m128 _mm_xor_ps(register __m128 a, register __m128 b)
{
	__asm XORPS a,b;
	return a;
}

// Bits in the MXCSR.
#define _MM_EXCEPT_MASK       0x003f
#define _MM_EXCEPT_INVALID    0x0001
#define _MM_EXCEPT_DENORM     0x0002
#define _MM_EXCEPT_DIV_ZERO   0x0004
#define _MM_EXCEPT_OVERFLOW   0x0008
#define _MM_EXCEPT_UNDERFLOW  0x0010
#define _MM_EXCEPT_INEXACT    0x0020

#define _MM_MASK_MASK         0x1f80
#define _MM_MASK_INVALID      0x0080
#define _MM_MASK_DENORM       0x0100
#define _MM_MASK_DIV_ZERO     0x0200
#define _MM_MASK_OVERFLOW     0x0400
#define _MM_MASK_UNDERFLOW    0x0800
#define _MM_MASK_INEXACT      0x1000

#define _MM_ROUND_MASK        0x6000
#define _MM_ROUND_NEAREST     0x0000
#define _MM_ROUND_DOWN        0x2000
#define _MM_ROUND_UP          0x4000
#define _MM_ROUND_TOWARD_ZERO 0x6000

#define _MM_FLUSH_ZERO_MASK   0x8000
#define _MM_FLUSH_ZERO_ON     0x8000
#define _MM_FLUSH_ZERO_OFF    0x0000


#if 0
#pragma mark -
#endif

/*	Composites */

__inline __m128 _mm_setr_ps(float z, float y, float x, float w)
{
	return _mm_set_ps(w,x,y,z);
}

__inline __m128 _mm_setzero_ps(void)
{
	register __m128 r;
	__asm xorps r,r
	return r;
}

__inline __m128i _mm_set_epi64(register __m64 q1, register __m64 q0)
{
	register __m128i r,s;
#if __option(sse2)
	__asm movq2dq r, q1
	__asm movq2dq s, q0
	__asm shufps r, s, 00010001b
#else
	__m64 arr[2];
	arr[0] = q1; arr[1] = q0;
	__asm movups r, arr
#endif
	return r;
}

__inline __m128i _mm_set_epi16(short i7, short i6, short i5, short i4,
				short i3, short i2, short i1, short i0)
{
	return _mm_set_epi32((i7<<16)|i6, (i5<<16)|i4, (i3<<16)|i2, (i1<<16)|i0);
}

__inline __m128i _mm_set_epi8 (char b15, char b14, char b13, char b12,
								char b11, char b10, char b9, char b8,
								char b7, char b6, char b5, char b4,
								char b3, char b2, char b1, char b0)
{
	return _mm_set_epi16((b15<<8)|b14, (b13<<8)|b12, (b11<<8)|b10, (b9<<8)|b8,
						(b7<<8)|b6, (b5<<8)|b4, (b3<<8)|b2, (b1<<8)|b0);
}								

__inline __m128i _mm_set1_epi64 (register __m64 q)
{
	register __m128i r;
	__asm movq r, q
	__asm punpcklqdq r,r;
	return r;
}

__inline __m128i _mm_set1_epi16 (short w)
{
	return _mm_set1_epi32((w<<16)|w);
}

__inline __m128i _mm_set1_epi8 (char b)
{
	return _mm_set1_epi16((b<<8)|b);
}

__inline __m128i _mm_setr_epi64 (__m64 q0, __m64 q1)
{
	return _mm_set_epi64(q1, q0);
}

__inline __m128i _mm_setr_epi32 (int i0, int i1, int i2, int i3)
{
	return _mm_set_epi32(i3, i2, i1, i0);
}

__inline __m128i _mm_setr_epi16 (short w0, short w1, short w2, short w3,
								short w4, short w5, short w6, short w7)
{
	return _mm_set_epi16(w7, w6, w5, w4, w3, w2, w1, w0);
}

__inline __m128i _mm_setr_epi8 (char b0, char b1, char b2, char b3,
								char b4, char b5, char b6, char b7,
								char b8, char b9, char b10, char b11,
								char b12, char b13, char b14, char b15)
{
	return _mm_set_epi8(b15, b14, b13, b12, b11, b10, b9, 
						b8,	b7, b6, b5, b4, b3, b2, b1, b0);
}								

__inline __m128i _mm_setzero_si128 (void)
{
	register __m128i r;
	__asm pxor r,r
	return r;
}


/*

#define _MM_SHUFFLE(i3,i2,i1,i0) ((i0)|((i1)<<2)|((i2)<<4)|((i3)<<6))

static __inline unsigned int
_MM_GET_EXCEPTION_STATE (void)
{
  return _mm_getcsr() & _MM_EXCEPT_MASK;
}

static __inline unsigned int
_MM_GET_EXCEPTION_MASK (void)
{
  return _mm_getcsr() & _MM_MASK_MASK;
}

static __inline unsigned int
_MM_GET_ROUNDING_MODE (void)
{
  return _mm_getcsr() & _MM_ROUND_MASK;
}

static __inline unsigned int
_MM_GET_FLUSH_ZERO_MODE (void)
static __inline void
_MM_SET_FLUSH_ZERO_MODE (unsigned int __mode)
static __inline void
_MM_SET_EXCEPTION_STATE(unsigned int __mask)
{
  _mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | __mask);
}
static __inline void
_MM_SET_EXCEPTION_MASK (unsigned int __mask)
{
  _mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | __mask);
}
*/

#pragma volatile_asm reset

#if __cplusplus
}
#endif

#endif	// _XMMINTRIN_H

/*
 * Change Log:
 * 030329 EJS	Initial checkin
 * 030605 EJS 	Use struct decls from original header instead of new keywords
 * 030619 EJS	Move __declspec(intrin_type) so this works in c++
 */
 