--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/genericopenlibs/liboil/src/motovec/vec_memset.S	Tue Aug 31 16:54:36 2010 +0300
@@ -0,0 +1,553 @@
+//------------------------------------------------------------------
+// file:  vec_memset.S
+//    AltiVec enabled version of memset and bzero and cacheable_memzero
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+//	Copyright Motorola, Inc. 2002
+//	ALL RIGHTS RESERVED
+//
+//	You are hereby granted a copyright license to use, modify, and 
+//	distribute the SOFTWARE so long as this entire notice is retained 
+//	without alteration in any modified and/or redistributed versions, 
+//	and that such modified versions are clearly identified as such.  
+//	No licenses are granted by implication, estoppel or otherwise under 
+//	any patents or trademarks of Motorola, Inc.
+//
+//	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
+//	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
+//	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
+//	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
+//	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
+//	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
+//	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
+//
+//	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
+//	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
+//	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
+//	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
+//	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
+//	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
+//	for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void *memset( void *ptr, int val, size_t len );
+//   Copies val into each of len characters beginning at ptr.
+//                                       - Harbison&Steele 4th ed
+//    (despite val being an int, this memset assumes it is never
+//     more than a byte.  That seems to be correct from all the
+//     memset functions I've seen but I don't know if ANSI allows
+//     anthing longer.     Chuck Corley  12/21/02) 
+// Returns:
+//  void * ptr
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * bzero( char *ptr, int len);   
+//   Copies 0 into each of len characters at ptr.
+//                                       - Harbison&Steele 4th ed
+// Returns:
+//  void * ptr
+//------------------------------------------------------------------
+
+// Revision History:
+//    Rev 0.0	Original                        Chuck Corley	02/09/03
+//              Could benefit from changes added to memcpy
+//    Rev 0.1	Revised per memcpy Rev 0.30     Chuck Corley	05/01/03
+//
+//  This is beta quality code; users are encouraged to make it faster.
+//  ASSUMPTIONS:
+//     Code is highly likely to be in the cache; data is not (streaming data)
+//     Zero fill could be quite likely.
+//     Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
+
+#define VRSV 256	//	VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+// Register useage
+#define Rt r0	// 	r0 when used as a temporary register	
+
+#define DST r3	// 	entering: dest pointer; exiting: same dest pointer
+
+#define FILL r4	// 	entering: fill char then fill word
+
+#define BC r5	//	entering: Byte_Count then remaining Byte_Count
+
+#define DBC r6//	dst + byte count
+
+#define BK r7	//  	BC - 1 +/- (n*16)
+
+#define Fsh r8	//	fill byte shifted right one nibble
+
+#define DM1 r9//	dst -1 for byte-by-byte backwards initially
+#define D r9	//	(dst+16)[0:27] - dst[28:31]
+#define DNX r9	//	(dst+n*16)[28:31]
+#define BL r9	//	second byte_kount index pointer
+
+#define DR r10	//	(dst+16)[0:27]
+#define QW r10	//  	number of cache lines
+
+#define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define RSV r12	//  	storage for VRSAVE register if used
+
+//  Condition register use (not including temporary cr0)
+//      cr0[2]   = (FILL==0)?
+//      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
+// then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+//      cr6[2]   = (QW == 0)? 1 : 0;
+// then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
+//      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
+// then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
+
+// Conditionalize the use of dcba.  It will help if the data is
+// not in cache and hurt if it is.  Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+// We use dcba which will noop to non-cacheable memory rather than
+// dcbz which will cause an aligment exception.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7    or    dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif  // __ghs__
+#endif  // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif  // NO_DCBA
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.globl	memset     
+memset:
+#else
+	.globl	_vec_memset     
+_vec_memset:
+#endif
+
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
+	rlwinm.	Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
+
+	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
+	bgt	cr7,v_memset	// b if BC>MIN_VEC
+
+	mtctr	BC		// for (i=1;i<=BC;i++)
+	beqlr	cr1		// return if BC = 0
+Byte_set:
+	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
+	bdnz	Byte_set
+
+	blr
+
+v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0xe000	// IU1 Or in registers used by this routine
+#endif
+	subf	D,DST,DR	// IU1 How many bytes in first destination?
+	li	BK,0		// IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
+#endif
+	vxor	v0,v0,v0	// VIU Clear v0
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
+	beq+	enter_bzero	// b if FILL==0
+
+	lvsl	v0,0,Fsh	// LSU Move upper nibble to byte 0 of VR
+	vspltisb	v1,4	// VPU Splat 0x4 to every byte
+
+	lvsl	v2,0,FILL	// LSU Move lower nibble to byte 0 of VR
+
+	vslb	v0,v0,v1	// VIU Move upper nibble to VR[0:3]
+
+	vor	v0,v0,v2	// VIU Form FILL byte in VR[0:7]
+
+	vspltb	v0,v0,0		// VPU Splat the fill byte to all bytes
+enter_bzero:
+	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	beq	cr1,Left_just	// b if D0 is left justified
+
+	bns	cr7,No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	v0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+No_B_fwd:
+	bne	cr7,No_H_fwd	// b if only words to store
+
+	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+No_H_fwd:
+	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	v0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+No_W1_fwd:
+	bnl	cr7,No_W2_fwd	// b if there was only one word to store
+	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	No_W2_fwd
+
+Left_just:	
+	stvx	v0,0,DST	// LSU Store 16 bytes at D0
+No_W2_fwd:
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+
+	li	BK,16		// IU1 Re-initialize byte kount index
+	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
+	ble	cr6,Last_QW	// b if no Quad words to do
+
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+
+QW_loop:
+	stvx	v0,DST,BK	// LSU Store 16 fill bytes
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,QW_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
+
+Last_QW:	// Next vector is the last; we're done.
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+
+	beq	cr1,Rt_just_fwd	// b if last destination is right justified
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	BL,0		// IU1 Initialize index pointer
+	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
+	addi	BL,BL,4		// IU1 increment index
+
+	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
+	addi	BL,BL,4		// IU1 increment index
+Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
+	addi	BL,BL,4		// IU1 increment index
+Only_2W_fwd:
+	bne	cr7,Only_B_fwd	// b if there are no half words to store
+
+	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
+	addi	BL,BL,2		// IU1 increment index
+Only_B_fwd:
+	bns	cr7,All_done_fwd	// b if there are no bytes to store
+
+	stvebx	v0,DBK,BL	// LSU store one byte if necessary
+	b	All_done_fwd
+
+Rt_just_fwd:
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	DCBK			// LSU then Kill instead of RWITM
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D11
+	addi	BK,BK,16	// IU1 Increment byte count
+	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
+
+Nxt_loc_fwd:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D12
+	addi	BK,BK,16	// IU1 Increment byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
+	b	Last_QW		// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+
+	b	Last_QW
+
+// End of memset in AltiVec
+
+#define BCz r4		// in bzero r4 enters with byte count
+
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.globl	bzero     
+bzero:
+#else
+	.globl	vec_bzero     
+vec_bzero:
+#endif
+
+	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
+	li	FILL,0		// IU1 for bzero FILL=0
+#ifdef LIBMOTOVEC
+	b	memset     
+#else
+	b	_vec_memset     
+#endif
+
+// cacheable_memzero will employ dcbz to clear 32 bytes at a time
+// of cacheable memory. Like bzero, second entering argument will be BC.
+// Using this for non-cacheable memory will generate an alignment exception.
+
+	.text
+#ifdef __MWERKS__
+	.align	32
+#else
+	.align	5
+#endif
+
+#ifdef LIBMOTOVEC
+	.globl	cacheable_memzero     
+cacheable_memzero:
+#else
+	.globl	vec_cacheable_memzero     
+vec_cacheable_memzero:
+#endif
+
+	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
+	li	FILL,0		// IU1 for bzero FILL=0
+	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
+
+	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
+
+	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
+	addi	DR,DST,16	// IU1 Address of second dst vector
+	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
+	bgt	cr7,c_v_memset	// b if BC>MIN_VEC
+
+	mtctr	BC		// for (i=1;i<=BC;i++)
+	beqlr	cr1		// return if BC = 0
+c_Byte_set:
+	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
+	bdnz	c_Byte_set
+
+	blr
+
+c_v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
+#endif
+	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
+	addi	DBK,DBC,-1	// IU1 Address of last dst byte
+
+#ifdef VRSAVE
+	oris	Rt,RSV,0x8000	// IU1 Or in registers used by this routine
+#endif
+	subf	D,DST,DR	// IU1 How many bytes in first destination?
+	li	BK,0		// IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
+#endif
+	vxor	v0,v0,v0	// VIU Clear v0
+	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
+	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
+
+	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
+	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
+	beq	cr1,c_Left_just	// b if D0 is left justified
+
+	bns	cr7,c_No_B_fwd	// b if only even number of bytes to store
+
+	stvebx	v0,DST,BK	// LSU store first byte at DST+0
+	addi	BK,BK,1		// IU1 increment index
+c_No_B_fwd:
+	bne	cr7,c_No_H_fwd	// b if only words to store
+
+	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
+	addi	BK,BK,2		// IU1 increment index
+c_No_H_fwd:
+	bng	cr7,c_No_W1_fwd	// b if exactly zero or two words to store
+
+	stvewx	v0,DST,BK	// LSU store word 1 of one or three
+	addi	BK,BK,4		// IU1 increment index
+
+c_No_W1_fwd:
+	bnl	cr7,c_No_W2_fwd	// b if there was only one word to store
+	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
+	addi	BK,BK,4		// IU1 increment index
+
+	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
+	b	c_No_W2_fwd
+
+c_Left_just:	
+	stvx	v0,0,DST	// LSU Store 16 bytes at D0
+c_No_W2_fwd:
+	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
+	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
+
+	li	BK,16		// IU1 Re-initialize byte kount index
+	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
+	ble	cr6,c_Last_QW	// b if no Quad words to do
+
+	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
+	cmpi	cr6,0,QW,4	// IU1 Check QW>4
+
+c_QW_loop:
+	stvx	v0,DST,BK	// LSU Store 16 fill bytes
+	addi	BK,BK,16	// IU1 Increment byte kount index
+	bdnzf	25,c_QW_loop	// b if 4 or less quad words to do
+
+	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
+	addi	QW,QW,-1	// IU1 One more QW stored by now
+	bgt	cr6,c_GT_4QW_fwd	// b if >4 quad words left
+
+c_Last_QW:	// Next vector is the last; we're done.
+	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
+
+	beq	cr1,c_Rt_just_fwd	// b if last destination is right justified
+
+	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
+	li	BL,0		// IU1 Initialize index pointer
+	bnl	cr7,c_Only_1W_fwd	// b if there was only one or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
+	addi	BL,BL,4		// IU1 increment index
+
+	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
+	addi	BL,BL,4		// IU1 increment index
+c_Only_1W_fwd:
+	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
+
+	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
+	addi	BL,BL,4		// IU1 increment index
+c_Only_2W_fwd:
+	bne	cr7,c_Only_B_fwd	// b if there are no half words to store
+
+	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
+	addi	BL,BL,2		// IU1 increment index
+c_Only_B_fwd:
+	bns	cr7,c_All_done_fwd	// b if there are no bytes to store
+
+	stvebx	v0,DBK,BL	// LSU store one byte if necessary
+	b	c_All_done_fwd
+
+c_Rt_just_fwd:
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
+c_All_done_fwd:
+#ifdef VRSAVE
+	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
+#endif
+	blr			// Return destination address from entry
+
+#ifdef __MWERKS__
+	.align	16
+#else
+	.align	4
+#endif
+c_GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
+
+	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
+	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
+
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
+	addi	BK,BK,16	// IU1 Increment byte count by 16
+	bdnzf	27,c_GT_4QW_fwd	// b if next store is to lower (even) half of CL
+
+	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+	bns	cr6,c_B32_fwd	// b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+	bdnz	B32_fwd		// decrement counter for last QW store odd
+
+c_B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
+	dcbz	DST,BK		// LSU zero whole cache line
+	bdz	c_Nxt_loc_fwd	// always decrement and branch to next instr		
+
+c_Nxt_loc_fwd:
+	addi	BK,BK,32	// IU1 Increment byte count
+	bdnz	B32_fwd		// b if there are at least two more QWs to do
+
+	bso	cr6,c_One_even_QW	// b if there is one even and one odd QW to store
+	b	c_Last_QW		// b if last store is to even address
+
+// Come here with two more loads and two stores to do
+c_One_even_QW:
+	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
+	addi	BK,BK,16	// IU1 Increment byte count
+
+	b	c_Last_QW
+
+// End of cacheable_memzero in AltiVec