genericopenlibs/liboil/src/motovec/vec_memcpy.s
changeset 18 47c74d1534e1
equal deleted inserted replaced
0:e4d67989cc36 18:47c74d1534e1
       
     1 //------------------------------------------------------------------
       
     2 // file:  vec_memcpy.S
       
     3 //    AltiVec enabled version of memcpy and bcopy
       
     4 //------------------------------------------------------------------
       
     5 
       
     6 //------------------------------------------------------------------
       
     7 //	Copyright Motorola, Inc. 2003
       
     8 //	ALL RIGHTS RESERVED
       
     9 //
       
    10 //	You are hereby granted a copyright license to use, modify, and 
       
    11 //	distribute the SOFTWARE so long as this entire notice is retained 
       
    12 //	without alteration in any modified and/or redistributed versions, 
       
    13 //	and that such modified versions are clearly identified as such.  
       
    14 //	No licenses are granted by implication, estoppel or otherwise under 
       
    15 //	any patents or trademarks of Motorola, Inc.
       
    16 //
       
    17 //	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
       
    18 //	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
       
    19 //	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
       
    20 //	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
       
    21 //	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
       
    22 //	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
       
    23 //	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
       
    24 //
       
    25 //	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
       
    26 //	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
       
    27 //	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
       
    28 //	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
       
    29 //	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
       
    30 //	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
       
    31 //	for the maintenance and support of the SOFTWARE.
       
    32 //------------------------------------------------------------------
       
    33 
       
    34 //------------------------------------------------------------------
       
    35 // extern  void * memcpy(void *dst, const void *src, size_t len);
       
    36 // Returns:
       
    37 //  void *dst
       
    38 //------------------------------------------------------------------
       
    39 
       
    40 //------------------------------------------------------------------
       
    41 // extern void * memmove( void *dst, const void *src, size_t len );
       
    42 //   Copies len characters from src to dst and returns the value of
       
    43 //   dst.  Works correctly for overlapping memory regions.
       
    44 //               - Harbison&Steele 4th ed (corrected as to return)
       
    45 // Returns:
       
    46 //  void *dst
       
    47 //------------------------------------------------------------------
       
    48 
       
    49 //------------------------------------------------------------------
       
    50 // extern  void * bcopy(const void *src, void *dst,  size_t len);
       
    51 // Returns:
       
    52 //  void *dst
       
    53 //------------------------------------------------------------------
       
    54 
       
    55 // memcpy and memmove are combined into one entry point here because of
       
    56 // the similarity of operation and need to create fool-proof code.
       
    57 // The following conditions determine what is "fool proof":
       
    58 //
       
    59 // if:                                          then single entry:
       
    60 // (DST-SRC)<0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memcpy
       
    61 // (DST-SRC)<0 && (SRC-DST)< BC && BC>MIN_VEC    must b to v_memcpy
       
    62 // (DST-SRC)<0                  && BC<MIN_VEC    copy fwd byte-by-byte
       
    63 // (DST-SRC)==0                 || BC==0         will just return
       
    64 // (DST-SRC)>0                  && BC<MIN_VEC    copy bkwd byte-by-byte
       
    65 // (DST-SRC)>0 && (DST-SRC)< BC && BC>MIN_VEC    must b to v_memmove
       
    66 // (DST-SRC)>0 && (SRC-DST)>=BC && BC>MIN_VEC    will b to v_memmove
       
    67 
       
    68 // If you call memmove (or vec_memmove) and |DST-SRC|>=BC,
       
    69 // this code will branch to v_memcpy anyway for maximum performance.
       
    70 
       
    71 // Revision History:
       
    72 //    Rev 0.0	Original                          Chuck Corley	02/03/03
       
    73 //              Can still add dst, 128B loop, and aligned option
       
    74 //    Rev 0.01  Fixed JY's seg-fault violation              CJC 02/17/03
       
    75 //    Rev 0.1   Added 128B loop and dst; cndtnlzd dcbz      CJC 02/18/03
       
    76 //              (Creating separate path for QW aligned didn't help much)
       
    77 //    Rev 0.11  Small code schdling; chngd dst for memmove  CJC 02/23/03
       
    78 //    Rev 0.20  Eliminated alternate entry and cleanup      CJC 02/27/03                   
       
    79 //    Rev 0.21  Inproved loop branch targets for v_mempcy   CJC 03/01/03                   
       
    80 //    Rev 0.22  Experimented with dst (sent to H.)          CJC 03/02/03                   
       
    81 //    Rev 0.23  Substituted dcba for dcbz (sent to JY)      CJC 03/08/03                   
       
    82 //    Rev 0.24  Use two dst streams                         CJC 03/12/03
       
    83 //    Rev 0.25  Fix for all compilers, cleanup, and release with
       
    84 //              libmotovec.a rev 0.10                       CJC 03/14/03
       
    85 //    Rev 0.30  Fix for pre-empted destination (SNDF-DS)    CJC 04/02/03                   
       
    86 //
       
    87 //  Between Rev 0.25 and 0.30 the code was revised to store elements of
       
    88 //  source at destination when first and/or last vector are less than 16
       
    89 //  bytes. Areviewer at SNDF observed that loading the destination vector
       
    90 //  for merging exposed the "uninvolved" destination bytes to incoherency 
       
    91 //  if an interrupt pre-empted this routine and modified the "uninvolved"
       
    92 //  destination vector(s) while held in register for merging.  It seems
       
    93 //  like a low possibility but this revision is no longer subject to that
       
    94 //  possibility.  (It is also slightly faster than Rev 0.25.)
       
    95 //  This is beta quality code; users are encouraged to make it faster.
       
    96 //  ASSUMPTIONS:
       
    97 //     Code is highly likely to be in the cache; data is not (streaming data)
       
    98 
       
    99 #define VRSV 256	//	VRSAVE spr
       
   100 // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
       
   101 #define MIN_VEC 16
       
   102 // Don't use Big_loop in v_memcpy for |dst-src|<= minimum overlap.
       
   103 #define MIN_OVL 128
       
   104 
       
   105 // Register useage
       
   106 #define Rt r0	// 	r0 when used as a temporary register	
       
   107 
       
   108 #define DST r3	// 	entering: dst pointer; exiting: same dst pointer
       
   109 
       
   110 #define SRC r4	// 	entering: src ptr; then end of src range index (SRC+BC) in memmove
       
   111 
       
   112 #define BC r5	//	entering: Byte_Count
       
   113 
       
   114 #define PCS r6	//  	save for partial checksum entering
       
   115 
       
   116 #define DMS r7	//      dst - src initially
       
   117 #define BK r7	//  	BC - 1 +/- (n*16)
       
   118 
       
   119 // Codewarrior will put an unwelcome space as "lbzu	r0,1(r7 )"
       
   120 // if you don't put the comment right after the r7.  CJC 030314
       
   121 #define SM1 r8//	src -1 for byte-by-byte forwards initially
       
   122 #define S r8	//	src[28:31]
       
   123 #define SMD r8	//      src[0:27]-dst[0:27]
       
   124 #define STR r8	//	data stream touch block & stride info for Big_loop
       
   125 
       
   126 #define DM1 r9//	dst -1 for byte-by-byte forwards initially
       
   127 #define D r9	//	dst[28:31]
       
   128 #define DNX r9	//	(dst+n*16)[28:31]
       
   129 #define BL r9	//	second byte_kount index pointer
       
   130 
       
   131 #define SBC r10//	src + byte count initially then src[28:31]
       
   132 #define BLK r10	//      temporary data stream touch block & stride info
       
   133 #define DR r10	//	(dst+16)[0:27]
       
   134 #define QW r10	//  	number of quad words (vectors)
       
   135 
       
   136 #define DBC r11//	dst + byte count initially
       
   137 #define BLL r11	//      temporary data stream touch block & stride info
       
   138 #define SBK r11	//	(src+byte_count-1)
       
   139 #define SBR r11	//	(src+byte_count-1)[0:27]
       
   140 #define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
       
   141 #define BIG r11	//	QW/8 or 128 byte loop count
       
   142 #define SP8 r11	//      SRC + n*128 (8 QWs) for data streaming after first call
       
   143 
       
   144 #define RSV r12	//  	storage for VRSAVE register if used
       
   145 
       
   146 #define VS0   v0	//  	src vector for permuting
       
   147 
       
   148 #define VS1   v1	//  	src vector for permuting
       
   149 
       
   150 #define VP3   v2	// 	d - s permute register
       
   151 
       
   152 #define VPS0  v3	// 	permuted source vector to store
       
   153 
       
   154 #define VPS1  v4	//  	2nd permuted source vector to store
       
   155 
       
   156 #define VPS2  v5	//      additional permuted src in Big loop
       
   157 
       
   158 #define VS2   v6	//  	src vector for permuting
       
   159 #define VPS3  v6	//      additional permuted src in Big loop
       
   160 
       
   161 #define VS3   v7	//      additional src load in Big loop
       
   162 #define VPS4  v7	//      additional permuted src in Big loop
       
   163 
       
   164 #define VS4   v8	//      additional src load in Big loop
       
   165 #define VPS5  v8	//      additional permuted src in Big loop
       
   166 
       
   167 #define VS5   v9	//      additional src load in Big loop
       
   168 #define VPS6  v9	//      additional permuted src in Big loop
       
   169 
       
   170 #define VS6   v10	//      additional src load in Big loop
       
   171 #define VPS7  v10	//      additional permuted src in Big loop
       
   172 
       
   173 #define VS7   v11	//      additional src load in Big loop
       
   174 
       
   175 // Conditionalize the use of dcba.  It will help if the data is
       
   176 // not in cache and hurt if it is.  Generally, except for small
       
   177 // benchmarks repeated many times, we assume data is not in cache
       
   178 // (data streaming) and using dcbz is a performance boost.
       
   179 #ifndef NO_DCBA
       
   180 #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
       
   181  // gcc and codewarrior and diab don't assemble dcba
       
   182 #define DCBK .long 0x7c033dec
       
   183 // dcba r3,r7    or    dcba DST,BK
       
   184 #define DCBL .long 0x7c034dec
       
   185 // dcba r3,r9     or    dcba DST,BL
       
   186 #else
       
   187 #ifdef __ghs__
       
   188 .macro DCBK
       
   189 .long 0x7c033dec
       
   190 .endm
       
   191 .macro DCBL
       
   192 .long 0x7c034dec
       
   193 .endm
       
   194 #else
       
   195 #define DCBK dcba DST,BK
       
   196 #define DCBL dcba DST,BL
       
   197 #endif  // __ghs__
       
   198 #endif  // __GNUC__ or __MWERKS__
       
   199 #else
       
   200 #define DCBK nop
       
   201 #define DCBL nop
       
   202 #endif  // NO_DCBA
       
   203 
       
   204 // Conditionalize the use of dst (data stream touch).  It will help
       
   205 // if the data is not in cache and hurt if it is (though not as badly
       
   206 // as dcbz).  Generally, except for small benchmarks repeated many times,
       
   207 // we assume data is not in cache (data streaming) and using dst is a
       
   208 // performance boost.
       
   209 #ifndef NO_DST
       
   210 #define STRM_B dst	SBC,BLL,0
       
   211 #define STRM_F dst	SRC,BLK,0
       
   212 #define STRM_1 dst	SP8,STR,1
       
   213 
       
   214 #else
       
   215 #define STRM_B	nop
       
   216 #define STRM_F	nop
       
   217 #define STRM_1	nop
       
   218 #endif
       
   219 
       
   220 //  Condition register use
       
   221 //      cr0[0:2] = (dst-src==0)? return: ((dst-src>0)? copy_bkwd, copy_fwd;);
       
   222 // then cr0[0:2] = (dst[28:31]-src[28:31]<0)? "shifting left", "shifting right";
       
   223 //      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
       
   224 // then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
       
   225 // then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
       
   226 //      cr5[0,2] = (|DST-SRC|<=MIN_OVL)?1:0;  (Overlap too small for Big loop?)
       
   227 //      cr6[1,2] = (DST-SRC>=BC)?1:0;  (Okay for v_memmove to copy forward?)
       
   228 // then cr6[2]   = (QW == 0)? 1 : 0; (Any full vectors to move?)
       
   229 // then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
       
   230 // then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
       
   231 // then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
       
   232 //      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
       
   233 // then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
       
   234 // then cr7[1]   = (QW > 14)? 1 : 0; (>14 vectors to move?)
       
   235 // then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
       
   236 
       
   237 	.text
       
   238 #ifdef __MWERKS__
       
   239 	.align	32
       
   240 #else
       
   241 	.align	5
       
   242 #endif
       
   243 
       
   244 #ifdef LIBMOTOVEC
       
   245 	.globl	memmove     
       
   246 memmove:
       
   247 	nop			// IU1 Compilers forget first label
       
   248 	.globl	memcpy     
       
   249 memcpy:
       
   250 #else
       
   251 	.globl	vec_memmove     
       
   252 vec_memmove:
       
   253 	nop			// IU1 Only way I know to preserve both labels
       
   254 	.globl	_vec_memcpy     
       
   255 _vec_memcpy:
       
   256 #endif
       
   257 	subf.	DMS,SRC,DST	// IU1 Compute dst-src difference
       
   258 	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count moves
       
   259 	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
       
   260 
       
   261 	addi	SM1,SRC,-1	// IU1 Pre-bias and duplicate src for fwd
       
   262 	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
       
   263 	add	SBC,SRC,BC	// IU1 Pre-bias and duplicate src for bkwd
       
   264 	beqlr			// return if DST = SRC
       
   265 
       
   266 	add	DBC,DST,BC	// IU1 Pre-bias and duplicate destination
       
   267 	subf	Rt,DST,SRC	// IU1 Form |DST-SRC| if DST-SRC<0
       
   268 	beqlr	cr1		// return if BC = 0
       
   269 
       
   270 	bgt	Cpy_bkwd	// b if DST-SRC>0 (have to copy backward)
       
   271 	cmpi	cr5,0,Rt,MIN_OVL	// IU1 (|DST-SRC|>128)?1:0; for v_memcpy
       
   272 	bgt	cr7,v_memcpy	// b if BC>MIN_VEC (okay to copy vectors fwd)
       
   273 
       
   274 // Copy byte-by-byte forwards if DST-SRC<0 and BC<=MIN_VEC	
       
   275 	mtctr	BC		// i=BC; do ...;i--; while (i>0)
       
   276 Byte_cpy_fwd:
       
   277 	lbzu	Rt,1(SM1)	// LSU * ++(DST-1) = * ++(SRC-1)
       
   278 	stbu	Rt,1(DM1)	// LSU
       
   279 	bdnz	Byte_cpy_fwd
       
   280 
       
   281 	blr
       
   282 	nop			// IU1 Improve next label as branch target	
       
   283 Cpy_bkwd:
       
   284 	cmpi	cr5,0,DMS,MIN_OVL	// IU1 ((DST-SRC)>128)?1:0; for v_memcpy
       
   285 	cmp	cr6,0,DMS,BC	// IU1 cr6[1,2]=(DST-SRC>=BC)?1:0;
       
   286 	bgt	cr7,v_memmove	// b if BC>MIN_VEC (copy vectors bkwd)
       
   287 // Copy byte-by-byte backwards if DST-SRC>0 and BC<=MIN_VEC
       
   288 	mtctr	BC		// i=BC; do ...;i--; while (i>0)
       
   289 Byte_cpy_bwd:
       
   290 	lbzu	Rt,-1(SBC)	// LSU * --(DST+BC) = * --(SRC+BC)
       
   291 	stbu	Rt,-1(DBC)	// LSU Store it
       
   292 	bdnz	Byte_cpy_bwd
       
   293 	blr
       
   294 	
       
   295 #ifdef __MWERKS__
       
   296 	.align	16
       
   297 #else
       
   298 	.align	4
       
   299 #endif
       
   300 
       
   301 v_memmove:
       
   302 // Byte count < MIN_VEC bytes will have been copied by scalar code above,
       
   303 // so this will not deal with small block moves < MIN_VEC.
       
   304 
       
   305 // For systems using VRSAVE, define VRSAVE=1 when compiling.  For systems
       
   306 // that don't, make sure VRSAVE is undefined.
       
   307 #ifdef VRSAVE
       
   308 	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
       
   309 #endif
       
   310 	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
       
   311 	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
       
   312 	bge	cr6,MC_entry	// b to v_memcpy if DST-SRC>=BC (fwd copy OK)
       
   313 
       
   314 #ifdef VRSAVE
       
   315 	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
       
   316 #endif	
       
   317 	lis	BLL,0x010c	// IU1 Stream 12 blocks of 16 bytes
       
   318 	subf.	SMD,D,S		// IU1 if S-D<0 essentially shifting right
       
   319 
       
   320 #ifdef VRSAVE
       
   321 	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
       
   322 #endif
       
   323 	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
       
   324 	ori	BLL,BLL,0xffe0	// IU1 Stream stride -32B
       
   325 
       
   326 	STRM_B			// LSU Start data stream at SRC+BC
       
   327 	addi	SBK,SBC,-1	// IU1 Address of last src byte
       
   328 	bgt	Rt_shft		// Bytes from upper vector = (s-d>0)?s-d:16+s-d;
       
   329 	addi	SMD,SMD,16	// IU1 Save 16-(d-s)
       
   330 Rt_shft:
       
   331 
       
   332 	rlwinm	SBR,SBK,0,0,27	// IU1 (SRC+BC-1)[0:27]
       
   333 	addi	BK,BC,-1	// IU1 Initialize byte index
       
   334 
       
   335 	subf	Rt,SBR,SBC	// IU1 How many bytes in first source?
       
   336 	add	DBK,DST,BK	// IU1 Address of last dst byte
       
   337 	addi	DR,DST,16	// IU1 Address of second dst vector
       
   338 
       
   339 	subf.	SMD,Rt,SMD	// IU1 if bytes in 1st src>Bytes in 1st permute
       
   340 	rlwinm	Rt,DBK,0,28,31	// IU1 (DST+BC-1)[28:31]
       
   341 	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
       
   342 
       
   343 // If there are more useful bytes in the upper vector of a permute pair than we
       
   344 // will get in the first permute, the first loaded vector needs to be in the
       
   345 // lower half of the permute pair.  The upper half is a don't care then.
       
   346 	blt	Get_bytes_rt	// b if shifting left (D-S>=0)
       
   347 
       
   348 	lvx	VS1,SRC,BK	// LSU Get SN load started
       
   349 // Comments numbering source and destination assume single path through the
       
   350 // code executing each instruction once.  For vec_memmove, an example would
       
   351 // be the call memmove(BASE+0x0F, BASE+0x2F, 82). N = 6 in that case.
       
   352 	addi	SRC,SRC,-16	// IU1 Decrement src base (to keep BK useful)
       
   353 
       
   354 Get_bytes_rt:	// Come here to get VS0 & Don't care what VS1 is	
       
   355 	lvx	VS0,SRC,BK	// LSU Get SN-1 (SN if D-S<0) in lower vector
       
   356 	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
       
   357 	cmpi	cr7,0,Rt,0xF	// IU1 Is Dn right justified?
       
   358 
       
   359 	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
       
   360 	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
       
   361 	add	Rt,DST,BC	// IU1 Refresh the value of DST+BC
       
   362 
       
   363 	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
       
   364 	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-1 and SN to DN
       
   365 	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
       
   366 	beq	cr7,Rt_just	// b if DN is right justified
       
   367 
       
   368 	mtcrf	0x01,Rt		// IU2 Put final vector byte count in cr7
       
   369 	rlwinm	DBK,DBK,0,0,27	// IU1 Address of first byte of final vector
       
   370 	li	D,0		// IU1 Initialize an index pointer
       
   371 	bnl	cr7,Only_1W_bkwd	// b if there was only one or zero words to store
       
   372 
       
   373 	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
       
   374 	addi	D,D,4		// IU1 increment index
       
   375 
       
   376 	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
       
   377 	addi	D,D,4		// IU1 increment index
       
   378 Only_1W_bkwd:
       
   379 	bng	cr7,Only_2W_bkwd	// b if there were only two or zero words to store
       
   380 
       
   381 	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
       
   382 	addi	D,D,4		// IU1 increment index
       
   383 Only_2W_bkwd:
       
   384 	bne	cr7,Only_B_bkwd	// b if there are no half words to store
       
   385 
       
   386 	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
       
   387 	addi	D,D,2		// IU1 increment index
       
   388 Only_B_bkwd:
       
   389 	bns	cr7,All_done_bkwd	// b if there are no bytes to store
       
   390 
       
   391 	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
       
   392 	b	All_done_bkwd
       
   393 
       
   394 Rt_just:	
       
   395 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN
       
   396 All_done_bkwd:
       
   397 	addi	BK,BK,-16	// IU1 Decrement destination byte count
       
   398 
       
   399 	ble	cr6,Last_load	// b if no Quad words to do
       
   400 	mtctr	QW		// IU2 for (i=0;i<=QW;i++)-execution serializng
       
   401 	cmpi	cr6,0,QW,4	// IU1 Check QW>4
       
   402 QW_loop:
       
   403 	lvx	VS0,SRC,BK	// LSU Get SN-2 (or SN-1 if ADJ==0)
       
   404 
       
   405 	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-2 and SN-1 to DN-1
       
   406 	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
       
   407 
       
   408 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-1
       
   409 	addi	BK,BK,-16	// IU1 Decrement byte kount
       
   410 	bdnzf	25,QW_loop	// b if 4 or less quad words to do
       
   411 
       
   412 	add	DNX,DST,BK	// IU1 address of next store (DST+BC-1-16)
       
   413 	bgt	cr6,GT_4QW	// b if >4 quad words left
       
   414 
       
   415 Last_load:	// if D-S>=0, next load will be from same address as last
       
   416 	blt	No_ld_bkwd	// b if shifting right (S-D>=0)
       
   417 	addi	SRC,SRC,16	// IU1 recorrect source if it was decremented
       
   418 No_ld_bkwd:				
       
   419 	lvx	VS0,0,SRC	// LSU Get last source SN-6 (guaranteed S0)
       
   420 // Current 16 bytes is the last; we're done.
       
   421 	dss	0		// Data stream stop
       
   422 	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-6 and SN-5 to DN-6
       
   423 	subfic	D,DST,16	// IU1 How many bytes in first destination?
       
   424 	beq	cr1,Lt_just	// b if last destination is left justified
       
   425 
       
   426 	mtcrf	0x01,D		// IU2 Put byte count remaining in cr7
       
   427 	li	D,0		// IU1 Initialize index pointer
       
   428 	bns	cr7,No_B_bkwd	// b if only even number of bytes to store
       
   429 
       
   430 	stvebx	VPS0,DST,D	// LSU store first byte at DST+0
       
   431 	addi	D,D,1		// IU1 increment index
       
   432 No_B_bkwd:
       
   433 	bne	cr7,No_H_bkwd	// b if only words to store
       
   434 	stvehx	VPS0,DST,D	// LSU store halfword at DST+0/1
       
   435 	addi	D,D,2		// IU1 increment index
       
   436 
       
   437 No_H_bkwd:
       
   438 	bng	cr7,No_W1_bkwd	// b if exactly zero or two words to store
       
   439 	stvewx	VPS0,DST,D	// LSU store word 1 of one or three
       
   440 	addi	D,D,4		// IU1 increment index
       
   441 
       
   442 No_W1_bkwd:
       
   443 	bnl	cr7,No_W2_bkwd	// b if there was only one word to store
       
   444 	stvewx	VPS0,DST,D	// LSU store word 1 of two or 2 of three
       
   445 	addi	D,D,4		// IU1 increment index
       
   446 
       
   447 	stvewx	VPS0,DST,D	// LSU store word 2 of two or 3 of three
       
   448 	b	No_W2_bkwd
       
   449 
       
   450 Lt_just:
       
   451 	stvx	VPS0,0,DST	// LSU Store 16 bytes at final dst addr D0
       
   452 No_W2_bkwd:
       
   453 #ifdef VRSAVE
       
   454 	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
       
   455 #endif
       
   456 	blr			// Return destination address from entry
       
   457 
       
   458 GT_4QW:	// Do once if next store is to even half of cache line, else twice
       
   459 
       
   460 	lvx	VS0,SRC,BK	// LSU Get SN-3 (or SN-2)
       
   461 	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+BC-1)[27]==1)?1:0;
       
   462 	
       
   463 	vperm	VPS0,VS0,VS1,VP3	// VPU Align SN-3 and SN-2 to Dn-2
       
   464 	vor	VS1,VS0,VS0	// VIU1 Move lower vector to upper
       
   465 	addi	DNX,DNX,-16	// IU1 Prepare to update cr6 next loop
       
   466 
       
   467 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-2
       
   468 	vor	VS3,VS0,VS0	// VIU Make a copy of lower vector
       
   469 	addi	BK,BK,-16	// IU1 Decrement byte count by 16
       
   470 	bdnzt	27,GT_4QW	// b if next store is to upper (odd) half of CL
       
   471 // At this point next store will be to even address.
       
   472 
       
   473 	lis	STR,0x102	// IU1 Stream 2 blocks of 16 bytes
       
   474 	mtcrf	0x02,DST	// IU2 cr6[3]=(DST[27]==1)?1:0; (DST odd?)
       
   475 	addi	BL,BK,-16	// IU1 Create an alternate byte count - 16
       
   476 
       
   477 	ori	STR,STR,0xffe0	// IU1 Stream stride -32B
       
   478 	addi	SP8,SRC,-64	// IU1 Starting address for data stream touch
       
   479 	bso	cr6,B32_bkwd	// b if DST[27] == 1; i.e, final store is odd
       
   480 
       
   481 	bdnz	B32_bkwd	// decrement counter for last odd QW store
       
   482 B32_bkwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
       
   483 	lvx	VS2,SRC,BK	// LSU Get SN-4 (or SN-3)
       
   484 	addi	SP8,SP8,-32	// IU1 Next starting address for data stream touch
       
   485 
       
   486 	lvx	VS1,SRC,BL	// LSU Get SN-5 (or SN-4)
       
   487 	vperm	VPS0,VS2,VS3,VP3	// VPU Align SN-4 and SN-3 to DN-3
       
   488 
       
   489 	STRM_1			// LSU Stream 64 byte blocks ahead of loads
       
   490 
       
   491 	DCBL			// LSU allocate next cache line
       
   492 
       
   493 	vperm	VPS1,VS1,VS2,VP3	// VPU Align SN-5 and SN-4 to DN-4
       
   494 	vor	VS3,VS1,VS1	// VIU1 Move SN-5 to SN-3
       
   495 
       
   496 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at DN-3
       
   497 	addi	BK,BL,-16	// IU1 Decrement byte count
       
   498 	bdz	Nxt_loc_bkwd	// always decrement and branch to next instr		
       
   499 
       
   500 Nxt_loc_bkwd:
       
   501 	stvx	VPS1,DST,BL	// LSU Store 16 bytes at DN-4
       
   502 	addi	BL,BK,-16	// IU1 Decrement alternate byte count
       
   503 	bdnz	B32_bkwd	// b if there are at least two more QWs to do
       
   504 
       
   505 	bns	cr6,One_odd_QW	// b if there was one more odd QW to store
       
   506 	b	Last_load
       
   507 
       
   508 // Come here with two more loads and two stores to do
       
   509 One_odd_QW:
       
   510 	lvx	VS1,SRC,BK	// LSU Get SN-6 (or SN-5)
       
   511 
       
   512 	vperm	VPS1,VS1,VS3,VP3	// VPU Align SN-6 and SN-5 to DN-5
       
   513 
       
   514 	stvx	VPS1,DST,BK	// LSU Store 16 bytes at DN-5
       
   515 
       
   516 	b	Last_load
       
   517 
       
   518 // End of memmove in AltiVec
       
   519 
       
   520 #ifdef __MWERKS__
       
   521 	.align	16
       
   522 #else
       
   523 	.align	4
       
   524 #endif
       
   525 v_memcpy:
       
   526 // Byte count < MIN_VEC bytes will have been copied by scalar code above,
       
   527 // so this will not deal with small block moves < MIN_VEC.
       
   528 
       
   529 #ifdef VRSAVE
       
   530 	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
       
   531 #endif
       
   532 	rlwinm	S,SRC,0,28,31	// IU1 Save src address bits s[28:31]
       
   533 	rlwinm	D,DST,0,28,31	// IU1 D = dst[28:31]
       
   534 
       
   535 MC_entry:	// enter here from memmove if DST-SRC>=BC; this should be faster
       
   536 #ifdef VRSAVE
       
   537 	oris	Rt,RSV,0xfff0	// IU1 Or in registers used by this routine
       
   538 #endif	
       
   539 	lis	BLK,0x010c	// IU1 Stream 12 blocks of 16 bytes
       
   540 
       
   541 	subf.	S,S,D		// IU1 if D-S<0 essentially shifting left
       
   542 
       
   543 #ifdef VRSAVE
       
   544 	mtspr	VRSV,Rt		// IU2 Save in VRSAVE before first vec op
       
   545 #endif
       
   546 	lvsr	VP3,0,DMS	// LSU Permute vector for dst - src shft right
       
   547 	ori	BLK,BLK,32	// IU1 Stream stride 32B
       
   548 
       
   549 	STRM_F			// LSU Start data stream 0 at SRC
       
   550 	addi	DR,DST,16	// IU1 Address of second dst vector
       
   551 	addi	DBK,DBC,-1	// IU1 Address of last dst byte
       
   552 
       
   553 // If D-S<0 we are "kinda" shifting left with the right shift permute vector
       
   554 // loaded to VP3 and we need both S0 and S1 to permute.  If D-S>=0 then the
       
   555 // first loaded vector needs to be in the upper half of the permute pair and
       
   556 // the lower half is a don't care then.
       
   557 	bge	Ld_bytes_rt	// b if shifting right (D-S>=0)
       
   558 
       
   559 	lvx	VS0,0,SRC	// LSU Get S0 load started
       
   560 // Comments numbering source and destination assume single path through the
       
   561 // code executing each instruction once.  For vec_memcpy, an example would
       
   562 // be the call memcpy(BASE+0x1E, BASE+0x1F, 259). N = 16 in that case.
       
   563 	addi	SRC,SRC,16	// IU1 Increment src base (to keep BK useful)
       
   564 
       
   565 Ld_bytes_rt:	// Come here to get VS1 & Don't care what VS0 is	
       
   566 	lvx	VS1,0,SRC	// LSU Get S1 (or S0 if D-S>=0) in upper vector
       
   567 	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
       
   568 	cmpi	cr1,0,D,0	// IU1 Is D0 left justified?
       
   569 
       
   570 	subf	Rt,DST,DR	// IU1 How many bytes in first destination?
       
   571 	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
       
   572 	li	BK,0		// IU1 Initialize byte kount index
       
   573 
       
   574 	mtcrf	0x01,Rt		// IU2 Put bytes in 1st dst in cr7
       
   575 	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
       
   576 	vperm	VPS0,VS0,VS1,VP3	// VPU Align S0 and S1 to D0
       
   577 
       
   578 	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
       
   579 	beq	cr1,Left_just	// b if D0 is left justified
       
   580 
       
   581 	bns	cr7,No_B_fwd	// b if only even number of bytes to store
       
   582 
       
   583 	stvebx	VPS0,DST,BK	// LSU store first byte at DST+0
       
   584 	addi	BK,BK,1		// IU1 increment index
       
   585 No_B_fwd:
       
   586 	bne	cr7,No_H_fwd	// b if only words to store
       
   587 
       
   588 	stvehx	VPS0,DST,BK	// LSU store halfword at DST+0/1
       
   589 	addi	BK,BK,2		// IU1 increment index
       
   590 No_H_fwd:
       
   591 	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
       
   592 
       
   593 	stvewx	VPS0,DST,BK	// LSU store word 1 of one or three
       
   594 	addi	BK,BK,4		// IU1 increment index
       
   595 
       
   596 No_W1_fwd:
       
   597 	bnl	cr7,No_W2_fwd	// b if there was only one word to store
       
   598 	stvewx	VPS0,DST,BK	// LSU store word 1 of two or 2 of three
       
   599 	addi	BK,BK,4		// IU1 increment index
       
   600 
       
   601 	stvewx	VPS0,DST,BK	// LSU store word 2 of two or 3 of three
       
   602 	b	No_W2_fwd
       
   603 
       
   604 Left_just:	
       
   605 	stvx	VPS0,0,DST	// LSU Store 16 bytes at D0
       
   606 No_W2_fwd:
       
   607 	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
       
   608 	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
       
   609 
       
   610 	li	BK,16		// IU1 Re-initialize byte kount index
       
   611 	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
       
   612 	cmpi	cr7,0,QW,14	// IU1 Check QW>14
       
   613 	ble	cr6,Last_ld_fwd	// b if no Quad words to do
       
   614 
       
   615 	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
       
   616 	cmpi	cr6,0,QW,4	// IU1 Check QW>4
       
   617 QW_fwd_loop:
       
   618 	lvx	VS1,SRC,BK	// LSU Get S2 (or S1)
       
   619 
       
   620 	vperm	VPS0,VS0,VS1,VP3	// VPU Align S1 and S2 to D1
       
   621 	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
       
   622 
       
   623 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D1(+n*16 where n<4)
       
   624 	addi	BK,BK,16	// IU1 Increment byte kount index
       
   625 	bdnzf	25,QW_fwd_loop	// b if 4 or less quad words to do
       
   626 
       
   627 	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
       
   628 	addi	QW,QW,-1	// IU1 One more QW stored by now
       
   629 	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
       
   630 
       
   631 Last_ld_fwd:	// Next 16 bytes is the last; we're done.
       
   632 	add	DBC,DST,BC	// IU1 Recompute address of last dst byte + 1
       
   633 	add	SBC,SRC,BC	// IU1 Recompute address of last src byte + 1
       
   634 	bge	No_ld_fwd	// b if shifting right (D-S>=0)
       
   635 
       
   636 	addi	SBC,SBC,-16	// IU1 if D-S>=0 we didn't add 16 to src
       
   637 No_ld_fwd:
       
   638 	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
       
   639 	addi	DBK,DBC,-1	// IU1 Recompute address of last dst byte
       
   640 	addi	Rt,SBC,-1	// IU1 Recompute address of last src byte
       
   641 
       
   642 // If D-S<0 we have already loaded all the source vectors.
       
   643 // If D-S>=0 then the first loaded vector went to the upper half of the permute
       
   644 // pair and we need one more vector.  (This may be a duplicate.)
       
   645 
       
   646 	lvx	VS1,0,Rt	// LSU Get last source S14 (guaranteed SN)
       
   647 
       
   648 #ifndef NO_DST				
       
   649 	dss	0		// Data stream 0 stop
       
   650 
       
   651 	dss	1		// Data stream 1 stop
       
   652 #endif
       
   653 	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D14
       
   654 	beq	cr1,Rt_just_fwd	// b if last destination is right justified
       
   655 
       
   656 	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
       
   657 	li	D,0		// IU1 Initialize index pointer
       
   658 	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
       
   659 
       
   660 	stvewx	VPS0,DBK,D	// LSU store word 1 of two or three
       
   661 	addi	D,D,4		// IU1 increment index
       
   662 
       
   663 	stvewx	VPS0,DBK,D	// LSU store word 2 of two or three
       
   664 	addi	D,D,4		// IU1 increment index
       
   665 Only_1W_fwd:
       
   666 	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
       
   667 
       
   668 	stvewx	VPS0,DBK,D	// LSU store word 3 of three if necessary
       
   669 	addi	D,D,4		// IU1 increment index
       
   670 Only_2W_fwd:
       
   671 	bne	cr7,Only_B_fwd	// b if there are no half words to store
       
   672 
       
   673 	stvehx	VPS0,DBK,D	// LSU store one halfword if necessary
       
   674 	addi	D,D,2		// IU1 increment index
       
   675 Only_B_fwd:
       
   676 	bns	cr7,All_done_fwd	// b if there are no bytes to store
       
   677 
       
   678 	stvebx	VPS0,DBK,D	// LSU store one byte if necessary
       
   679 	b	All_done_fwd
       
   680 
       
   681 Rt_just_fwd:
       
   682 
       
   683 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D14
       
   684 All_done_fwd:
       
   685 #ifdef VRSAVE
       
   686 	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
       
   687 #endif
       
   688 	blr			// Return destination address from entry
       
   689 #ifdef __MWERKS__
       
   690 	.align	16
       
   691 #else
       
   692 	.align	4
       
   693 #endif
       
   694 GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
       
   695 
       
   696 	lvx	VS1,SRC,BK	// LSU Get S3 (or S2)
       
   697 	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
       
   698 	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
       
   699 	
       
   700 	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
       
   701 	addi	Rt,QW,-2	// IU1 Insure at least 2 QW left after big loop
       
   702 
       
   703 	vperm	VPS0,VS0,VS1,VP3	// VPU Align S2 and S3 to D2
       
   704 	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
       
   705 
       
   706 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D2
       
   707 	addi	BK,BK,16	// IU1 Increment byte count by 16
       
   708 	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
       
   709 // At this point next store will be to even address.
       
   710 
       
   711 	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
       
   712 	lis	STR,0x104	// IU1 Stream 4 blocks of 16 bytes
       
   713 	addi	BL,BK,16	// IU1 Create an alternate byte kount + 32
       
   714 
       
   715 	ori	STR,STR,32	// IU1 Stream stride 32B
       
   716 #ifndef NO_BIG_LOOP
       
   717 	rlwinm	BIG,Rt,29,3,31	// IU1 QW/8 big loops to do
       
   718 
       
   719 	rlwinm	Rt,Rt,0,0,28	// IU1 How many QWs will be done in big loop
       
   720 	bgt	cr7,Big_loop	// b if QW > 14
       
   721 #endif
       
   722 No_big_loop:
       
   723 // We need the ctr register to reflect an even byte count before entering
       
   724 // the next block - faster to decrement than to reload.
       
   725 
       
   726 	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
       
   727 	xoris	STR,STR,0x6	// IU1 Reset stream to 2 blocks of 16 bytes
       
   728 	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
       
   729 
       
   730 	bdnz	B32_fwd		// decrement counter for last QW store odd
       
   731 
       
   732 B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
       
   733 	lvx	VS1,SRC,BK	// LSU Get S12
       
   734 	addi	SP8,SP8,32	// IU1 Next starting address for data stream touch
       
   735 
       
   736 	lvx	VS2,SRC,BL	// LSU Get S13
       
   737 	vperm	VPS1,VS0,VS1,VP3	// VPU Align S11 and S12 to D11
       
   738 
       
   739 	STRM_1			// LSU Stream 64 byte blocks ahead of loads
       
   740 
       
   741 	DCBK			// LSU then Kill instead of RWITM
       
   742 
       
   743 	vperm	VPS0,VS1,VS2,VP3	// VPU Align S12 and S13 to D12
       
   744 	vor	VS0,VS2,VS2	// VIU1 Move S13 to S11
       
   745 
       
   746 	stvx	VPS1,DST,BK	// LSU Store 16 bytes at D11
       
   747 	addi	BK,BL,16	// IU1 Increment byte count
       
   748 	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
       
   749 
       
   750 Nxt_loc_fwd:
       
   751 	stvx	VPS0,DST,BL	// LSU Store 16 bytes at D12
       
   752 	addi	BL,BK,16	// IU1 Increment alternate byte count
       
   753 	bdnz	B32_fwd		// b if there are at least two more QWs to do
       
   754 
       
   755 	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
       
   756 	b	Last_ld_fwd	// b if last store is to even address
       
   757 
       
   758 // Come here with two more loads and two stores to do
       
   759 One_even_QW:
       
   760 	lvx	VS1,SRC,BK	// LSU Get S14 (or S13 if if D-S>=0)
       
   761 
       
   762 	vperm	VPS0,VS0,VS1,VP3	// VPU Align S13 and S14 to D13
       
   763 	vor	VS0,VS1,VS1	// VIU1 Move upper vector to lower
       
   764 
       
   765 	stvx	VPS0,DST,BK	// LSU Store 16 bytes at D13
       
   766 	addi	BK,BK,16	// IU1 Increment byte count
       
   767 
       
   768 	b	Last_ld_fwd
       
   769 
       
   770 #ifdef __MWERKS__
       
   771 	.align	16
       
   772 #else
       
   773 	.align	4
       
   774 #endif
       
   775 Big_loop:
       
   776 	subf	QW,Rt,QW	// IU1 Should be 2-7 QWs left after big loop
       
   777 	blt	cr5,No_big_loop	// b back if |DST-SRC|<128; Big_loop won't work.
       
   778 	mtctr	BIG		// IU2 loop for as many 128B loops as possible
       
   779 	addi	SP8,SRC,256	// IU1 Starting address for data stream touch
       
   780 
       
   781 Loop_of_128B:	// Come here with QW>=10 and next store even; VS0 last load
       
   782 	lvx	VS1,SRC,BK	// LSU Get S4 (or S3 if D-S>=0)
       
   783 	addi	BL,BK,32	// IU1 Increment Byte_Kount+16 by 32	
       
   784 	addi	SP8,SP8,128	// IU1 increment address for data stream touch
       
   785 
       
   786 	lvx	VS3,SRC,BL	// LSU Get S6 (or S5)
       
   787 	addi	BL,BL,32	// IU1 Increment Byte_Kount+48 by 32	
       
   788 
       
   789 	lvx	VS5,SRC,BL	// LSU Get S8 (or S7)
       
   790 	addi	BL,BL,32	// IU1 Increment Byte_Kount+80 by 32	
       
   791 
       
   792 	lvx	VS7,SRC,BL	// LSU Get S10 (or S9)
       
   793 	addi	BL,BK,16	// IU1 Increment Byte_Kount+16 by 16	
       
   794 
       
   795 	lvx	VS2,SRC,BL	// LSU Get S5 (or S4)
       
   796 	addi	BL,BL,32	// IU1 Increment Byte_Kount+32 by 32	
       
   797 
       
   798 	lvx	VS4,SRC,BL	// LSU Get S7 (or S6)
       
   799 	addi	BL,BL,32	// IU1 Increment Byte_Kount+64 by 32	
       
   800 	
       
   801 	lvx	VS6,SRC,BL	// LSU Get S9 (or S8)
       
   802 	addi	BL,BL,32	// IU1 Increment Byte_Kount+96 by 32	
       
   803 	vperm	VPS0,VS0,VS1,VP3	// VPU
       
   804 
       
   805 	lvx	VS0,SRC,BL	// LSU Get S11 (or S10)
       
   806 	vperm	VPS1,VS1,VS2,VP3	// VPU
       
   807 
       
   808 	STRM_1			// LSU Stream 4 32B blocks, stride 32B
       
   809 
       
   810 	DCBK			// LSU then Kill instead of RWITM
       
   811 
       
   812 	stvx	VPS0,DST,BK	// LSU Store D3
       
   813 	addi	BK,BK,16	// IU1 Increment Byte_Kount+16 by 16	
       
   814 	vperm	VPS2,VS2,VS3,VP3	// VPU
       
   815 
       
   816 	stvx	VPS1,DST,BK	// LSU Store D4
       
   817 	addi	BK,BK,16	// IU1 Increment Byte_Kount+32 by 16	
       
   818 	vperm	VPS3,VS3,VS4,VP3	// VPU
       
   819 
       
   820 	DCBK			// LSU then Kill instead of RWITM
       
   821 
       
   822 	stvx	VPS2,DST,BK	// LSU Store D5
       
   823 	addi	BK,BK,16	// IU1 Increment Byte_Kount+48 by 16	
       
   824 	vperm	VPS4,VS4,VS5,VP3	// VPU
       
   825 
       
   826 	stvx	VPS3,DST,BK	// LSU Store D6
       
   827 	addi	BK,BK,16	// IU1 Increment Byte_Kount+64 by 16	
       
   828 	vperm	VPS5,VS5,VS6,VP3	// VPU
       
   829 
       
   830 	DCBK			// LSU then Kill instead of RWITM
       
   831 
       
   832 	stvx	VPS4,DST,BK	// LSU Store D7
       
   833 	addi	BK,BK,16	// IU1 Increment Byte_Kount+80 by 16	
       
   834 	vperm	VPS6,VS6,VS7,VP3	// VPU
       
   835 
       
   836 	stvx	VPS5,DST,BK	// LSU Store D8
       
   837 	addi	BK,BK,16	// IU1 Increment Byte_Kount+96 by 16	
       
   838 	vperm	VPS7,VS7,VS0,VP3	// VPU
       
   839 
       
   840 	DCBK			// LSU then Kill instead of RWITM
       
   841 
       
   842 	stvx	VPS6,DST,BK	// LSU Store D9
       
   843 	addi	BK,BK,16	// IU1 Increment Byte_Kount+112 by 16	
       
   844 
       
   845 	stvx	VPS7,DST,BK	// LSU Store D10
       
   846 	addi	BK,BK,16	// IU1 Increment Byte_Kount+128 by 16	
       
   847 	bdnz	Loop_of_128B	// b if ctr > 0 (QW/8 still > 0)
       
   848 
       
   849 	mtctr	QW		// IU1 Restore QW remaining to counter
       
   850 	addi	BL,BK,16	// IU1 Create an alternate byte kount + 16
       
   851 	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
       
   852 
       
   853 	bdnz	B32_fwd		// b and decrement counter for last QW store odd
       
   854 				// One of the above branches should have taken
       
   855 
       
   856 // End of memcpy in AltiVec
       
   857 
       
   858 // bcopy works like memcpy, but the source and destination operands are reversed.
       
   859 // Following will just reverse the operands and branch to memcpy.
       
   860 
       
   861 #ifdef LIBMOTOVEC
       
   862 	.globl	bcopy     
       
   863 bcopy:
       
   864 #else
       
   865 	.globl	vec_bcopy     
       
   866 vec_bcopy:
       
   867 #endif
       
   868 	mr	Rt,DST		// temp storage for what is really source address (r3)
       
   869 	mr	DST,SRC		// swap destination address to r3 to match memcpy dst
       
   870 	mr	SRC,Rt		// Complete swap of destination and source for memcpy
       
   871 #ifdef LIBMOTOVEC
       
   872 	b	memcpy		// b to memcpy with correct args in r3 and r4	
       
   873 #else
       
   874 	b	_vec_memcpy	// b to vec_memcpy with correct args in r3 and r4	
       
   875 #endif
       
   876 // End of bcopy in AltiVec