genericopenlibs/liboil/src/motovec/vec_memset.s
changeset 71 28ccaba883f4
parent 67 a1e347446159
child 72 403e7f6ed6c5
equal deleted inserted replaced
67:a1e347446159 71:28ccaba883f4
     1 //------------------------------------------------------------------
       
     2 // file:  vec_memset.S
       
     3 //    AltiVec enabled version of memset and bzero and cacheable_memzero
       
     4 //------------------------------------------------------------------
       
     5 
       
     6 //------------------------------------------------------------------
       
     7 //	Copyright Motorola, Inc. 2002
       
     8 //	ALL RIGHTS RESERVED
       
     9 //
       
    10 //	You are hereby granted a copyright license to use, modify, and 
       
    11 //	distribute the SOFTWARE so long as this entire notice is retained 
       
    12 //	without alteration in any modified and/or redistributed versions, 
       
    13 //	and that such modified versions are clearly identified as such.  
       
    14 //	No licenses are granted by implication, estoppel or otherwise under 
       
    15 //	any patents or trademarks of Motorola, Inc.
       
    16 //
       
    17 //	The SOFTWARE is provided on an "AS IS" basis and without warranty.  
       
    18 //	To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS 
       
    19 //	ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED 
       
    20 //	WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR 
       
    21 //	PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH 
       
    22 //	REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS 
       
    23 //	THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. 
       
    24 //
       
    25 //	To the maximum extent permitted by applicable law, IN NO EVENT SHALL 
       
    26 //	MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER 
       
    27 //	(INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF 
       
    28 //	BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS 
       
    29 //	INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR 
       
    30 //	INABILITY TO USE THE SOFTWARE.   Motorola assumes no responsibility 
       
    31 //	for the maintenance and support of the SOFTWARE.
       
    32 //------------------------------------------------------------------
       
    33 
       
    34 //------------------------------------------------------------------
       
    35 // extern void *memset( void *ptr, int val, size_t len );
       
    36 //   Copies val into each of len characters beginning at ptr.
       
    37 //                                       - Harbison&Steele 4th ed
       
    38 //    (despite val being an int, this memset assumes it is never
       
    39 //     more than a byte.  That seems to be correct from all the
       
    40 //     memset functions I've seen but I don't know if ANSI allows
       
    41 //     anthing longer.     Chuck Corley  12/21/02) 
       
    42 // Returns:
       
    43 //  void * ptr
       
    44 //------------------------------------------------------------------
       
    45 
       
    46 //------------------------------------------------------------------
       
    47 // extern void * bzero( char *ptr, int len);   
       
    48 //   Copies 0 into each of len characters at ptr.
       
    49 //                                       - Harbison&Steele 4th ed
       
    50 // Returns:
       
    51 //  void * ptr
       
    52 //------------------------------------------------------------------
       
    53 
       
    54 // Revision History:
       
    55 //    Rev 0.0	Original                        Chuck Corley	02/09/03
       
    56 //              Could benefit from changes added to memcpy
       
    57 //    Rev 0.1	Revised per memcpy Rev 0.30     Chuck Corley	05/01/03
       
    58 //
       
    59 //  This is beta quality code; users are encouraged to make it faster.
       
    60 //  ASSUMPTIONS:
       
    61 //     Code is highly likely to be in the cache; data is not (streaming data)
       
    62 //     Zero fill could be quite likely.
       
    63 //     Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
       
    64 
       
    65 #define VRSV 256	//	VRSAVE spr
       
    66 // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
       
    67 #define MIN_VEC 16
       
    68 
       
    69 // Register useage
       
    70 #define Rt r0	// 	r0 when used as a temporary register	
       
    71 
       
    72 #define DST r3	// 	entering: dest pointer; exiting: same dest pointer
       
    73 
       
    74 #define FILL r4	// 	entering: fill char then fill word
       
    75 
       
    76 #define BC r5	//	entering: Byte_Count then remaining Byte_Count
       
    77 
       
    78 #define DBC r6//	dst + byte count
       
    79 
       
    80 #define BK r7	//  	BC - 1 +/- (n*16)
       
    81 
       
    82 #define Fsh r8	//	fill byte shifted right one nibble
       
    83 
       
    84 #define DM1 r9//	dst -1 for byte-by-byte backwards initially
       
    85 #define D r9	//	(dst+16)[0:27] - dst[28:31]
       
    86 #define DNX r9	//	(dst+n*16)[28:31]
       
    87 #define BL r9	//	second byte_kount index pointer
       
    88 
       
    89 #define DR r10	//	(dst+16)[0:27]
       
    90 #define QW r10	//  	number of cache lines
       
    91 
       
    92 #define DBK r11	//	(dst+byte_count-1) then (dst+byte_count-1)[28:31]
       
    93 
       
    94 #define RSV r12	//  	storage for VRSAVE register if used
       
    95 
       
    96 //  Condition register use (not including temporary cr0)
       
    97 //      cr0[2]   = (FILL==0)?
       
    98 //      cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
       
    99 // then cr1[2]   = (DST[28:31] == 0)? 1 : 0;  (D0 left justified)
       
   100 // then cr1[2]   = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
       
   101 //      cr6[2]   = (QW == 0)? 1 : 0;
       
   102 // then cr6[1]   = (QW > 4)? 1 : 0; (>4 vectors to move?)
       
   103 // then cr6[3]   = (third store[27] == 1)? 1: 0; (cache line alignment)
       
   104 // then cr6[3]   = (last store[27] == 1)? 1: 0; (last store odd?)
       
   105 //      cr7[2]   = (BC>MIN_VEC)?1:0;  (BC big enough to warrant vectors)
       
   106 // then cr7[0:3] = (DST+16)[0:27]-DST  (How many bytes (iff <16) in first vector?)
       
   107 // then cr7[0:3] = (DST+BC)[0:27]  (How many bytes (iff <16) in last vector?)
       
   108 
       
   109 // Conditionalize the use of dcba.  It will help if the data is
       
   110 // not in cache and hurt if it is.  Generally, except for small
       
   111 // benchmarks repeated many times, we assume data is not in cache
       
   112 // (data streaming) and using dcba is a performance boost.
       
   113 // We use dcba which will noop to non-cacheable memory rather than
       
   114 // dcbz which will cause an aligment exception.
       
   115 #ifndef NO_DCBA
       
   116 #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
       
   117  // gcc and codewarrior and diab don't assemble dcba
       
   118 #define DCBK .long 0x7c033dec
       
   119 // dcba r3,r7    or    dcba DST,BK
       
   120 #else
       
   121 #ifdef __ghs__
       
   122 .macro DCBK
       
   123 .long 0x7c033dec
       
   124 .endm
       
   125 #else
       
   126 #define DCBK dcba DST,BK
       
   127 #endif  // __ghs__
       
   128 #endif  // __GNUC__ or __MWERKS__
       
   129 #else
       
   130 #define DCBK nop
       
   131 #endif  // NO_DCBA
       
   132 
       
   133 	.text
       
   134 #ifdef __MWERKS__
       
   135 	.align	32
       
   136 #else
       
   137 	.align	5
       
   138 #endif
       
   139 
       
   140 #ifdef LIBMOTOVEC
       
   141 	.globl	memset     
       
   142 memset:
       
   143 #else
       
   144 	.globl	_vec_memset     
       
   145 _vec_memset:
       
   146 #endif
       
   147 
       
   148 	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
       
   149 	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
       
   150 	rlwinm.	Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
       
   151 
       
   152 	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
       
   153 	addi	DR,DST,16	// IU1 Address of second dst vector
       
   154 	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
       
   155 	bgt	cr7,v_memset	// b if BC>MIN_VEC
       
   156 
       
   157 	mtctr	BC		// for (i=1;i<=BC;i++)
       
   158 	beqlr	cr1		// return if BC = 0
       
   159 Byte_set:
       
   160 	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
       
   161 	bdnz	Byte_set
       
   162 
       
   163 	blr
       
   164 
       
   165 v_memset:
       
   166 // Byte count < MIN_VEC bytes will have been set by scalar code above,
       
   167 // so this will not deal with small block sets < MIN_VEC.
       
   168 
       
   169 // For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
       
   170 // that don't, make sure VRSAVE is undefined.
       
   171 #ifdef VRSAVE
       
   172 	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
       
   173 #endif
       
   174 	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
       
   175 	addi	DBK,DBC,-1	// IU1 Address of last dst byte
       
   176 
       
   177 #ifdef VRSAVE
       
   178 	oris	Rt,RSV,0xe000	// IU1 Or in registers used by this routine
       
   179 #endif
       
   180 	subf	D,DST,DR	// IU1 How many bytes in first destination?
       
   181 	li	BK,0		// IU1 Initialize byte kount index
       
   182 
       
   183 #ifdef VRSAVE
       
   184 	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
       
   185 #endif
       
   186 	vxor	v0,v0,v0	// VIU Clear v0
       
   187 	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
       
   188 	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
       
   189 	beq+	enter_bzero	// b if FILL==0
       
   190 
       
   191 	lvsl	v0,0,Fsh	// LSU Move upper nibble to byte 0 of VR
       
   192 	vspltisb	v1,4	// VPU Splat 0x4 to every byte
       
   193 
       
   194 	lvsl	v2,0,FILL	// LSU Move lower nibble to byte 0 of VR
       
   195 
       
   196 	vslb	v0,v0,v1	// VIU Move upper nibble to VR[0:3]
       
   197 
       
   198 	vor	v0,v0,v2	// VIU Form FILL byte in VR[0:7]
       
   199 
       
   200 	vspltb	v0,v0,0		// VPU Splat the fill byte to all bytes
       
   201 enter_bzero:
       
   202 	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
       
   203 	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
       
   204 	beq	cr1,Left_just	// b if D0 is left justified
       
   205 
       
   206 	bns	cr7,No_B_fwd	// b if only even number of bytes to store
       
   207 
       
   208 	stvebx	v0,DST,BK	// LSU store first byte at DST+0
       
   209 	addi	BK,BK,1		// IU1 increment index
       
   210 No_B_fwd:
       
   211 	bne	cr7,No_H_fwd	// b if only words to store
       
   212 
       
   213 	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
       
   214 	addi	BK,BK,2		// IU1 increment index
       
   215 No_H_fwd:
       
   216 	bng	cr7,No_W1_fwd	// b if exactly zero or two words to store
       
   217 
       
   218 	stvewx	v0,DST,BK	// LSU store word 1 of one or three
       
   219 	addi	BK,BK,4		// IU1 increment index
       
   220 
       
   221 No_W1_fwd:
       
   222 	bnl	cr7,No_W2_fwd	// b if there was only one word to store
       
   223 	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
       
   224 	addi	BK,BK,4		// IU1 increment index
       
   225 
       
   226 	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
       
   227 	b	No_W2_fwd
       
   228 
       
   229 Left_just:	
       
   230 	stvx	v0,0,DST	// LSU Store 16 bytes at D0
       
   231 No_W2_fwd:
       
   232 	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
       
   233 	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
       
   234 
       
   235 	li	BK,16		// IU1 Re-initialize byte kount index
       
   236 	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
       
   237 	ble	cr6,Last_QW	// b if no Quad words to do
       
   238 
       
   239 	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
       
   240 	cmpi	cr6,0,QW,4	// IU1 Check QW>4
       
   241 
       
   242 QW_loop:
       
   243 	stvx	v0,DST,BK	// LSU Store 16 fill bytes
       
   244 	addi	BK,BK,16	// IU1 Increment byte kount index
       
   245 	bdnzf	25,QW_loop	// b if 4 or less quad words to do
       
   246 
       
   247 	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
       
   248 	addi	QW,QW,-1	// IU1 One more QW stored by now
       
   249 	bgt	cr6,GT_4QW_fwd	// b if >4 quad words left
       
   250 
       
   251 Last_QW:	// Next vector is the last; we're done.
       
   252 	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
       
   253 
       
   254 	beq	cr1,Rt_just_fwd	// b if last destination is right justified
       
   255 
       
   256 	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
       
   257 	li	BL,0		// IU1 Initialize index pointer
       
   258 	bnl	cr7,Only_1W_fwd	// b if there was only one or zero words to store
       
   259 
       
   260 	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
       
   261 	addi	BL,BL,4		// IU1 increment index
       
   262 
       
   263 	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
       
   264 	addi	BL,BL,4		// IU1 increment index
       
   265 Only_1W_fwd:
       
   266 	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
       
   267 
       
   268 	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
       
   269 	addi	BL,BL,4		// IU1 increment index
       
   270 Only_2W_fwd:
       
   271 	bne	cr7,Only_B_fwd	// b if there are no half words to store
       
   272 
       
   273 	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
       
   274 	addi	BL,BL,2		// IU1 increment index
       
   275 Only_B_fwd:
       
   276 	bns	cr7,All_done_fwd	// b if there are no bytes to store
       
   277 
       
   278 	stvebx	v0,DBK,BL	// LSU store one byte if necessary
       
   279 	b	All_done_fwd
       
   280 
       
   281 Rt_just_fwd:
       
   282 
       
   283 	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
       
   284 All_done_fwd:
       
   285 #ifdef VRSAVE
       
   286 	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
       
   287 #endif
       
   288 	blr			// Return destination address from entry
       
   289 
       
   290 #ifdef __MWERKS__
       
   291 	.align	16
       
   292 #else
       
   293 	.align	4
       
   294 #endif
       
   295 GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
       
   296 
       
   297 	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
       
   298 	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
       
   299 	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
       
   300 
       
   301 	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
       
   302 	addi	BK,BK,16	// IU1 Increment byte count by 16
       
   303 	bdnzf	27,GT_4QW_fwd	// b if next store is to lower (even) half of CL
       
   304 
       
   305 	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
       
   306 
       
   307 	bns	cr6,B32_fwd	// b if DST[27] == 0; i.e, final store is even
       
   308 
       
   309 // We need the ctr register to reflect an even byte count before entering
       
   310 // the next block - faster to decrement than to reload.
       
   311 	bdnz	B32_fwd		// decrement counter for last QW store odd
       
   312 
       
   313 B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
       
   314 	DCBK			// LSU then Kill instead of RWITM
       
   315 
       
   316 	stvx	v0,DST,BK	// LSU Store 16 bytes at D11
       
   317 	addi	BK,BK,16	// IU1 Increment byte count
       
   318 	bdz	Nxt_loc_fwd	// always decrement and branch to next instr		
       
   319 
       
   320 Nxt_loc_fwd:
       
   321 	stvx	v0,DST,BK	// LSU Store 16 bytes at D12
       
   322 	addi	BK,BK,16	// IU1 Increment byte count
       
   323 	bdnz	B32_fwd		// b if there are at least two more QWs to do
       
   324 
       
   325 	bso	cr6,One_even_QW	// b if there is one even and one odd QW to store
       
   326 	b	Last_QW		// b if last store is to even address
       
   327 
       
   328 // Come here with two more loads and two stores to do
       
   329 One_even_QW:
       
   330 	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
       
   331 	addi	BK,BK,16	// IU1 Increment byte count
       
   332 
       
   333 	b	Last_QW
       
   334 
       
   335 // End of memset in AltiVec
       
   336 
       
   337 #define BCz r4		// in bzero r4 enters with byte count
       
   338 
       
   339 #ifdef __MWERKS__
       
   340 	.align	32
       
   341 #else
       
   342 	.align	5
       
   343 #endif
       
   344 
       
   345 #ifdef LIBMOTOVEC
       
   346 	.globl	bzero     
       
   347 bzero:
       
   348 #else
       
   349 	.globl	vec_bzero     
       
   350 vec_bzero:
       
   351 #endif
       
   352 
       
   353 	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
       
   354 	li	FILL,0		// IU1 for bzero FILL=0
       
   355 #ifdef LIBMOTOVEC
       
   356 	b	memset     
       
   357 #else
       
   358 	b	_vec_memset     
       
   359 #endif
       
   360 
       
   361 // cacheable_memzero will employ dcbz to clear 32 bytes at a time
       
   362 // of cacheable memory. Like bzero, second entering argument will be BC.
       
   363 // Using this for non-cacheable memory will generate an alignment exception.
       
   364 
       
   365 	.text
       
   366 #ifdef __MWERKS__
       
   367 	.align	32
       
   368 #else
       
   369 	.align	5
       
   370 #endif
       
   371 
       
   372 #ifdef LIBMOTOVEC
       
   373 	.globl	cacheable_memzero     
       
   374 cacheable_memzero:
       
   375 #else
       
   376 	.globl	vec_cacheable_memzero     
       
   377 vec_cacheable_memzero:
       
   378 #endif
       
   379 
       
   380 	mr	BC,BCz		// IU1 arg[2] is BC here, not FILL
       
   381 	li	FILL,0		// IU1 for bzero FILL=0
       
   382 	cmpi	cr7,0,BC,MIN_VEC	// IU1 Check for minimum byte count
       
   383 
       
   384 	cmpi	cr1,0,BC,0	// IU1 Eliminate zero byte count
       
   385 
       
   386 	addi	DM1,DST,-1	// IU1 Pre-bias and duplicate destination
       
   387 	addi	DR,DST,16	// IU1 Address of second dst vector
       
   388 	add	DBC,DST,BC	// IU1 Address of last dst byte + 1
       
   389 	bgt	cr7,c_v_memset	// b if BC>MIN_VEC
       
   390 
       
   391 	mtctr	BC		// for (i=1;i<=BC;i++)
       
   392 	beqlr	cr1		// return if BC = 0
       
   393 c_Byte_set:
       
   394 	stbu	FILL,1(DM1)	// LSU * ++(DST-1) = FILL
       
   395 	bdnz	c_Byte_set
       
   396 
       
   397 	blr
       
   398 
       
   399 c_v_memset:
       
   400 // Byte count < MIN_VEC bytes will have been set by scalar code above,
       
   401 // so this will not deal with small block sets < MIN_VEC.
       
   402 
       
   403 // For systems using VRSAVE, define VRSAV=1 when compiling.  For systems
       
   404 // that don't, make sure VRSAVE is undefined.
       
   405 #ifdef VRSAVE
       
   406 	mfspr	RSV,VRSV	// IU2 Get current VRSAVE contents
       
   407 #endif
       
   408 	rlwinm	DR,DR,0,0,27	// IU1 (DST+16)[0:27]
       
   409 	addi	DBK,DBC,-1	// IU1 Address of last dst byte
       
   410 
       
   411 #ifdef VRSAVE
       
   412 	oris	Rt,RSV,0x8000	// IU1 Or in registers used by this routine
       
   413 #endif
       
   414 	subf	D,DST,DR	// IU1 How many bytes in first destination?
       
   415 	li	BK,0		// IU1 Initialize byte kount index
       
   416 
       
   417 #ifdef VRSAVE
       
   418 	mtspr	VRSV,Rt	// IU2 Save in VRSAVE before first vec op
       
   419 #endif
       
   420 	vxor	v0,v0,v0	// VIU Clear v0
       
   421 	subf	QW,DR,DBK	// IU1 Bytes of full vectors to move (-16)
       
   422 	cmpi	cr1,0,D,16	// IU1 Is D0 left justified?
       
   423 
       
   424 	mtcrf	0x01,D		// IU2 Put bytes in 1st dst in cr7
       
   425 	rlwinm	QW,QW,28,4,31	// IU1 Quad words remaining
       
   426 	beq	cr1,c_Left_just	// b if D0 is left justified
       
   427 
       
   428 	bns	cr7,c_No_B_fwd	// b if only even number of bytes to store
       
   429 
       
   430 	stvebx	v0,DST,BK	// LSU store first byte at DST+0
       
   431 	addi	BK,BK,1		// IU1 increment index
       
   432 c_No_B_fwd:
       
   433 	bne	cr7,c_No_H_fwd	// b if only words to store
       
   434 
       
   435 	stvehx	v0,DST,BK	// LSU store halfword at DST+0/1
       
   436 	addi	BK,BK,2		// IU1 increment index
       
   437 c_No_H_fwd:
       
   438 	bng	cr7,c_No_W1_fwd	// b if exactly zero or two words to store
       
   439 
       
   440 	stvewx	v0,DST,BK	// LSU store word 1 of one or three
       
   441 	addi	BK,BK,4		// IU1 increment index
       
   442 
       
   443 c_No_W1_fwd:
       
   444 	bnl	cr7,c_No_W2_fwd	// b if there was only one word to store
       
   445 	stvewx	v0,DST,BK	// LSU store word 1 of two or 2 of three
       
   446 	addi	BK,BK,4		// IU1 increment index
       
   447 
       
   448 	stvewx	v0,DST,BK	// LSU store word 2 of two or 3 of three
       
   449 	b	c_No_W2_fwd
       
   450 
       
   451 c_Left_just:	
       
   452 	stvx	v0,0,DST	// LSU Store 16 bytes at D0
       
   453 c_No_W2_fwd:
       
   454 	rlwinm	Rt,DBK,0,28,31	// IU1 (DBK = DST+BC-1)[28:31]
       
   455 	cmpi	cr6,0,QW,0	// IU1 Any full vectors to move?
       
   456 
       
   457 	li	BK,16		// IU1 Re-initialize byte kount index
       
   458 	cmpi	cr1,0,Rt,0xF	// IU1 Is DN right justified?
       
   459 	ble	cr6,c_Last_QW	// b if no Quad words to do
       
   460 
       
   461 	mtctr	QW		// IU2 for (i=0;i<=QW;i++)
       
   462 	cmpi	cr6,0,QW,4	// IU1 Check QW>4
       
   463 
       
   464 c_QW_loop:
       
   465 	stvx	v0,DST,BK	// LSU Store 16 fill bytes
       
   466 	addi	BK,BK,16	// IU1 Increment byte kount index
       
   467 	bdnzf	25,c_QW_loop	// b if 4 or less quad words to do
       
   468 
       
   469 	add	DNX,DST,BK	// IU1 address of next store (DST+32 if QW>4)
       
   470 	addi	QW,QW,-1	// IU1 One more QW stored by now
       
   471 	bgt	cr6,c_GT_4QW_fwd	// b if >4 quad words left
       
   472 
       
   473 c_Last_QW:	// Next vector is the last; we're done.
       
   474 	mtcrf	0x01,DBC	// IU2 Put final vector byte count in cr7
       
   475 
       
   476 	beq	cr1,c_Rt_just_fwd	// b if last destination is right justified
       
   477 
       
   478 	rlwinm	DBK,DBK,0,0,27	// IU1 Round to QW addr of last byte
       
   479 	li	BL,0		// IU1 Initialize index pointer
       
   480 	bnl	cr7,c_Only_1W_fwd	// b if there was only one or zero words to store
       
   481 
       
   482 	stvewx	v0,DBK,BL	// LSU store word 1 of two or three
       
   483 	addi	BL,BL,4		// IU1 increment index
       
   484 
       
   485 	stvewx	v0,DBK,BL	// LSU store word 2 of two or three
       
   486 	addi	BL,BL,4		// IU1 increment index
       
   487 c_Only_1W_fwd:
       
   488 	bng	cr7,Only_2W_fwd	// b if there were only two or zero words to store
       
   489 
       
   490 	stvewx	v0,DBK,BL	// LSU store word 3 of three if necessary
       
   491 	addi	BL,BL,4		// IU1 increment index
       
   492 c_Only_2W_fwd:
       
   493 	bne	cr7,c_Only_B_fwd	// b if there are no half words to store
       
   494 
       
   495 	stvehx	v0,DBK,BL	// LSU store one halfword if necessary
       
   496 	addi	BL,BL,2		// IU1 increment index
       
   497 c_Only_B_fwd:
       
   498 	bns	cr7,c_All_done_fwd	// b if there are no bytes to store
       
   499 
       
   500 	stvebx	v0,DBK,BL	// LSU store one byte if necessary
       
   501 	b	c_All_done_fwd
       
   502 
       
   503 c_Rt_just_fwd:
       
   504 
       
   505 	stvx	v0,DST,BK	// LSU Store 16 bytes at D14
       
   506 c_All_done_fwd:
       
   507 #ifdef VRSAVE
       
   508 	mtspr	VRSV,RSV	// IU1 Restore VRSAVE	
       
   509 #endif
       
   510 	blr			// Return destination address from entry
       
   511 
       
   512 #ifdef __MWERKS__
       
   513 	.align	16
       
   514 #else
       
   515 	.align	4
       
   516 #endif
       
   517 c_GT_4QW_fwd:	// Do once if nxt st is to odd half of cache line, else twice
       
   518 
       
   519 	addi	QW,QW,-1	// IU1 Keeping track of QWs stored
       
   520 	mtcrf	0x02,DNX	// IU2 cr6[3]=((DST+32)[27]==1)?1:0;
       
   521 	addi	DNX,DNX,16	// IU1 Update cr6 for next loop
       
   522 
       
   523 	stvx	v0,DST,BK	// LSU Store 16 bytes at D2
       
   524 	addi	BK,BK,16	// IU1 Increment byte count by 16
       
   525 	bdnzf	27,c_GT_4QW_fwd	// b if next store is to lower (even) half of CL
       
   526 
       
   527 	mtcrf	0x02,DBK	// IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
       
   528 
       
   529 	bns	cr6,c_B32_fwd	// b if DST[27] == 0; i.e, final store is even
       
   530 
       
   531 // We need the ctr register to reflect an even byte count before entering
       
   532 // the next block - faster to decrement than to reload.
       
   533 	bdnz	B32_fwd		// decrement counter for last QW store odd
       
   534 
       
   535 c_B32_fwd:	// Should be at least 2 stores remaining and next 2 are cache aligned
       
   536 	dcbz	DST,BK		// LSU zero whole cache line
       
   537 	bdz	c_Nxt_loc_fwd	// always decrement and branch to next instr		
       
   538 
       
   539 c_Nxt_loc_fwd:
       
   540 	addi	BK,BK,32	// IU1 Increment byte count
       
   541 	bdnz	B32_fwd		// b if there are at least two more QWs to do
       
   542 
       
   543 	bso	cr6,c_One_even_QW	// b if there is one even and one odd QW to store
       
   544 	b	c_Last_QW		// b if last store is to even address
       
   545 
       
   546 // Come here with two more loads and two stores to do
       
   547 c_One_even_QW:
       
   548 	stvx	v0,DST,BK	// LSU Store 16 bytes at D13
       
   549 	addi	BK,BK,16	// IU1 Increment byte count
       
   550 
       
   551 	b	c_Last_QW
       
   552 
       
   553 // End of cacheable_memzero in AltiVec