1 //------------------------------------------------------------------ |
|
2 // file: vec_memset.S |
|
3 // AltiVec enabled version of memset and bzero and cacheable_memzero |
|
4 //------------------------------------------------------------------ |
|
5 |
|
6 //------------------------------------------------------------------ |
|
7 // Copyright Motorola, Inc. 2002 |
|
8 // ALL RIGHTS RESERVED |
|
9 // |
|
10 // You are hereby granted a copyright license to use, modify, and |
|
11 // distribute the SOFTWARE so long as this entire notice is retained |
|
12 // without alteration in any modified and/or redistributed versions, |
|
13 // and that such modified versions are clearly identified as such. |
|
14 // No licenses are granted by implication, estoppel or otherwise under |
|
15 // any patents or trademarks of Motorola, Inc. |
|
16 // |
|
17 // The SOFTWARE is provided on an "AS IS" basis and without warranty. |
|
18 // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS |
|
19 // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED |
|
20 // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR |
|
21 // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH |
|
22 // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS |
|
23 // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. |
|
24 // |
|
25 // To the maximum extent permitted by applicable law, IN NO EVENT SHALL |
|
26 // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER |
|
27 // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF |
|
28 // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS |
|
29 // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR |
|
30 // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility |
|
31 // for the maintenance and support of the SOFTWARE. |
|
32 //------------------------------------------------------------------ |
|
33 |
|
34 //------------------------------------------------------------------ |
|
35 // extern void *memset( void *ptr, int val, size_t len ); |
|
36 // Copies val into each of len characters beginning at ptr. |
|
37 // - Harbison&Steele 4th ed |
|
38 // (despite val being an int, this memset assumes it is never |
|
39 // more than a byte. That seems to be correct from all the |
|
40 // memset functions I've seen but I don't know if ANSI allows |
|
41 // anthing longer. Chuck Corley 12/21/02) |
|
42 // Returns: |
|
43 // void * ptr |
|
44 //------------------------------------------------------------------ |
|
45 |
|
46 //------------------------------------------------------------------ |
|
47 // extern void * bzero( char *ptr, int len); |
|
48 // Copies 0 into each of len characters at ptr. |
|
49 // - Harbison&Steele 4th ed |
|
50 // Returns: |
|
51 // void * ptr |
|
52 //------------------------------------------------------------------ |
|
53 |
|
54 // Revision History: |
|
55 // Rev 0.0 Original Chuck Corley 02/09/03 |
|
56 // Could benefit from changes added to memcpy |
|
57 // Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 |
|
58 // |
|
59 // This is beta quality code; users are encouraged to make it faster. |
|
60 // ASSUMPTIONS: |
|
61 // Code is highly likely to be in the cache; data is not (streaming data) |
|
62 // Zero fill could be quite likely. |
|
63 // Moving fill byte from GPR to VR as below faster than stw->lvebx via stack |
|
64 |
|
65 #define VRSV 256 // VRSAVE spr |
|
66 // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. |
|
67 #define MIN_VEC 16 |
|
68 |
|
69 // Register useage |
|
70 #define Rt r0 // r0 when used as a temporary register |
|
71 |
|
72 #define DST r3 // entering: dest pointer; exiting: same dest pointer |
|
73 |
|
74 #define FILL r4 // entering: fill char then fill word |
|
75 |
|
76 #define BC r5 // entering: Byte_Count then remaining Byte_Count |
|
77 |
|
78 #define DBC r6// dst + byte count |
|
79 |
|
80 #define BK r7 // BC - 1 +/- (n*16) |
|
81 |
|
82 #define Fsh r8 // fill byte shifted right one nibble |
|
83 |
|
84 #define DM1 r9// dst -1 for byte-by-byte backwards initially |
|
85 #define D r9 // (dst+16)[0:27] - dst[28:31] |
|
86 #define DNX r9 // (dst+n*16)[28:31] |
|
87 #define BL r9 // second byte_kount index pointer |
|
88 |
|
89 #define DR r10 // (dst+16)[0:27] |
|
90 #define QW r10 // number of cache lines |
|
91 |
|
92 #define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] |
|
93 |
|
94 #define RSV r12 // storage for VRSAVE register if used |
|
95 |
|
96 // Condition register use (not including temporary cr0) |
|
97 // cr0[2] = (FILL==0)? |
|
98 // cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) |
|
99 // then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) |
|
100 // then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) |
|
101 // cr6[2] = (QW == 0)? 1 : 0; |
|
102 // then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) |
|
103 // then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) |
|
104 // then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) |
|
105 // cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) |
|
106 // then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) |
|
107 // then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) |
|
108 |
|
109 // Conditionalize the use of dcba. It will help if the data is |
|
110 // not in cache and hurt if it is. Generally, except for small |
|
111 // benchmarks repeated many times, we assume data is not in cache |
|
112 // (data streaming) and using dcba is a performance boost. |
|
113 // We use dcba which will noop to non-cacheable memory rather than |
|
114 // dcbz which will cause an aligment exception. |
|
115 #ifndef NO_DCBA |
|
116 #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) |
|
117 // gcc and codewarrior and diab don't assemble dcba |
|
118 #define DCBK .long 0x7c033dec |
|
119 // dcba r3,r7 or dcba DST,BK |
|
120 #else |
|
121 #ifdef __ghs__ |
|
122 .macro DCBK |
|
123 .long 0x7c033dec |
|
124 .endm |
|
125 #else |
|
126 #define DCBK dcba DST,BK |
|
127 #endif // __ghs__ |
|
128 #endif // __GNUC__ or __MWERKS__ |
|
129 #else |
|
130 #define DCBK nop |
|
131 #endif // NO_DCBA |
|
132 |
|
133 .text |
|
134 #ifdef __MWERKS__ |
|
135 .align 32 |
|
136 #else |
|
137 .align 5 |
|
138 #endif |
|
139 |
|
140 #ifdef LIBMOTOVEC |
|
141 .globl memset |
|
142 memset: |
|
143 #else |
|
144 .globl _vec_memset |
|
145 _vec_memset: |
|
146 #endif |
|
147 |
|
148 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count |
|
149 cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count |
|
150 rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift |
|
151 |
|
152 addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination |
|
153 addi DR,DST,16 // IU1 Address of second dst vector |
|
154 add DBC,DST,BC // IU1 Address of last dst byte + 1 |
|
155 bgt cr7,v_memset // b if BC>MIN_VEC |
|
156 |
|
157 mtctr BC // for (i=1;i<=BC;i++) |
|
158 beqlr cr1 // return if BC = 0 |
|
159 Byte_set: |
|
160 stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL |
|
161 bdnz Byte_set |
|
162 |
|
163 blr |
|
164 |
|
165 v_memset: |
|
166 // Byte count < MIN_VEC bytes will have been set by scalar code above, |
|
167 // so this will not deal with small block sets < MIN_VEC. |
|
168 |
|
169 // For systems using VRSAVE, define VRSAV=1 when compiling. For systems |
|
170 // that don't, make sure VRSAVE is undefined. |
|
171 #ifdef VRSAVE |
|
172 mfspr RSV,VRSV // IU2 Get current VRSAVE contents |
|
173 #endif |
|
174 rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] |
|
175 addi DBK,DBC,-1 // IU1 Address of last dst byte |
|
176 |
|
177 #ifdef VRSAVE |
|
178 oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine |
|
179 #endif |
|
180 subf D,DST,DR // IU1 How many bytes in first destination? |
|
181 li BK,0 // IU1 Initialize byte kount index |
|
182 |
|
183 #ifdef VRSAVE |
|
184 mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op |
|
185 #endif |
|
186 vxor v0,v0,v0 // VIU Clear v0 |
|
187 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) |
|
188 cmpi cr1,0,D,16 // IU1 Is D0 left justified? |
|
189 beq+ enter_bzero // b if FILL==0 |
|
190 |
|
191 lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR |
|
192 vspltisb v1,4 // VPU Splat 0x4 to every byte |
|
193 |
|
194 lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR |
|
195 |
|
196 vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] |
|
197 |
|
198 vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] |
|
199 |
|
200 vspltb v0,v0,0 // VPU Splat the fill byte to all bytes |
|
201 enter_bzero: |
|
202 mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 |
|
203 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining |
|
204 beq cr1,Left_just // b if D0 is left justified |
|
205 |
|
206 bns cr7,No_B_fwd // b if only even number of bytes to store |
|
207 |
|
208 stvebx v0,DST,BK // LSU store first byte at DST+0 |
|
209 addi BK,BK,1 // IU1 increment index |
|
210 No_B_fwd: |
|
211 bne cr7,No_H_fwd // b if only words to store |
|
212 |
|
213 stvehx v0,DST,BK // LSU store halfword at DST+0/1 |
|
214 addi BK,BK,2 // IU1 increment index |
|
215 No_H_fwd: |
|
216 bng cr7,No_W1_fwd // b if exactly zero or two words to store |
|
217 |
|
218 stvewx v0,DST,BK // LSU store word 1 of one or three |
|
219 addi BK,BK,4 // IU1 increment index |
|
220 |
|
221 No_W1_fwd: |
|
222 bnl cr7,No_W2_fwd // b if there was only one word to store |
|
223 stvewx v0,DST,BK // LSU store word 1 of two or 2 of three |
|
224 addi BK,BK,4 // IU1 increment index |
|
225 |
|
226 stvewx v0,DST,BK // LSU store word 2 of two or 3 of three |
|
227 b No_W2_fwd |
|
228 |
|
229 Left_just: |
|
230 stvx v0,0,DST // LSU Store 16 bytes at D0 |
|
231 No_W2_fwd: |
|
232 rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] |
|
233 cmpi cr6,0,QW,0 // IU1 Any full vectors to move? |
|
234 |
|
235 li BK,16 // IU1 Re-initialize byte kount index |
|
236 cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? |
|
237 ble cr6,Last_QW // b if no Quad words to do |
|
238 |
|
239 mtctr QW // IU2 for (i=0;i<=QW;i++) |
|
240 cmpi cr6,0,QW,4 // IU1 Check QW>4 |
|
241 |
|
242 QW_loop: |
|
243 stvx v0,DST,BK // LSU Store 16 fill bytes |
|
244 addi BK,BK,16 // IU1 Increment byte kount index |
|
245 bdnzf 25,QW_loop // b if 4 or less quad words to do |
|
246 |
|
247 add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) |
|
248 addi QW,QW,-1 // IU1 One more QW stored by now |
|
249 bgt cr6,GT_4QW_fwd // b if >4 quad words left |
|
250 |
|
251 Last_QW: // Next vector is the last; we're done. |
|
252 mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 |
|
253 |
|
254 beq cr1,Rt_just_fwd // b if last destination is right justified |
|
255 |
|
256 rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte |
|
257 li BL,0 // IU1 Initialize index pointer |
|
258 bnl cr7,Only_1W_fwd // b if there was only one or zero words to store |
|
259 |
|
260 stvewx v0,DBK,BL // LSU store word 1 of two or three |
|
261 addi BL,BL,4 // IU1 increment index |
|
262 |
|
263 stvewx v0,DBK,BL // LSU store word 2 of two or three |
|
264 addi BL,BL,4 // IU1 increment index |
|
265 Only_1W_fwd: |
|
266 bng cr7,Only_2W_fwd // b if there were only two or zero words to store |
|
267 |
|
268 stvewx v0,DBK,BL // LSU store word 3 of three if necessary |
|
269 addi BL,BL,4 // IU1 increment index |
|
270 Only_2W_fwd: |
|
271 bne cr7,Only_B_fwd // b if there are no half words to store |
|
272 |
|
273 stvehx v0,DBK,BL // LSU store one halfword if necessary |
|
274 addi BL,BL,2 // IU1 increment index |
|
275 Only_B_fwd: |
|
276 bns cr7,All_done_fwd // b if there are no bytes to store |
|
277 |
|
278 stvebx v0,DBK,BL // LSU store one byte if necessary |
|
279 b All_done_fwd |
|
280 |
|
281 Rt_just_fwd: |
|
282 |
|
283 stvx v0,DST,BK // LSU Store 16 bytes at D14 |
|
284 All_done_fwd: |
|
285 #ifdef VRSAVE |
|
286 mtspr VRSV,RSV // IU1 Restore VRSAVE |
|
287 #endif |
|
288 blr // Return destination address from entry |
|
289 |
|
290 #ifdef __MWERKS__ |
|
291 .align 16 |
|
292 #else |
|
293 .align 4 |
|
294 #endif |
|
295 GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice |
|
296 |
|
297 addi QW,QW,-1 // IU1 Keeping track of QWs stored |
|
298 mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; |
|
299 addi DNX,DNX,16 // IU1 Update cr6 for next loop |
|
300 |
|
301 stvx v0,DST,BK // LSU Store 16 bytes at D2 |
|
302 addi BK,BK,16 // IU1 Increment byte count by 16 |
|
303 bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL |
|
304 |
|
305 mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) |
|
306 |
|
307 bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even |
|
308 |
|
309 // We need the ctr register to reflect an even byte count before entering |
|
310 // the next block - faster to decrement than to reload. |
|
311 bdnz B32_fwd // decrement counter for last QW store odd |
|
312 |
|
313 B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned |
|
314 DCBK // LSU then Kill instead of RWITM |
|
315 |
|
316 stvx v0,DST,BK // LSU Store 16 bytes at D11 |
|
317 addi BK,BK,16 // IU1 Increment byte count |
|
318 bdz Nxt_loc_fwd // always decrement and branch to next instr |
|
319 |
|
320 Nxt_loc_fwd: |
|
321 stvx v0,DST,BK // LSU Store 16 bytes at D12 |
|
322 addi BK,BK,16 // IU1 Increment byte count |
|
323 bdnz B32_fwd // b if there are at least two more QWs to do |
|
324 |
|
325 bso cr6,One_even_QW // b if there is one even and one odd QW to store |
|
326 b Last_QW // b if last store is to even address |
|
327 |
|
328 // Come here with two more loads and two stores to do |
|
329 One_even_QW: |
|
330 stvx v0,DST,BK // LSU Store 16 bytes at D13 |
|
331 addi BK,BK,16 // IU1 Increment byte count |
|
332 |
|
333 b Last_QW |
|
334 |
|
335 // End of memset in AltiVec |
|
336 |
|
337 #define BCz r4 // in bzero r4 enters with byte count |
|
338 |
|
339 #ifdef __MWERKS__ |
|
340 .align 32 |
|
341 #else |
|
342 .align 5 |
|
343 #endif |
|
344 |
|
345 #ifdef LIBMOTOVEC |
|
346 .globl bzero |
|
347 bzero: |
|
348 #else |
|
349 .globl vec_bzero |
|
350 vec_bzero: |
|
351 #endif |
|
352 |
|
353 mr BC,BCz // IU1 arg[2] is BC here, not FILL |
|
354 li FILL,0 // IU1 for bzero FILL=0 |
|
355 #ifdef LIBMOTOVEC |
|
356 b memset |
|
357 #else |
|
358 b _vec_memset |
|
359 #endif |
|
360 |
|
361 // cacheable_memzero will employ dcbz to clear 32 bytes at a time |
|
362 // of cacheable memory. Like bzero, second entering argument will be BC. |
|
363 // Using this for non-cacheable memory will generate an alignment exception. |
|
364 |
|
365 .text |
|
366 #ifdef __MWERKS__ |
|
367 .align 32 |
|
368 #else |
|
369 .align 5 |
|
370 #endif |
|
371 |
|
372 #ifdef LIBMOTOVEC |
|
373 .globl cacheable_memzero |
|
374 cacheable_memzero: |
|
375 #else |
|
376 .globl vec_cacheable_memzero |
|
377 vec_cacheable_memzero: |
|
378 #endif |
|
379 |
|
380 mr BC,BCz // IU1 arg[2] is BC here, not FILL |
|
381 li FILL,0 // IU1 for bzero FILL=0 |
|
382 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count |
|
383 |
|
384 cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count |
|
385 |
|
386 addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination |
|
387 addi DR,DST,16 // IU1 Address of second dst vector |
|
388 add DBC,DST,BC // IU1 Address of last dst byte + 1 |
|
389 bgt cr7,c_v_memset // b if BC>MIN_VEC |
|
390 |
|
391 mtctr BC // for (i=1;i<=BC;i++) |
|
392 beqlr cr1 // return if BC = 0 |
|
393 c_Byte_set: |
|
394 stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL |
|
395 bdnz c_Byte_set |
|
396 |
|
397 blr |
|
398 |
|
399 c_v_memset: |
|
400 // Byte count < MIN_VEC bytes will have been set by scalar code above, |
|
401 // so this will not deal with small block sets < MIN_VEC. |
|
402 |
|
403 // For systems using VRSAVE, define VRSAV=1 when compiling. For systems |
|
404 // that don't, make sure VRSAVE is undefined. |
|
405 #ifdef VRSAVE |
|
406 mfspr RSV,VRSV // IU2 Get current VRSAVE contents |
|
407 #endif |
|
408 rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] |
|
409 addi DBK,DBC,-1 // IU1 Address of last dst byte |
|
410 |
|
411 #ifdef VRSAVE |
|
412 oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine |
|
413 #endif |
|
414 subf D,DST,DR // IU1 How many bytes in first destination? |
|
415 li BK,0 // IU1 Initialize byte kount index |
|
416 |
|
417 #ifdef VRSAVE |
|
418 mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op |
|
419 #endif |
|
420 vxor v0,v0,v0 // VIU Clear v0 |
|
421 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) |
|
422 cmpi cr1,0,D,16 // IU1 Is D0 left justified? |
|
423 |
|
424 mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 |
|
425 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining |
|
426 beq cr1,c_Left_just // b if D0 is left justified |
|
427 |
|
428 bns cr7,c_No_B_fwd // b if only even number of bytes to store |
|
429 |
|
430 stvebx v0,DST,BK // LSU store first byte at DST+0 |
|
431 addi BK,BK,1 // IU1 increment index |
|
432 c_No_B_fwd: |
|
433 bne cr7,c_No_H_fwd // b if only words to store |
|
434 |
|
435 stvehx v0,DST,BK // LSU store halfword at DST+0/1 |
|
436 addi BK,BK,2 // IU1 increment index |
|
437 c_No_H_fwd: |
|
438 bng cr7,c_No_W1_fwd // b if exactly zero or two words to store |
|
439 |
|
440 stvewx v0,DST,BK // LSU store word 1 of one or three |
|
441 addi BK,BK,4 // IU1 increment index |
|
442 |
|
443 c_No_W1_fwd: |
|
444 bnl cr7,c_No_W2_fwd // b if there was only one word to store |
|
445 stvewx v0,DST,BK // LSU store word 1 of two or 2 of three |
|
446 addi BK,BK,4 // IU1 increment index |
|
447 |
|
448 stvewx v0,DST,BK // LSU store word 2 of two or 3 of three |
|
449 b c_No_W2_fwd |
|
450 |
|
451 c_Left_just: |
|
452 stvx v0,0,DST // LSU Store 16 bytes at D0 |
|
453 c_No_W2_fwd: |
|
454 rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] |
|
455 cmpi cr6,0,QW,0 // IU1 Any full vectors to move? |
|
456 |
|
457 li BK,16 // IU1 Re-initialize byte kount index |
|
458 cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? |
|
459 ble cr6,c_Last_QW // b if no Quad words to do |
|
460 |
|
461 mtctr QW // IU2 for (i=0;i<=QW;i++) |
|
462 cmpi cr6,0,QW,4 // IU1 Check QW>4 |
|
463 |
|
464 c_QW_loop: |
|
465 stvx v0,DST,BK // LSU Store 16 fill bytes |
|
466 addi BK,BK,16 // IU1 Increment byte kount index |
|
467 bdnzf 25,c_QW_loop // b if 4 or less quad words to do |
|
468 |
|
469 add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) |
|
470 addi QW,QW,-1 // IU1 One more QW stored by now |
|
471 bgt cr6,c_GT_4QW_fwd // b if >4 quad words left |
|
472 |
|
473 c_Last_QW: // Next vector is the last; we're done. |
|
474 mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 |
|
475 |
|
476 beq cr1,c_Rt_just_fwd // b if last destination is right justified |
|
477 |
|
478 rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte |
|
479 li BL,0 // IU1 Initialize index pointer |
|
480 bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store |
|
481 |
|
482 stvewx v0,DBK,BL // LSU store word 1 of two or three |
|
483 addi BL,BL,4 // IU1 increment index |
|
484 |
|
485 stvewx v0,DBK,BL // LSU store word 2 of two or three |
|
486 addi BL,BL,4 // IU1 increment index |
|
487 c_Only_1W_fwd: |
|
488 bng cr7,Only_2W_fwd // b if there were only two or zero words to store |
|
489 |
|
490 stvewx v0,DBK,BL // LSU store word 3 of three if necessary |
|
491 addi BL,BL,4 // IU1 increment index |
|
492 c_Only_2W_fwd: |
|
493 bne cr7,c_Only_B_fwd // b if there are no half words to store |
|
494 |
|
495 stvehx v0,DBK,BL // LSU store one halfword if necessary |
|
496 addi BL,BL,2 // IU1 increment index |
|
497 c_Only_B_fwd: |
|
498 bns cr7,c_All_done_fwd // b if there are no bytes to store |
|
499 |
|
500 stvebx v0,DBK,BL // LSU store one byte if necessary |
|
501 b c_All_done_fwd |
|
502 |
|
503 c_Rt_just_fwd: |
|
504 |
|
505 stvx v0,DST,BK // LSU Store 16 bytes at D14 |
|
506 c_All_done_fwd: |
|
507 #ifdef VRSAVE |
|
508 mtspr VRSV,RSV // IU1 Restore VRSAVE |
|
509 #endif |
|
510 blr // Return destination address from entry |
|
511 |
|
512 #ifdef __MWERKS__ |
|
513 .align 16 |
|
514 #else |
|
515 .align 4 |
|
516 #endif |
|
517 c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice |
|
518 |
|
519 addi QW,QW,-1 // IU1 Keeping track of QWs stored |
|
520 mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; |
|
521 addi DNX,DNX,16 // IU1 Update cr6 for next loop |
|
522 |
|
523 stvx v0,DST,BK // LSU Store 16 bytes at D2 |
|
524 addi BK,BK,16 // IU1 Increment byte count by 16 |
|
525 bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL |
|
526 |
|
527 mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) |
|
528 |
|
529 bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even |
|
530 |
|
531 // We need the ctr register to reflect an even byte count before entering |
|
532 // the next block - faster to decrement than to reload. |
|
533 bdnz B32_fwd // decrement counter for last QW store odd |
|
534 |
|
535 c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned |
|
536 dcbz DST,BK // LSU zero whole cache line |
|
537 bdz c_Nxt_loc_fwd // always decrement and branch to next instr |
|
538 |
|
539 c_Nxt_loc_fwd: |
|
540 addi BK,BK,32 // IU1 Increment byte count |
|
541 bdnz B32_fwd // b if there are at least two more QWs to do |
|
542 |
|
543 bso cr6,c_One_even_QW // b if there is one even and one odd QW to store |
|
544 b c_Last_QW // b if last store is to even address |
|
545 |
|
546 // Come here with two more loads and two stores to do |
|
547 c_One_even_QW: |
|
548 stvx v0,DST,BK // LSU Store 16 bytes at D13 |
|
549 addi BK,BK,16 // IU1 Increment byte count |
|
550 |
|
551 b c_Last_QW |
|
552 |
|
553 // End of cacheable_memzero in AltiVec |
|