diff -r a1e347446159 -r 28ccaba883f4 genericopenlibs/liboil/src/motovec/vec_memset.S --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/genericopenlibs/liboil/src/motovec/vec_memset.S Wed Oct 13 19:39:18 2010 +0530 @@ -0,0 +1,553 @@ +//------------------------------------------------------------------ +// file: vec_memset.S +// AltiVec enabled version of memset and bzero and cacheable_memzero +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// Copyright Motorola, Inc. 2002 +// ALL RIGHTS RESERVED +// +// You are hereby granted a copyright license to use, modify, and +// distribute the SOFTWARE so long as this entire notice is retained +// without alteration in any modified and/or redistributed versions, +// and that such modified versions are clearly identified as such. +// No licenses are granted by implication, estoppel or otherwise under +// any patents or trademarks of Motorola, Inc. +// +// The SOFTWARE is provided on an "AS IS" basis and without warranty. +// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS +// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED +// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR +// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH +// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS +// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. +// +// To the maximum extent permitted by applicable law, IN NO EVENT SHALL +// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER +// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF +// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS +// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR +// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility +// for the maintenance and support of the SOFTWARE. +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void *memset( void *ptr, int val, size_t len ); +// Copies val into each of len characters beginning at ptr. +// - Harbison&Steele 4th ed +// (despite val being an int, this memset assumes it is never +// more than a byte. That seems to be correct from all the +// memset functions I've seen but I don't know if ANSI allows +// anthing longer. Chuck Corley 12/21/02) +// Returns: +// void * ptr +//------------------------------------------------------------------ + +//------------------------------------------------------------------ +// extern void * bzero( char *ptr, int len); +// Copies 0 into each of len characters at ptr. +// - Harbison&Steele 4th ed +// Returns: +// void * ptr +//------------------------------------------------------------------ + +// Revision History: +// Rev 0.0 Original Chuck Corley 02/09/03 +// Could benefit from changes added to memcpy +// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 +// +// This is beta quality code; users are encouraged to make it faster. +// ASSUMPTIONS: +// Code is highly likely to be in the cache; data is not (streaming data) +// Zero fill could be quite likely. +// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack + +#define VRSV 256 // VRSAVE spr +// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. +#define MIN_VEC 16 + +// Register useage +#define Rt r0 // r0 when used as a temporary register + +#define DST r3 // entering: dest pointer; exiting: same dest pointer + +#define FILL r4 // entering: fill char then fill word + +#define BC r5 // entering: Byte_Count then remaining Byte_Count + +#define DBC r6// dst + byte count + +#define BK r7 // BC - 1 +/- (n*16) + +#define Fsh r8 // fill byte shifted right one nibble + +#define DM1 r9// dst -1 for byte-by-byte backwards initially +#define D r9 // (dst+16)[0:27] - dst[28:31] +#define DNX r9 // (dst+n*16)[28:31] +#define BL r9 // second byte_kount index pointer + +#define DR r10 // (dst+16)[0:27] +#define QW r10 // number of cache lines + +#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] + +#define RSV r12 // storage for VRSAVE register if used + +// Condition register use (not including temporary cr0) +// cr0[2] = (FILL==0)? +// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) +// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) +// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) +// cr6[2] = (QW == 0)? 1 : 0; +// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) +// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) +// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) +// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) +// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) +// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) + +// Conditionalize the use of dcba. It will help if the data is +// not in cache and hurt if it is. Generally, except for small +// benchmarks repeated many times, we assume data is not in cache +// (data streaming) and using dcba is a performance boost. +// We use dcba which will noop to non-cacheable memory rather than +// dcbz which will cause an aligment exception. +#ifndef NO_DCBA +#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) + // gcc and codewarrior and diab don't assemble dcba +#define DCBK .long 0x7c033dec +// dcba r3,r7 or dcba DST,BK +#else +#ifdef __ghs__ +.macro DCBK +.long 0x7c033dec +.endm +#else +#define DCBK dcba DST,BK +#endif // __ghs__ +#endif // __GNUC__ or __MWERKS__ +#else +#define DCBK nop +#endif // NO_DCBA + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .globl memset +memset: +#else + .globl _vec_memset +_vec_memset: +#endif + + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count + rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift + + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination + addi DR,DST,16 // IU1 Address of second dst vector + add DBC,DST,BC // IU1 Address of last dst byte + 1 + bgt cr7,v_memset // b if BC>MIN_VEC + + mtctr BC // for (i=1;i<=BC;i++) + beqlr cr1 // return if BC = 0 +Byte_set: + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL + bdnz Byte_set + + blr + +v_memset: +// Byte count < MIN_VEC bytes will have been set by scalar code above, +// so this will not deal with small block sets < MIN_VEC. + +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + addi DBK,DBC,-1 // IU1 Address of last dst byte + +#ifdef VRSAVE + oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine +#endif + subf D,DST,DR // IU1 How many bytes in first destination? + li BK,0 // IU1 Initialize byte kount index + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + vxor v0,v0,v0 // VIU Clear v0 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + cmpi cr1,0,D,16 // IU1 Is D0 left justified? + beq+ enter_bzero // b if FILL==0 + + lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR + vspltisb v1,4 // VPU Splat 0x4 to every byte + + lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR + + vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] + + vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] + + vspltb v0,v0,0 // VPU Splat the fill byte to all bytes +enter_bzero: + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + beq cr1,Left_just // b if D0 is left justified + + bns cr7,No_B_fwd // b if only even number of bytes to store + + stvebx v0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +No_B_fwd: + bne cr7,No_H_fwd // b if only words to store + + stvehx v0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +No_H_fwd: + bng cr7,No_W1_fwd // b if exactly zero or two words to store + + stvewx v0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +No_W1_fwd: + bnl cr7,No_W2_fwd // b if there was only one word to store + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three + b No_W2_fwd + +Left_just: + stvx v0,0,DST // LSU Store 16 bytes at D0 +No_W2_fwd: + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + + li BK,16 // IU1 Re-initialize byte kount index + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? + ble cr6,Last_QW // b if no Quad words to do + + mtctr QW // IU2 for (i=0;i<=QW;i++) + cmpi cr6,0,QW,4 // IU1 Check QW>4 + +QW_loop: + stvx v0,DST,BK // LSU Store 16 fill bytes + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,QW_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,GT_4QW_fwd // b if >4 quad words left + +Last_QW: // Next vector is the last; we're done. + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + + beq cr1,Rt_just_fwd // b if last destination is right justified + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li BL,0 // IU1 Initialize index pointer + bnl cr7,Only_1W_fwd // b if there was only one or zero words to store + + stvewx v0,DBK,BL // LSU store word 1 of two or three + addi BL,BL,4 // IU1 increment index + + stvewx v0,DBK,BL // LSU store word 2 of two or three + addi BL,BL,4 // IU1 increment index +Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx v0,DBK,BL // LSU store word 3 of three if necessary + addi BL,BL,4 // IU1 increment index +Only_2W_fwd: + bne cr7,Only_B_fwd // b if there are no half words to store + + stvehx v0,DBK,BL // LSU store one halfword if necessary + addi BL,BL,2 // IU1 increment index +Only_B_fwd: + bns cr7,All_done_fwd // b if there are no bytes to store + + stvebx v0,DBK,BL // LSU store one byte if necessary + b All_done_fwd + +Rt_just_fwd: + + stvx v0,DST,BK // LSU Store 16 bytes at D14 +All_done_fwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + addi QW,QW,-1 // IU1 Keeping track of QWs stored + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + stvx v0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + + bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + bdnz B32_fwd // decrement counter for last QW store odd + +B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + DCBK // LSU then Kill instead of RWITM + + stvx v0,DST,BK // LSU Store 16 bytes at D11 + addi BK,BK,16 // IU1 Increment byte count + bdz Nxt_loc_fwd // always decrement and branch to next instr + +Nxt_loc_fwd: + stvx v0,DST,BK // LSU Store 16 bytes at D12 + addi BK,BK,16 // IU1 Increment byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,One_even_QW // b if there is one even and one odd QW to store + b Last_QW // b if last store is to even address + +// Come here with two more loads and two stores to do +One_even_QW: + stvx v0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + + b Last_QW + +// End of memset in AltiVec + +#define BCz r4 // in bzero r4 enters with byte count + +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .globl bzero +bzero: +#else + .globl vec_bzero +vec_bzero: +#endif + + mr BC,BCz // IU1 arg[2] is BC here, not FILL + li FILL,0 // IU1 for bzero FILL=0 +#ifdef LIBMOTOVEC + b memset +#else + b _vec_memset +#endif + +// cacheable_memzero will employ dcbz to clear 32 bytes at a time +// of cacheable memory. Like bzero, second entering argument will be BC. +// Using this for non-cacheable memory will generate an alignment exception. + + .text +#ifdef __MWERKS__ + .align 32 +#else + .align 5 +#endif + +#ifdef LIBMOTOVEC + .globl cacheable_memzero +cacheable_memzero: +#else + .globl vec_cacheable_memzero +vec_cacheable_memzero: +#endif + + mr BC,BCz // IU1 arg[2] is BC here, not FILL + li FILL,0 // IU1 for bzero FILL=0 + cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count + + cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count + + addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination + addi DR,DST,16 // IU1 Address of second dst vector + add DBC,DST,BC // IU1 Address of last dst byte + 1 + bgt cr7,c_v_memset // b if BC>MIN_VEC + + mtctr BC // for (i=1;i<=BC;i++) + beqlr cr1 // return if BC = 0 +c_Byte_set: + stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL + bdnz c_Byte_set + + blr + +c_v_memset: +// Byte count < MIN_VEC bytes will have been set by scalar code above, +// so this will not deal with small block sets < MIN_VEC. + +// For systems using VRSAVE, define VRSAV=1 when compiling. For systems +// that don't, make sure VRSAVE is undefined. +#ifdef VRSAVE + mfspr RSV,VRSV // IU2 Get current VRSAVE contents +#endif + rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] + addi DBK,DBC,-1 // IU1 Address of last dst byte + +#ifdef VRSAVE + oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine +#endif + subf D,DST,DR // IU1 How many bytes in first destination? + li BK,0 // IU1 Initialize byte kount index + +#ifdef VRSAVE + mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op +#endif + vxor v0,v0,v0 // VIU Clear v0 + subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) + cmpi cr1,0,D,16 // IU1 Is D0 left justified? + + mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 + rlwinm QW,QW,28,4,31 // IU1 Quad words remaining + beq cr1,c_Left_just // b if D0 is left justified + + bns cr7,c_No_B_fwd // b if only even number of bytes to store + + stvebx v0,DST,BK // LSU store first byte at DST+0 + addi BK,BK,1 // IU1 increment index +c_No_B_fwd: + bne cr7,c_No_H_fwd // b if only words to store + + stvehx v0,DST,BK // LSU store halfword at DST+0/1 + addi BK,BK,2 // IU1 increment index +c_No_H_fwd: + bng cr7,c_No_W1_fwd // b if exactly zero or two words to store + + stvewx v0,DST,BK // LSU store word 1 of one or three + addi BK,BK,4 // IU1 increment index + +c_No_W1_fwd: + bnl cr7,c_No_W2_fwd // b if there was only one word to store + stvewx v0,DST,BK // LSU store word 1 of two or 2 of three + addi BK,BK,4 // IU1 increment index + + stvewx v0,DST,BK // LSU store word 2 of two or 3 of three + b c_No_W2_fwd + +c_Left_just: + stvx v0,0,DST // LSU Store 16 bytes at D0 +c_No_W2_fwd: + rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] + cmpi cr6,0,QW,0 // IU1 Any full vectors to move? + + li BK,16 // IU1 Re-initialize byte kount index + cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? + ble cr6,c_Last_QW // b if no Quad words to do + + mtctr QW // IU2 for (i=0;i<=QW;i++) + cmpi cr6,0,QW,4 // IU1 Check QW>4 + +c_QW_loop: + stvx v0,DST,BK // LSU Store 16 fill bytes + addi BK,BK,16 // IU1 Increment byte kount index + bdnzf 25,c_QW_loop // b if 4 or less quad words to do + + add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) + addi QW,QW,-1 // IU1 One more QW stored by now + bgt cr6,c_GT_4QW_fwd // b if >4 quad words left + +c_Last_QW: // Next vector is the last; we're done. + mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 + + beq cr1,c_Rt_just_fwd // b if last destination is right justified + + rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte + li BL,0 // IU1 Initialize index pointer + bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store + + stvewx v0,DBK,BL // LSU store word 1 of two or three + addi BL,BL,4 // IU1 increment index + + stvewx v0,DBK,BL // LSU store word 2 of two or three + addi BL,BL,4 // IU1 increment index +c_Only_1W_fwd: + bng cr7,Only_2W_fwd // b if there were only two or zero words to store + + stvewx v0,DBK,BL // LSU store word 3 of three if necessary + addi BL,BL,4 // IU1 increment index +c_Only_2W_fwd: + bne cr7,c_Only_B_fwd // b if there are no half words to store + + stvehx v0,DBK,BL // LSU store one halfword if necessary + addi BL,BL,2 // IU1 increment index +c_Only_B_fwd: + bns cr7,c_All_done_fwd // b if there are no bytes to store + + stvebx v0,DBK,BL // LSU store one byte if necessary + b c_All_done_fwd + +c_Rt_just_fwd: + + stvx v0,DST,BK // LSU Store 16 bytes at D14 +c_All_done_fwd: +#ifdef VRSAVE + mtspr VRSV,RSV // IU1 Restore VRSAVE +#endif + blr // Return destination address from entry + +#ifdef __MWERKS__ + .align 16 +#else + .align 4 +#endif +c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice + + addi QW,QW,-1 // IU1 Keeping track of QWs stored + mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; + addi DNX,DNX,16 // IU1 Update cr6 for next loop + + stvx v0,DST,BK // LSU Store 16 bytes at D2 + addi BK,BK,16 // IU1 Increment byte count by 16 + bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL + + mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) + + bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even + +// We need the ctr register to reflect an even byte count before entering +// the next block - faster to decrement than to reload. + bdnz B32_fwd // decrement counter for last QW store odd + +c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned + dcbz DST,BK // LSU zero whole cache line + bdz c_Nxt_loc_fwd // always decrement and branch to next instr + +c_Nxt_loc_fwd: + addi BK,BK,32 // IU1 Increment byte count + bdnz B32_fwd // b if there are at least two more QWs to do + + bso cr6,c_One_even_QW // b if there is one even and one odd QW to store + b c_Last_QW // b if last store is to even address + +// Come here with two more loads and two stores to do +c_One_even_QW: + stvx v0,DST,BK // LSU Store 16 bytes at D13 + addi BK,BK,16 // IU1 Increment byte count + + b c_Last_QW + +// End of cacheable_memzero in AltiVec