| author | andy simpson <andrews@symbian.org> |
| Wed, 16 Jun 2010 08:14:03 +0100 | |
| branch | GCC_SURGE |
| changeset 36 | 6a60b9d459b4 |
| parent 18 | 47c74d1534e1 |
| permissions | -rw-r--r-- |
//------------------------------------------------------------------ // file: vec_memset.S // AltiVec enabled version of memset and bzero and cacheable_memzero //------------------------------------------------------------------ //------------------------------------------------------------------ // Copyright Motorola, Inc. 2002 // ALL RIGHTS RESERVED // // You are hereby granted a copyright license to use, modify, and // distribute the SOFTWARE so long as this entire notice is retained // without alteration in any modified and/or redistributed versions, // and that such modified versions are clearly identified as such. // No licenses are granted by implication, estoppel or otherwise under // any patents or trademarks of Motorola, Inc. // // The SOFTWARE is provided on an "AS IS" basis and without warranty. // To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS // ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED // WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR // PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH // REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS // THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS. // // To the maximum extent permitted by applicable law, IN NO EVENT SHALL // MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER // (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF // BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS // INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR // INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility // for the maintenance and support of the SOFTWARE. //------------------------------------------------------------------ //------------------------------------------------------------------ // extern void *memset( void *ptr, int val, size_t len ); // Copies val into each of len characters beginning at ptr. // - Harbison&Steele 4th ed // (despite val being an int, this memset assumes it is never // more than a byte. That seems to be correct from all the // memset functions I've seen but I don't know if ANSI allows // anthing longer. Chuck Corley 12/21/02) // Returns: // void * ptr //------------------------------------------------------------------ //------------------------------------------------------------------ // extern void * bzero( char *ptr, int len); // Copies 0 into each of len characters at ptr. // - Harbison&Steele 4th ed // Returns: // void * ptr //------------------------------------------------------------------ // Revision History: // Rev 0.0 Original Chuck Corley 02/09/03 // Could benefit from changes added to memcpy // Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03 // // This is beta quality code; users are encouraged to make it faster. // ASSUMPTIONS: // Code is highly likely to be in the cache; data is not (streaming data) // Zero fill could be quite likely. // Moving fill byte from GPR to VR as below faster than stw->lvebx via stack #define VRSV 256 // VRSAVE spr // Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes. #define MIN_VEC 16 // Register useage #define Rt r0 // r0 when used as a temporary register #define DST r3 // entering: dest pointer; exiting: same dest pointer #define FILL r4 // entering: fill char then fill word #define BC r5 // entering: Byte_Count then remaining Byte_Count #define DBC r6// dst + byte count #define BK r7 // BC - 1 +/- (n*16) #define Fsh r8 // fill byte shifted right one nibble #define DM1 r9// dst -1 for byte-by-byte backwards initially #define D r9 // (dst+16)[0:27] - dst[28:31] #define DNX r9 // (dst+n*16)[28:31] #define BL r9 // second byte_kount index pointer #define DR r10 // (dst+16)[0:27] #define QW r10 // number of cache lines #define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31] #define RSV r12 // storage for VRSAVE register if used // Condition register use (not including temporary cr0) // cr0[2] = (FILL==0)? // cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move) // then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified) // then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified) // cr6[2] = (QW == 0)? 1 : 0; // then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?) // then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment) // then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?) // cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors) // then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?) // then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?) // Conditionalize the use of dcba. It will help if the data is // not in cache and hurt if it is. Generally, except for small // benchmarks repeated many times, we assume data is not in cache // (data streaming) and using dcba is a performance boost. // We use dcba which will noop to non-cacheable memory rather than // dcbz which will cause an aligment exception. #ifndef NO_DCBA #if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL) // gcc and codewarrior and diab don't assemble dcba #define DCBK .long 0x7c033dec // dcba r3,r7 or dcba DST,BK #else #ifdef __ghs__ .macro DCBK .long 0x7c033dec .endm #else #define DCBK dcba DST,BK #endif // __ghs__ #endif // __GNUC__ or __MWERKS__ #else #define DCBK nop #endif // NO_DCBA .text #ifdef __MWERKS__ .align 32 #else .align 5 #endif #ifdef LIBMOTOVEC .globl memset memset: #else .globl _vec_memset _vec_memset: #endif cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination addi DR,DST,16 // IU1 Address of second dst vector add DBC,DST,BC // IU1 Address of last dst byte + 1 bgt cr7,v_memset // b if BC>MIN_VEC mtctr BC // for (i=1;i<=BC;i++) beqlr cr1 // return if BC = 0 Byte_set: stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL bdnz Byte_set blr v_memset: // Byte count < MIN_VEC bytes will have been set by scalar code above, // so this will not deal with small block sets < MIN_VEC. // For systems using VRSAVE, define VRSAV=1 when compiling. For systems // that don't, make sure VRSAVE is undefined. #ifdef VRSAVE mfspr RSV,VRSV // IU2 Get current VRSAVE contents #endif rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] addi DBK,DBC,-1 // IU1 Address of last dst byte #ifdef VRSAVE oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine #endif subf D,DST,DR // IU1 How many bytes in first destination? li BK,0 // IU1 Initialize byte kount index #ifdef VRSAVE mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op #endif vxor v0,v0,v0 // VIU Clear v0 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) cmpi cr1,0,D,16 // IU1 Is D0 left justified? beq+ enter_bzero // b if FILL==0 lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR vspltisb v1,4 // VPU Splat 0x4 to every byte lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3] vor v0,v0,v2 // VIU Form FILL byte in VR[0:7] vspltb v0,v0,0 // VPU Splat the fill byte to all bytes enter_bzero: mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining beq cr1,Left_just // b if D0 is left justified bns cr7,No_B_fwd // b if only even number of bytes to store stvebx v0,DST,BK // LSU store first byte at DST+0 addi BK,BK,1 // IU1 increment index No_B_fwd: bne cr7,No_H_fwd // b if only words to store stvehx v0,DST,BK // LSU store halfword at DST+0/1 addi BK,BK,2 // IU1 increment index No_H_fwd: bng cr7,No_W1_fwd // b if exactly zero or two words to store stvewx v0,DST,BK // LSU store word 1 of one or three addi BK,BK,4 // IU1 increment index No_W1_fwd: bnl cr7,No_W2_fwd // b if there was only one word to store stvewx v0,DST,BK // LSU store word 1 of two or 2 of three addi BK,BK,4 // IU1 increment index stvewx v0,DST,BK // LSU store word 2 of two or 3 of three b No_W2_fwd Left_just: stvx v0,0,DST // LSU Store 16 bytes at D0 No_W2_fwd: rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] cmpi cr6,0,QW,0 // IU1 Any full vectors to move? li BK,16 // IU1 Re-initialize byte kount index cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? ble cr6,Last_QW // b if no Quad words to do mtctr QW // IU2 for (i=0;i<=QW;i++) cmpi cr6,0,QW,4 // IU1 Check QW>4 QW_loop: stvx v0,DST,BK // LSU Store 16 fill bytes addi BK,BK,16 // IU1 Increment byte kount index bdnzf 25,QW_loop // b if 4 or less quad words to do add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) addi QW,QW,-1 // IU1 One more QW stored by now bgt cr6,GT_4QW_fwd // b if >4 quad words left Last_QW: // Next vector is the last; we're done. mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 beq cr1,Rt_just_fwd // b if last destination is right justified rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte li BL,0 // IU1 Initialize index pointer bnl cr7,Only_1W_fwd // b if there was only one or zero words to store stvewx v0,DBK,BL // LSU store word 1 of two or three addi BL,BL,4 // IU1 increment index stvewx v0,DBK,BL // LSU store word 2 of two or three addi BL,BL,4 // IU1 increment index Only_1W_fwd: bng cr7,Only_2W_fwd // b if there were only two or zero words to store stvewx v0,DBK,BL // LSU store word 3 of three if necessary addi BL,BL,4 // IU1 increment index Only_2W_fwd: bne cr7,Only_B_fwd // b if there are no half words to store stvehx v0,DBK,BL // LSU store one halfword if necessary addi BL,BL,2 // IU1 increment index Only_B_fwd: bns cr7,All_done_fwd // b if there are no bytes to store stvebx v0,DBK,BL // LSU store one byte if necessary b All_done_fwd Rt_just_fwd: stvx v0,DST,BK // LSU Store 16 bytes at D14 All_done_fwd: #ifdef VRSAVE mtspr VRSV,RSV // IU1 Restore VRSAVE #endif blr // Return destination address from entry #ifdef __MWERKS__ .align 16 #else .align 4 #endif GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice addi QW,QW,-1 // IU1 Keeping track of QWs stored mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; addi DNX,DNX,16 // IU1 Update cr6 for next loop stvx v0,DST,BK // LSU Store 16 bytes at D2 addi BK,BK,16 // IU1 Increment byte count by 16 bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even // We need the ctr register to reflect an even byte count before entering // the next block - faster to decrement than to reload. bdnz B32_fwd // decrement counter for last QW store odd B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned DCBK // LSU then Kill instead of RWITM stvx v0,DST,BK // LSU Store 16 bytes at D11 addi BK,BK,16 // IU1 Increment byte count bdz Nxt_loc_fwd // always decrement and branch to next instr Nxt_loc_fwd: stvx v0,DST,BK // LSU Store 16 bytes at D12 addi BK,BK,16 // IU1 Increment byte count bdnz B32_fwd // b if there are at least two more QWs to do bso cr6,One_even_QW // b if there is one even and one odd QW to store b Last_QW // b if last store is to even address // Come here with two more loads and two stores to do One_even_QW: stvx v0,DST,BK // LSU Store 16 bytes at D13 addi BK,BK,16 // IU1 Increment byte count b Last_QW // End of memset in AltiVec #define BCz r4 // in bzero r4 enters with byte count #ifdef __MWERKS__ .align 32 #else .align 5 #endif #ifdef LIBMOTOVEC .globl bzero bzero: #else .globl vec_bzero vec_bzero: #endif mr BC,BCz // IU1 arg[2] is BC here, not FILL li FILL,0 // IU1 for bzero FILL=0 #ifdef LIBMOTOVEC b memset #else b _vec_memset #endif // cacheable_memzero will employ dcbz to clear 32 bytes at a time // of cacheable memory. Like bzero, second entering argument will be BC. // Using this for non-cacheable memory will generate an alignment exception. .text #ifdef __MWERKS__ .align 32 #else .align 5 #endif #ifdef LIBMOTOVEC .globl cacheable_memzero cacheable_memzero: #else .globl vec_cacheable_memzero vec_cacheable_memzero: #endif mr BC,BCz // IU1 arg[2] is BC here, not FILL li FILL,0 // IU1 for bzero FILL=0 cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination addi DR,DST,16 // IU1 Address of second dst vector add DBC,DST,BC // IU1 Address of last dst byte + 1 bgt cr7,c_v_memset // b if BC>MIN_VEC mtctr BC // for (i=1;i<=BC;i++) beqlr cr1 // return if BC = 0 c_Byte_set: stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL bdnz c_Byte_set blr c_v_memset: // Byte count < MIN_VEC bytes will have been set by scalar code above, // so this will not deal with small block sets < MIN_VEC. // For systems using VRSAVE, define VRSAV=1 when compiling. For systems // that don't, make sure VRSAVE is undefined. #ifdef VRSAVE mfspr RSV,VRSV // IU2 Get current VRSAVE contents #endif rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27] addi DBK,DBC,-1 // IU1 Address of last dst byte #ifdef VRSAVE oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine #endif subf D,DST,DR // IU1 How many bytes in first destination? li BK,0 // IU1 Initialize byte kount index #ifdef VRSAVE mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op #endif vxor v0,v0,v0 // VIU Clear v0 subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16) cmpi cr1,0,D,16 // IU1 Is D0 left justified? mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7 rlwinm QW,QW,28,4,31 // IU1 Quad words remaining beq cr1,c_Left_just // b if D0 is left justified bns cr7,c_No_B_fwd // b if only even number of bytes to store stvebx v0,DST,BK // LSU store first byte at DST+0 addi BK,BK,1 // IU1 increment index c_No_B_fwd: bne cr7,c_No_H_fwd // b if only words to store stvehx v0,DST,BK // LSU store halfword at DST+0/1 addi BK,BK,2 // IU1 increment index c_No_H_fwd: bng cr7,c_No_W1_fwd // b if exactly zero or two words to store stvewx v0,DST,BK // LSU store word 1 of one or three addi BK,BK,4 // IU1 increment index c_No_W1_fwd: bnl cr7,c_No_W2_fwd // b if there was only one word to store stvewx v0,DST,BK // LSU store word 1 of two or 2 of three addi BK,BK,4 // IU1 increment index stvewx v0,DST,BK // LSU store word 2 of two or 3 of three b c_No_W2_fwd c_Left_just: stvx v0,0,DST // LSU Store 16 bytes at D0 c_No_W2_fwd: rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31] cmpi cr6,0,QW,0 // IU1 Any full vectors to move? li BK,16 // IU1 Re-initialize byte kount index cmpi cr1,0,Rt,0xF // IU1 Is DN right justified? ble cr6,c_Last_QW // b if no Quad words to do mtctr QW // IU2 for (i=0;i<=QW;i++) cmpi cr6,0,QW,4 // IU1 Check QW>4 c_QW_loop: stvx v0,DST,BK // LSU Store 16 fill bytes addi BK,BK,16 // IU1 Increment byte kount index bdnzf 25,c_QW_loop // b if 4 or less quad words to do add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4) addi QW,QW,-1 // IU1 One more QW stored by now bgt cr6,c_GT_4QW_fwd // b if >4 quad words left c_Last_QW: // Next vector is the last; we're done. mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7 beq cr1,c_Rt_just_fwd // b if last destination is right justified rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte li BL,0 // IU1 Initialize index pointer bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store stvewx v0,DBK,BL // LSU store word 1 of two or three addi BL,BL,4 // IU1 increment index stvewx v0,DBK,BL // LSU store word 2 of two or three addi BL,BL,4 // IU1 increment index c_Only_1W_fwd: bng cr7,Only_2W_fwd // b if there were only two or zero words to store stvewx v0,DBK,BL // LSU store word 3 of three if necessary addi BL,BL,4 // IU1 increment index c_Only_2W_fwd: bne cr7,c_Only_B_fwd // b if there are no half words to store stvehx v0,DBK,BL // LSU store one halfword if necessary addi BL,BL,2 // IU1 increment index c_Only_B_fwd: bns cr7,c_All_done_fwd // b if there are no bytes to store stvebx v0,DBK,BL // LSU store one byte if necessary b c_All_done_fwd c_Rt_just_fwd: stvx v0,DST,BK // LSU Store 16 bytes at D14 c_All_done_fwd: #ifdef VRSAVE mtspr VRSV,RSV // IU1 Restore VRSAVE #endif blr // Return destination address from entry #ifdef __MWERKS__ .align 16 #else .align 4 #endif c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice addi QW,QW,-1 // IU1 Keeping track of QWs stored mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0; addi DNX,DNX,16 // IU1 Update cr6 for next loop stvx v0,DST,BK // LSU Store 16 bytes at D2 addi BK,BK,16 // IU1 Increment byte count by 16 bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?) bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even // We need the ctr register to reflect an even byte count before entering // the next block - faster to decrement than to reload. bdnz B32_fwd // decrement counter for last QW store odd c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned dcbz DST,BK // LSU zero whole cache line bdz c_Nxt_loc_fwd // always decrement and branch to next instr c_Nxt_loc_fwd: addi BK,BK,32 // IU1 Increment byte count bdnz B32_fwd // b if there are at least two more QWs to do bso cr6,c_One_even_QW // b if there is one even and one odd QW to store b c_Last_QW // b if last store is to even address // Come here with two more loads and two stores to do c_One_even_QW: stvx v0,DST,BK // LSU Store 16 bytes at D13 addi BK,BK,16 // IU1 Increment byte count b c_Last_QW // End of cacheable_memzero in AltiVec