--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/genericopenlibs/liboil/src/motovec/vec_memset.S Tue Aug 31 16:54:36 2010 +0300
@@ -0,0 +1,553 @@
+//------------------------------------------------------------------
+// file: vec_memset.S
+// AltiVec enabled version of memset and bzero and cacheable_memzero
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// Copyright Motorola, Inc. 2002
+// ALL RIGHTS RESERVED
+//
+// You are hereby granted a copyright license to use, modify, and
+// distribute the SOFTWARE so long as this entire notice is retained
+// without alteration in any modified and/or redistributed versions,
+// and that such modified versions are clearly identified as such.
+// No licenses are granted by implication, estoppel or otherwise under
+// any patents or trademarks of Motorola, Inc.
+//
+// The SOFTWARE is provided on an "AS IS" basis and without warranty.
+// To the maximum extent permitted by applicable law, MOTOROLA DISCLAIMS
+// ALL WARRANTIES WHETHER EXPRESS OR IMPLIED, INCLUDING IMPLIED
+// WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR
+// PURPOSE AND ANY WARRANTY AGAINST INFRINGEMENT WITH
+// REGARD TO THE SOFTWARE (INCLUDING ANY MODIFIED VERSIONS
+// THEREOF) AND ANY ACCOMPANYING WRITTEN MATERIALS.
+//
+// To the maximum extent permitted by applicable law, IN NO EVENT SHALL
+// MOTOROLA BE LIABLE FOR ANY DAMAGES WHATSOEVER
+// (INCLUDING WITHOUT LIMITATION, DAMAGES FOR LOSS OF
+// BUSINESS PROFITS, BUSINESS INTERRUPTION, LOSS OF BUSINESS
+// INFORMATION, OR OTHER PECUNIARY LOSS) ARISING OF THE USE OR
+// INABILITY TO USE THE SOFTWARE. Motorola assumes no responsibility
+// for the maintenance and support of the SOFTWARE.
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void *memset( void *ptr, int val, size_t len );
+// Copies val into each of len characters beginning at ptr.
+// - Harbison&Steele 4th ed
+// (despite val being an int, this memset assumes it is never
+// more than a byte. That seems to be correct from all the
+// memset functions I've seen but I don't know if ANSI allows
+// anthing longer. Chuck Corley 12/21/02)
+// Returns:
+// void * ptr
+//------------------------------------------------------------------
+
+//------------------------------------------------------------------
+// extern void * bzero( char *ptr, int len);
+// Copies 0 into each of len characters at ptr.
+// - Harbison&Steele 4th ed
+// Returns:
+// void * ptr
+//------------------------------------------------------------------
+
+// Revision History:
+// Rev 0.0 Original Chuck Corley 02/09/03
+// Could benefit from changes added to memcpy
+// Rev 0.1 Revised per memcpy Rev 0.30 Chuck Corley 05/01/03
+//
+// This is beta quality code; users are encouraged to make it faster.
+// ASSUMPTIONS:
+// Code is highly likely to be in the cache; data is not (streaming data)
+// Zero fill could be quite likely.
+// Moving fill byte from GPR to VR as below faster than stw->lvebx via stack
+
+#define VRSV 256 // VRSAVE spr
+// Don't use vectors for BC <= MIN_VEC. Works only if MIN_VEC >= 16 bytes.
+#define MIN_VEC 16
+
+// Register useage
+#define Rt r0 // r0 when used as a temporary register
+
+#define DST r3 // entering: dest pointer; exiting: same dest pointer
+
+#define FILL r4 // entering: fill char then fill word
+
+#define BC r5 // entering: Byte_Count then remaining Byte_Count
+
+#define DBC r6// dst + byte count
+
+#define BK r7 // BC - 1 +/- (n*16)
+
+#define Fsh r8 // fill byte shifted right one nibble
+
+#define DM1 r9// dst -1 for byte-by-byte backwards initially
+#define D r9 // (dst+16)[0:27] - dst[28:31]
+#define DNX r9 // (dst+n*16)[28:31]
+#define BL r9 // second byte_kount index pointer
+
+#define DR r10 // (dst+16)[0:27]
+#define QW r10 // number of cache lines
+
+#define DBK r11 // (dst+byte_count-1) then (dst+byte_count-1)[28:31]
+
+#define RSV r12 // storage for VRSAVE register if used
+
+// Condition register use (not including temporary cr0)
+// cr0[2] = (FILL==0)?
+// cr1[0,2] = (BC == 0)? 1 : 0; (nothing to move)
+// then cr1[2] = (DST[28:31] == 0)? 1 : 0; (D0 left justified)
+// then cr1[2] = ((DBK = DST+BC-1)[28:31] = 0xF)? 1 : 0; (DN right justified)
+// cr6[2] = (QW == 0)? 1 : 0;
+// then cr6[1] = (QW > 4)? 1 : 0; (>4 vectors to move?)
+// then cr6[3] = (third store[27] == 1)? 1: 0; (cache line alignment)
+// then cr6[3] = (last store[27] == 1)? 1: 0; (last store odd?)
+// cr7[2] = (BC>MIN_VEC)?1:0; (BC big enough to warrant vectors)
+// then cr7[0:3] = (DST+16)[0:27]-DST (How many bytes (iff <16) in first vector?)
+// then cr7[0:3] = (DST+BC)[0:27] (How many bytes (iff <16) in last vector?)
+
+// Conditionalize the use of dcba. It will help if the data is
+// not in cache and hurt if it is. Generally, except for small
+// benchmarks repeated many times, we assume data is not in cache
+// (data streaming) and using dcba is a performance boost.
+// We use dcba which will noop to non-cacheable memory rather than
+// dcbz which will cause an aligment exception.
+#ifndef NO_DCBA
+#if defined(__GNUC__) || defined(__MWERKS__) || defined(_DIAB_TOOL)
+ // gcc and codewarrior and diab don't assemble dcba
+#define DCBK .long 0x7c033dec
+// dcba r3,r7 or dcba DST,BK
+#else
+#ifdef __ghs__
+.macro DCBK
+.long 0x7c033dec
+.endm
+#else
+#define DCBK dcba DST,BK
+#endif // __ghs__
+#endif // __GNUC__ or __MWERKS__
+#else
+#define DCBK nop
+#endif // NO_DCBA
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .globl memset
+memset:
+#else
+ .globl _vec_memset
+_vec_memset:
+#endif
+
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
+ rlwinm. Fsh,FILL,28,28,3 // IU1 Is fill byte zero? and shift
+
+ addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
+ addi DR,DST,16 // IU1 Address of second dst vector
+ add DBC,DST,BC // IU1 Address of last dst byte + 1
+ bgt cr7,v_memset // b if BC>MIN_VEC
+
+ mtctr BC // for (i=1;i<=BC;i++)
+ beqlr cr1 // return if BC = 0
+Byte_set:
+ stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
+ bdnz Byte_set
+
+ blr
+
+v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+
+#ifdef VRSAVE
+ oris Rt,RSV,0xe000 // IU1 Or in registers used by this routine
+#endif
+ subf D,DST,DR // IU1 How many bytes in first destination?
+ li BK,0 // IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ vxor v0,v0,v0 // VIU Clear v0
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ cmpi cr1,0,D,16 // IU1 Is D0 left justified?
+ beq+ enter_bzero // b if FILL==0
+
+ lvsl v0,0,Fsh // LSU Move upper nibble to byte 0 of VR
+ vspltisb v1,4 // VPU Splat 0x4 to every byte
+
+ lvsl v2,0,FILL // LSU Move lower nibble to byte 0 of VR
+
+ vslb v0,v0,v1 // VIU Move upper nibble to VR[0:3]
+
+ vor v0,v0,v2 // VIU Form FILL byte in VR[0:7]
+
+ vspltb v0,v0,0 // VPU Splat the fill byte to all bytes
+enter_bzero:
+ mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ beq cr1,Left_just // b if D0 is left justified
+
+ bns cr7,No_B_fwd // b if only even number of bytes to store
+
+ stvebx v0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+No_B_fwd:
+ bne cr7,No_H_fwd // b if only words to store
+
+ stvehx v0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+No_H_fwd:
+ bng cr7,No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx v0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+No_W1_fwd:
+ bnl cr7,No_W2_fwd // b if there was only one word to store
+ stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
+ b No_W2_fwd
+
+Left_just:
+ stvx v0,0,DST // LSU Store 16 bytes at D0
+No_W2_fwd:
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+
+ li BK,16 // IU1 Re-initialize byte kount index
+ cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
+ ble cr6,Last_QW // b if no Quad words to do
+
+ mtctr QW // IU2 for (i=0;i<=QW;i++)
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+
+QW_loop:
+ stvx v0,DST,BK // LSU Store 16 fill bytes
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,QW_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,GT_4QW_fwd // b if >4 quad words left
+
+Last_QW: // Next vector is the last; we're done.
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+
+ beq cr1,Rt_just_fwd // b if last destination is right justified
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li BL,0 // IU1 Initialize index pointer
+ bnl cr7,Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 1 of two or three
+ addi BL,BL,4 // IU1 increment index
+
+ stvewx v0,DBK,BL // LSU store word 2 of two or three
+ addi BL,BL,4 // IU1 increment index
+Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 3 of three if necessary
+ addi BL,BL,4 // IU1 increment index
+Only_2W_fwd:
+ bne cr7,Only_B_fwd // b if there are no half words to store
+
+ stvehx v0,DBK,BL // LSU store one halfword if necessary
+ addi BL,BL,2 // IU1 increment index
+Only_B_fwd:
+ bns cr7,All_done_fwd // b if there are no bytes to store
+
+ stvebx v0,DBK,BL // LSU store one byte if necessary
+ b All_done_fwd
+
+Rt_just_fwd:
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D14
+All_done_fwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ bdnzf 27,GT_4QW_fwd // b if next store is to lower (even) half of CL
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+ bns cr6,B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ DCBK // LSU then Kill instead of RWITM
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D11
+ addi BK,BK,16 // IU1 Increment byte count
+ bdz Nxt_loc_fwd // always decrement and branch to next instr
+
+Nxt_loc_fwd:
+ stvx v0,DST,BK // LSU Store 16 bytes at D12
+ addi BK,BK,16 // IU1 Increment byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,One_even_QW // b if there is one even and one odd QW to store
+ b Last_QW // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+One_even_QW:
+ stvx v0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+
+ b Last_QW
+
+// End of memset in AltiVec
+
+#define BCz r4 // in bzero r4 enters with byte count
+
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .globl bzero
+bzero:
+#else
+ .globl vec_bzero
+vec_bzero:
+#endif
+
+ mr BC,BCz // IU1 arg[2] is BC here, not FILL
+ li FILL,0 // IU1 for bzero FILL=0
+#ifdef LIBMOTOVEC
+ b memset
+#else
+ b _vec_memset
+#endif
+
+// cacheable_memzero will employ dcbz to clear 32 bytes at a time
+// of cacheable memory. Like bzero, second entering argument will be BC.
+// Using this for non-cacheable memory will generate an alignment exception.
+
+ .text
+#ifdef __MWERKS__
+ .align 32
+#else
+ .align 5
+#endif
+
+#ifdef LIBMOTOVEC
+ .globl cacheable_memzero
+cacheable_memzero:
+#else
+ .globl vec_cacheable_memzero
+vec_cacheable_memzero:
+#endif
+
+ mr BC,BCz // IU1 arg[2] is BC here, not FILL
+ li FILL,0 // IU1 for bzero FILL=0
+ cmpi cr7,0,BC,MIN_VEC // IU1 Check for minimum byte count
+
+ cmpi cr1,0,BC,0 // IU1 Eliminate zero byte count
+
+ addi DM1,DST,-1 // IU1 Pre-bias and duplicate destination
+ addi DR,DST,16 // IU1 Address of second dst vector
+ add DBC,DST,BC // IU1 Address of last dst byte + 1
+ bgt cr7,c_v_memset // b if BC>MIN_VEC
+
+ mtctr BC // for (i=1;i<=BC;i++)
+ beqlr cr1 // return if BC = 0
+c_Byte_set:
+ stbu FILL,1(DM1) // LSU * ++(DST-1) = FILL
+ bdnz c_Byte_set
+
+ blr
+
+c_v_memset:
+// Byte count < MIN_VEC bytes will have been set by scalar code above,
+// so this will not deal with small block sets < MIN_VEC.
+
+// For systems using VRSAVE, define VRSAV=1 when compiling. For systems
+// that don't, make sure VRSAVE is undefined.
+#ifdef VRSAVE
+ mfspr RSV,VRSV // IU2 Get current VRSAVE contents
+#endif
+ rlwinm DR,DR,0,0,27 // IU1 (DST+16)[0:27]
+ addi DBK,DBC,-1 // IU1 Address of last dst byte
+
+#ifdef VRSAVE
+ oris Rt,RSV,0x8000 // IU1 Or in registers used by this routine
+#endif
+ subf D,DST,DR // IU1 How many bytes in first destination?
+ li BK,0 // IU1 Initialize byte kount index
+
+#ifdef VRSAVE
+ mtspr VRSV,Rt // IU2 Save in VRSAVE before first vec op
+#endif
+ vxor v0,v0,v0 // VIU Clear v0
+ subf QW,DR,DBK // IU1 Bytes of full vectors to move (-16)
+ cmpi cr1,0,D,16 // IU1 Is D0 left justified?
+
+ mtcrf 0x01,D // IU2 Put bytes in 1st dst in cr7
+ rlwinm QW,QW,28,4,31 // IU1 Quad words remaining
+ beq cr1,c_Left_just // b if D0 is left justified
+
+ bns cr7,c_No_B_fwd // b if only even number of bytes to store
+
+ stvebx v0,DST,BK // LSU store first byte at DST+0
+ addi BK,BK,1 // IU1 increment index
+c_No_B_fwd:
+ bne cr7,c_No_H_fwd // b if only words to store
+
+ stvehx v0,DST,BK // LSU store halfword at DST+0/1
+ addi BK,BK,2 // IU1 increment index
+c_No_H_fwd:
+ bng cr7,c_No_W1_fwd // b if exactly zero or two words to store
+
+ stvewx v0,DST,BK // LSU store word 1 of one or three
+ addi BK,BK,4 // IU1 increment index
+
+c_No_W1_fwd:
+ bnl cr7,c_No_W2_fwd // b if there was only one word to store
+ stvewx v0,DST,BK // LSU store word 1 of two or 2 of three
+ addi BK,BK,4 // IU1 increment index
+
+ stvewx v0,DST,BK // LSU store word 2 of two or 3 of three
+ b c_No_W2_fwd
+
+c_Left_just:
+ stvx v0,0,DST // LSU Store 16 bytes at D0
+c_No_W2_fwd:
+ rlwinm Rt,DBK,0,28,31 // IU1 (DBK = DST+BC-1)[28:31]
+ cmpi cr6,0,QW,0 // IU1 Any full vectors to move?
+
+ li BK,16 // IU1 Re-initialize byte kount index
+ cmpi cr1,0,Rt,0xF // IU1 Is DN right justified?
+ ble cr6,c_Last_QW // b if no Quad words to do
+
+ mtctr QW // IU2 for (i=0;i<=QW;i++)
+ cmpi cr6,0,QW,4 // IU1 Check QW>4
+
+c_QW_loop:
+ stvx v0,DST,BK // LSU Store 16 fill bytes
+ addi BK,BK,16 // IU1 Increment byte kount index
+ bdnzf 25,c_QW_loop // b if 4 or less quad words to do
+
+ add DNX,DST,BK // IU1 address of next store (DST+32 if QW>4)
+ addi QW,QW,-1 // IU1 One more QW stored by now
+ bgt cr6,c_GT_4QW_fwd // b if >4 quad words left
+
+c_Last_QW: // Next vector is the last; we're done.
+ mtcrf 0x01,DBC // IU2 Put final vector byte count in cr7
+
+ beq cr1,c_Rt_just_fwd // b if last destination is right justified
+
+ rlwinm DBK,DBK,0,0,27 // IU1 Round to QW addr of last byte
+ li BL,0 // IU1 Initialize index pointer
+ bnl cr7,c_Only_1W_fwd // b if there was only one or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 1 of two or three
+ addi BL,BL,4 // IU1 increment index
+
+ stvewx v0,DBK,BL // LSU store word 2 of two or three
+ addi BL,BL,4 // IU1 increment index
+c_Only_1W_fwd:
+ bng cr7,Only_2W_fwd // b if there were only two or zero words to store
+
+ stvewx v0,DBK,BL // LSU store word 3 of three if necessary
+ addi BL,BL,4 // IU1 increment index
+c_Only_2W_fwd:
+ bne cr7,c_Only_B_fwd // b if there are no half words to store
+
+ stvehx v0,DBK,BL // LSU store one halfword if necessary
+ addi BL,BL,2 // IU1 increment index
+c_Only_B_fwd:
+ bns cr7,c_All_done_fwd // b if there are no bytes to store
+
+ stvebx v0,DBK,BL // LSU store one byte if necessary
+ b c_All_done_fwd
+
+c_Rt_just_fwd:
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D14
+c_All_done_fwd:
+#ifdef VRSAVE
+ mtspr VRSV,RSV // IU1 Restore VRSAVE
+#endif
+ blr // Return destination address from entry
+
+#ifdef __MWERKS__
+ .align 16
+#else
+ .align 4
+#endif
+c_GT_4QW_fwd: // Do once if nxt st is to odd half of cache line, else twice
+
+ addi QW,QW,-1 // IU1 Keeping track of QWs stored
+ mtcrf 0x02,DNX // IU2 cr6[3]=((DST+32)[27]==1)?1:0;
+ addi DNX,DNX,16 // IU1 Update cr6 for next loop
+
+ stvx v0,DST,BK // LSU Store 16 bytes at D2
+ addi BK,BK,16 // IU1 Increment byte count by 16
+ bdnzf 27,c_GT_4QW_fwd // b if next store is to lower (even) half of CL
+
+ mtcrf 0x02,DBK // IU2 cr6[3]=((last store)[27]==1)?1:0; (odd?)
+
+ bns cr6,c_B32_fwd // b if DST[27] == 0; i.e, final store is even
+
+// We need the ctr register to reflect an even byte count before entering
+// the next block - faster to decrement than to reload.
+ bdnz B32_fwd // decrement counter for last QW store odd
+
+c_B32_fwd: // Should be at least 2 stores remaining and next 2 are cache aligned
+ dcbz DST,BK // LSU zero whole cache line
+ bdz c_Nxt_loc_fwd // always decrement and branch to next instr
+
+c_Nxt_loc_fwd:
+ addi BK,BK,32 // IU1 Increment byte count
+ bdnz B32_fwd // b if there are at least two more QWs to do
+
+ bso cr6,c_One_even_QW // b if there is one even and one odd QW to store
+ b c_Last_QW // b if last store is to even address
+
+// Come here with two more loads and two stores to do
+c_One_even_QW:
+ stvx v0,DST,BK // LSU Store 16 bytes at D13
+ addi BK,BK,16 // IU1 Increment byte count
+
+ b c_Last_QW
+
+// End of cacheable_memzero in AltiVec