kernel/eka/common/arm/cmem.cia
changeset 0 a41df078684a
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/kernel/eka/common/arm/cmem.cia	Mon Oct 19 15:55:17 2009 +0100
@@ -0,0 +1,1086 @@
+// Copyright (c) 1995-2009 Nokia Corporation and/or its subsidiary(-ies).
+// All rights reserved.
+// This component and the accompanying materials are made available
+// under the terms of the License "Eclipse Public License v1.0"
+// which accompanies this distribution, and is available
+// at the URL "http://www.eclipse.org/legal/epl-v10.html".
+//
+// Initial Contributors:
+// Nokia Corporation - initial contribution.
+//
+// Contributors:
+//
+// Description:
+// e32\common\arm\cmem.cia
+// 
+//
+
+#include "../common.h"
+#include <e32cia.h>
+#if defined(__REPLACE_GENERIC_UTILS)
+#include "replacement_utils.h"
+#endif
+
+#if defined(__MEM_MACHINE_CODED__)
+
+#ifndef USE_REPLACEMENT_MEMSET
+
+#if defined(_DEBUG)
+
+#ifdef __STANDALONE_NANOKERNEL__
+
+#define ARM_ASSERT_MULTIPLE_OF_FOUR(rt1, panicfunc)	\
+	asm("tst "#rt1", #3"); \
+	asm("ldrne "#rt1", ["#rt1"]")
+
+#else	// __STANDALONE_NANOKERNEL__
+GLDEF_C void PanicEWordMoveLengthNotMultipleOf4();
+GLDEF_C void PanicEWordMoveSourceNotAligned();
+GLDEF_C void PanicEWordMoveTargetNotAligned();
+
+#define ARM_ASSERT_MULTIPLE_OF_FOUR(rt1, panicfunc)	\
+	asm("tst "#rt1", #3"); \
+	asm("bne " panicfunc )
+
+#endif	// __STANDALONE_NANOKERNEL__
+
+#else	// _DEBUG
+
+#define ARM_ASSERT_MULTIPLE_OF_FOUR(rt1, panicfunc)
+
+#endif	//_DEBUG
+
+
+// See header file e32cmn.h for the in-source documentation.
+extern "C" EXPORT_C __NAKED__ TAny* memclr(TAny* /*aTrg*/, unsigned int /*aLength*/)
+	{
+	KMEMCLRHOOK
+	asm("mov r2, #0 ");
+	asm("b fill ");
+	}
+
+// See header file e32cmn.h for the in-source documentation.
+extern "C" EXPORT_C __NAKED__ TAny* memset(TAny* /*aTrg*/, TInt /*aValue*/, unsigned int /*aLength*/)
+    {
+    KMEMSETHOOK
+    asm("   mov		 r3, r2 ");				/* length into r3 */
+    asm("   and      r2,r1,#255");			/* fill value into r2 */
+	asm("	mov		 r1, r3 ");				/* length into r1 */
+
+    asm("fill:");
+    asm("   cmp      r1,#8");
+	asm("   bls      small_fill");			// only taken ~20% of the time
+
+    asm("   stmfd    sp!,{r0,r4-r9,lr}");
+	asm("   movs     r3, r0, lsl #30 ");	// Check if word aligned
+	asm("   orr      r2,r2,r2,lsl #8");
+    asm("   orr      r2,r2,r2,lsl #16");
+	asm("   bne		 unaligned_fill ");
+		
+	// Align destination address to 32 byte boundary if possible
+	
+	asm("word_aligned_fill: ");
+    asm("   mov      r4,r2");
+    asm("   mov      r5,r2");
+    asm("   mov      r6,r2");
+	asm("   movs     r3, r0, lsl #27 ");
+    asm("   beq      aligned_fill ");
+    asm("   rsb      r3, r3, #0 ");				// calculate fill length necessary for aligment
+	asm("   cmp      r1, r3, lsr #27 ");		// compare with remaining length
+	asm("   blo		 smaller_fill ");			// skip alignment if greater
+	asm("   msr      cpsr_f, r3 ");				// put length bits 4, 3, 2 into N, Z, C flags
+    asm("   strcs    r2, [r0], #4 ");			// align to 8 byte boundary
+    asm("   stmeqia  r0!, {r2, r4} ");			// align to 16 byte boundary
+    asm("   stmmiia  r0!, {r2, r4-r6} ");		// align to 32 byte boundary
+	asm("   sub      r1, r1, r3, lsr #27 ");	// adjust remaining length
+
+    asm("aligned_fill:");
+	asm("   cmp		 r1, #64 ");
+	asm("   bhs		 big_fill ");
+
+	// Fill 0-63 bytes
+	
+    asm("smaller_fill:");
+    asm("   movs     r1, r1, lsl #26");
+	asm("	beq		 mem_fill_end ");
+    asm("   msr      cpsr_flg, r1 ");
+    asm("   stmmiia  r0!,{r2,r4-r6}");	// Fill 32
+    asm("   stmmiia  r0!,{r2,r4-r6}");
+    asm("   stmeqia  r0!,{r2,r4-r6}");	// Fill 16
+    asm("   stmcsia  r0!,{r2,r4}");		// Fill 8
+    asm("   strvs    r2,[r0],#4");		// Fill 4
+	asm("   movs	 r1, r1, lsl #4 ");
+	asm("	bne		 smallest_fill ");
+	asm("mem_fill_end: ");
+	__POPRET("r0,r4-r9,");
+
+	// Fill last 1-3 bytes
+	
+    asm("smallest_fill: ");
+    asm("   msr      cpsr_flg,r1");
+    asm("   strmih   r2,[r0],#2");  	// Fill 2
+    asm("   streqb   r2,[r0],#1");  	// Fill 1
+	__POPRET("r0,r4-r9,");
+
+	// Fill loop for length >= 64
+	
+	asm("big_fill: ");
+	asm("   mov      r3,r2");
+    asm("   mov      r7,r2");
+    asm("   mov      r8,r2");
+    asm("   mov      r9,r2");
+    asm("   movs     ip,r1,lsr #8");	// Number of 256 byte blocks to fill
+	asm("   beq		 medium_fill ");
+    asm("fill_256_bytes_loop:");
+    asm("   stmia    r0!,{r2-r9}");		// Fill 256 bytes
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   stmia    r0!,{r2-r9}");
+    asm("   subs     ip,ip,#1");
+    asm("   bne      fill_256_bytes_loop");
+	asm("medium_fill: ");
+    asm("   movs     ip,r1,lsl #24");
+    asm("   msr      cpsr_flg,ip");	
+    asm("   stmmiia  r0!,{r2-r9}");		// Fill 128
+    asm("   stmmiia  r0!,{r2-r9}");  
+    asm("   stmmiia  r0!,{r2-r9}"); 
+    asm("   stmmiia  r0!,{r2-r9}"); 
+    asm("   stmeqia  r0!,{r2-r9}");		// Fill 64
+    asm("   stmeqia  r0!,{r2-r9}"); 
+	asm("   and		 r1, r1, #63 ");
+	asm("   b 		 smaller_fill");
+
+	// Word-align destination address, length >= 8
+	
+	asm("unaligned_fill: ");
+    asm("   rsb      r3, r3, #0 ");				// calculate fill length necessary for aligment
+    asm("   msr      cpsr_flg, r3");
+	asm("   streqb   r2, [r0], #1 ");			// align to 2 byte boundary
+    asm("   strmih   r2, [r0], #2 ");			// align to 4 byte boundary
+	asm("   sub      r1, r1, r3, lsr #30 ");
+	asm("	b		 word_aligned_fill ");
+
+	// Fill for length <= 8
+	
+	asm("small_fill: ");
+	asm("	mov		 r3, r0 ");				/* r3=dest */
+	asm("   adr      ip, small_fill_end ");
+	asm("   sub		 pc, ip, r1, lsl #2 ");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+    asm("   strb     r2, [r3], #1");
+	asm("small_fill_end: ");
+	__JUMP(,lr);
+
+#ifdef __EABI__
+	// The AEABI switched the order of arg2 and arg3 to save an intruction when
+	// calling 'memset' from 'memclr'	
+	asm(".global __aeabi_memset8 ");
+	asm("__aeabi_memset8: 		 ");
+	asm(".global __aeabi_memset4 ");
+	asm("__aeabi_memset4: 		 ");
+	asm(".global __aeabi_memset  ");
+	asm("__aeabi_memset: 		 ");
+    asm("   and      r2, r2, #255");
+	asm("	b		 fill		 ");
+#endif
+    }
+
+#endif  // USE_REPLACEMENT_MEMSET
+
+#ifndef USE_REPLACEMENT_MEMCPY
+
+// See header file e32cmn.h for the in-source documentation.
+
+extern "C" EXPORT_C __NAKED__ TAny* wordmove(TAny* /*aTrg*/, const TAny* /*aSrc*/, unsigned int /*aLength*/)
+//
+// Assumes all is aligned
+//
+    {
+	ARM_ASSERT_MULTIPLE_OF_FOUR(r0, CSM_Z30PanicEWordMoveTargetNotAlignedv);
+	ARM_ASSERT_MULTIPLE_OF_FOUR(r1, CSM_Z30PanicEWordMoveSourceNotAlignedv);
+	ARM_ASSERT_MULTIPLE_OF_FOUR(r2, CSM_Z34PanicEWordMoveLengthNotMultipleOf4v);
+
+	// Mask length to a multiple of four bytes to avoid memory, or register
+	// corruption by the special cases below.
+	asm("bic r2,r2,#3");
+
+	// Length <= 24 in ~90% of cases, however can only copy > 16 bytes in 4
+	// instructions if LDM instuction restores thumb state when loading the PC.	
+#ifdef __CPU_ARM_LDR_PC_SETS_TBIT
+	asm("cmp r2, #24 ");
+#else
+	asm("cmp r2, #16 ");
+#endif
+	PLD(1);
+	asm("addls pc, pc, r2, lsl #2 ");		// take branch depending on size
+	asm("b 9f ");							// too big
+
+	// 0 words
+	__JUMP(,lr);
+	__JUMP(,lr);
+	__JUMP(,lr);
+	__JUMP(,lr);
+
+	// 1 word
+	asm("ldr ip, [r1] ");
+	asm("str ip, [r0] ");
+	__JUMP(,lr);
+	__JUMP(,lr);
+
+	// 2 words
+	asm("ldmia r1, {r2,r3}");
+	asm("stmia r0, {r2,r3}");
+	__JUMP(,lr);
+	__JUMP(,lr);
+
+	// 3 words
+	asm("ldmia r1, {r2,r3,ip}");
+	asm("stmia r0, {r2,r3,ip}");
+	__JUMP(,lr);
+	__JUMP(,lr);
+
+	// 4 words
+	asm("ldmia r1, {r1,r2,r3,ip}");
+	asm("stmia r0, {r1,r2,r3,ip}");
+	__JUMP(,lr);
+	__JUMP(,lr);
+
+#ifdef __CPU_ARM_LDR_PC_SETS_TBIT
+	// 5 words
+	asm("stmfd sp!, {lr}");
+	asm("ldmia r1, {r1,r2,r3,ip,lr}");
+	asm("stmia r0, {r1,r2,r3,ip,lr}");
+	asm("ldmfd sp!, {pc}");
+
+	// 6 words
+	asm("stmfd sp!, {r4,lr}");
+	asm("ldmia r1, {r1,r2,r3,r4,ip,lr}");
+	asm("stmia r0, {r1,r2,r3,r4,ip,lr}");
+	asm("ldmfd sp!, {r4,pc}");
+#endif
+
+	asm("9: ");
+    asm("subs r3, r0, r1 ");				// r3 = dest - source
+	__JUMP(eq,lr);							// return if source = dest
+    asm("stmfd sp!, {r0,r4-r11,lr} ");
+	asm("cmphi r2, r3 ");					// if dest>source, compare length with dest-source
+    asm("bls mem_move_fore ");				// if dest<source or length<=dest-source do forwards aligned copy
+    asm("add r0, r0, r2 ");
+    asm("add r1, r1, r2 ");
+    asm("b mem_move_back ");				// Backwards aligned copy
+    }
+
+
+
+
+// See header file e32cmn.h for the in-source documentation.
+extern "C" EXPORT_C __NAKED__ TAny* memmove(TAny* /*aTrg*/, const TAny* /*aSrc*/, unsigned int /*aLength*/)
+	{
+	KMEMMOVEHOOK
+	// fall through
+	}
+
+
+
+// See header file e32cmn.h for the in-source documentation.
+extern "C" EXPORT_C __NAKED__ TAny* memcpy(TAny* /*aTrg*/, const TAny* /*aSrc*/, unsigned int /*aLength*/)
+    {
+    KMEMCPYHOOK
+//
+// Check for zero length or source and target being the same
+//
+    asm("	cmp		r2, #0 ");				// zero length?
+    asm("	subnes	r3, r0, r1 ");			// if not, r3 = dest-source
+	__JUMP(eq,lr);							// if zero length or dest=source, nothing to do
+	asm("	cmphi	r2, r3 ");				// if dest>source compare length to dest-source
+	asm("	movhi	r3, #0 ");				// if dest>source and length>dest-source need to go backwards - set r3=0
+//
+//	If <16 bytes, just do byte moves
+//
+    asm("	cmp		r2,	#15 ");
+	asm("	bhi		main_copy ");
+
+	asm("	ldrb	r12, [r0] ");			// read dest so it's in cache - avoid lots of single accesses to external memory
+	asm("	sub		r12, r0, #1 ");
+	asm("	ldrb	r12, [r12, r2] ");		// read dest+length-1
+	asm("	cmp		r3, #0 ");
+	asm("	beq		small_copy_back ");		// r3=0 means go backwards
+
+	asm("small_copy_fwd: ");
+	asm("	mov		r3, r0 ");
+	asm("	adr		r12, small_copy_fwd_end ");
+	asm("	sub		pc, r12, r2, lsl #3 ");
+
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("	ldrb	r12, [r1], #1 ");
+	asm("	strb	r12, [r3], #1 ");
+	asm("small_copy_fwd_end: ");
+	__JUMP(,lr);
+
+	asm("small_copy_back: ");
+	asm("	add		r3, r0, r2 ");
+	asm("	add		r1, r1, r2 ");
+	asm("	adr		r12, small_copy_back_end ");
+	asm("	sub		pc, r12, r2, lsl #3 ");
+
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("	ldrb	r12, [r1, #-1]! ");
+	asm("	strb	r12, [r3, #-1]! ");
+	asm("small_copy_back_end: ");
+	__JUMP(,lr);
+
+	
+	asm("main_copy: ");
+	PLD(1);											// preload first two cache lines
+	PLD_ioff(1, 32);
+	asm("	stmfd	sp!, {r0,r4-r11,lr} ");			// r0 == dest, r1 == src, r2 == len
+	asm("	cmp		r3, #0 ");
+	asm("	beq		copy_back ");					// we must go backwards
+    asm("   movs	r3, r0, lsl #30 ");				// check destination word aligned
+	asm("   bne		dest_unaligned_fore ");
+
+//
+// Normal copy forwards. r0 should point to end address on exit
+// Destination now word-aligned; if source is also word-aligned, do aligned copy.
+//	
+	asm("dest_aligned_fore: ");
+    asm("   ands	r12, r1, #3 ");		// r12=alignment of source
+    asm("   bne		copy_fwd_nonaligned ");
+
+//
+// We are now word aligned, at least 13 bytes to do
+//
+	
+    asm("mem_move_fore:");
+//
+// superalign
+//
+    asm("	movs	r4, r0, lsl #27 ");		 		 		// destination alignment into r4
+	asm("	beq		f_al_already_aligned ");				// fast path
+	asm("	rsb		r4, r4, #0 ");							// bytes required to align destination to 32
+	asm("	cmp		r2, r4, lsr #27 ");						// check that many remaining
+	asm("	blo		its_smaller_fore ");					// if too short, just stick with word alignment
+	asm("	msr		cpsr_flg, r4 ");		 		 		// destination alignment into N, Z, C flags
+															// do word moves to align destination
+	asm("	ldrcs	lr, [r1], #4 ");						// C flag == 1 word (we are already word aligned)
+	asm("	ldmeqia	r1!, {r3,r9} ");						// Z flag == 2 words
+	asm("	ldmmiia	r1!, {r5-r8} ");						// N flag == 4 words, destination now 32 byte aligned
+	asm("	sub		r2, r2, r4, lsr #27 ");		 			// adjust length
+	asm("	strcs	lr, [r0], #4 ");						// destination now 8 byte aligned
+	asm("	stmeqia	r0!, {r3,r9} ");						// destination now 16 byte aligned
+	asm("	stmmiia	r0!, {r5-r8} ");						// destination now 32 byte aligned	
+
+	asm("f_al_already_aligned: ");
+	asm("	cmp		r2, #64 ");
+	asm("	bhs		large_copy_fore ");
+//
+// Less than 64 bytes to go...
+//	
+    asm("its_smaller_fore:");
+    asm("	movs	ip, r2, lsl #26 ");		// length bits 5, 4, 3, 2 into N, Z, C, V
+	asm("	beq		mem_copy_end ");		// skip if remaining length zero
+    asm("	msr		cpsr_flg, ip ");
+    asm("	ldmmiia	r1!, {r3-r10} ");
+    asm("	stmmiia	r0!, {r3-r10} ");		// copy 32	
+    asm("	ldmeqia	r1!, {r3-r6} ");
+    asm("	ldmcsia	r1!, {r7-r8} ");
+    asm("	ldrvs	r9, [r1], #4 ");
+    asm("	stmeqia	r0!, {r3-r6} ");		// copy 16
+    asm("	stmcsia	r0!, {r7-r8} ");		// copy 8
+    asm("	strvs	r9, [r0], #4 ");		// copy 4
+
+    asm("	movs	ip, r2, lsl #30 ");	
+	asm("	bne		smallest_copy_fore ");
+	
+	asm("mem_copy_end: ");
+	__POPRET("r0,r4-r11,");
+
+	
+//
+// Less than 4 bytes to go...
+//
+	
+	asm("smallest_copy_fore: ");
+    asm("	msr		cpsr_flg, ip ");
+    asm("	ldrmih	r3, [r1], #2 ");
+    asm("	ldreqb	r4, [r1], #1 ");
+    asm("	strmih	r3, [r0], #2 ");		// copy 2
+    asm("	streqb	r4, [r0], #1 ");		// copy 1
+	__POPRET("r0,r4-r11,");
+
+	
+//
+// Do byte moves if necessary to word-align destination
+//
+	asm("dest_unaligned_fore: ");
+	asm("	rsb		r3, r3, #0 ");
+	asm("	msr		cpsr_flg, r3 ");
+	asm("	ldrmib	r4, [r1], #1 ");				// move bytes to align destination
+	asm("	ldrmib	r5, [r1], #1 ");
+	asm("	ldreqb	r6, [r1], #1 ");
+	asm("	sub		r2, r2, r3, lsr #30 ");			// adjust length, at least 13 bytes remaining
+	asm("	strmib	r4, [r0], #1 ");
+	asm("	strmib	r5, [r0], #1 ");
+	asm("	streqb	r6, [r0], #1 ");
+	asm("   b		dest_aligned_fore ");
+
+	
+//
+//	Large copy, length >= 64
+//
+	
+	asm("large_copy_fore: ");
+	asm("	movs	ip, r2, lsr #6 ");						// ip = number of 64 blocks to copy
+	asm("1: ");
+	PLD_ioff(1, 32);
+	PLD_ioff(1, 64);
+    asm("	ldmia	r1!, {r3-r10} ");		// Copy 64
+    asm("	stmia	r0!, {r3-r10} "); 
+    asm("	ldmia	r1!, {r3-r10} ");
+    asm("	subs	ip, ip, #1 ");
+    asm("	stmia	r0!, {r3-r10} "); 	
+	asm("	bne		1b ");		
+	asm("	and		r2, r2, #63 ");
+	asm("	b		its_smaller_fore ");
+
+	
+//
+// Forward unlaigned copy
+//	
+	
+	asm("copy_fwd_nonaligned:");
+//
+// superalign
+//	
+	asm("	bic		r1, r1, #3 ");					// align source
+	asm("	ldr		r11, [r1], #4 ");				// get first word
+	asm("	mov		r12, r12, lsl #3 ");			// r12 = 8*source alignment
+	asm("	ands	r4, r0, #31 ");					// destination alignment into r4
+	asm("	beq		medium_unal_copy ");			// skip if already aligned
+	asm("	rsb		r4, r4, #32 ");					// r4 = bytes to align dest to 32
+	asm("	cmp		r2, r4 ");						// check if length big enough to align to 32
+	asm("	blo		copy_fwd_remainder ");			// skip if too small
+	asm("	sub		r2, r2, r4 ");					// adjust length
+	asm("	rsb		r3, r12, #32 ");				// r3 = 32 - 8*source alignment
+
+	asm("1: ");
+	asm("	mov		r5, r11, lsr r12 ");			// r5 = part of previous source word required to make destination word
+	asm("	ldr		r11, [r1], #4 ");				// get next word
+	asm("	subs	r4, r4, #4 ");					// 4 bytes less to do
+	asm("	orr		r5, r5, r11, lsl r3 ");			// form next destination word
+	asm("	str		r5, [r0], #4 ");				// and store it
+	asm("	bne		1b ");							// loop until destination 32 byte aligned
+
+	asm("medium_unal_copy: ");						// destination now aligned to 32 bytes
+	asm("	movs	lr, r2, lsr #5 ");				// lr=number of 32-byte blocks
+	asm("	beq		copy_fwd_remainder ");			// skip if length < 32
+
+	asm("	cmp		r12, #16 ");
+	asm("	beq		copy_fwd_nonaligned_2 ");		// branch if source = 2 mod 4
+	asm("	bhi		copy_fwd_nonaligned_3 ");		// branch if source = 3 mod 4, else source = 1 mod 4
+
+// source = 1 mod 4
+	asm("copy_fwd_nonaligned_1: ");
+	asm("	mov		r3, r11, lsr #8 ");
+	asm("	ldmia	r1!, {r4-r11} ");
+	PLD_ioff(1, 32);
+	asm("	subs	lr, lr, #1 ");
+	asm("	orr		r3, r3, r4, lsl #24 ");
+	asm("	mov		r4, r4, lsr #8 ");
+	asm("	orr		r4, r4, r5, lsl #24 ");
+	asm("	mov		r5, r5, lsr #8 ");
+	asm("	orr		r5, r5, r6, lsl #24 ");
+	asm("	mov		r6, r6, lsr #8 ");
+	asm("	orr		r6, r6, r7, lsl #24 ");
+	asm("	mov		r7, r7, lsr #8 ");
+	asm("	orr		r7, r7, r8, lsl #24 ");
+	asm("	mov		r8, r8, lsr #8 ");
+	asm("	orr		r8, r8, r9, lsl #24 ");
+	asm("	mov		r9, r9, lsr #8 ");
+	asm("	orr		r9, r9, r10, lsl #24 ");
+	asm("	mov		r10, r10, lsr #8 ");
+	asm("	orr		r10, r10, r11, lsl #24 ");
+	asm("	stmia	r0!, {r3-r10} ");
+	asm("	bne		copy_fwd_nonaligned_1 ");
+	asm("	b		copy_fwd_remainder ");
+
+// source = 2 mod 4
+	asm("copy_fwd_nonaligned_2: ");
+	asm("	mov		r3, r11, lsr #16 ");
+	asm("	ldmia	r1!, {r4-r11} ");
+	PLD_ioff(1, 32);
+	asm("	subs	lr, lr, #1 ");
+	asm("	orr		r3, r3, r4, lsl #16 ");
+	asm("	mov		r4, r4, lsr #16 ");
+	asm("	orr		r4, r4, r5, lsl #16 ");
+	asm("	mov		r5, r5, lsr #16 ");
+	asm("	orr		r5, r5, r6, lsl #16 ");
+	asm("	mov		r6, r6, lsr #16 ");
+	asm("	orr		r6, r6, r7, lsl #16 ");
+	asm("	mov		r7, r7, lsr #16 ");
+	asm("	orr		r7, r7, r8, lsl #16 ");
+	asm("	mov		r8, r8, lsr #16 ");
+	asm("	orr		r8, r8, r9, lsl #16 ");
+	asm("	mov		r9, r9, lsr #16 ");
+	asm("	orr		r9, r9, r10, lsl #16 ");
+	asm("	mov		r10, r10, lsr #16 ");
+	asm("	orr		r10, r10, r11, lsl #16 ");
+	asm("	stmia	r0!, {r3-r10} ");
+	asm("	bne		copy_fwd_nonaligned_2 ");
+	asm("	b		copy_fwd_remainder ");
+
+// source = 3 mod 4
+	asm("copy_fwd_nonaligned_3: ");
+	asm("	mov		r3, r11, lsr #24 ");
+	asm("	ldmia	r1!, {r4-r11} ");
+	PLD_ioff(1, 32);
+	asm("	subs	lr, lr, #1 ");
+	asm("	orr		r3, r3, r4, lsl #8 ");
+	asm("	mov		r4, r4, lsr #24 ");
+	asm("	orr		r4, r4, r5, lsl #8 ");
+	asm("	mov		r5, r5, lsr #24 ");
+	asm("	orr		r5, r5, r6, lsl #8 ");
+	asm("	mov		r6, r6, lsr #24 ");
+	asm("	orr		r6, r6, r7, lsl #8 ");
+	asm("	mov		r7, r7, lsr #24 ");
+	asm("	orr		r7, r7, r8, lsl #8 ");
+	asm("	mov		r8, r8, lsr #24 ");
+	asm("	orr		r8, r8, r9, lsl #8 ");
+	asm("	mov		r9, r9, lsr #24 ");
+	asm("	orr		r9, r9, r10, lsl #8 ");
+	asm("	mov		r10, r10, lsr #24 ");
+	asm("	orr		r10, r10, r11, lsl #8 ");
+	asm("	stmia	r0!, {r3-r10} ");
+	asm("	bne		copy_fwd_nonaligned_3 ");
+
+// <32 bytes to go, source alignment could be 1, 2 or 3 mod 4
+// r12 = 8 * (source mod 4)
+	asm("copy_fwd_remainder: ");
+	asm("	ands	r4, r2, #0x1c ");			// r4 = 4*number of words left
+	asm("	beq		2f ");						// skip if none
+	asm("	rsb		r3, r12, #32 ");			// r3 = 32 - 8*source alignment
+
+	asm("1: ");
+	asm("	mov		r5, r11, lsr r12 ");		// r5 = part of previous source word required to make destination word
+	asm("	ldr		r11, [r1], #4 ");			// get next word
+	asm("	subs	r4, r4, #4 ");				// 4 bytes less to do
+	asm("	orr		r5, r5, r11, lsl r3 ");		// form next destination word
+	asm("	str		r5, [r0], #4 ");			// and store it
+	asm("	bne		1b ");						// loop until destination 32 byte aligned
+
+	asm("2: ");
+	asm("	sub		r1, r1, #4 ");
+	asm("	add		r1, r1, r12, lsr #3 ");		// r1 = real unaligned source address
+	asm("	tst		r2, #2 ");					// 2 bytes left?
+	asm("	ldrneb	r5, [r1], #1 ");			// copy 2
+	asm("	strneb	r5, [r0], #1 ");
+	asm("	ldrneb	r5, [r1], #1 ");
+	asm("	strneb	r5, [r0], #1 ");
+	asm("	tst		r2, #1 ");					// 1 byte left?
+	asm("	ldrneb	r5, [r1], #1 ");			// copy 1
+	asm("	strneb	r5, [r0], #1 ");
+	__POPRET("r0,r4-r11,");
+
+	
+//
+// Source is before destination and they overlap, so need to copy backwards
+//
+	
+    asm("copy_back:");
+	asm("	add		r0, r0, r2 ");				// r0=last dest address+1
+	asm("	add		r1, r1, r2 ");				// r1=last source address+1
+	PLD_noff(1, 33);							// preload last two cache lines
+	PLD_noff(1, 1);
+
+    asm("	movs	r3, r0, lsl #30 ");			// check destination word aligned
+	asm("	bne		dest_unaligned_back ");
+	
+	asm("dest_aligned_back: ");
+	asm("	ands	r12, r1, #3 ");					// r12=alignment of source
+    asm("	bne		copy_back_nonaligned ");
+
+//
+// Backwards copying, addresses both word aligned, at least 13 bytes to go
+//
+	
+    asm("mem_move_back:");
+//
+// superalign
+// 
+	asm("	movs	r4, r0, lsl #27 ");					// bytes required to align destination to 32
+	asm("	beq		bal_already_aligned ");				// skip if already aligned to 32
+	asm("	cmp		r2, r4, lsr #27 ");					// check that many remaining
+	asm("	blo		its_smaller_back ");				// if too short, just stick with word alignment
+	asm("	msr		cpsr_flg, r4 ");		 		 	// destination alignment into N, Z, C flags
+														// do word moves to align destination
+	asm("	ldrcs	lr, [r1, #-4]! ");					// C flag == 1 word (we are already word aligned)
+	asm("	ldmeqdb	r1!, {r3,r9} ");					// Z flag == 2 words
+	asm("	ldmmidb	r1!, {r5-r8} ");
+	asm("	sub		r2, r2, r4, lsr #27 ");		 		// adjust length
+	asm("	strcs	lr, [r0, #-4]! ");					// destination now 8 byte aligned
+	asm("	stmeqdb	r0!, {r3,r9} ");					// destination now 16 byte aligned
+	asm("	stmmidb	r0!, {r5-r8} ");					// N flag == 4 words, destination now 32 byte aligned
+
+	asm("bal_already_aligned: ");
+	asm("	cmp		r2, #64 ");
+	asm("	bhs		large_copy_back ");
+//
+// Less than 64 bytes to go
+//
+    asm("its_smaller_back: ");
+    asm("	movs	ip, r2, lsl #26 ");		// r2 = remaining length (<256) << 24
+	asm("	beq		mem_copy_end2 ");		// skip if remaining length zero
+    asm("	msr		cpsr_flg, ip ");
+    asm("	ldmmidb	r1!, {r3-r10} ");
+    asm("	stmmidb	r0!, {r3-r10} ");		// copy 32
+    asm("	ldmeqdb	r1!, {r3-r6} ");
+    asm("	ldmcsdb	r1!, {r7,r8} ");
+    asm("   ldrvs	r9, [r1, #-4]! ");
+    asm("	stmeqdb	r0!, {r3-r6} ");		// copy 16
+    asm("	stmcsdb	r0!, {r7,r8} ");		// copy 8
+    asm("   strvs	r9, [r0, #-4]! ");		// copy 4
+	
+    asm("	movs	ip, r2, lsl #30 ");
+	asm("	bne		smallest_copy_back ");
+
+	asm("mem_copy_end2: ");
+	__POPRET("r0,r4-r11,");
+
+	
+//
+// Less than 4 bytes to go...
+//
+	
+	asm("smallest_copy_back: ");
+    asm("	msr		cpsr_flg, ip ");
+    asm("	ldrmih	r3, [r1, #-2]! ");
+    asm("	ldreqb	r4, [r1, #-1]! ");
+    asm("	strmih	r3, [r0, #-2]! ");		// copy 2
+    asm("	streqb	r4, [r0, #-1]! ");		// copy 1
+	__POPRET("r0,r4-r11,");
+	
+
+//
+// Do byte moves if necessary to word-align destination
+//
+	asm("dest_unaligned_back: ");
+	asm("	msr		cpsr_flg, r3 ");				// destination alignment in r3 into N,Z flags
+	asm("	ldrmib	r4, [r1, #-1]! ");				// do byte moves to align destination
+	asm("	ldrmib	r5, [r1, #-1]! ");
+	asm("	ldreqb	r6, [r1, #-1]! ");
+	asm("	sub		r2, r2, r3, lsr #30 ");			// adjust length, at least 13 bytes remaining
+	asm("	strmib	r4, [r0, #-1]! ");
+	asm("	strmib	r5, [r0, #-1]! ");
+	asm("	streqb	r6, [r0, #-1]! ");
+	asm("	b		dest_aligned_back ");
+
+
+//
+//	Large backwards copy, length >= 64
+//	
+
+	asm("large_copy_back: ");
+    asm("	movs	ip, r2, lsr #6 ");
+	asm("1: ");
+	PLD_noff(1, 65);
+	PLD_noff(1, 33);
+    asm("	ldmdb	r1!, {r3-r10} ");		// Copy 64
+    asm("	stmdb	r0!, {r3-r10} "); 
+    asm("	ldmdb	r1!, {r3-r10} ");
+    asm("	subs	ip, ip, #1 ");
+    asm("	stmdb	r0!, {r3-r10} "); 
+	asm("	bne		1b ");		
+	asm("	and		r2, r2, #63 ");
+	asm("	b		its_smaller_back ");
+
+//
+// Backwards unlaigned copy
+//	
+
+	asm("copy_back_nonaligned: ");
+//
+// superalign
+//
+	asm("	bic		r1, r1, #3 ");					// align source
+	asm("	ldr		r3, [r1] ");					// get first word
+	asm("	mov		r12, r12, lsl #3 ");			// r12 = 8*source alignment
+	asm("	ands	r4, r0, #31 ");					// r4 = bytes to align dest to 32
+	asm("	beq		bunal_already_aligned ");		// skip if already aligned
+	asm("	cmp		r2, r4 ");						// check if length big enough to align to 32
+	asm("	blo		copy_back_remainder ");			// skip if too small
+	asm("	sub		r2, r2, r4 ");					// adjust length
+	asm("	rsb		r6, r12, #32 ");				// r6 = 32 - 8*source alignment
+
+	asm("1: ");
+	asm("	mov		r5, r3, lsl r6 ");				// r5 = part of previous source word required to make destination word
+	asm("	ldr		r3, [r1, #-4]! ");				// get next word
+	asm("	subs	r4, r4, #4 ");					// 4 bytes less to do
+	asm("	orr		r5, r5, r3, lsr r12 ");			// form next destination word
+	asm("	str		r5, [r0, #-4]! ");				// and store it
+	asm("	bne		1b ");							// loop until destination 32 byte aligned
+
+	asm("bunal_already_aligned: ");					// destination now aligned to 32 bytes
+	asm("	movs	lr, r2, lsr #5 ");				// lr=number of 32-byte blocks
+	asm("	beq		copy_back_remainder ");			// skip if length < 32
+
+	asm("	cmp		r12, #16 ");
+	asm("	beq		copy_back_nonaligned_2 ");		// branch if source = 2 mod 4
+	asm("	bhi		copy_back_nonaligned_3 ");		// branch if source = 3 mod 4, else source = 1 mod 4
+
+// source = 1 mod 4
+	asm("copy_back_nonaligned_1: ");
+	asm("	mov		r11, r3, lsl #24 ");
+	asm("	ldmdb	r1!, {r3-r10} ");
+	PLD_noff(1, 64);
+	asm("	orr		r11, r11, r10, lsr #8 ");
+	asm("	mov		r10, r10, lsl #24 ");
+	asm("	orr		r10, r10, r9, lsr #8 ");
+	asm("	mov		r9, r9, lsl #24 ");
+	asm("	orr		r9, r9, r8, lsr #8 ");
+	asm("	mov		r8, r8, lsl #24 ");
+	asm("	orr		r8, r8, r7, lsr #8 ");
+	asm("	mov		r7, r7, lsl #24 ");
+	asm("	orr		r7, r7, r6, lsr #8 ");
+	asm("	mov		r6, r6, lsl #24 ");
+	asm("	orr		r6, r6, r5, lsr #8 ");
+	asm("	mov		r5, r5, lsl #24 ");
+	asm("	orr		r5, r5, r4, lsr #8 ");
+	asm("	mov		r4, r4, lsl #24 ");
+	asm("	orr		r4, r4, r3, lsr #8 ");
+	asm("	stmdb	r0!, {r4-r11} ");
+	asm("	subs	lr, lr, #1 ");
+	asm("	bne		copy_back_nonaligned_1 ");
+	asm("	b		copy_back_remainder ");
+
+// source = 2 mod 4
+	asm("copy_back_nonaligned_2: ");
+	asm("	mov		r11, r3, lsl #16 ");
+	asm("	ldmdb	r1!, {r3-r10} ");
+	PLD_noff(1, 64);
+	asm("	orr		r11, r11, r10, lsr #16 ");
+	asm("	mov		r10, r10, lsl #16 ");
+	asm("	orr		r10, r10, r9, lsr #16 ");
+	asm("	mov		r9, r9, lsl #16 ");
+	asm("	orr		r9, r9, r8, lsr #16 ");
+	asm("	mov		r8, r8, lsl #16 ");
+	asm("	orr		r8, r8, r7, lsr #16 ");
+	asm("	mov		r7, r7, lsl #16 ");
+	asm("	orr		r7, r7, r6, lsr #16 ");
+	asm("	mov		r6, r6, lsl #16 ");
+	asm("	orr		r6, r6, r5, lsr #16 ");
+	asm("	mov		r5, r5, lsl #16 ");
+	asm("	orr		r5, r5, r4, lsr #16 ");
+	asm("	mov		r4, r4, lsl #16 ");
+	asm("	orr		r4, r4, r3, lsr #16 ");
+	asm("	stmdb	r0!, {r4-r11} ");
+	asm("	subs	lr, lr, #1 ");
+	asm("	bne		copy_back_nonaligned_2 ");
+	asm("	b		copy_back_remainder ");
+
+// source = 3 mod 4
+	asm("copy_back_nonaligned_3: ");
+	asm("	mov		r11, r3, lsl #8 ");
+	asm("	ldmdb	r1!, {r3-r10} ");
+	PLD_noff(1, 64);
+	asm("	orr		r11, r11, r10, lsr #24 ");
+	asm("	mov		r10, r10, lsl #8 ");
+	asm("	orr		r10, r10, r9, lsr #24 ");
+	asm("	mov		r9, r9, lsl #8 ");
+	asm("	orr		r9, r9, r8, lsr #24 ");
+	asm("	mov		r8, r8, lsl #8 ");
+	asm("	orr		r8, r8, r7, lsr #24 ");
+	asm("	mov		r7, r7, lsl #8 ");
+	asm("	orr		r7, r7, r6, lsr #24 ");
+	asm("	mov		r6, r6, lsl #8 ");
+	asm("	orr		r6, r6, r5, lsr #24 ");
+	asm("	mov		r5, r5, lsl #8 ");
+	asm("	orr		r5, r5, r4, lsr #24 ");
+	asm("	mov		r4, r4, lsl #8 ");
+	asm("	orr		r4, r4, r3, lsr #24 ");
+	asm("	stmdb	r0!, {r4-r11} ");
+	asm("	subs	lr, lr, #1 ");
+	asm("	bne		copy_back_nonaligned_3 ");
+
+// <32 bytes to go, source alignment could be 1, 2 or 3 mod 4
+// r12 = 8 * (source mod 4)
+	asm("copy_back_remainder: ");
+	asm("	ands	r4, r2, #0x1c ");			// r4 = 4*number of words left
+	asm("	beq		2f ");						// skip if none
+	asm("	rsb		r6, r12, #32 ");			// r6 = 32 - 8*source alignment
+
+	asm("1: ");
+	asm("	mov		r5, r3, lsl r6 ");			// r5 = part of previous source word required to make destination word
+	asm("	ldr		r3, [r1, #-4]! ");			// get next word
+	asm("	subs	r4, r4, #4 ");				// 4 bytes less to do
+	asm("	orr		r5, r5, r3, lsr r12 ");		// form next destination word
+	asm("	str		r5, [r0, #-4]! ");			// and store it
+	asm("	bne		1b ");						// loop until destination 32 byte aligned
+
+	asm("2: ");
+	asm("	add		r1, r1, r12, lsr #3 ");		// r1 = real unaligned source address
+	asm("	tst		r2, #2 ");					// 2 bytes left?
+	asm("	ldrneb	r3, [r1, #-1]! ");			// copy 2
+	asm("	strneb	r3, [r0, #-1]! ");
+	asm("	ldrneb	r3, [r1, #-1]! ");
+	asm("	strneb	r3, [r0, #-1]! ");
+	asm("	tst		r2, #1 ");					// 1 byte left?
+	asm("	ldrneb	r3, [r1, #-1]! ");			// copy 1
+	asm("	strneb	r3, [r0, #-1]! ");
+	__POPRET("r0,r4-r11,");
+    }
+
+#endif  // USE_REPLACEMENT_MEMCPY
+
+
+#ifndef __KERNEL_MODE__
+#ifdef __GCC32__ 
+/**
+Compares a block of data at one specified location with a block of data at 
+another specified location.
+
+The comparison proceeds on a byte for byte basis, the result of the comparison 
+is based on the difference of the first bytes to disagree.
+
+The data at the two locations are equal if they have the same length and content. 
+Where the lengths are different and the shorter section of data is the same 
+as the first part of the longer section of data, the shorter is considered 
+to be less than the longer.
+
+@param aLeft   A pointer to the first (or left) block of 8 bit data
+               to be compared.
+@param aLeftL  The length of the first (or left) block of data to be compared,  
+               i.e. the number of bytes.
+@param aRight  A pointer to the second (or right) block of 8 bit data to be 
+               compared.
+@param aRightL The length of the second (or right) block of data to be compared 
+               i.e. the number of bytes.
+               
+@return Positive, if the first (or left) block of data is greater than the 
+        second (or right) block of data.
+        Negative, if the first (or left) block of data is less than the
+        second (or right) block of data.
+        Zero, if both the first (or left) and second (or right) blocks of data
+        have the same length and the same content.
+*/
+EXPORT_C __NAKED__ TInt Mem::Compare(const TUint8* /*aLeft*/, TInt /*aLeftL*/, const TUint8* /*aRight*/, TInt /*aRightL*/)
+	{
+	// fall through
+	}
+#endif
+#endif
+
+
+
+// See header file e32cmn.h for the in-source documentation.
+extern "C" EXPORT_C __NAKED__ TInt memcompare(const TUint8* /*aLeft*/, TInt /*aLeftL*/, const TUint8* /*aRight*/, TInt /*aRightL*/)
+//
+// Compares until the smaller of the two lengths is reached.
+// If the lengths differ, returns leftlen-rightlen
+// If a difference is encountered, returns left byte-right byte
+//
+    {
+
+    asm("   stmfd    sp!,{r4,r5,r6,lr}");
+    asm("   mov      r4,r0");
+//
+// Get the shorter of the two lengths, and check for zero length
+//
+    asm("   cmp      r1,r3");
+    asm("   mov      r6,r1");
+    asm("   movge    r6,r3");
+    asm("   cmp      r6,#0");
+    asm("   beq      compare_done");
+    asm("   cmp      r6,#16");
+//
+// Check for aligned buffers for faster comparing if more than 16 bytes
+//
+    asm("   andge    r0,r4,#3");
+    asm("   andge    r5,r2,#3");
+    asm("   addlt    r0,r5,#1");
+    asm("   cmp      r0,r5");
+    asm("   beq      aligned_compare");
+//
+// Get aLeft+Min(aLeftL,aRightL)
+//
+    asm("   add      r6,r4,r6");
+
+    asm("compare_loop:");
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   cmp      r4,r6");
+    asm("   beq      compare_done");
+
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   cmp      r4,r6");
+    asm("   beq      compare_done");
+
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   cmp      r4,r6");
+    asm("   beq      compare_done");
+
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   cmp      r4,r6");
+    asm("   bne      compare_loop");
+//
+// Return difference of lengths
+//
+    asm("compare_done:");
+    asm("   sub      r0,r1,r3");
+
+    asm("compare_exit:");
+	__POPRET("r4-r6,");
+//
+// Compare byte at a time until word aligned...
+//
+    asm("aligned_compare:");
+//
+// Get number of bytes to compare before word alignment reached...and jump to appropriate point
+//
+    asm("   mov      ip,r6");
+    asm("   add      r6,r4,r6");
+    asm("   subs     r0,r0,#1");
+    asm("   movmi    r0,#3");
+    asm("   rsb      r5,r0,#3");
+    asm("   sub      ip,ip,r5");
+    asm("   mov      ip,ip,lsr #2");
+	asm("   add      pc,pc,r0,asl #4");
+    asm("   b        compare_done"); // Never executed
+//
+// Jump here if alignment is 1. Do not use more than 4 instructions without altering above relative jump
+//
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+//
+// Jump here if alignment is 2. Do not use more than 4 instructions without altering above relative jump
+//
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+//
+// Jump here if alignment is 3. Do not use more than 4 instructions without altering above relative jump
+//
+    asm("   ldrb     r0,[r4],#1");
+    asm("   ldrb     r5,[r2],#1");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+//
+// Must now be word aligned
+//
+    asm("aligned_compare_loop:");
+    asm("   ldr      r0,[r4],#4");
+    asm("   ldr      r5,[r2],#4");
+    asm("   eors     r0,r0,r5");
+    asm("   bne      word_different");
+    asm("   subs     ip,ip,#1");
+    asm("   bne      aligned_compare_loop");
+//
+// Less than 4 bytes to go...
+//
+    asm("   cmp      r4,r6");
+    asm("   bne      compare_loop");
+    asm("   sub      r0,r1,r3");
+	__POPRET("r4-r6,");
+//
+// A difference encountered while word comparing, find out which byte it was
+//
+    asm("word_different:");
+    asm("   ldrb     r0,[r4,#-4]");
+    asm("   ldrb     r5,[r2,#-4]");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   ldrb     r0,[r4,#-3]");
+    asm("   ldrb     r5,[r2,#-3]");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+    asm("   ldrb     r0,[r4,#-2]");
+    asm("   ldrb     r5,[r2,#-2]");
+    asm("   subs     r0,r0,r5");
+	asm("bne compare_exit ");
+//
+// This must be the different byte...
+//
+    asm("   ldrb     r0,[r4,#-1]");
+    asm("   ldrb     r5,[r2,#-1]");
+    asm("   sub      r0,r0,r5");
+	__POPRET("r4-r6,");
+    }
+#endif
+