arm: build arch memset/memcpy in Thumb2 mode

Resynchronize memcpy/memset with kernel 3.17 and build them in
Thumb2 mode (unified syntax). Those assembler files can be built
and linked in ARM mode too, however when calling them from Thumb2
built code, the stack got corrupted and the copy did not succeed
(the exact details have not been traced back). However, the Linux
kernel builds those files in Thumb2 mode. Hence U-Boot should
build them in Thumb2 mode too when CONFIG_SYS_THUMB_BUILD is set.

To build the files without warning, some assembler instructions
had to be replaced with their UAL compliant variant (thanks
Jeroen for this input).

To build the file in Thumb2 mode the implicit-it=always option need
to be set to generate Thumb2 compliant IT instructions where needed.
We add this option to the general AFLAGS when building for Thumb2.

Reviewed-by: Simon Glass <sjg@chromium.org>
Tested-by: Simon Glass <sjg@chromium.org>
Signed-off-by: Stefan Agner <stefan@agner.ch>
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index f655256..eeaf003 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -10,9 +10,14 @@
  *  published by the Free Software Foundation.
  */
 
+#include <linux/linkage.h>
 #include <asm/assembler.h>
 
+#ifdef CONFIG_SYS_THUMB_BUILD
+#define W(instr)	instr.w
+#else
 #define W(instr)	instr
+#endif
 
 #define LDR1W_SHIFT	0
 #define STR1W_SHIFT	0
@@ -30,7 +35,7 @@
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
-	ldr\cond\()b \reg, [\ptr], #1
+	ldrb\cond\() \reg, [\ptr], #1
 	.endm
 
 	.macro str1w ptr reg abort
@@ -42,7 +47,7 @@
 	.endm
 
 	.macro str1b ptr reg cond=al abort
-	str\cond\()b \reg, [\ptr], #1
+	strb\cond\() \reg, [\ptr], #1
 	.endm
 
 	.macro enter reg1 reg2
@@ -56,10 +61,12 @@
 	.text
 
 /* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-
-.globl memcpy
-memcpy:
-
+	.syntax unified
+#ifdef CONFIG_SYS_THUMB_BUILD
+	.thumb
+	.thumb_func
+#endif
+ENTRY(memcpy)
 		cmp	r0, r1
 		moveq	pc, lr
 
@@ -79,7 +86,7 @@
 
 	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	r3, ip, #32		)
-	CALGN(	sbcnes	r4, r3, r2		)  @ C is always set here
+	CALGN(	sbcsne	r4, r3, r2		)  @ C is always set here
 	CALGN(	bcs	2f			)
 	CALGN(	adr	r4, 6f			)
 	CALGN(	subs	r2, r2, r3		)  @ C gets set
@@ -178,7 +185,7 @@
 
 	CALGN(	ands	ip, r0, #31		)
 	CALGN(	rsb	ip, ip, #32		)
-	CALGN(	sbcnes	r4, ip, r2		)  @ C is always set here
+	CALGN(	sbcsne	r4, ip, r2		)  @ C is always set here
 	CALGN(	subcc	r2, r2, ip		)
 	CALGN(	bcc	15f			)
 
@@ -193,24 +200,24 @@
 
 12:	PLD(	pld	[r1, #124]		)
 13:		ldr4w	r1, r4, r5, r6, r7, abort=19f
-		mov	r3, lr, pull #\pull
+		mov	r3, lr, lspull #\pull
 		subs	r2, r2, #32
 		ldr4w	r1, r8, r9, ip, lr, abort=19f
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
-		mov	ip, ip, pull #\pull
-		orr	ip, ip, lr, push #\push
+		orr	r3, r3, r4, lspush #\push
+		mov	r4, r4, lspull #\pull
+		orr	r4, r4, r5, lspush #\push
+		mov	r5, r5, lspull #\pull
+		orr	r5, r5, r6, lspush #\push
+		mov	r6, r6, lspull #\pull
+		orr	r6, r6, r7, lspush #\push
+		mov	r7, r7, lspull #\pull
+		orr	r7, r7, r8, lspush #\push
+		mov	r8, r8, lspull #\pull
+		orr	r8, r8, r9, lspush #\push
+		mov	r9, r9, lspull #\pull
+		orr	r9, r9, ip, lspush #\push
+		mov	ip, ip, lspull #\pull
+		orr	ip, ip, lr, lspush #\push
 		str8w	r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
 		bge	12b
 	PLD(	cmn	r2, #96			)
@@ -221,10 +228,10 @@
 14:		ands	ip, r2, #28
 		beq	16f
 
-15:		mov	r3, lr, pull #\pull
+15:		mov	r3, lr, lspull #\pull
 		ldr1w	r1, lr, abort=21f
 		subs	ip, ip, #4
-		orr	r3, r3, lr, push #\push
+		orr	r3, r3, lr, lspush #\push
 		str1w	r0, r3, abort=21f
 		bgt	15b
 	CALGN(	cmp	r2, #0			)
@@ -241,3 +248,24 @@
 17:		forward_copy_shift	pull=16	push=16
 
 18:		forward_copy_shift	pull=24	push=8
+
+
+/*
+ * Abort preamble and completion macros.
+ * If a fixup handler is required then those macros must surround it.
+ * It is assumed that the fixup code will handle the private part of
+ * the exit macro.
+ */
+
+	.macro	copy_abort_preamble
+19:	ldmfd	sp!, {r5 - r9}
+	b	21f
+20:	ldmfd	sp!, {r5 - r8}
+21:
+	.endm
+
+	.macro	copy_abort_end
+	ldmfd	sp!, {r4, pc}
+	.endm
+
+ENDPROC(memcpy)