arm: Add a 64-bit division routine to the private library

This is missing, with causes lldiv() to fail on boards with use the private
libgcc. Add the missing routine.

Code is available for using the CLZ instruction but it is not enabled at
present.

This comes from coreboot version 4.0.

Signed-off-by: Simon Glass <sjg@chromium.org>
diff --git a/arch/arm/lib/_uldivmod.S b/arch/arm/lib/_uldivmod.S
new file mode 100644
index 0000000..426c2f2
--- /dev/null
+++ b/arch/arm/lib/_uldivmod.S
@@ -0,0 +1,245 @@
+/*
+ * Copyright 2010, Google Inc.
+ *
+ * Brought in from coreboot uldivmod.S
+ *
+ * SPDX-License-Identifier:     GPL-2.0
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+/* We don't use Thumb instructions for now */
+#define ARM(x...)	x
+#define THUMB(x...)
+
+/*
+ * A, Q = r0 + (r1 << 32)
+ * B, R = r2 + (r3 << 32)
+ * A / B = Q ... R
+ */
+
+A_0	.req	r0
+A_1	.req	r1
+B_0	.req	r2
+B_1	.req	r3
+C_0	.req	r4
+C_1	.req	r5
+D_0	.req	r6
+D_1	.req	r7
+
+Q_0	.req	r0
+Q_1	.req	r1
+R_0	.req	r2
+R_1	.req	r3
+
+THUMB(
+TMP	.req	r8
+)
+
+ENTRY(__aeabi_uldivmod)
+	stmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) lr}
+	@ Test if B == 0
+	orrs	ip, B_0, B_1		@ Z set -> B == 0
+	beq	L_div_by_0
+	@ Test if B is power of 2: (B & (B - 1)) == 0
+	subs	C_0, B_0, #1
+	sbc	C_1, B_1, #0
+	tst	C_0, B_0
+	tsteq	B_1, C_1
+	beq	L_pow2
+	@ Test if A_1 == B_1 == 0
+	orrs	ip, A_1, B_1
+	beq	L_div_32_32
+
+L_div_64_64:
+/* CLZ only exists in ARM architecture version 5 and above. */
+#ifdef HAVE_CLZ
+	mov	C_0, #1
+	mov	C_1, #0
+	@ D_0 = clz A
+	teq	A_1, #0
+	clz	D_0, A_1
+	clzeq	ip, A_0
+	addeq	D_0, D_0, ip
+	@ D_1 = clz B
+	teq	B_1, #0
+	clz	D_1, B_1
+	clzeq	ip, B_0
+	addeq	D_1, D_1, ip
+	@ if clz B - clz A > 0
+	subs	D_0, D_1, D_0
+	bls	L_done_shift
+	@ B <<= (clz B - clz A)
+	subs	D_1, D_0, #32
+	rsb	ip, D_0, #32
+	movmi	B_1, B_1, lsl D_0
+ARM(	orrmi	B_1, B_1, B_0, lsr ip	)
+THUMB(	lsrmi	TMP, B_0, ip		)
+THUMB(	orrmi	B_1, B_1, TMP		)
+	movpl	B_1, B_0, lsl D_1
+	mov	B_0, B_0, lsl D_0
+	@ C = 1 << (clz B - clz A)
+	movmi	C_1, C_1, lsl D_0
+ARM(	orrmi	C_1, C_1, C_0, lsr ip	)
+THUMB(	lsrmi	TMP, C_0, ip		)
+THUMB(	orrmi	C_1, C_1, TMP		)
+	movpl	C_1, C_0, lsl D_1
+	mov	C_0, C_0, lsl D_0
+L_done_shift:
+	mov	D_0, #0
+	mov	D_1, #0
+	@ C: current bit; D: result
+#else
+	@ C: current bit; D: result
+	mov	C_0, #1
+	mov	C_1, #0
+	mov	D_0, #0
+	mov	D_1, #0
+L_lsl_4:
+	cmp	B_1, #0x10000000
+	cmpcc	B_1, A_1
+	cmpeq	B_0, A_0
+	bcs	L_lsl_1
+	@ B <<= 4
+	mov	B_1, B_1, lsl #4
+	orr	B_1, B_1, B_0, lsr #28
+	mov	B_0, B_0, lsl #4
+	@ C <<= 4
+	mov	C_1, C_1, lsl #4
+	orr	C_1, C_1, C_0, lsr #28
+	mov	C_0, C_0, lsl #4
+	b	L_lsl_4
+L_lsl_1:
+	cmp	B_1, #0x80000000
+	cmpcc	B_1, A_1
+	cmpeq	B_0, A_0
+	bcs	L_subtract
+	@ B <<= 1
+	mov	B_1, B_1, lsl #1
+	orr	B_1, B_1, B_0, lsr #31
+	mov	B_0, B_0, lsl #1
+	@ C <<= 1
+	mov	C_1, C_1, lsl #1
+	orr	C_1, C_1, C_0, lsr #31
+	mov	C_0, C_0, lsl #1
+	b	L_lsl_1
+#endif
+L_subtract:
+	@ if A >= B
+	cmp	A_1, B_1
+	cmpeq	A_0, B_0
+	bcc	L_update
+	@ A -= B
+	subs	A_0, A_0, B_0
+	sbc	A_1, A_1, B_1
+	@ D |= C
+	orr	D_0, D_0, C_0
+	orr	D_1, D_1, C_1
+L_update:
+	@ if A == 0: break
+	orrs	ip, A_1, A_0
+	beq	L_exit
+	@ C >>= 1
+	movs	C_1, C_1, lsr #1
+	movs	C_0, C_0, rrx
+	@ if C == 0: break
+	orrs	ip, C_1, C_0
+	beq	L_exit
+	@ B >>= 1
+	movs	B_1, B_1, lsr #1
+	mov	B_0, B_0, rrx
+	b	L_subtract
+L_exit:
+	@ Note: A, B & Q, R are aliases
+	mov	R_0, A_0
+	mov	R_1, A_1
+	mov	Q_0, D_0
+	mov	Q_1, D_1
+	ldmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) pc}
+
+L_div_32_32:
+	@ Note:	A_0 &	r0 are aliases
+	@	Q_1	r1
+	mov	r1, B_0
+	bl	__aeabi_uidivmod
+	mov	R_0, r1
+	mov	R_1, #0
+	mov	Q_1, #0
+	ldmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) pc}
+
+L_pow2:
+#ifdef HAVE_CLZ
+	@ Note: A, B and Q, R are aliases
+	@ R = A & (B - 1)
+	and	C_0, A_0, C_0
+	and	C_1, A_1, C_1
+	@ Q = A >> log2(B)
+	@ Note: B must not be 0 here!
+	clz	D_0, B_0
+	add	D_1, D_0, #1
+	rsbs	D_0, D_0, #31
+	bpl	L_1
+	clz	D_0, B_1
+	rsb	D_0, D_0, #31
+	mov	A_0, A_1, lsr D_0
+	add	D_0, D_0, #32
+L_1:
+	movpl	A_0, A_0, lsr D_0
+ARM(	orrpl	A_0, A_0, A_1, lsl D_1	)
+THUMB(	lslpl	TMP, A_1, D_1		)
+THUMB(	orrpl	A_0, A_0, TMP		)
+	mov	A_1, A_1, lsr D_0
+	@ Mov back C to R
+	mov	R_0, C_0
+	mov	R_1, C_1
+	ldmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) pc}
+#else
+	@ Note: A, B and Q, R are aliases
+	@ R = A & (B - 1)
+	and	C_0, A_0, C_0
+	and	C_1, A_1, C_1
+	@ Q = A >> log2(B)
+	@ Note: B must not be 0 here!
+	@ Count the leading zeroes in B.
+	mov	D_0, #0
+	orrs	B_0, B_0, B_0
+	@ If B is greater than 1 << 31, divide A and B by 1 << 32.
+	moveq	A_0, A_1
+	moveq	A_1, #0
+	moveq	B_0, B_1
+	@ Count the remaining leading zeroes in B.
+	movs	B_1, B_0, lsl #16
+	addeq	D_0, #16
+	moveq	B_0, B_0, lsr #16
+	tst	B_0, #0xff
+	addeq	D_0, #8
+	moveq	B_0, B_0, lsr #8
+	tst	B_0, #0xf
+	addeq	D_0, #4
+	moveq	B_0, B_0, lsr #4
+	tst	B_0, #0x3
+	addeq	D_0, #2
+	moveq	B_0, B_0, lsr #2
+	tst	B_0, #0x1
+	addeq	D_0, #1
+	@ Shift A to the right by the appropriate amount.
+	rsb	D_1, D_0, #32
+	mov	Q_0, A_0, lsr D_0
+	orr	Q_0, A_1, lsl D_1
+	mov	Q_1, A_1, lsr D_0
+	@ Move C to R
+	mov	R_0, C_0
+	mov	R_1, C_1
+	ldmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) pc}
+#endif
+
+L_div_by_0:
+	bl	__div0
+	@ As wrong as it could be
+	mov	Q_0, #0
+	mov	Q_1, #0
+	mov	R_0, #0
+	mov	R_1, #0
+	ldmfd	sp!, {r4, r5, r6, r7, THUMB(TMP,) pc}
+ENDPROC(__aeabi_uldivmod)