x86: Use loops instead of memcpy/memset in board_init_f

Provides a small speed increase and prepares for fully relocatable image.
Downside is the TEXT_BASE, bss, load address etc must ALL be aligned on a
a 4-byte boundary which is not such a terrible restriction as everything
is already 4-byte aligned anyway
diff --git a/arch/i386/lib/board.c b/arch/i386/lib/board.c
index 3e89ef4..9c2f77f 100644
--- a/arch/i386/lib/board.c
+++ b/arch/i386/lib/board.c
@@ -53,7 +53,7 @@
 extern ulong __rel_dyn_start;
 extern ulong __rel_dyn_end;
 extern ulong __bss_start;
-extern ulong __bss_size;
+extern ulong __bss_end;
 
 const char version_string[] =
 	U_BOOT_VERSION" (" U_BOOT_DATE " - " U_BOOT_TIME ")";
@@ -172,18 +172,22 @@
 {
 	void *text_start = &__text_start;
 	void *data_end = &__data_end;
-	Elf32_Rel *rel_dyn_start = (Elf32_Rel *)&__rel_dyn_start;
-	Elf32_Rel *rel_dyn_end = (Elf32_Rel *)&__rel_dyn_end;
+	void *rel_dyn_start = &__rel_dyn_start;
+	void *rel_dyn_end = &__rel_dyn_end;
 	void *bss_start = &__bss_start;
-	ulong bss_size = (ulong)&__bss_size;
+	void *bss_end = &__bss_end;
 
-	ulong uboot_size;
+	ulong *dst_addr;
+	ulong *src_addr;
+	ulong *end_addr;
+
 	void *dest_addr;
 	ulong rel_offset;
-	Elf32_Rel *re;
+	Elf32_Rel *re_src;
+	Elf32_Rel *re_end;
 
-	uboot_size = (ulong)data_end - (ulong)text_start;
-	dest_addr  = (void *)gdp - (uboot_size + (ulong)bss_size);
+	/* Calculate destination RAM Address and relocation offset */
+	dest_addr  = (void *)gdp - (bss_end - text_start);
 	rel_offset = text_start - dest_addr;
 
 	/* First stage CPU initialization */
@@ -195,18 +199,29 @@
 		hang();
 
 	/* Copy U-Boot into RAM */
-	memcpy(dest_addr, text_start, uboot_size);
+	dst_addr = (ulong *)dest_addr;
+	src_addr = (ulong *)text_start;
+	end_addr = (ulong *)data_end;
+
+	while (src_addr < end_addr)
+		*dst_addr++ = *src_addr++;
 
 	/* Clear BSS */
-	memset(bss_start - rel_offset,	0, bss_size);
+	dst_addr = (ulong *)(bss_start - rel_offset);
+	end_addr = (ulong *)(bss_end - rel_offset);
+
+	while (dst_addr < end_addr)
+		*dst_addr++ = 0x00000000;
 
 	/* Perform relocation adjustments */
-	for (re = rel_dyn_start; re < rel_dyn_end; re++)
-	{
-		if (re->r_offset >= TEXT_BASE)
-			if (*(ulong *)re->r_offset >= TEXT_BASE)
-				*(ulong *)(re->r_offset - rel_offset) -= (Elf32_Addr)rel_offset;
-	}
+	re_src = (Elf32_Rel *)rel_dyn_start;
+	re_end = (Elf32_Rel *)rel_dyn_end;
+
+	do {
+		if (re_src->r_offset >= TEXT_BASE)
+			if (*(Elf32_Addr *)(re_src->r_offset - rel_offset) >= TEXT_BASE)
+				*(Elf32_Addr *)(re_src->r_offset - rel_offset) -= rel_offset;
+	} while (re_src++ < re_end);
 
 	((gd_t *)gdp)->reloc_off = rel_offset;
 	((gd_t *)gdp)->flags |= GD_FLG_RELOC;