sh: Fix OF_SEPARATE support

If the OF_SEPARATE is enabled, the DT is appended past the _end symbol.
The current code however clears BSS very early, which overwrites the DT
blob with zeroes. Moreover, the early code relocates U-Boot into RAM to
the correct location, but does not relocate the DT.

This patch adds code to relocate the DT and avoids clearing BSS too
early, thus addressing both problems with OF_SEPARATE on SH.

Signed-off-by: Marek Vasut <marek.vasut+renesas@gmail.com>
Cc: Nobuhiro Iwamatsu <iwamatsu@nigauri.org>
diff --git a/arch/sh/cpu/u-boot.lds b/arch/sh/cpu/u-boot.lds
index 7b225a6..47302da 100644
--- a/arch/sh/cpu/u-boot.lds
+++ b/arch/sh/cpu/u-boot.lds
@@ -75,6 +75,7 @@
 
 	PROVIDE (__init_end = .);
 	PROVIDE (reloc_dst_end = .);
+	PROVIDE (_end = .);
 
 	PROVIDE (bss_start = .);
 	PROVIDE (__bss_start = .);
diff --git a/arch/sh/lib/start.S b/arch/sh/lib/start.S
index f5350b9..f9f26d3 100644
--- a/arch/sh/lib/start.S
+++ b/arch/sh/lib/start.S
@@ -22,6 +22,17 @@
 	mov.l	._reloc_dst, r4
 	add	#(_start-1b), r5
 	mov.l	._reloc_dst_end, r6
+#ifdef CONFIG_OF_SEPARATE
+	mov.l	._reloc_size, r0
+	add	r5, r0
+	add	#4, r0
+	mov.l	@r0, r0
+	swap.b	r0, r0
+	swap.w	r0, r0
+	swap.b	r0, r0
+	add	#4, r0
+	add	r0, r6
+#endif
 
 2:	mov.l	@r5+, r1
 	mov.l	r1, @r4
@@ -29,6 +40,7 @@
 	cmp/hs	r6, r4
 	bf	2b
 
+#ifndef CONFIG_OF_SEPARATE
 	mov.l	._bss_start, r4
 	mov.l	._bss_end, r5
 	mov	#0, r1
@@ -37,6 +49,7 @@
 	add	#4, r4
 	cmp/hs	r5, r4
 	bf	3b
+#endif
 
 	mov.l	._gd_init, r13		/* global data */
 	mov.l	._stack_init, r15	/* stack */
@@ -53,6 +66,7 @@
 ._lowlevel_init:	.long	(lowlevel_init - (100b + 4))
 ._reloc_dst:		.long	_start
 ._reloc_dst_end:	.long	reloc_dst_end
+._reloc_size:		.long	(_end - _start)
 ._bss_start:		.long	bss_start
 ._bss_end:		.long	bss_end
 ._gd_init:		.long	(_start - GENERATED_GBL_DATA_SIZE)