PXA: Rework start.S to be closer to other ARMs

The start.S on PXA was very obscure. This reworks it back to be close to arm1136
start.S and others.

Signed-off-by: Marek Vasut <marek.vasut@gmail.com>
Cc: Albert ARIBAUD <albert.u.boot@aribaud.net>

V2: Don't compile in relocation support if building SPL
diff --git a/arch/arm/cpu/pxa/cpu.c b/arch/arm/cpu/pxa/cpu.c
index df351c7..c48b2ef 100644
--- a/arch/arm/cpu/pxa/cpu.c
+++ b/arch/arm/cpu/pxa/cpu.c
@@ -328,3 +328,19 @@
 	writel(readl(CKEN) | CKEN14_I2C, CKEN);
 #endif
 }
+
+void reset_cpu(ulong ignored) __attribute__((noreturn));
+
+void reset_cpu(ulong ignored)
+{
+	uint32_t tmp;
+
+	setbits_le32(OWER, OWER_WME);
+
+	tmp = readl(OSCR);
+	tmp += 0x1000;
+	writel(tmp, OSMR3);
+
+	for (;;)
+		;
+}
diff --git a/arch/arm/cpu/pxa/start.S b/arch/arm/cpu/pxa/start.S
index 6191a73..88a4cc2 100644
--- a/arch/arm/cpu/pxa/start.S
+++ b/arch/arm/cpu/pxa/start.S
@@ -1,14 +1,20 @@
 /*
- *  armboot - Startup Code for XScale
+ *  armboot - Startup Code for XScale CPU-core
  *
  *  Copyright (C) 1998	Dan Malek <dmalek@jlc.net>
  *  Copyright (C) 1999	Magnus Damm <kieraypc01.p.y.kie.era.ericsson.se>
  *  Copyright (C) 2000	Wolfgang Denk <wd@denx.de>
  *  Copyright (C) 2001	Alex Zuepke <azu@sysgo.de>
+ *  Copyright (C) 2001	Marius Groger <mag@sysgo.de>
+ *  Copyright (C) 2002	Alex Zupke <azu@sysgo.de>
+ *  Copyright (C) 2002	Gary Jennejohn <garyj@denx.de>
  *  Copyright (C) 2002	Kyle Harris <kharris@nexus-tech.net>
- *  Copyright (C) 2003	Robert Schwebel <r.schwebel@pengutronix.de>
  *  Copyright (C) 2003	Kai-Uwe Bloem <kai-uwe.bloem@auerswald.de>
- *  Copyright (c) 2010	Marek Vasut <marek.vasut@gmail.com>
+ *  Copyright (C) 2003	Kshitij <kshitij@ti.com>
+ *  Copyright (C) 2003	Richard Woodruff <r-woodruff2@ti.com>
+ *  Copyright (C) 2003	Robert Schwebel <r.schwebel@pengutronix.de>
+ *  Copyright (C) 2004	Texas Instruments <r-woodruff2@ti.com>
+ *  Copyright (C) 2010	Marek Vasut <marek.vasut@gmail.com>
  *
  * See file CREDITS for list of people who contributed to this
  * project.
@@ -32,15 +38,6 @@
 #include <asm-offsets.h>
 #include <config.h>
 #include <version.h>
-#include <asm/arch/pxa-regs.h>
-
-/* takes care the CP15 update has taken place */
-.macro CPWAIT reg
-mrc  p15,0,\reg,c2,c0,0
-mov  \reg,\reg
-sub  pc,pc,#4
-.endm
-
 .globl _start
 _start: b	reset
 #ifdef CONFIG_SPL_BUILD
@@ -77,26 +74,38 @@
 _not_used:		.word not_used
 _irq:			.word irq
 _fiq:			.word fiq
+_pad:			.word 0x12345678 /* now 16*4=64 */
 #endif	/* CONFIG_SPL_BUILD */
+.global _end_vect
+_end_vect:
 
 	.balignl 16,0xdeadbeef
-
-
 /*
+ *************************************************************************
+ *
  * Startup Code (reset vector)
  *
- * do important init only if we don't start from RAM!
- * - relocate armboot to RAM
- * - setup stack
- * - jump to second stage
+ * do important init only if we don't start from memory!
+ * setup Memory and board specific bits prior to relocation.
+ * relocate armboot to ram
+ * setup stack
+ *
+ *************************************************************************
  */
 
 .globl _TEXT_BASE
 _TEXT_BASE:
+#ifdef	CONFIG_SPL_BUILD
+	.word	CONFIG_SPL_TEXT_BASE
+#else
 	.word	CONFIG_SYS_TEXT_BASE
+#endif
 
 /*
  * These are defined in the board-specific linker script.
+ * Subtracting _start from them lets the linker put their
+ * relative position in the executable instead of leaving
+ * them null.
  */
 .globl _bss_start_ofs
 _bss_start_ofs:
@@ -120,9 +129,8 @@
 .globl FIQ_STACK_START
 FIQ_STACK_START:
 	.word 0x0badc0de
-#endif /* CONFIG_USE_IRQ */
+#endif
 
-#ifndef CONFIG_SPL_BUILD
 /* IRQ stack memory (calculated at run-time) + 8 bytes */
 .globl IRQ_STACK_START_IN
 IRQ_STACK_START_IN:
@@ -141,95 +149,19 @@
 	orr	r0,r0,#0xd3
 	msr	cpsr,r0
 
-	/*
-	 * Enable MMU to use DCache as DRAM
-	 */
-	/* Domain access -- enable for all CPs */
-	ldr	r0, =0x0000ffff
-	mcr	p15, 0, r0, c3, c0, 0
-
-	/* Point TTBR to MMU table */
-	ldr	r0, =mmu_table
-	adr	r2, _start
-	orr	r0, r2
-	mcr	p15, 0, r0, c2, c0, 0
-
-/* !!! Hereby, check if the code is running from SRAM !!! */
-/* If the code is running from SRAM, alias SRAM to 0x0 to simulate NOR. The code
- * is linked to 0x0 too, so this makes things easier. */
-	cmp	r2, #0x5c000000
-
-	ldreq	r1, [r0]
-	orreq	r1, r2
-	streq	r1, [r0]
-
-	/* Kick in MMU, ICache, DCache, BTB */
-	mrc	p15, 0, r0, c1, c0, 0
-	bic	r0, #0x1b00
-	bic	r0, #0x0087
-	orr	r0, #0x1800
-	orr	r0, #0x0005
-	mcr	p15, 0, r0, c1, c0, 0
-	CPWAIT	r0
-
-	/* Unlock Icache, Dcache */
-	mcr	p15, 0, r0, c9, c1, 1
-	mcr	p15, 0, r0, c9, c2, 1
-
-	/* Flush Icache, Dcache, BTB */
-	mcr	p15, 0, r0, c7, c7, 0
-
-	/* Unlock I-TLB, D-TLB */
-	mcr	p15, 0, r0, c10, c4, 1
-	mcr	p15, 0, r0, c10, c8, 1
-
-	/* Flush TLB */
-	mcr	p15, 0, r0, c8, c7, 0
-	/* Allocate 4096 bytes of Dcache as RAM */
-
-	/* Drain pending loads and stores */
-	mcr	p15, 0, r0, c7, c10, 4
-
-	mov	r4, #0x00
-	mov	r5, #0x00
-	mov	r2, #0x01
-	mcr	p15, 0, r0, c9, c2, 0
-	CPWAIT	r0
-
-	/* 128 lines reserved (128 x 32bytes = 4096 bytes total) */
-	mov	r0, #128
-	mov	r1, #0xa0000000
-alloc:
-	mcr	p15, 0, r1, c7, c2, 5
-	/* Drain pending loads and stores */
-	mcr	p15, 0, r0, c7, c10, 4
-	strd	r4, [r1], #8
-	strd	r4, [r1], #8
-	strd	r4, [r1], #8
-	strd	r4, [r1], #8
-	subs	r0, #0x01
-	bne	alloc
-	/* Drain pending loads and stores */
-	mcr	p15, 0, r0, c7, c10, 4
-	mov	r2, #0x00
-	mcr	p15, 0, r2, c9, c2, 0
-	CPWAIT	r0
-
-	/* Jump to 0x0 ( + offset) if running from SRAM */
-	adr	r0, zerojmp
-	bic	r0, #0x5c000000
-	mov	pc, r0
-zerojmp:
+#ifndef CONFIG_SKIP_LOWLEVEL_INIT
+	bl  cpu_init_crit
+#endif
 
 /* Set stackpointer in internal RAM to call board_init_f */
 call_board_init_f:
 	ldr	sp, =(CONFIG_SYS_INIT_SP_ADDR)
 	bic	sp, sp, #7 /* 8-byte alignment for ABI compliance */
-	ldr	r0,=0x00000000
+	ldr	r0, =0x00000000
 	bl	board_init_f
 
 /*------------------------------------------------------------------------------*/
-
+#ifndef CONFIG_SPL_BUILD
 /*
  * void relocate_code (addr_sp, gd, addr_moni)
  *
@@ -254,13 +186,11 @@
 	ldr	r3, _bss_start_ofs
 	add	r2, r0, r3		/* r2 <- source end address	    */
 
-	stmfd sp!, {r0-r12}
 copy_loop:
-	ldmia	r0!, {r3-r5, r7-r11}	/* copy from source address [r0]    */
-	stmia	r1!, {r3-r5, r7-r11}	/* copy to   target address [r1]    */
+	ldmia	r0!, {r9-r10}		/* copy from source address [r0]    */
+	stmia	r1!, {r9-r10}		/* copy to   target address [r1]    */
 	cmp	r0, r2			/* until source end address [r2]    */
 	blo	copy_loop
-	ldmfd sp!, {r0-r12}
 
 #ifndef CONFIG_SPL_BUILD
 	/*
@@ -275,13 +205,13 @@
 	ldr	r3, _rel_dyn_end_ofs	/* r3 <- rel dyn end ofs */
 	add	r3, r3, r0		/* r3 <- rel dyn end in FLASH */
 fixloop:
-	ldr	r0, [r2]	/* r0 <- location to fix up, IN FLASH! */
-	add	r0, r9		/* r0 <- location to fix up in RAM */
+	ldr	r0, [r2]		/* r0 <- location to fix up, IN FLASH! */
+	add	r0, r0, r9		/* r0 <- location to fix up in RAM */
 	ldr	r1, [r2, #4]
 	and	r7, r1, #0xff
-	cmp	r7, #23		/* relative fixup? */
+	cmp	r7, #23			/* relative fixup? */
 	beq	fixrel
-	cmp	r7, #2		/* absolute fixup? */
+	cmp	r7, #2			/* absolute fixup? */
 	beq	fixabs
 	/* ignore unknown type of fixup */
 	b	fixnext
@@ -298,10 +228,10 @@
 	add	r1, r1, r9
 fixnext:
 	str	r1, [r0]
-	add	r2, r2, #8	/* each rel.dyn entry is 8 bytes */
+	add	r2, r2, #8		/* each rel.dyn entry is 8 bytes */
 	cmp	r2, r3
 	blo	fixloop
-#endif	/* #ifndef CONFIG_SPL_BUILD */
+#endif
 
 clear_bss:
 #ifndef CONFIG_SPL_BUILD
@@ -322,15 +252,16 @@
  * We are done. Do not return, instead branch to second part of board
  * initialization, now running from RAM.
  */
-#ifdef CONFIG_ONENAND_IPL
-	ldr     r0, _start_oneboot_ofs
+#ifdef CONFIG_ONENAND_SPL
+	ldr     r0, _onenand_boot_ofs
 	mov	pc, r0
 
-_start_oneboot_ofs
-	: .word start_oneboot
+_onenand_boot_ofs:
+	.word onenand_boot
 #else
+jump_2_ram:
 	ldr	r0, _board_init_r_ofs
-	adr	r1, _start
+	ldr     r1, _TEXT_BASE
 	add	lr, r0, r1
 	add	lr, lr, r9
 	/* setup parameters for board_init_r */
@@ -341,7 +272,7 @@
 
 _board_init_r_ofs:
 	.word board_init_r - _start
-#endif	/* CONFIG_ONENAND_IPL */
+#endif
 
 _rel_dyn_start_ofs:
 	.word __rel_dyn_start - _start
@@ -349,43 +280,50 @@
 	.word __rel_dyn_end - _start
 _dynsym_start_ofs:
 	.word __dynsym_start - _start
-
-#else /* CONFIG_SPL_BUILD */
-
-/****************************************************************************/
-/*									    */
-/* the actual reset code for OneNAND IPL				    */
-/*									    */
-/****************************************************************************/
-
-#ifndef	CONFIG_PXA27X
-#error OneNAND IPL is not supported on PXA25x and 26x due to lack of SRAM
 #endif
-
-reset:
-	/* Set CPU to SVC32 mode */
-	mrs	r0,cpsr
-	bic	r0,r0,#0x1f
-	orr	r0,r0,#0x13
-	msr	cpsr,r0
-
-	/* Point stack at the end of SRAM and leave 32 words for abort-stack */
-	ldr	sp, =0x5c03ff80
+/*
+ *************************************************************************
+ *
+ * CPU_init_critical registers
+ *
+ * setup important registers
+ * setup memory timing
+ *
+ *************************************************************************
+ */
+#ifndef CONFIG_SKIP_LOWLEVEL_INIT
+cpu_init_crit:
+	/*
+	 * flush v4 I/D caches
+	 */
+	mov	r0, #0
+	mcr	p15, 0, r0, c7, c7, 0	/* Invalidate I+D+BTB caches */
+	mcr	p15, 0, r0, c8, c7, 0	/* Invalidate Unified TLB */
 
-	/* Start OneNAND IPL */
-	ldr	pc, =start_oneboot
+	/*
+	 * disable MMU stuff and caches
+	 */
+	mrc	p15, 0, r0, c1, c0, 0
+	bic	r0, r0, #0x00002300	@ clear bits 13, 9:8 (--V- --RS)
+	bic	r0, r0, #0x00000087	@ clear bits 7, 2:0 (B--- -CAM)
+	orr	r0, r0, #0x00000002	@ set bit 2 (A) Align
+	orr	r0, r0, #0x00001000	@ set bit 12 (I) I-Cache
+	mcr	p15, 0, r0, c1, c0, 0
 
-#endif /* CONFIG_SPL_BUILD */
+	mov	pc, lr		/* back to my caller */
+#endif /* CONFIG_SKIP_LOWLEVEL_INIT */
 
 #ifndef CONFIG_SPL_BUILD
-/****************************************************************************/
-/*									    */
-/* Interrupt handling							    */
-/*									    */
-/****************************************************************************/
-
-/* IRQ stack frame							    */
-
+/*
+ *************************************************************************
+ *
+ * Interrupt handling
+ *
+ *************************************************************************
+ */
+@
+@ IRQ stack frame.
+@
 #define S_FRAME_SIZE	72
 
 #define S_OLD_R0	68
@@ -409,37 +347,36 @@
 #define S_R0		0
 
 #define MODE_SVC 0x13
+#define I_BIT	 0x80
 
-	/* use bad_save_user_regs for abort/prefetch/undef/swi ...	    */
+/*
+ * use bad_save_user_regs for abort/prefetch/undef/swi ...
+ * use irq_save_user_regs / irq_restore_user_regs for IRQ/FIQ handling
+ */
 
 	.macro	bad_save_user_regs
-	sub	sp, sp, #S_FRAME_SIZE
-	stmia	sp, {r0 - r12}			/* Calling r0-r12	    */
-	add	r8, sp, #S_PC
+	sub	sp, sp, #S_FRAME_SIZE		@ carve out a frame on current user stack
+	stmia	sp, {r0 - r12}			@ Save user registers (now in svc mode) r0-r12
 
-	ldr	r2, IRQ_STACK_START_IN
-	ldmia	r2, {r2 - r4}			/* get pc, cpsr, old_r0	    */
-	add	r0, sp, #S_FRAME_SIZE		/* restore sp_SVC	    */
+	ldr	r2, IRQ_STACK_START_IN		@ set base 2 words into abort stack
+	ldmia	r2, {r2 - r3}			@ get values for "aborted" pc and cpsr (into parm regs)
+	add	r0, sp, #S_FRAME_SIZE		@ grab pointer to old stack
 
 	add	r5, sp, #S_SP
 	mov	r1, lr
-	stmia	r5, {r0 - r4}			/* save sp_SVC, lr_SVC, pc, cpsr, old_r */
-	mov	r0, sp
+	stmia	r5, {r0 - r3}			@ save sp_SVC, lr_SVC, pc, cpsr
+	mov	r0, sp				@ save current stack into r0 (param register)
 	.endm
 
-
-	/* use irq_save_user_regs / irq_restore_user_regs for		     */
-	/* IRQ/FIQ handling						     */
-
 	.macro	irq_save_user_regs
 	sub	sp, sp, #S_FRAME_SIZE
-	stmia	sp, {r0 - r12}			/* Calling r0-r12	     */
-	add	r8, sp, #S_PC
-	stmdb	r8, {sp, lr}^			/* Calling SP, LR	     */
-	str	lr, [r8, #0]			/* Save calling PC	     */
+	stmia	sp, {r0 - r12}			@ Calling r0-r12
+	add	r8, sp, #S_PC			@ !!!! R8 NEEDS to be saved !!!! a reserved stack spot would be good.
+	stmdb	r8, {sp, lr}^			@ Calling SP, LR
+	str	lr, [r8, #0]			@ Save calling PC
 	mrs	r6, spsr
-	str	r6, [r8, #4]			/* Save CPSR		     */
-	str	r0, [r8, #8]			/* Save OLD_R0		     */
+	str	r6, [r8, #4]			@ Save CPSR
+	str	r0, [r8, #8]			@ Save OLD_R0
 	mov	r0, sp
 	.endm
 
@@ -452,16 +389,28 @@
 	.endm
 
 	.macro get_bad_stack
-	ldr	r13, IRQ_STACK_START_IN		@ setup our mode stack
+	ldr	r13, IRQ_STACK_START_IN		@ setup our mode stack (enter in banked mode)
 
-	str	lr, [r13]			@ save caller lr / spsr
-	mrs	lr, spsr
-	str	lr, [r13, #4]
+	str	lr, [r13]			@ save caller lr in position 0 of saved stack
+	mrs	lr, spsr			@ get the spsr
+	str	lr, [r13, #4]			@ save spsr in position 1 of saved stack
 
 	mov	r13, #MODE_SVC			@ prepare SVC-Mode
-	msr	spsr_c, r13
-	mov	lr, pc
-	movs	pc, lr
+	@ msr	spsr_c, r13
+	msr	spsr, r13			@ switch modes, make sure moves will execute
+	mov	lr, pc				@ capture return pc
+	movs	pc, lr				@ jump to next instruction & switch modes.
+	.endm
+
+	.macro get_bad_stack_swi
+	sub	r13, r13, #4			@ space on current stack for scratch reg.
+	str	r0, [r13]			@ save R0's value.
+	ldr	r0, IRQ_STACK_START_IN		@ get data regions start
+	str	lr, [r0]			@ save caller lr in position 0 of saved stack
+	mrs	r0, spsr			@ get the spsr
+	str	lr, [r0, #4]			@ save spsr in position 1 of saved stack
+	ldr	r0, [r13]			@ restore r0
+	add	r13, r13, #4			@ pop stack entry
 	.endm
 
 	.macro get_irq_stack			@ setup IRQ stack
@@ -471,21 +420,17 @@
 	.macro get_fiq_stack			@ setup FIQ stack
 	ldr	sp, FIQ_STACK_START
 	.endm
-#endif	/* CONFIG_SPL_BUILD
-
-
-/****************************************************************************/
-/*									    */
-/* exception handlers							    */
-/*									    */
-/****************************************************************************/
+#endif	/* CONFIG_SPL_BUILD */
 
+/*
+ * exception handlers
+ */
 #ifdef CONFIG_SPL_BUILD
 	.align	5
 do_hang:
-	ldr	sp, _TEXT_BASE			/* use 32 words abort stack */
+	ldr	sp, _TEXT_BASE			/* use 32 words about stack */
 	bl	hang				/* hang and never return */
-#else
+#else	/* !CONFIG_SPL_BUILD */
 	.align	5
 undefined_instruction:
 	get_bad_stack
@@ -494,7 +439,7 @@
 
 	.align	5
 software_interrupt:
-	get_bad_stack
+	get_bad_stack_swi
 	bad_save_user_regs
 	bl	do_software_interrupt
 
@@ -528,11 +473,12 @@
 	.align	5
 fiq:
 	get_fiq_stack
-	irq_save_user_regs		/* someone ought to write a more    */
-	bl	do_fiq			/* effiction fiq_save_user_regs	    */
+	/* someone ought to write a more effiction fiq_save_user_regs */
+	irq_save_user_regs
+	bl	do_fiq
 	irq_restore_user_regs
 
-#else /* !CONFIG_USE_IRQ */
+#else
 
 	.align	5
 irq:
@@ -545,63 +491,7 @@
 	get_bad_stack
 	bad_save_user_regs
 	bl	do_fiq
-#endif	/* CONFIG_SPL_BUILD */
-#endif /* CONFIG_USE_IRQ */
-
-/****************************************************************************/
-/*									    */
-/* Reset function: the PXA250 doesn't have a reset function, so we have to  */
-/* perform a watchdog timeout for a soft reset.				    */
-/*									    */
-/****************************************************************************/
-/* Operating System Timer */
-.align	5
-.globl reset_cpu
-
-	/* FIXME: this code is PXA250 specific. How is this handled on	    */
-	/*	  other XScale processors?				    */
-
-reset_cpu:
-
-	/* We set OWE:WME (watchdog enable) and wait until timeout happens  */
 
-	ldr	r0, =OWER
-	ldr	r1, [r0]
-	orr	r1, r1, #0x0001			/* bit0: WME		    */
-	str	r1, [r0]
-
-	/* OS timer does only wrap every 1165 seconds, so we have to set    */
-	/* the match register as well.					    */
-
-	ldr	r0, =OSCR
-	ldr	r1, [r0]			/* read OS timer	    */
-	add	r1, r1, #0x800			/* let OSMR3 match after    */
-	add	r1, r1, #0x800			/* 4096*(1/3.6864MHz)=1ms   */
-	ldr	r0, =OSMR3
-	str	r1, [r0]
-
-reset_endless:
-
-	b	reset_endless
-
-#ifndef CONFIG_SPL_BUILD
-.section .mmudata, "a"
-	.align	14
-	.globl	mmu_table
-mmu_table:
-	/* 0x00000000 - 0xa0000000 : 1:1, uncached mapping */
-	.set	__base, 0
-	.rept	0xa00
-	.word	(__base << 20) | 0xc12
-	.set	__base, __base + 1
-	.endr
-
-	/* 0xa0000000 - 0xa0100000 : 1:1, cached mapping */
-	.word	(0xa00 << 20) | 0x1c1e
-
-	.set	__base, 0xa01
-	.rept	0x1000 - 0xa01
-	.word	(__base << 20) | 0xc12
-	.set	__base, __base + 1
-	.endr
+#endif
+	.align 5
 #endif	/* CONFIG_SPL_BUILD */