Merge branch 'master' of git://git.denx.de/u-boot-sunxi
diff --git a/arch/arm/cpu/armv8/Makefile b/arch/arm/cpu/armv8/Makefile
index 28ba786..e780afc 100644
--- a/arch/arm/cpu/armv8/Makefile
+++ b/arch/arm/cpu/armv8/Makefile
@@ -26,3 +26,4 @@
 obj-$(CONFIG_ARCH_ZYNQMP) += zynqmp/
 obj-$(CONFIG_TARGET_HIKEY) += hisilicon/
 obj-$(CONFIG_ARMV8_PSCI) += psci.o
+obj-$(CONFIG_ARCH_SUNXI) += lowlevel_init.o
diff --git a/arch/arm/cpu/armv8/cpu.c b/arch/arm/cpu/armv8/cpu.c
index 5dcb5e2..28a27f7 100644
--- a/arch/arm/cpu/armv8/cpu.c
+++ b/arch/arm/cpu/armv8/cpu.c
@@ -17,6 +17,20 @@
 #include <asm/secure.h>
 #include <linux/compiler.h>
 
+/*
+ * sdelay() - simple spin loop.
+ *
+ * Will delay execution by roughly (@loops * 2) cycles.
+ * This is necessary to be used before timers are accessible.
+ *
+ * A value of "0" will results in 2^64 loops.
+ */
+void sdelay(unsigned long loops)
+{
+	__asm__ volatile ("1:\n" "subs %0, %0, #1\n"
+			  "b.ne 1b" : "=r" (loops) : "0"(loops) : "cc");
+}
+
 int cleanup_before_linux(void)
 {
 	/*
diff --git a/arch/arm/cpu/armv8/lowlevel_init.S b/arch/arm/cpu/armv8/lowlevel_init.S
new file mode 100644
index 0000000..189e35f
--- /dev/null
+++ b/arch/arm/cpu/armv8/lowlevel_init.S
@@ -0,0 +1,44 @@
+/*
+ * A lowlevel_init function that sets up the stack to call a C function to
+ * perform further init.
+ *
+ * SPDX-License-Identifier:	GPL-2.0+
+ */
+
+#include <asm-offsets.h>
+#include <config.h>
+#include <linux/linkage.h>
+
+ENTRY(lowlevel_init)
+	/*
+	 * Setup a temporary stack. Global data is not available yet.
+	 */
+#if defined(CONFIG_SPL_BUILD) && defined(CONFIG_SPL_STACK)
+	ldr	w0, =CONFIG_SPL_STACK
+#else
+	ldr	w0, =CONFIG_SYS_INIT_SP_ADDR
+#endif
+	bic	sp, x0, #0xf	/* 16-byte alignment for ABI compliance */
+
+	/*
+	 * Save the old LR(passed in x29) and the current LR to stack
+	 */
+	stp	x29, x30, [sp, #-16]!
+
+	/*
+	 * Call the very early init function. This should do only the
+	 * absolute bare minimum to get started. It should not:
+	 *
+	 * - set up DRAM
+	 * - use global_data
+	 * - clear BSS
+	 * - try to start a console
+	 *
+	 * For boards with SPL this should be empty since SPL can do all of
+	 * this init in the SPL board_init_f() function which is called
+	 * immediately after this.
+	 */
+	bl	s_init
+	ldp	x29, x30, [sp]
+	ret
+ENDPROC(lowlevel_init)
diff --git a/arch/arm/cpu/armv8/start.S b/arch/arm/cpu/armv8/start.S
index 4f5f6d8..140609d 100644
--- a/arch/arm/cpu/armv8/start.S
+++ b/arch/arm/cpu/armv8/start.S
@@ -19,8 +19,6 @@
 
 .globl	_start
 _start:
-	b	reset
-
 #ifdef CONFIG_ENABLE_ARM_SOC_BOOT0_HOOK
 /*
  * Various SoCs need something special and SoC-specific up front in
@@ -28,7 +26,8 @@
  * use it here.
  */
 #include <asm/arch/boot0.h>
-ARM_SOC_BOOT0_HOOK
+#else
+	b	reset
 #endif
 
 	.align 3
diff --git a/arch/arm/include/asm/arch-bcm235xx/boot0.h b/arch/arm/include/asm/arch-bcm235xx/boot0.h
index 7e72882..a747bd3 100644
--- a/arch/arm/include/asm/arch-bcm235xx/boot0.h
+++ b/arch/arm/include/asm/arch-bcm235xx/boot0.h
@@ -4,12 +4,6 @@
  * SPDX-License-Identifier:	GPL-2.0+
  */
 
-#ifndef __BOOT0_H
-#define __BOOT0_H
-
 /* BOOT0 header information */
-#define ARM_SOC_BOOT0_HOOK	\
-	.word	0xbabeface;	\
+	.word	0xbabeface
 	.word	_end - _start
-
-#endif /* __BOOT0_H */
diff --git a/arch/arm/include/asm/arch-bcm281xx/boot0.h b/arch/arm/include/asm/arch-bcm281xx/boot0.h
index 7e72882..a747bd3 100644
--- a/arch/arm/include/asm/arch-bcm281xx/boot0.h
+++ b/arch/arm/include/asm/arch-bcm281xx/boot0.h
@@ -4,12 +4,6 @@
  * SPDX-License-Identifier:	GPL-2.0+
  */
 
-#ifndef __BOOT0_H
-#define __BOOT0_H
-
 /* BOOT0 header information */
-#define ARM_SOC_BOOT0_HOOK	\
-	.word	0xbabeface;	\
+	.word	0xbabeface
 	.word	_end - _start
-
-#endif /* __BOOT0_H */
diff --git a/arch/arm/include/asm/arch-sunxi/boot0.h b/arch/arm/include/asm/arch-sunxi/boot0.h
index ea5675e..9c6d82d 100644
--- a/arch/arm/include/asm/arch-sunxi/boot0.h
+++ b/arch/arm/include/asm/arch-sunxi/boot0.h
@@ -4,11 +4,36 @@
  * SPDX-License-Identifier:	GPL-2.0+
  */
 
-#ifndef __BOOT0_H
-#define __BOOT0_H
-
+#if defined(CONFIG_RESERVE_ALLWINNER_BOOT0_HEADER) && !defined(CONFIG_SPL_BUILD)
 /* reserve space for BOOT0 header information */
-#define ARM_SOC_BOOT0_HOOK	\
+	b	reset
 	.space	1532
-
-#endif /* __BOOT0_H */
+#elif defined(CONFIG_ARM_BOOT_HOOK_RMR)
+/*
+ * Switch into AArch64 if needed.
+ * Refer to arch/arm/mach-sunxi/rmr_switch.S for the original source.
+ */
+	tst     x0, x0                  // this is "b #0x84" in ARM
+	b       reset
+	.space  0x7c
+	.word	0xe59f1024	// ldr     r1, [pc, #36] ; 0x170000a0
+	.word	0xe59f0024	// ldr     r0, [pc, #36] ; CONFIG_*_TEXT_BASE
+	.word	0xe5810000	// str     r0, [r1]
+	.word	0xf57ff04f	// dsb     sy
+	.word	0xf57ff06f	// isb     sy
+	.word	0xee1c0f50	// mrc     15, 0, r0, cr12, cr0, {2} ; RMR
+	.word	0xe3800003	// orr     r0, r0, #3
+	.word	0xee0c0f50	// mcr     15, 0, r0, cr12, cr0, {2} ; RMR
+	.word	0xf57ff06f	// isb     sy
+	.word	0xe320f003	// wfi
+	.word	0xeafffffd	// b       @wfi
+	.word	0x017000a0	// writeable RVBAR mapping address
+#ifdef CONFIG_SPL_BUILD
+	.word	CONFIG_SPL_TEXT_BASE
+#else
+	.word   CONFIG_SYS_TEXT_BASE
+#endif
+#else
+/* normal execution */
+	b	reset
+#endif
diff --git a/arch/arm/include/asm/arch-sunxi/clock_sun6i.h b/arch/arm/include/asm/arch-sunxi/clock_sun6i.h
index be9fcfd..3f87672 100644
--- a/arch/arm/include/asm/arch-sunxi/clock_sun6i.h
+++ b/arch/arm/include/asm/arch-sunxi/clock_sun6i.h
@@ -322,6 +322,7 @@
 #define CCM_DRAMCLK_CFG_DIV0_MASK	(0xf << 8)
 #define CCM_DRAMCLK_CFG_SRC_PLL5	(0x0 << 20)
 #define CCM_DRAMCLK_CFG_SRC_PLL6x2	(0x1 << 20)
+#define CCM_DRAMCLK_CFG_SRC_PLL11	(0x1 << 20) /* A64 only */
 #define CCM_DRAMCLK_CFG_SRC_MASK	(0x3 << 20)
 #define CCM_DRAMCLK_CFG_UPD		(0x1 << 16)
 #define CCM_DRAMCLK_CFG_RST		(0x1 << 31)
diff --git a/arch/arm/include/asm/arch-sunxi/cpu.h b/arch/arm/include/asm/arch-sunxi/cpu.h
index 73583ed..6f96a97 100644
--- a/arch/arm/include/asm/arch-sunxi/cpu.h
+++ b/arch/arm/include/asm/arch-sunxi/cpu.h
@@ -13,4 +13,7 @@
 #include <asm/arch/cpu_sun4i.h>
 #endif
 
+#define SOCID_A64	0x1689
+#define SOCID_H3	0x1680
+
 #endif /* _SUNXI_CPU_H */
diff --git a/arch/arm/include/asm/arch-sunxi/dram.h b/arch/arm/include/asm/arch-sunxi/dram.h
index e0be744..53e6d47 100644
--- a/arch/arm/include/asm/arch-sunxi/dram.h
+++ b/arch/arm/include/asm/arch-sunxi/dram.h
@@ -24,7 +24,7 @@
 #include <asm/arch/dram_sun8i_a33.h>
 #elif defined(CONFIG_MACH_SUN8I_A83T)
 #include <asm/arch/dram_sun8i_a83t.h>
-#elif defined(CONFIG_MACH_SUN8I_H3)
+#elif defined(CONFIG_MACH_SUN8I_H3) || defined(CONFIG_MACH_SUN50I)
 #include <asm/arch/dram_sun8i_h3.h>
 #elif defined(CONFIG_MACH_SUN9I)
 #include <asm/arch/dram_sun9i.h>
diff --git a/arch/arm/include/asm/arch-sunxi/dram_sun8i_h3.h b/arch/arm/include/asm/arch-sunxi/dram_sun8i_h3.h
index d0f2b8a..25d07d9 100644
--- a/arch/arm/include/asm/arch-sunxi/dram_sun8i_h3.h
+++ b/arch/arm/include/asm/arch-sunxi/dram_sun8i_h3.h
@@ -15,7 +15,8 @@
 
 struct sunxi_mctl_com_reg {
 	u32 cr;			/* 0x00 control register */
-	u8 res0[0xc];		/* 0x04 */
+	u8 res0[0x8];		/* 0x04 */
+	u32 tmr;		/* 0x0c (unused on H3) */
 	u32 mcr[16][2];		/* 0x10 */
 	u32 bwcr;		/* 0x90 bandwidth control register */
 	u32 maer;		/* 0x94 master enable register */
@@ -32,7 +33,9 @@
 	u32 swoffr;		/* 0xc4 */
 	u8 res2[0x8];		/* 0xc8 */
 	u32 cccr;		/* 0xd0 */
-	u8 res3[0x72c];		/* 0xd4 */
+	u8 res3[0x54];		/* 0xd4 */
+	u32 mdfs_bwlr[3];	/* 0x128 (unused on H3) */
+	u8 res4[0x6cc];		/* 0x134 */
 	u32 protect;		/* 0x800 */
 };
 
@@ -81,7 +84,8 @@
 	u32 rfshtmg;		/* 0x90 refresh timing */
 	u32 rfshctl1;		/* 0x94 */
 	u32 pwrtmg;		/* 0x98 */
-	u8  res3[0x20];		/* 0x9c */
+	u8 res3[0x1c];		/* 0x9c */
+	u32 vtfcr;		/* 0xb8 (unused on H3) */
 	u32 dqsgmr;		/* 0xbc */
 	u32 dtcr;		/* 0xc0 */
 	u32 dtar[4];		/* 0xc4 */
@@ -106,20 +110,23 @@
 	u32 perfhpr[2];		/* 0x1c4 */
 	u32 perflpr[2];		/* 0x1cc */
 	u32 perfwr[2];		/* 0x1d4 */
-	u8 res8[0x2c];		/* 0x1dc */
-	u32 aciocr;		/* 0x208 */
-	u8 res9[0xf4];		/* 0x20c */
+	u8 res8[0x24];		/* 0x1dc */
+	u32 acmdlr;		/* 0x200 AC master delay line register */
+	u32 aclcdlr;		/* 0x204 AC local calibrated delay line register */
+	u32 aciocr;		/* 0x208 AC I/O configuration register */
+	u8 res9[0x4];		/* 0x20c */
+	u32 acbdlr[31];		/* 0x210 AC bit delay line registers */
+	u8 res10[0x74];		/* 0x28c */
 	struct {		/* 0x300 DATX8 modules*/
-		u32 mdlr;		/* 0x00 */
-		u32 lcdlr[3];		/* 0x04 */
-		u32 iocr[11];		/* 0x10 IO configuration register */
-		u32 bdlr6;		/* 0x3c */
-		u32 gtr;		/* 0x40 */
-		u32 gcr;		/* 0x44 */
-		u32 gsr[3];		/* 0x48 */
+		u32 mdlr;		/* 0x00 master delay line register */
+		u32 lcdlr[3];		/* 0x04 local calibrated delay line registers */
+		u32 bdlr[12];		/* 0x10 bit delay line registers */
+		u32 gtr;		/* 0x40 general timing register */
+		u32 gcr;		/* 0x44 general configuration register */
+		u32 gsr[3];		/* 0x48 general status registers */
 		u8 res0[0x2c];		/* 0x54 */
-	} datx[4];
-	u8 res10[0x388];	/* 0x500 */
+	} dx[4];
+	u8 res11[0x388];	/* 0x500 */
 	u32 upd2;		/* 0x888 */
 };
 
@@ -172,14 +179,16 @@
 
 #define PGSR_INIT_DONE	(0x1 << 0)	/* PHY init done */
 
-#define ZQCR_PWRDOWN	(0x1 << 31)	/* ZQ power down */
+#define ZQCR_PWRDOWN	(1U << 31)	/* ZQ power down */
 
-#define DATX_IOCR_DQ(x)	(x)		/* DQ0-7 IOCR index */
-#define DATX_IOCR_DM	(8)		/* DM IOCR index */
-#define DATX_IOCR_DQS	(9)		/* DQS IOCR index */
-#define DATX_IOCR_DQSN	(10)		/* DQSN IOCR index */
+#define ACBDLR_WRITE_DELAY(x)	((x) << 8)
+
+#define DXBDLR_DQ(x)	(x)		/* DQ0-7 BDLR index */
+#define DXBDLR_DM	8		/* DM BDLR index */
+#define DXBDLR_DQS	9		/* DQS BDLR index */
+#define DXBDLR_DQSN	10		/* DQSN BDLR index */
 
-#define DATX_IOCR_WRITE_DELAY(x)	((x) << 8)
-#define DATX_IOCR_READ_DELAY(x)		((x) << 0)
+#define DXBDLR_WRITE_DELAY(x)	((x) << 8)
+#define DXBDLR_READ_DELAY(x)	((x) << 0)
 
 #endif /* _SUNXI_DRAM_SUN8I_H3_H */
diff --git a/arch/arm/include/asm/armv8/mmu.h b/arch/arm/include/asm/armv8/mmu.h
index aa0f3c4..e9b4cdb 100644
--- a/arch/arm/include/asm/armv8/mmu.h
+++ b/arch/arm/include/asm/armv8/mmu.h
@@ -8,14 +8,6 @@
 #ifndef _ASM_ARMV8_MMU_H_
 #define _ASM_ARMV8_MMU_H_
 
-#ifdef __ASSEMBLY__
-#define _AC(X, Y)	X
-#else
-#define _AC(X, Y)	(X##Y)
-#endif
-
-#define UL(x)		_AC(x, UL)
-
 /***************************************************************/
 /*
  * The following definitions are related each other, shoud be
diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
index 0051f76..024139d 100644
--- a/arch/arm/lib/Makefile
+++ b/arch/arm/lib/Makefile
@@ -77,8 +77,10 @@
 
 # for C files, just apend -marm, which will override previous -mthumb*
 
+ifndef CONFIG_ARM64
 CFLAGS_cache.o := -marm
 CFLAGS_cache-cp15.o := -marm
+endif
 
 # For .S, drop -mthumb* and other thumb-related options.
 # CFLAGS_REMOVE_* would not have an effet, so AFLAGS_REMOVE_*
diff --git a/arch/arm/lib/vectors.S b/arch/arm/lib/vectors.S
index 5cc132b..9fe7415 100644
--- a/arch/arm/lib/vectors.S
+++ b/arch/arm/lib/vectors.S
@@ -67,7 +67,6 @@
  * use it here.
  */
 #include <asm/arch/boot0.h>
-ARM_SOC_BOOT0_HOOK
 #endif
 
 /*
diff --git a/arch/arm/mach-omap2/boot-common.c b/arch/arm/mach-omap2/boot-common.c
index 385310b..7ae3d80 100644
--- a/arch/arm/mach-omap2/boot-common.c
+++ b/arch/arm/mach-omap2/boot-common.c
@@ -228,7 +228,7 @@
 
 	u32 boot_params = *((u32 *)OMAP_SRAM_SCRATCH_BOOT_PARAMS);
 
-	debug("image entry point: 0x%X\n", spl_image->entry_point);
+	debug("image entry point: 0x%lX\n", spl_image->entry_point);
 	/* Pass the saved boot_params from rom code */
 	image_entry((u32 *)boot_params);
 }
diff --git a/arch/arm/mach-sunxi/Makefile b/arch/arm/mach-sunxi/Makefile
index e73114e..7daba11 100644
--- a/arch/arm/mach-sunxi/Makefile
+++ b/arch/arm/mach-sunxi/Makefile
@@ -50,4 +50,5 @@
 obj-$(CONFIG_MACH_SUN8I_A83T)	+= dram_sun8i_a83t.o
 obj-$(CONFIG_MACH_SUN8I_H3)	+= dram_sun8i_h3.o
 obj-$(CONFIG_MACH_SUN9I)	+= dram_sun9i.o
+obj-$(CONFIG_MACH_SUN50I)	+= dram_sun8i_h3.o
 endif
diff --git a/arch/arm/mach-sunxi/board.c b/arch/arm/mach-sunxi/board.c
index aa11493..52be5b0 100644
--- a/arch/arm/mach-sunxi/board.c
+++ b/arch/arm/mach-sunxi/board.c
@@ -133,7 +133,7 @@
 	return 0;
 }
 
-#ifdef CONFIG_SPL_BUILD
+#if defined(CONFIG_SPL_BOARD_LOAD_IMAGE) && defined(CONFIG_SPL_BUILD)
 static int spl_board_load_image(struct spl_image_info *spl_image,
 				struct spl_boot_device *bootdev)
 {
diff --git a/arch/arm/mach-sunxi/clock_sun6i.c b/arch/arm/mach-sunxi/clock_sun6i.c
index ed8cd9b..d123b3a 100644
--- a/arch/arm/mach-sunxi/clock_sun6i.c
+++ b/arch/arm/mach-sunxi/clock_sun6i.c
@@ -21,6 +21,8 @@
 {
 	struct sunxi_ccm_reg * const ccm =
 		(struct sunxi_ccm_reg *)SUNXI_CCM_BASE;
+
+#if !defined(CONFIG_MACH_SUN8I_H3) && !defined(CONFIG_MACH_SUN50I)
 	struct sunxi_prcm_reg * const prcm =
 		(struct sunxi_prcm_reg *)SUNXI_PRCM_BASE;
 
@@ -31,6 +33,7 @@
 		PRCM_PLL_CTRL_LDO_DIGITAL_EN | PRCM_PLL_CTRL_LDO_ANALOG_EN |
 		PRCM_PLL_CTRL_EXT_OSC_EN | PRCM_PLL_CTRL_LDO_OUT_L(1140));
 	clrbits_le32(&prcm->pll_ctrl1, PRCM_PLL_CTRL_LDO_KEY_MASK);
+#endif
 
 	clock_set_pll1(408000000);
 
@@ -41,7 +44,8 @@
 	writel(AHB1_ABP1_DIV_DEFAULT, &ccm->ahb1_apb1_div);
 
 	writel(MBUS_CLK_DEFAULT, &ccm->mbus0_clk_cfg);
-	writel(MBUS_CLK_DEFAULT, &ccm->mbus1_clk_cfg);
+	if (IS_ENABLED(CONFIG_MACH_SUN6I))
+		writel(MBUS_CLK_DEFAULT, &ccm->mbus1_clk_cfg);
 }
 #endif
 
@@ -213,14 +217,14 @@
 }
 #endif
 
-#ifdef CONFIG_MACH_SUN8I_A33
+#if defined(CONFIG_MACH_SUN8I_A33) || defined(CONFIG_MACH_SUN50I)
 void clock_set_pll11(unsigned int clk, bool sigma_delta_enable)
 {
 	struct sunxi_ccm_reg * const ccm =
 		(struct sunxi_ccm_reg *)SUNXI_CCM_BASE;
 
 	if (sigma_delta_enable)
-		writel(CCM_PLL11_PATTERN, &ccm->pll5_pattern_cfg);
+		writel(CCM_PLL11_PATTERN, &ccm->pll11_pattern_cfg0);
 
 	writel(CCM_PLL11_CTRL_EN | CCM_PLL11_CTRL_UPD |
 	       (sigma_delta_enable ? CCM_PLL11_CTRL_SIGMA_DELTA_EN : 0) |
diff --git a/arch/arm/mach-sunxi/dram_sun8i_h3.c b/arch/arm/mach-sunxi/dram_sun8i_h3.c
index b08b8e6..9f7cc7f 100644
--- a/arch/arm/mach-sunxi/dram_sun8i_h3.c
+++ b/arch/arm/mach-sunxi/dram_sun8i_h3.c
@@ -13,15 +13,27 @@
 #include <asm/io.h>
 #include <asm/arch/clock.h>
 #include <asm/arch/dram.h>
+#include <asm/arch/cpu.h>
 #include <linux/kconfig.h>
 
+/*
+ * The delay parameters below allow to allegedly specify delay times of some
+ * unknown unit for each individual bit trace in each of the four data bytes
+ * the 32-bit wide access consists of. Also three control signals can be
+ * adjusted individually.
+ */
+#define BITS_PER_BYTE		8
+#define NR_OF_BYTE_LANES	(32 / BITS_PER_BYTE)
+/* The eight data lines (DQn) plus DM, DQS and DQSN */
+#define LINES_PER_BYTE_LANE	(BITS_PER_BYTE + 3)
 struct dram_para {
-	u32 read_delays;
-	u32 write_delays;
 	u16 page_size;
 	u8 bus_width;
 	u8 dual_rank;
 	u8 row_bits;
+	const u8 dx_read_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE];
+	const u8 dx_write_delays[NR_OF_BYTE_LANES][LINES_PER_BYTE_LANE];
+	const u8 ac_delays[31];
 };
 
 static inline int ns_to_t(int nanoseconds)
@@ -31,30 +43,6 @@
 	return DIV_ROUND_UP(ctrl_freq * nanoseconds, 1000);
 }
 
-static u32 bin_to_mgray(int val)
-{
-	static const u8 lookup_table[32] = {
-		0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05,
-		0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x08, 0x09,
-		0x18, 0x19, 0x1a, 0x1b, 0x1e, 0x1f, 0x1c, 0x1d,
-		0x14, 0x15, 0x16, 0x17, 0x12, 0x13, 0x10, 0x11,
-	};
-
-	return lookup_table[clamp(val, 0, 31)];
-}
-
-static int mgray_to_bin(u32 val)
-{
-	static const u8 lookup_table[32] = {
-		0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05,
-		0x0e, 0x0f, 0x0c, 0x0d, 0x08, 0x09, 0x0a, 0x0b,
-		0x1e, 0x1f, 0x1c, 0x1d, 0x18, 0x19, 0x1a, 0x1b,
-		0x10, 0x11, 0x12, 0x13, 0x16, 0x17, 0x14, 0x15,
-	};
-
-	return lookup_table[val & 0x1f];
-}
-
 static void mctl_phy_init(u32 val)
 {
 	struct sunxi_mctl_ctl_reg * const mctl_ctl =
@@ -64,74 +52,144 @@
 	mctl_await_completion(&mctl_ctl->pgsr[0], PGSR_INIT_DONE, 0x1);
 }
 
-static void mctl_dq_delay(u32 read, u32 write)
+static void mctl_set_bit_delays(struct dram_para *para)
 {
 	struct sunxi_mctl_ctl_reg * const mctl_ctl =
 			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
 	int i, j;
-	u32 val;
-
-	for (i = 0; i < 4; i++) {
-		val = DATX_IOCR_WRITE_DELAY((write >> (i * 4)) & 0xf) |
-		      DATX_IOCR_READ_DELAY(((read >> (i * 4)) & 0xf) * 2);
-
-		for (j = DATX_IOCR_DQ(0); j <= DATX_IOCR_DM; j++)
-			writel(val, &mctl_ctl->datx[i].iocr[j]);
-	}
 
 	clrbits_le32(&mctl_ctl->pgcr[0], 1 << 26);
 
-	for (i = 0; i < 4; i++) {
-		val = DATX_IOCR_WRITE_DELAY((write >> (16 + i * 4)) & 0xf) |
-		      DATX_IOCR_READ_DELAY((read >> (16 + i * 4)) & 0xf);
+	for (i = 0; i < NR_OF_BYTE_LANES; i++)
+		for (j = 0; j < LINES_PER_BYTE_LANE; j++)
+			writel(DXBDLR_WRITE_DELAY(para->dx_write_delays[i][j]) |
+			       DXBDLR_READ_DELAY(para->dx_read_delays[i][j]),
+			       &mctl_ctl->dx[i].bdlr[j]);
 
-		writel(val, &mctl_ctl->datx[i].iocr[DATX_IOCR_DQS]);
-		writel(val, &mctl_ctl->datx[i].iocr[DATX_IOCR_DQSN]);
-	}
+	for (i = 0; i < 31; i++)
+		writel(ACBDLR_WRITE_DELAY(para->ac_delays[i]),
+		       &mctl_ctl->acbdlr[i]);
 
 	setbits_le32(&mctl_ctl->pgcr[0], 1 << 26);
+}
 
-	udelay(1);
+enum {
+	MBUS_PORT_CPU           = 0,
+	MBUS_PORT_GPU           = 1,
+	MBUS_PORT_UNUSED	= 2,
+	MBUS_PORT_DMA           = 3,
+	MBUS_PORT_VE            = 4,
+	MBUS_PORT_CSI           = 5,
+	MBUS_PORT_NAND          = 6,
+	MBUS_PORT_SS            = 7,
+	MBUS_PORT_TS            = 8,
+	MBUS_PORT_DI            = 9,
+	MBUS_PORT_DE            = 10,
+	MBUS_PORT_DE_CFD        = 11,
+};
+
+enum {
+	MBUS_QOS_LOWEST = 0,
+	MBUS_QOS_LOW,
+	MBUS_QOS_HIGH,
+	MBUS_QOS_HIGHEST
+};
+
+inline void mbus_configure_port(u8 port,
+				bool bwlimit,
+				bool priority,
+				u8 qos,         /* MBUS_QOS_LOWEST .. MBUS_QOS_HIGEST */
+				u8 waittime,    /* 0 .. 0xf */
+				u8 acs,         /* 0 .. 0xff */
+				u16 bwl0,       /* 0 .. 0xffff, bandwidth limit in MB/s */
+				u16 bwl1,
+				u16 bwl2)
+{
+	struct sunxi_mctl_com_reg * const mctl_com =
+			(struct sunxi_mctl_com_reg *)SUNXI_DRAM_COM_BASE;
+
+	const u32 cfg0 = ( (bwlimit ? (1 << 0) : 0)
+			   | (priority ? (1 << 1) : 0)
+			   | ((qos & 0x3) << 2)
+			   | ((waittime & 0xf) << 4)
+			   | ((acs & 0xff) << 8)
+			   | (bwl0 << 16) );
+	const u32 cfg1 = ((u32)bwl2 << 16) | (bwl1 & 0xffff);
+
+	debug("MBUS port %d cfg0 %08x cfg1 %08x\n", port, cfg0, cfg1);
+	writel(cfg0, &mctl_com->mcr[port][0]);
+	writel(cfg1, &mctl_com->mcr[port][1]);
 }
 
-static void mctl_set_master_priority(void)
+#define MBUS_CONF(port, bwlimit, qos, acs, bwl0, bwl1, bwl2)	\
+	mbus_configure_port(MBUS_PORT_ ## port, bwlimit, false, \
+			    MBUS_QOS_ ## qos, 0, acs, bwl0, bwl1, bwl2)
+
+static void mctl_set_master_priority_h3(void)
 {
 	struct sunxi_mctl_com_reg * const mctl_com =
 			(struct sunxi_mctl_com_reg *)SUNXI_DRAM_COM_BASE;
 
 	/* enable bandwidth limit windows and set windows size 1us */
-	writel(0x00010190, &mctl_com->bwcr);
+	writel((1 << 16) | (400 << 0), &mctl_com->bwcr);
 
 	/* set cpu high priority */
 	writel(0x00000001, &mctl_com->mapr);
 
-	writel(0x0200000d, &mctl_com->mcr[0][0]);
-	writel(0x00800100, &mctl_com->mcr[0][1]);
-	writel(0x06000009, &mctl_com->mcr[1][0]);
-	writel(0x01000400, &mctl_com->mcr[1][1]);
-	writel(0x0200000d, &mctl_com->mcr[2][0]);
-	writel(0x00600100, &mctl_com->mcr[2][1]);
-	writel(0x0100000d, &mctl_com->mcr[3][0]);
-	writel(0x00200080, &mctl_com->mcr[3][1]);
-	writel(0x07000009, &mctl_com->mcr[4][0]);
-	writel(0x01000640, &mctl_com->mcr[4][1]);
-	writel(0x0100000d, &mctl_com->mcr[5][0]);
-	writel(0x00200080, &mctl_com->mcr[5][1]);
-	writel(0x01000009, &mctl_com->mcr[6][0]);
-	writel(0x00400080, &mctl_com->mcr[6][1]);
-	writel(0x0100000d, &mctl_com->mcr[7][0]);
-	writel(0x00400080, &mctl_com->mcr[7][1]);
-	writel(0x0100000d, &mctl_com->mcr[8][0]);
-	writel(0x00400080, &mctl_com->mcr[8][1]);
-	writel(0x04000009, &mctl_com->mcr[9][0]);
-	writel(0x00400100, &mctl_com->mcr[9][1]);
-	writel(0x2000030d, &mctl_com->mcr[10][0]);
-	writel(0x04001800, &mctl_com->mcr[10][1]);
-	writel(0x04000009, &mctl_com->mcr[11][0]);
-	writel(0x00400120, &mctl_com->mcr[11][1]);
+	MBUS_CONF(   CPU,  true, HIGHEST, 0,  512,  256,  128);
+	MBUS_CONF(   GPU,  true,    HIGH, 0, 1536, 1024,  256);
+	MBUS_CONF(UNUSED,  true, HIGHEST, 0,  512,  256,   96);
+	MBUS_CONF(   DMA,  true, HIGHEST, 0,  256,  128,   32);
+	MBUS_CONF(    VE,  true,    HIGH, 0, 1792, 1600,  256);
+	MBUS_CONF(   CSI,  true, HIGHEST, 0,  256,  128,   32);
+	MBUS_CONF(  NAND,  true,    HIGH, 0,  256,  128,   64);
+	MBUS_CONF(    SS,  true, HIGHEST, 0,  256,  128,   64);
+	MBUS_CONF(    TS,  true, HIGHEST, 0,  256,  128,   64);
+	MBUS_CONF(    DI,  true,    HIGH, 0, 1024,  256,   64);
+	MBUS_CONF(    DE,  true, HIGHEST, 3, 8192, 6120, 1024);
+	MBUS_CONF(DE_CFD,  true,    HIGH, 0, 1024,  288,   64);
 }
 
-static void mctl_set_timing_params(struct dram_para *para)
+static void mctl_set_master_priority_a64(void)
+{
+	struct sunxi_mctl_com_reg * const mctl_com =
+			(struct sunxi_mctl_com_reg *)SUNXI_DRAM_COM_BASE;
+
+	/* enable bandwidth limit windows and set windows size 1us */
+	writel(399, &mctl_com->tmr);
+	writel((1 << 16), &mctl_com->bwcr);
+
+	/* Port 2 is reserved per Allwinner's linux-3.10 source, yet they
+	 * initialise it */
+	MBUS_CONF(   CPU,  true, HIGHEST, 0,  160,  100,   80);
+	MBUS_CONF(   GPU, false,    HIGH, 0, 1536, 1400,  256);
+	MBUS_CONF(UNUSED,  true, HIGHEST, 0,  512,  256,   96);
+	MBUS_CONF(   DMA,  true,    HIGH, 0,  256,   80,  100);
+	MBUS_CONF(    VE,  true,    HIGH, 0, 1792, 1600,  256);
+	MBUS_CONF(   CSI,  true,    HIGH, 0,  256,  128,    0);
+	MBUS_CONF(  NAND,  true,    HIGH, 0,  256,  128,   64);
+	MBUS_CONF(    SS,  true, HIGHEST, 0,  256,  128,   64);
+	MBUS_CONF(    TS,  true, HIGHEST, 0,  256,  128,   64);
+	MBUS_CONF(    DI,  true,    HIGH, 0, 1024,  256,   64);
+	MBUS_CONF(    DE,  true,    HIGH, 2, 8192, 6144, 2048);
+	MBUS_CONF(DE_CFD,  true,    HIGH, 0, 1280,  144,   64);
+
+	writel(0x81000004, &mctl_com->mdfs_bwlr[2]);
+}
+
+static void mctl_set_master_priority(uint16_t socid)
+{
+	switch (socid) {
+	case SOCID_H3:
+		mctl_set_master_priority_h3();
+		return;
+	case SOCID_A64:
+		mctl_set_master_priority_a64();
+		return;
+	}
+}
+
+static void mctl_set_timing_params(uint16_t socid, struct dram_para *para)
 {
 	struct sunxi_mctl_ctl_reg * const mctl_ctl =
 			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
@@ -212,7 +270,31 @@
 	writel(RFSHTMG_TREFI(trefi) | RFSHTMG_TRFC(trfc), &mctl_ctl->rfshtmg);
 }
 
+static u32 bin_to_mgray(int val)
+{
+	static const u8 lookup_table[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05,
+		0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x08, 0x09,
+		0x18, 0x19, 0x1a, 0x1b, 0x1e, 0x1f, 0x1c, 0x1d,
+		0x14, 0x15, 0x16, 0x17, 0x12, 0x13, 0x10, 0x11,
+	};
+
+	return lookup_table[clamp(val, 0, 31)];
+}
+
-static void mctl_zq_calibration(struct dram_para *para)
+static int mgray_to_bin(u32 val)
+{
+	static const u8 lookup_table[32] = {
+		0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05,
+		0x0e, 0x0f, 0x0c, 0x0d, 0x08, 0x09, 0x0a, 0x0b,
+		0x1e, 0x1f, 0x1c, 0x1d, 0x18, 0x19, 0x1a, 0x1b,
+		0x10, 0x11, 0x12, 0x13, 0x16, 0x17, 0x14, 0x15,
+	};
+
+	return lookup_table[val & 0x1f];
+}
+
+static void mctl_h3_zq_calibration_quirk(struct dram_para *para)
 {
 	struct sunxi_mctl_ctl_reg * const mctl_ctl =
 			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
@@ -282,7 +364,7 @@
 	       MCTL_CR_ROW_BITS(para->row_bits), &mctl_com->cr);
 }
 
-static void mctl_sys_init(struct dram_para *para)
+static void mctl_sys_init(uint16_t socid, struct dram_para *para)
 {
 	struct sunxi_ccm_reg * const ccm =
 			(struct sunxi_ccm_reg *)SUNXI_CCM_BASE;
@@ -294,16 +376,30 @@
 	clrbits_le32(&ccm->ahb_gate0, 1 << AHB_GATE_OFFSET_MCTL);
 	clrbits_le32(&ccm->ahb_reset0_cfg, 1 << AHB_RESET_OFFSET_MCTL);
 	clrbits_le32(&ccm->pll5_cfg, CCM_PLL5_CTRL_EN);
+	if (socid == SOCID_A64)
+		clrbits_le32(&ccm->pll11_cfg, CCM_PLL11_CTRL_EN);
 	udelay(10);
 
 	clrbits_le32(&ccm->dram_clk_cfg, CCM_DRAMCLK_CFG_RST);
 	udelay(1000);
 
-	clock_set_pll5(CONFIG_DRAM_CLK * 2 * 1000000, false);
-	clrsetbits_le32(&ccm->dram_clk_cfg,
-			CCM_DRAMCLK_CFG_DIV_MASK | CCM_DRAMCLK_CFG_SRC_MASK,
-			CCM_DRAMCLK_CFG_DIV(1) | CCM_DRAMCLK_CFG_SRC_PLL5 |
-			CCM_DRAMCLK_CFG_UPD);
+	if (socid == SOCID_A64) {
+		clock_set_pll11(CONFIG_DRAM_CLK * 2 * 1000000, false);
+		clrsetbits_le32(&ccm->dram_clk_cfg,
+				CCM_DRAMCLK_CFG_DIV_MASK |
+				CCM_DRAMCLK_CFG_SRC_MASK,
+				CCM_DRAMCLK_CFG_DIV(1) |
+				CCM_DRAMCLK_CFG_SRC_PLL11 |
+				CCM_DRAMCLK_CFG_UPD);
+	} else if (socid == SOCID_H3) {
+		clock_set_pll5(CONFIG_DRAM_CLK * 2 * 1000000, false);
+		clrsetbits_le32(&ccm->dram_clk_cfg,
+				CCM_DRAMCLK_CFG_DIV_MASK |
+				CCM_DRAMCLK_CFG_SRC_MASK,
+				CCM_DRAMCLK_CFG_DIV(1) |
+				CCM_DRAMCLK_CFG_SRC_PLL5 |
+				CCM_DRAMCLK_CFG_UPD);
+	}
 	mctl_await_completion(&ccm->dram_clk_cfg, CCM_DRAMCLK_CFG_UPD, 0);
 
 	setbits_le32(&ccm->ahb_reset0_cfg, 1 << AHB_RESET_OFFSET_MCTL);
@@ -318,7 +414,12 @@
 	udelay(500);
 }
 
-static int mctl_channel_init(struct dram_para *para)
+/* These are more guessed based on some Allwinner code. */
+#define DX_GCR_ODT_DYNAMIC	(0x0 << 4)
+#define DX_GCR_ODT_ALWAYS_ON	(0x1 << 4)
+#define DX_GCR_ODT_OFF		(0x2 << 4)
+
+static int mctl_channel_init(uint16_t socid, struct dram_para *para)
 {
 	struct sunxi_mctl_com_reg * const mctl_com =
 			(struct sunxi_mctl_com_reg *)SUNXI_DRAM_COM_BASE;
@@ -328,8 +429,8 @@
 	unsigned int i;
 
 	mctl_set_cr(para);
-	mctl_set_timing_params(para);
-	mctl_set_master_priority();
+	mctl_set_timing_params(socid, para);
+	mctl_set_master_priority(socid);
 
 	/* setting VTC, default disable all VT */
 	clrbits_le32(&mctl_ctl->pgcr[0], (1 << 30) | 0x3f);
@@ -344,10 +445,11 @@
 
 	/* set dramc odt */
 	for (i = 0; i < 4; i++)
-		clrsetbits_le32(&mctl_ctl->datx[i].gcr, (0x3 << 4) |
+		clrsetbits_le32(&mctl_ctl->dx[i].gcr, (0x3 << 4) |
 				(0x1 << 1) | (0x3 << 2) | (0x3 << 12) |
 				(0x3 << 14),
-				IS_ENABLED(CONFIG_DRAM_ODT_EN) ? 0x0 : 0x2);
+				IS_ENABLED(CONFIG_DRAM_ODT_EN) ?
+					DX_GCR_ODT_DYNAMIC : DX_GCR_ODT_OFF);
 
 	/* AC PDR should always ON */
 	setbits_le32(&mctl_ctl->aciocr, 0x1 << 1);
@@ -355,48 +457,58 @@
 	/* set DQS auto gating PD mode */
 	setbits_le32(&mctl_ctl->pgcr[2], 0x3 << 6);
 
-	/* dx ddr_clk & hdr_clk dynamic mode */
-	clrbits_le32(&mctl_ctl->pgcr[0], (0x3 << 14) | (0x3 << 12));
+	if (socid == SOCID_H3) {
+		/* dx ddr_clk & hdr_clk dynamic mode */
+		clrbits_le32(&mctl_ctl->pgcr[0], (0x3 << 14) | (0x3 << 12));
 
-	/* dphy & aphy phase select 270 degree */
-	clrsetbits_le32(&mctl_ctl->pgcr[2], (0x3 << 10) | (0x3 << 8),
-			(0x1 << 10) | (0x2 << 8));
+		/* dphy & aphy phase select 270 degree */
+		clrsetbits_le32(&mctl_ctl->pgcr[2], (0x3 << 10) | (0x3 << 8),
+				(0x1 << 10) | (0x2 << 8));
+	} else if (socid == SOCID_A64) {
+		/* dphy & aphy phase select ? */
+		clrsetbits_le32(&mctl_ctl->pgcr[2], (0x3 << 10) | (0x3 << 8),
+				(0x0 << 10) | (0x3 << 8));
+	}
 
 	/* set half DQ */
 	if (para->bus_width != 32) {
-		writel(0x0, &mctl_ctl->datx[2].gcr);
-		writel(0x0, &mctl_ctl->datx[3].gcr);
+		writel(0x0, &mctl_ctl->dx[2].gcr);
+		writel(0x0, &mctl_ctl->dx[3].gcr);
 	}
 
 	/* data training configuration */
 	clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24,
 			(para->dual_rank ? 0x3 : 0x1) << 24);
 
+	mctl_set_bit_delays(para);
+	udelay(50);
 
-	if (para->read_delays || para->write_delays) {
-		mctl_dq_delay(para->read_delays, para->write_delays);
-		udelay(50);
-	}
+	if (socid == SOCID_H3) {
+		mctl_h3_zq_calibration_quirk(para);
 
-	mctl_zq_calibration(para);
+		mctl_phy_init(PIR_PLLINIT | PIR_DCAL | PIR_PHYRST |
+			      PIR_DRAMRST | PIR_DRAMINIT | PIR_QSGATE);
+	} else if (socid == SOCID_A64) {
+		clrsetbits_le32(&mctl_ctl->zqcr, 0xffffff, CONFIG_DRAM_ZQ);
 
-	mctl_phy_init(PIR_PLLINIT | PIR_DCAL | PIR_PHYRST | PIR_DRAMRST |
-		      PIR_DRAMINIT | PIR_QSGATE);
+		mctl_phy_init(PIR_ZCAL | PIR_PLLINIT | PIR_DCAL | PIR_PHYRST |
+			      PIR_DRAMRST | PIR_DRAMINIT | PIR_QSGATE);
+	}
 
 	/* detect ranks and bus width */
 	if (readl(&mctl_ctl->pgsr[0]) & (0xfe << 20)) {
 		/* only one rank */
-		if (((readl(&mctl_ctl->datx[0].gsr[0]) >> 24) & 0x2) ||
-		    ((readl(&mctl_ctl->datx[1].gsr[0]) >> 24) & 0x2)) {
+		if (((readl(&mctl_ctl->dx[0].gsr[0]) >> 24) & 0x2) ||
+		    ((readl(&mctl_ctl->dx[1].gsr[0]) >> 24) & 0x2)) {
 			clrsetbits_le32(&mctl_ctl->dtcr, 0xf << 24, 0x1 << 24);
 			para->dual_rank = 0;
 		}
 
 		/* only half DQ width */
-		if (((readl(&mctl_ctl->datx[2].gsr[0]) >> 24) & 0x1) ||
-		    ((readl(&mctl_ctl->datx[3].gsr[0]) >> 24) & 0x1)) {
-			writel(0x0, &mctl_ctl->datx[2].gcr);
-			writel(0x0, &mctl_ctl->datx[3].gcr);
+		if (((readl(&mctl_ctl->dx[2].gsr[0]) >> 24) & 0x1) ||
+		    ((readl(&mctl_ctl->dx[3].gsr[0]) >> 24) & 0x1)) {
+			writel(0x0, &mctl_ctl->dx[2].gcr);
+			writel(0x0, &mctl_ctl->dx[3].gcr);
 			para->bus_width = 16;
 		}
 
@@ -419,7 +531,10 @@
 	udelay(10);
 
 	/* set PGCR3, CKE polarity */
-	writel(0x00aa0060, &mctl_ctl->pgcr[3]);
+	if (socid == SOCID_H3)
+		writel(0x00aa0060, &mctl_ctl->pgcr[3]);
+	else if (socid == SOCID_A64)
+		writel(0xc0aa0060, &mctl_ctl->pgcr[3]);
 
 	/* power down zq calibration module for power save */
 	setbits_le32(&mctl_ctl->zqcr, ZQCR_PWRDOWN);
@@ -450,6 +565,45 @@
 			break;
 }
 
+/*
+ * The actual values used here are taken from Allwinner provided boot0
+ * binaries, though they are probably board specific, so would likely benefit
+ * from invidual tuning for each board. Apparently a lot of boards copy from
+ * some Allwinner reference design, so we go with those generic values for now
+ * in the hope that they are reasonable for most (all?) boards.
+ */
+#define SUN8I_H3_DX_READ_DELAYS					\
+	{{ 18, 18, 18, 18, 18, 18, 18, 18, 18,  0,  0 },	\
+	 { 14, 14, 14, 14, 14, 14, 14, 14, 14,  0,  0 },	\
+	 { 18, 18, 18, 18, 18, 18, 18, 18, 18,  0,  0 },	\
+	 { 14, 14, 14, 14, 14, 14, 14, 14, 14,  0,  0 }}
+#define SUN8I_H3_DX_WRITE_DELAYS				\
+	{{  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 10 },	\
+	 {  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 10 },	\
+	 {  0,  0,  0,  0,  0,  0,  0,  0,  0, 10, 10 },	\
+	 {  0,  0,  0,  0,  0,  0,  0,  0,  0,  6,  6 }}
+#define SUN8I_H3_AC_DELAYS					\
+	{  0,  0,  0,  0,  0,  0,  0,  0,			\
+	   0,  0,  0,  0,  0,  0,  0,  0,			\
+	   0,  0,  0,  0,  0,  0,  0,  0,			\
+	   0,  0,  0,  0,  0,  0,  0      }
+
+#define SUN50I_A64_DX_READ_DELAYS				\
+	{{ 16, 16, 16, 16, 17, 16, 16, 17, 16,  1,  0 },	\
+	 { 17, 17, 17, 17, 17, 17, 17, 17, 17,  1,  0 },	\
+	 { 16, 17, 17, 16, 16, 16, 16, 16, 16,  0,  0 },	\
+	 { 17, 17, 17, 17, 17, 17, 17, 17, 17,  1,  0 }}
+#define SUN50I_A64_DX_WRITE_DELAYS				\
+	{{  0,  0,  0,  0,  0,  0,  0,  0,  0, 15, 15 },	\
+	 {  0,  0,  0,  0,  1,  1,  1,  1,  0, 10, 10 },	\
+	 {  1,  0,  1,  1,  1,  1,  1,  1,  0, 11, 11 },	\
+	 {  1,  0,  0,  1,  1,  1,  1,  1,  0, 12, 12 }}
+#define SUN50I_A64_AC_DELAYS					\
+	{  5,  5, 13, 10,  2,  5,  3,  3,			\
+	   0,  3,  3,  3,  1,  0,  0,  0,			\
+	   3,  4,  0,  3,  4,  1,  4,  0,			\
+	   1,  1,  0,  1, 13,  5,  4      }
+
 unsigned long sunxi_dram_init(void)
 {
 	struct sunxi_mctl_com_reg * const mctl_com =
@@ -458,16 +612,34 @@
 			(struct sunxi_mctl_ctl_reg *)SUNXI_DRAM_CTL0_BASE;
 
 	struct dram_para para = {
-		.read_delays = 0x00007979,	/* dram_tpr12 */
-		.write_delays = 0x6aaa0000,	/* dram_tpr11 */
 		.dual_rank = 0,
 		.bus_width = 32,
 		.row_bits = 15,
 		.page_size = 4096,
+
+#if defined(CONFIG_MACH_SUN8I_H3)
+		.dx_read_delays  = SUN8I_H3_DX_READ_DELAYS,
+		.dx_write_delays = SUN8I_H3_DX_WRITE_DELAYS,
+		.ac_delays	 = SUN8I_H3_AC_DELAYS,
+#elif defined(CONFIG_MACH_SUN50I)
+		.dx_read_delays  = SUN50I_A64_DX_READ_DELAYS,
+		.dx_write_delays = SUN50I_A64_DX_WRITE_DELAYS,
+		.ac_delays	 = SUN50I_A64_AC_DELAYS,
+#endif
 	};
+/*
+ * Let the compiler optimize alternatives away by passing this value into
+ * the static functions. This saves us #ifdefs, but still keeps the binary
+ * small.
+ */
+#if defined(CONFIG_MACH_SUN8I_H3)
+	uint16_t socid = SOCID_H3;
+#elif defined(CONFIG_MACH_SUN50I)
+	uint16_t socid = SOCID_A64;
+#endif
 
-	mctl_sys_init(&para);
-	if (mctl_channel_init(&para))
+	mctl_sys_init(socid, &para);
+	if (mctl_channel_init(socid, &para))
 		return 0;
 
 	if (para.dual_rank)
@@ -477,7 +649,13 @@
 	udelay(1);
 
 	/* odt delay */
-	writel(0x0c000400, &mctl_ctl->odtcfg);
+	if (socid == SOCID_H3)
+		writel(0x0c000400, &mctl_ctl->odtcfg);
+
+	if (socid == SOCID_A64) {
+		setbits_le32(&mctl_ctl->vtfcr, 2 << 8);
+		clrbits_le32(&mctl_ctl->pgcr[2], (1 << 13));
+	}
 
 	/* clear credit value */
 	setbits_le32(&mctl_com->cccr, 1 << 31);
@@ -486,6 +664,6 @@
 	mctl_auto_detect_dram_size(&para);
 	mctl_set_cr(&para);
 
-	return (1 << (para.row_bits + 3)) * para.page_size *
+	return (1UL << (para.row_bits + 3)) * para.page_size *
 						(para.dual_rank ? 2 : 1);
 }
diff --git a/arch/arm/mach-sunxi/rmr_switch.S b/arch/arm/mach-sunxi/rmr_switch.S
new file mode 100644
index 0000000..cefa930
--- /dev/null
+++ b/arch/arm/mach-sunxi/rmr_switch.S
@@ -0,0 +1,41 @@
+@
+@ ARMv8 RMR reset sequence on Allwinner SoCs.
+@
+@ All 64-bit capable Allwinner SoCs reset in AArch32 (and continue to
+@ exectute the Boot ROM in this state), so we need to switch to AArch64
+@ at some point.
+@ Section G6.2.133 of the ARMv8 ARM describes the Reset Management Register
+@ (RMR), which triggers a warm-reset of a core and can request to switch
+@ into a different execution state (AArch32 or AArch64).
+@ The address at which execution starts after the reset is held in the
+@ RVBAR system register, which is architecturally read-only.
+@ Allwinner provides a writable alias of this register in MMIO space, so
+@ we can easily set the start address of AArch64 code.
+@ This code below switches to AArch64 and starts execution at the specified
+@ start address. It needs to be assembled by an ARM(32) assembler and
+@ the machine code must be inserted as verbatim .word statements into the
+@ beginning of the AArch64 U-Boot code.
+@ To get the encoded bytes, use:
+@ ${CROSS_COMPILE}gcc -c -o rmr_switch.o rmr_switch.S
+@ ${CROSS_COMPILE}objdump -d rmr_switch.o
+@
+@ The resulting words should be inserted into the U-Boot file at
+@ arch/arm/include/asm/arch-sunxi/boot0.h.
+@
+@ This file is not build by the U-Boot build system, but provided only as a
+@ reference and to be able to regenerate a (probably fixed) version of this
+@ code found in encoded form in boot0.h.
+
+.text
+
+	ldr	r1, =0x017000a0		@ MMIO mapped RVBAR[0] register
+	ldr	r0, =0x57aA7add		@ start address, to be replaced
+	str	r0, [r1]
+	dsb	sy
+	isb	sy
+	mrc	15, 0, r0, cr12, cr0, 2	@ read RMR register
+	orr	r0, r0, #3		@ request reset in AArch64
+	mcr	15, 0, r0, cr12, cr0, 2 @ write RMR register
+	isb	sy
+1:	wfi
+	b	1b
diff --git a/arch/arm/mach-tegra/spl.c b/arch/arm/mach-tegra/spl.c
index e0f9d5b..41c88cb 100644
--- a/arch/arm/mach-tegra/spl.c
+++ b/arch/arm/mach-tegra/spl.c
@@ -42,7 +42,7 @@
 
 void __noreturn jump_to_image_no_args(struct spl_image_info *spl_image)
 {
-	debug("image entry point: 0x%X\n", spl_image->entry_point);
+	debug("image entry point: 0x%lX\n", spl_image->entry_point);
 
 	start_cpu((u32)spl_image->entry_point);
 	halt_avp();
diff --git a/board/sunxi/Kconfig b/board/sunxi/Kconfig
index e1d4ab1..0001133 100644
--- a/board/sunxi/Kconfig
+++ b/board/sunxi/Kconfig
@@ -125,6 +125,7 @@
 	bool "sun50i (Allwinner A64)"
 	select ARM64
 	select SUNXI_GEN_SUN6I
+	select SUPPORT_SPL
 
 endchoice
 
@@ -133,6 +134,29 @@
 	bool
 	default y if MACH_SUN8I_A23 || MACH_SUN8I_A33 || MACH_SUN8I_H3 || MACH_SUN8I_A83T
 
+config RESERVE_ALLWINNER_BOOT0_HEADER
+	bool "reserve space for Allwinner boot0 header"
+	select ENABLE_ARM_SOC_BOOT0_HOOK
+	---help---
+	Prepend a 1536 byte (empty) header to the U-Boot image file, to be
+	filled with magic values post build. The Allwinner provided boot0
+	blob relies on this information to load and execute U-Boot.
+	Only needed on 64-bit Allwinner boards so far when using boot0.
+
+config ARM_BOOT_HOOK_RMR
+	bool
+	depends on ARM64
+	default y
+	select ENABLE_ARM_SOC_BOOT0_HOOK
+	---help---
+	Insert some ARM32 code at the very beginning of the U-Boot binary
+	which uses an RMR register write to bring the core into AArch64 mode.
+	The very first instruction acts as a switch, since it's carefully
+	chosen to be a NOP in one mode and a branch in the other, so the
+	code would only be executed if not already in AArch64.
+	This allows both the SPL and the U-Boot proper to be entered in
+	either mode and switch to AArch64 if needed.
+
 config DRAM_TYPE
 	int "sunxi dram type"
 	depends on MACH_SUN8I_A83T
@@ -145,6 +169,7 @@
 	default 792 if MACH_SUN9I
 	default 312 if MACH_SUN6I || MACH_SUN8I
 	default 360 if MACH_SUN4I || MACH_SUN5I || MACH_SUN7I
+	default 672 if MACH_SUN50I
 	---help---
 	Set the dram clock speed, valid range 240 - 480 (prior to sun9i),
 	must be a multiple of 24. For the sun9i (A80), the tested values
@@ -164,6 +189,7 @@
 	default 123 if MACH_SUN4I || MACH_SUN5I || MACH_SUN6I || MACH_SUN8I
 	default 127 if MACH_SUN7I
 	default 4145117 if MACH_SUN9I
+	default 3881915 if MACH_SUN50I
 	---help---
 	Set the dram zq value.
 
@@ -171,6 +197,7 @@
 	bool "sunxi dram odt enable"
 	default n if !MACH_SUN8I_A23
 	default y if MACH_SUN8I_A23
+	default y if MACH_SUN50I
 	---help---
 	Select this to enable dram odt (on die termination).
 
diff --git a/common/spl/spl.c b/common/spl/spl.c
index f7df834..a76ea3a 100644
--- a/common/spl/spl.c
+++ b/common/spl/spl.c
@@ -115,7 +115,7 @@
 		}
 		spl_image->os = image_get_os(header);
 		spl_image->name = image_get_name(header);
-		debug("spl: payload image: %.*s load addr: 0x%x size: %d\n",
+		debug("spl: payload image: %.*s load addr: 0x%lx size: %d\n",
 			(int)sizeof(spl_image->name), spl_image->name,
 			spl_image->load_addr, spl_image->size);
 	} else {
@@ -140,7 +140,7 @@
 			spl_image->load_addr = CONFIG_SYS_LOAD_ADDR;
 			spl_image->entry_point = CONFIG_SYS_LOAD_ADDR;
 			spl_image->size = end - start;
-			debug("spl: payload zImage, load addr: 0x%x size: %d\n",
+			debug("spl: payload zImage, load addr: 0x%lx size: %d\n",
 			      spl_image->load_addr, spl_image->size);
 			return 0;
 		}
@@ -164,9 +164,9 @@
 	typedef void __noreturn (*image_entry_noargs_t)(void);
 
 	image_entry_noargs_t image_entry =
-		(image_entry_noargs_t)(unsigned long)spl_image->entry_point;
+		(image_entry_noargs_t)spl_image->entry_point;
 
-	debug("image entry point: 0x%X\n", spl_image->entry_point);
+	debug("image entry point: 0x%lX\n", spl_image->entry_point);
 	image_entry();
 }
 
diff --git a/common/spl/spl_mmc.c b/common/spl/spl_mmc.c
index 85e3de8..0cd355c 100644
--- a/common/spl/spl_mmc.c
+++ b/common/spl/spl_mmc.c
@@ -36,7 +36,7 @@
 	/* Read the header too to avoid extra memcpy */
 	count = blk_dread(mmc_get_blk_desc(mmc), sector, image_size_sectors,
 			  (void *)(ulong)spl_image->load_addr);
-	debug("read %x sectors to %x\n", image_size_sectors,
+	debug("read %x sectors to %lx\n", image_size_sectors,
 	      spl_image->load_addr);
 	if (count != image_size_sectors)
 		return -EIO;
diff --git a/configs/pine64_plus_defconfig b/configs/pine64_plus_defconfig
index 6d0198f..2374170 100644
--- a/configs/pine64_plus_defconfig
+++ b/configs/pine64_plus_defconfig
@@ -1,12 +1,11 @@
 CONFIG_ARM=y
-CONFIG_ENABLE_ARM_SOC_BOOT0_HOOK=y
+CONFIG_RESERVE_ALLWINNER_BOOT0_HEADER=y
 CONFIG_ARCH_SUNXI=y
 CONFIG_MACH_SUN50I=y
-CONFIG_DRAM_CLK=672
-CONFIG_DRAM_ZQ=3881915
 CONFIG_DEFAULT_DEVICE_TREE="sun50i-a64-pine64-plus"
 # CONFIG_SYS_MALLOC_CLEAR_ON_INIT is not set
 CONFIG_CONSOLE_MUX=y
+CONFIG_SPL=y
 # CONFIG_CMD_IMLS is not set
 # CONFIG_CMD_FLASH is not set
 # CONFIG_CMD_FPGA is not set
diff --git a/drivers/mtd/spi/sunxi_spi_spl.c b/drivers/mtd/spi/sunxi_spi_spl.c
index e70064c..a24c115 100644
--- a/drivers/mtd/spi/sunxi_spi_spl.c
+++ b/drivers/mtd/spi/sunxi_spi_spl.c
@@ -284,4 +284,4 @@
 	return 0;
 }
 /* Use priorty 0 to override the default if it happens to be linked in */
-SPL_LOAD_IMAGE_METHOD("sunxi SPI" 0, BOOT_DEVICE_SPI, spl_spi_load_image);
+SPL_LOAD_IMAGE_METHOD("sunxi SPI", 0, BOOT_DEVICE_SPI, spl_spi_load_image);
diff --git a/include/common.h b/include/common.h
index a8d833b..ee0436b 100644
--- a/include/common.h
+++ b/include/common.h
@@ -15,6 +15,9 @@
 typedef volatile unsigned short vu_short;
 typedef volatile unsigned char	vu_char;
 
+/* Allow sharing constants with type modifiers between C and assembly. */
+#define _AC(X, Y)       (X##Y)
+
 #include <config.h>
 #include <errno.h>
 #include <asm-offsets.h>
@@ -936,7 +939,12 @@
 int cpu_release(int nr, int argc, char * const argv[]);
 #endif
 
+#else	/* __ASSEMBLY__ */
+
-#endif /* __ASSEMBLY__ */
+/* Drop a C type modifier (like in 3UL) for constants used in assembly. */
+#define _AC(X, Y)       X
+
+#endif	/* __ASSEMBLY__ */
 
 #ifdef CONFIG_PPC
 /*
@@ -948,6 +956,9 @@
 
 /* Put only stuff here that the assembler can digest */
 
+/* Declare an unsigned long constant digestable both by C and an assembler. */
+#define UL(x)           _AC(x, UL)
+
 #ifdef CONFIG_POST
 #define CONFIG_HAS_POST
 #ifndef CONFIG_POST_ALT_LIST
diff --git a/include/configs/sunxi-common.h b/include/configs/sunxi-common.h
index b0bfc0d..ab2d33f 100644
--- a/include/configs/sunxi-common.h
+++ b/include/configs/sunxi-common.h
@@ -35,7 +35,7 @@
 /*
  * High Level Configuration Options
  */
-#ifdef CONFIG_SPL_BUILD
+#if defined(CONFIG_SPL_BUILD) && !defined(CONFIG_ARM64)
 #define CONFIG_SYS_THUMB_BUILD	/* Thumbs mode to save space in SPL */
 #endif
 
@@ -183,7 +183,9 @@
 
 #define CONFIG_SPL_FRAMEWORK
 
+#ifndef CONFIG_ARM64		/* AArch64 FEL support is not ready yet */
 #define CONFIG_SPL_BOARD_LOAD_IMAGE
+#endif
 
 #if defined(CONFIG_MACH_SUN9I)
 #define CONFIG_SPL_TEXT_BASE		0x10040		/* sram start+header */
diff --git a/include/spl.h b/include/spl.h
index 6e746b2..bde4437 100644
--- a/include/spl.h
+++ b/include/spl.h
@@ -23,8 +23,8 @@
 struct spl_image_info {
 	const char *name;
 	u8 os;
-	u32 load_addr;
-	u32 entry_point;
+	ulong load_addr;
+	ulong entry_point;
 	u32 size;
 	u32 flags;
 };
diff --git a/lib/tiny-printf.c b/lib/tiny-printf.c
index 30ac759..dfa8432 100644
--- a/lib/tiny-printf.c
+++ b/lib/tiny-printf.c
@@ -38,8 +38,8 @@
 	info->zs = 1;
 }
 
-static void div_out(struct printf_info *info, unsigned int *num,
-		    unsigned int div)
+static void div_out(struct printf_info *info, unsigned long *num,
+		    unsigned long div)
 {
 	unsigned char dgt = 0;
 
@@ -56,9 +56,9 @@
 {
 	char ch;
 	char *p;
-	unsigned int num;
+	unsigned long num;
 	char buf[12];
-	unsigned int div;
+	unsigned long div;
 
 	while ((ch = *(fmt++))) {
 		if (ch != '%') {
@@ -66,8 +66,12 @@
 		} else {
 			bool lz = false;
 			int width = 0;
+			bool islong = false;
 
 			ch = *(fmt++);
+			if (ch == '-')
+				ch = *(fmt++);
+
 			if (ch == '0') {
 				ch = *(fmt++);
 				lz = 1;
@@ -80,6 +84,11 @@
 					ch = *fmt++;
 				}
 			}
+			if (ch == 'l') {
+				ch = *(fmt++);
+				islong = true;
+			}
+
 			info->bf = buf;
 			p = info->bf;
 			info->zs = 0;
@@ -89,24 +98,43 @@
 				goto abort;
 			case 'u':
 			case 'd':
-				num = va_arg(va, unsigned int);
-				if (ch == 'd' && (int)num < 0) {
-					num = -(int)num;
-					out(info, '-');
+				div = 1000000000;
+				if (islong) {
+					num = va_arg(va, unsigned long);
+					if (sizeof(long) > 4)
+						div *= div * 10;
+				} else {
+					num = va_arg(va, unsigned int);
+				}
+
+				if (ch == 'd') {
+					if (islong && (long)num < 0) {
+						num = -(long)num;
+						out(info, '-');
+					} else if (!islong && (int)num < 0) {
+						num = -(int)num;
+						out(info, '-');
+					}
 				}
 				if (!num) {
 					out_dgt(info, 0);
 				} else {
-					for (div = 1000000000; div; div /= 10)
+					for (; div; div /= 10)
 						div_out(info, &num, div);
 				}
 				break;
 			case 'x':
-				num = va_arg(va, unsigned int);
+				if (islong) {
+					num = va_arg(va, unsigned long);
+					div = 1UL << (sizeof(long) * 8 - 4);
+				} else {
+					num = va_arg(va, unsigned int);
+					div = 0x10000000;
+				}
 				if (!num) {
 					out_dgt(info, 0);
 				} else {
-					for (div = 0x10000000; div; div /= 0x10)
+					for (; div; div /= 0x10)
 						div_out(info, &num, div);
 				}
 				break;