rockchip: spi: add optimised receive-only implementation

For the RK3399-Q7 we recommend storing SPL and u-boot.itb in the
on-module 32MBit (and sometimes even larger, if requested as part of a
configure-to-order configuration) SPI-NOR flash that is clocked for a
bitrate of 49.5MBit/s and connected in a single-IO configuration (the
RK3399 only supports single-IO for SPI).

Unfortunately, the existing SPI driver is excruciatingly slow at
reading out large chunks of data (in fact it is just as slow for small
chunks of data, but the overheads of the driver-framework make it less
noticeable): before this change, the throughput on a 4MB read from
SPI-NOR is 8.47MBit/s which equates a 17.11% bus-utilisation.

To improve on this, this commit adds an optimised receive-only
transfer (i.e.: out == NULL) handler that hooks into the main transfer
function and processes data in 16bit frames (utilising the full with
of each FIFO element).  As of now, the receive-only handler requires
the in-buffer to be 16bit aligned.  Any lingering data (i.e. either if
the in-buffer was not 16-bit aligned or if an odd number of bytes are
to be received) will be handled by the original 8bit reader/wirter.

Given that the SPI controller's documentation does not guarantuee any
interlocking between the RXFIFO and the master SCLK, the transfer loop
will be restarted for each chunk of 32 frames (i.e. 64 bytes).

With this new receive-only transfer handler, the throughput for a 4MB
read increases to 36.28MBit/s (i.e. 73.29% bus-utilisation): this is a
4x improvement over the baseline.

Signed-off-by: Philipp Tomsich <philipp.tomsich@theobroma-systems.com>
Reported-by: Klaus Goger <klaus.goger@theobroma-systems.com>

Series-Cc: Klaus Goger <klaus.goger@theobroma-systems.com>
Series-Cc: Christoph Muellner <christoph.muellner@theobroma-systems.com>
diff --git a/drivers/spi/rk_spi.c b/drivers/spi/rk_spi.c
index c807d78..fec41a4 100644
--- a/drivers/spi/rk_spi.c
+++ b/drivers/spi/rk_spi.c
@@ -2,6 +2,8 @@
 /*
  * spi driver for rockchip
  *
+ * (C) 2019 Theobroma Systems Design und Consulting GmbH
+ *
  * (C) Copyright 2015 Google, Inc
  *
  * (C) Copyright 2008-2013 Rockchip Electronics
@@ -330,6 +332,81 @@
 
 	rkspi_enable_chip(priv->regs, false);
 
+	return 0;
+}
+
+static inline int rockchip_spi_16bit_reader(struct udevice *dev,
+					    u8 **din, int *len)
+{
+	struct udevice *bus = dev->parent;
+	const struct rockchip_spi_params * const data =
+		(void *)dev_get_driver_data(bus);
+	struct rockchip_spi_priv *priv = dev_get_priv(bus);
+	struct rockchip_spi *regs = priv->regs;
+	const u32 saved_ctrlr0 = readl(&regs->ctrlr0);
+#if defined(DEBUG)
+	u32 statistics_rxlevels[33] = { };
+#endif
+	u32 frames = *len / 2;
+	u16 *in16 = (u16 *)(*din);
+	u32 max_chunk_size = SPI_FIFO_DEPTH;
+
+	if (!frames)
+		return 0;
+
+	/*
+	 * If the destination buffer is unaligned, we'd run into a problem
+	 * on ARMv8.  Given that this doesn't seem to be a real issue, we
+	 * just chicken out and fall back to the unoptimised implementation.
+	 */
+	if ((uintptr_t)*din & 1) {
+		debug("%s: unaligned buffer, din = %p\n", __func__, *din);
+		return 0;
+	}
+
+	// rockchip_spi_configure(dev, mode, size)
+	rkspi_enable_chip(regs, false);
+	clrsetbits_le32(&regs->ctrlr0,
+			TMOD_MASK << TMOD_SHIFT,
+			TMOD_RO << TMOD_SHIFT);
+	/* 16bit data frame size */
+	clrsetbits_le32(&regs->ctrlr0, DFS_MASK, DFS_16BIT);
+
+	/* Update caller's context */
+	const u32 bytes_to_process = 2 * frames;
+	*din += bytes_to_process;
+	*len -= bytes_to_process;
+
+	/* Process our frames */
+	while (frames) {
+		u32 chunk_size = min(frames, max_chunk_size);
+
+		frames -= chunk_size;
+
+		writew(chunk_size - 1, &regs->ctrlr1);
+		rkspi_enable_chip(regs, true);
+
+		do {
+			u32 rx_level = readw(&regs->rxflr);
+#if defined(DEBUG)
+			statistics_rxlevels[rx_level]++;
+#endif
+			chunk_size -= rx_level;
+			while (rx_level--)
+				*in16++ = readw(regs->rxdr);
+		} while (chunk_size);
+
+		rkspi_enable_chip(regs, false);
+	}
+
+#if defined(DEBUG)
+	debug("%s: observed rx_level during processing:\n", __func__);
+	for (int i = 0; i <= 32; ++i)
+		if (statistics_rxlevels[i])
+			debug("\t%2d: %d\n", i, statistics_rxlevels[i]);
+#endif
+	/* Restore the original transfer setup and return error-free. */
+	writel(saved_ctrlr0, &regs->ctrlr0);
 	return 0;
 }
 
@@ -344,7 +421,7 @@
 	const u8 *out = dout;
 	u8 *in = din;
 	int toread, towrite;
-	int ret;
+	int ret = 0;
 
 	debug("%s: dout=%p, din=%p, len=%x, flags=%lx\n", __func__, dout, din,
 	      len, flags);
@@ -355,6 +432,16 @@
 	if (flags & SPI_XFER_BEGIN)
 		spi_cs_activate(dev, slave_plat->cs);
 
+	/*
+	 * To ensure fast loading of firmware images (e.g. full U-Boot
+	 * stage, ATF, Linux kernel) from SPI flash, we optimise the
+	 * case of read-only transfers by using the full 16bits of each
+	 * FIFO element.
+	 */
+	if (!out)
+		ret = rockchip_spi_16bit_reader(dev, &in, &len);
+
+	/* This is the original 8bit reader/writer code */
 	while (len > 0) {
 		int todo = min(len, 0x10000);