MX28: SPI: Fix the DMA chaining

It turns out that in order for the SPI DMA to properly support
continuous transfers longer than 65280 bytes, there are some very
important parts that were left out from the documentation.

Firstly, the XFER_SIZE register is not written with the whole length
of a transfer, but is written by each and every chained descriptor
with the length of the descriptors data buffer.

Next, unlike the demo code supplied by FSL, which only writes one PIO
word per descriptor, this does not apply if the descriptors are chained,
since the XFER_SIZE register must be written. Therefore, it is essential
to use four PIO words, CTRL0, CMD0, CMD1, XFER_SIZE. CMD0 and CMD1 are
written with zero, since they don't apply. The DMA programs the PIO words
in an incrementing order, so four PIO words.

Finally, unlike the demo code supplied by FSL, the SSP_CTRL0_IGNORE_CRC
must not be set during the whole transfer, but it must be set only on the
last descriptor in the chain.

Signed-off-by: Marek Vasut <marex@denx.de>
Cc: Fabio Estevam <festevam@gmail.com>
Cc: Otavio Salvador <otavio@ossystems.com.br>
Cc: Stefano Babic <sbabic@denx.de>
diff --git a/drivers/spi/mxs_spi.c b/drivers/spi/mxs_spi.c
index c399707..42e4c99 100644
--- a/drivers/spi/mxs_spi.c
+++ b/drivers/spi/mxs_spi.c
@@ -227,6 +227,7 @@
 	const uint32_t dstart = (uint32_t)data;
 	int dmach;
 	int tl;
+	int ret = 0;
 
 	ALLOC_CACHE_ALIGN_BUFFER(struct mxs_dma_desc, desc, desc_count);
 
@@ -240,8 +241,6 @@
 	if (!write)
 		ctrl0 |= SSP_CTRL0_READ;
 
-	writel(length, &ssp_regs->hw_ssp_xfer_size);
-
 	if (length % ARCH_DMA_MINALIGN)
 		cache_data_count = roundup(length, ARCH_DMA_MINALIGN);
 	else
@@ -284,39 +283,47 @@
 			tl = min(length, xfer_max_sz);
 
 		dp->cmd.data |=
-			(tl << MXS_DMA_DESC_BYTES_OFFSET) |
-			(1 << MXS_DMA_DESC_PIO_WORDS_OFFSET) |
+			((tl & 0xffff) << MXS_DMA_DESC_BYTES_OFFSET) |
+			(4 << MXS_DMA_DESC_PIO_WORDS_OFFSET) |
 			MXS_DMA_DESC_HALT_ON_TERMINATE |
 			MXS_DMA_DESC_TERMINATE_FLUSH;
-		dp->cmd.pio_words[0] = ctrl0;
 
 		data += tl;
 		length -= tl;
 
+		if (!length) {
+			dp->cmd.data |= MXS_DMA_DESC_IRQ | MXS_DMA_DESC_DEC_SEM;
+
+			if (flags & SPI_XFER_END) {
+				ctrl0 &= ~SSP_CTRL0_LOCK_CS;
+				ctrl0 |= SSP_CTRL0_IGNORE_CRC;
+			}
+		}
+
+		/*
+		 * Write CTRL0, CMD0, CMD1, XFER_SIZE registers. It is
+		 * essential that the XFER_SIZE register is written on
+		 * a per-descriptor basis with the same size as is the
+		 * descriptor!
+		 */
+		dp->cmd.pio_words[0] = ctrl0;
+		dp->cmd.pio_words[1] = 0;
+		dp->cmd.pio_words[2] = 0;
+		dp->cmd.pio_words[3] = tl;
+
 		mxs_dma_desc_append(dmach, dp);
 
 		dp++;
 	}
 
-	dp->address = (dma_addr_t)dp;
-	dp->cmd.address = (dma_addr_t)0;
-	dp->cmd.data = MXS_DMA_DESC_COMMAND_NO_DMAXFER |
-			(1 << MXS_DMA_DESC_PIO_WORDS_OFFSET) |
-			MXS_DMA_DESC_IRQ | MXS_DMA_DESC_DEC_SEM;
-	if (flags & SPI_XFER_END) {
-		ctrl0 &= ~SSP_CTRL0_LOCK_CS;
-		dp->cmd.pio_words[0] = ctrl0 | SSP_CTRL0_IGNORE_CRC;
-	}
-	mxs_dma_desc_append(dmach, dp);
-
 	if (mxs_dma_go(dmach))
-		return -EINVAL;
+		ret = -EINVAL;
 
 	/* The data arrived into DRAM, invalidate cache over them */
 	if (!write)
 		invalidate_dcache_range(dstart, dstart + cache_data_count);
 
-	return 0;
+	return ret;
 }
 
 int spi_xfer(struct spi_slave *slave, unsigned int bitlen,