arm: a37xx: pci: Fix a3700_fdt_fix_pcie_regions() function again

The a3700_fdt_fix_pcie_regions() function still computes nonsense.

It computes the fixup offset from the PCI address taken from the first
row of the "ranges" array, which means that:
- PCI address must equal CPU address (otherwise the computed fix offset
  will be wrong),
- the first row must contain the lowest address.

This is the case for the default device-tree, which is why we didn't
notice it.

It also adds the fixup offset to all PCI and CPU addresses, which is
wrong.

Instead:
1) The fixup offset must be computed from the CPU address, not PCI
   address.

2) The fixup offset must be computed from the row containing the lowest
   CPU address, which is not necessarily contained in the first row.

3) The PCI address - the address to which the PCIe controller remaps the
   address space as seen from the point of view of the PCIe device -
   must be fixed by the fix offset in the same way as the CPU address
   only in the special case when the CPU adn PCI addresses are the same.
   Same addresses means that remapping is disabled, and thus if we
   change the CPU address, we need also to change the PCI address so
   that the remapping is still disabled afterwards.

Consider an example:
  The ranges entries contain:
    PCI address   CPU address
    70000000      EA000000
    E9000000      E9000000
    EB000000      EB000000

  By default CPU PCIe window is at:        E8000000 - F0000000
  Consider the case when TF-A moves it to: F2000000 - FA000000

  Until now the function would take the PCI address of the first entry:
  70000000, and the new base, F2000000, to compute the fix offset:
  F2000000 - 70000000 = 82000000, and then add 8200000 to all addresses,
  resulting in
    PCI address   CPU address
    F2000000      6C000000
    6B000000      6B000000
    6D000000      6D000000
  which is complete nonsense - none of the CPU addresses is in the
  requested window.

  Now it will take the lowest CPU address, which is in second row,
  E9000000, and compute the fix offset F2000000 - E9000000 = 09000000,
  and then add it to all CPU addresses and those PCI addresses which
  equal to their corresponding CPU addresses, resulting in
    PCI address   CPU address
    70000000      F3000000
    F2000000      F2000000
    F4000000      F4000000
  where all of the CPU addresses are in the needed window.

Fixes: 4a82fca8e330 ("arm: a37xx: pci: Fix a3700_fdt_fix_pcie_regions() function")
Signed-off-by: Pali Rohár <pali@kernel.org>
Signed-off-by: Marek Behún <marek.behun@nic.cz>
Reviewed-by: Stefan Roese <sr@denx.de>
diff --git a/arch/arm/mach-mvebu/armada3700/cpu.c b/arch/arm/mach-mvebu/armada3700/cpu.c
index 23492f4..52b5109 100644
--- a/arch/arm/mach-mvebu/armada3700/cpu.c
+++ b/arch/arm/mach-mvebu/armada3700/cpu.c
@@ -316,8 +316,8 @@
 
 int a3700_fdt_fix_pcie_regions(void *blob)
 {
-	int acells, pacells, scells;
-	u32 base, fix_offset;
+	u32 base, lowest_cpu_addr, fix_offset;
+	int pci_cells, cpu_cells, size_cells;
 	const u32 *ranges;
 	int node, pnode;
 	int ret, i, len;
@@ -331,51 +331,80 @@
 		return node;
 
 	ranges = fdt_getprop(blob, node, "ranges", &len);
-	if (!ranges || len % sizeof(u32))
-		return -ENOENT;
+	if (!ranges || !len || len % sizeof(u32))
+		return -EINVAL;
 
 	/*
 	 * The "ranges" property is an array of
-	 * { <child address> <parent address> <size in child address space> }
+	 *   { <PCI address> <CPU address> <size in PCI address space> }
+	 * where number of PCI address cells and size cells is stored in the
+	 * "#address-cells" and "#size-cells" properties of the same node
+	 * containing the "ranges" property and number of CPU address cells
+	 * is stored in the parent's "#address-cells" property.
 	 *
-	 * All 3 elements can span a diffent number of cells. Fetch their sizes.
+	 * All 3 elements can span a diffent number of cells. Fetch them.
 	 */
 	pnode = fdt_parent_offset(blob, node);
-	acells = fdt_address_cells(blob, node);
-	pacells = fdt_address_cells(blob, pnode);
-	scells = fdt_size_cells(blob, node);
+	pci_cells = fdt_address_cells(blob, node);
+	cpu_cells = fdt_address_cells(blob, pnode);
+	size_cells = fdt_size_cells(blob, node);
 
-	/* Child PCI addresses always use 3 cells */
-	if (acells != 3)
-		return -ENOENT;
+	/* PCI addresses always use 3 cells */
+	if (pci_cells != 3)
+		return -EINVAL;
+
+	/* CPU addresses on Armada 37xx always use 2 cells */
+	if (cpu_cells != 2)
+		return -EINVAL;
 
-	/* Calculate fixup offset from first child address (in last cell) */
-	fix_offset = base - fdt32_to_cpu(ranges[2]);
+	for (i = 0; i < len / sizeof(u32);
+	     i += pci_cells + cpu_cells + size_cells) {
+		/*
+		 * Parent CPU addresses on Armada 37xx are always 32-bit, so
+		 * check that the high word is zero.
+		 */
+		if (fdt32_to_cpu(ranges[i + pci_cells]))
+			return -EINVAL;
 
-	/* If fixup offset is zero then there is nothing to fix */
+		if (i == 0 ||
+		    fdt32_to_cpu(ranges[i + pci_cells + 1]) < lowest_cpu_addr)
+			lowest_cpu_addr = fdt32_to_cpu(ranges[i + pci_cells + 1]);
+	}
+
+	/* Calculate fixup offset from the lowest (first) CPU address */
+	fix_offset = base - lowest_cpu_addr;
+
+	/* If fixup offset is zero there is nothing to fix */
 	if (!fix_offset)
 		return 0;
 
 	/*
-	 * Fix address (last cell) of each child address and each parent
-	 * address
+	 * Fix each CPU address and corresponding PCI address if PCI address
+	 * is not already remapped (has the same value)
 	 */
-	for (i = 0; i < len / sizeof(u32); i += acells + pacells + scells) {
+	for (i = 0; i < len / sizeof(u32);
+	     i += pci_cells + cpu_cells + size_cells) {
+		u32 cpu_addr;
+		u64 pci_addr;
 		int idx;
 
-		/* fix child address */
-		idx = i + acells - 1;
+		/* Fix CPU address */
+		idx = i + pci_cells + cpu_cells - 1;
+		cpu_addr = fdt32_to_cpu(ranges[idx]);
 		ret = fdt_setprop_inplace_u32_partial(blob, node, "ranges", idx,
-						      fdt32_to_cpu(ranges[idx]) +
-						      fix_offset);
+						      cpu_addr + fix_offset);
 		if (ret)
 			return ret;
 
-		/* fix parent address */
-		idx = i + acells + pacells - 1;
+		/* Fix PCI address only if it isn't remapped (is same as CPU) */
+		idx = i + pci_cells - 1;
+		pci_addr = ((u64)fdt32_to_cpu(ranges[idx - 1]) << 32) |
+			   fdt32_to_cpu(ranges[idx]);
+		if (cpu_addr != pci_addr)
+			continue;
+
 		ret = fdt_setprop_inplace_u32_partial(blob, node, "ranges", idx,
-						      fdt32_to_cpu(ranges[idx]) +
-						      fix_offset);
+						      cpu_addr + fix_offset);
 		if (ret)
 			return ret;
 	}