[][kernel][common][hnat][Refactor HNAT flow to prevent entry modification racing]

[Description]
Refactor HNAT flow to prevent entry modification racing.

[Release-log]
N/A


Change-Id: Ia1541424ed0fc1f744f1bd5a0efc64e9c74cd0bf
Reviewed-on: https://gerrit.mediatek.inc/c/openwrt/feeds/mtk_openwrt_feeds/+/9246920
diff --git a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat.h b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat.h
index a5b578b..67309bf 100644
--- a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat.h
+++ b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat.h
@@ -1232,7 +1232,7 @@
 #endif
 
 #define UDF_PINGPONG_IFIDX GENMASK(6, 0)
-#define UDF_HNAT_PRE_FILLED BIT(7)
+#define UDF_HNAT_ENTRY_LOCKED BIT(7)
 
 #define HQOS_FLAG(dev, skb, qid)			\
 	((IS_HQOS_UL_MODE && IS_WAN(dev)) ||		\
@@ -1249,7 +1249,7 @@
 extern const struct of_device_id of_hnat_match[];
 extern struct mtk_hnat *hnat_priv;
 
-static inline int is_hnat_pre_filled(struct foe_entry *entry)
+static inline int is_hnat_entry_locked(struct foe_entry *entry)
 {
 	u32 udf = 0;
 
@@ -1258,7 +1258,30 @@
 	else
 		udf = entry->ipv6_5t_route.act_dp;
 
-	return !!(udf & UDF_HNAT_PRE_FILLED);
+	return !!(udf & UDF_HNAT_ENTRY_LOCKED);
+}
+
+static inline void hnat_set_entry_lock(struct foe_entry *entry, bool locked)
+{
+	if (IS_IPV4_GRP(entry)) {
+		if (locked)
+			entry->ipv4_hnapt.act_dp |= UDF_HNAT_ENTRY_LOCKED;
+		else
+			entry->ipv4_hnapt.act_dp &= ~UDF_HNAT_ENTRY_LOCKED;
+	} else {
+		if (locked)
+			entry->ipv6_5t_route.act_dp |= UDF_HNAT_ENTRY_LOCKED;
+		else
+			entry->ipv6_5t_route.act_dp &= ~UDF_HNAT_ENTRY_LOCKED;
+	}
+	/* Ensure the lock has been written to the entry before return */
+	wmb();
+}
+
+static inline void hnat_check_release_entry_lock(struct foe_entry *entry)
+{
+	if (is_hnat_entry_locked(entry))
+		hnat_set_entry_lock(entry, false);
 }
 
 #if defined(CONFIG_NET_DSA_MT7530)
diff --git a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat_nf_hook.c b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat_nf_hook.c
index 194af58..9b31e66 100644
--- a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat_nf_hook.c
+++ b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/hnat_nf_hook.c
@@ -1246,9 +1246,6 @@
 	if (!hnat_priv->data->mcast && is_multicast_ether_addr(eth->h_dest))
 		return 0;
 
-	if (whnat && is_hnat_pre_filled(foe))
-		return 0;
-
 	entry.bfib1.pkt_type = foe->udib1.pkt_type; /* Get packte type state*/
 	entry.bfib1.state = foe->udib1.state;
 
@@ -1274,8 +1271,7 @@
 			entry.ipv4_hnapt.etype = htons(ETH_P_IP);
 
 			/* DS-Lite WAN->LAN */
-			if (entry.ipv4_hnapt.bfib1.pkt_type == IPV4_DSLITE ||
-			    entry.ipv4_hnapt.bfib1.pkt_type == IPV4_MAP_E) {
+			if (IS_IPV4_DSLITE(&entry) || IS_IPV4_MAPE(&entry)) {
 				entry.ipv4_dslite.sip = foe->ipv4_dslite.sip;
 				entry.ipv4_dslite.dip = foe->ipv4_dslite.dip;
 				entry.ipv4_dslite.sport =
@@ -1284,7 +1280,7 @@
 					foe->ipv4_dslite.dport;
 
 #if defined(CONFIG_MEDIATEK_NETSYS_V2) || defined(CONFIG_MEDIATEK_NETSYS_V3)
-				if (entry.bfib1.pkt_type == IPV4_MAP_E) {
+				if (IS_IPV4_MAPE(&entry)) {
 					pptr = skb_header_pointer(skb,
 								  iph->ihl * 4,
 								  sizeof(_ports),
@@ -1336,7 +1332,7 @@
 				entry.ipv4_dslite.eg_keep_cls = 1;
 #endif
 
-			} else {
+			} else if (IS_IPV4_GRP(&entry)) {
 				entry.ipv4_hnapt.iblk2.dscp = iph->tos;
 				if (hnat_priv->data->per_flow_accounting)
 					entry.ipv4_hnapt.iblk2.mibf = 1;
@@ -1352,7 +1348,7 @@
 					else
 						entry.ipv4_hnapt.vlan1 =
 							skb->vlan_tci;
-			}
+				}
 
 				entry.ipv4_hnapt.sip = foe->ipv4_hnapt.sip;
 				entry.ipv4_hnapt.dip = foe->ipv4_hnapt.dip;
@@ -1361,20 +1357,23 @@
 
 				entry.ipv4_hnapt.new_sip = ntohl(iph->saddr);
 				entry.ipv4_hnapt.new_dip = ntohl(iph->daddr);
-			}
 
-			entry.ipv4_hnapt.bfib1.udp = udp;
-			if (IS_IPV4_HNAPT(foe)) {
-				pptr = skb_header_pointer(skb, iph->ihl * 4,
-							  sizeof(_ports),
-							  &_ports);
-				if (unlikely(!pptr))
-					return -1;
+				if (IS_IPV4_HNAPT(&entry)) {
+					pptr = skb_header_pointer(skb, iph->ihl * 4,
+								sizeof(_ports),
+								&_ports);
+					if (unlikely(!pptr))
+						return -1;
 
-				entry.ipv4_hnapt.new_sport = ntohs(pptr->src);
-				entry.ipv4_hnapt.new_dport = ntohs(pptr->dst);
+					entry.ipv4_hnapt.new_sport = ntohs(pptr->src);
+					entry.ipv4_hnapt.new_dport = ntohs(pptr->dst);
+				}
+			} else {
+				return 0;
 			}
 
+			entry.ipv4_hnapt.bfib1.udp = udp;
+
 #if defined(CONFIG_MEDIATEK_NETSYS_V3)
 			entry.ipv4_hnapt.eg_keep_ecn = 1;
 			entry.ipv4_hnapt.eg_keep_dscp = 1;
@@ -1417,7 +1416,7 @@
 				entry.ipv6_5t_route.iblk2.mibf = 1;
 			entry.ipv6_5t_route.bfib1.udp = udp;
 
-			if (IS_IPV6_6RD(foe)) {
+			if (IS_IPV6_6RD(&entry)) {
 				entry.ipv6_5t_route.bfib1.rmt = 1;
 				entry.ipv6_6rd.tunnel_sipv4 =
 					foe->ipv6_6rd.tunnel_sipv4;
@@ -1448,7 +1447,7 @@
 			entry.ipv6_3t_route.ipv6_dip3 =
 				foe->ipv6_3t_route.ipv6_dip3;
 
-			if (IS_IPV6_3T_ROUTE(foe)) {
+			if (IS_IPV6_3T_ROUTE(&entry)) {
 				entry.ipv6_3t_route.prot =
 					foe->ipv6_3t_route.prot;
 				entry.ipv6_3t_route.hph =
@@ -1457,9 +1456,7 @@
 				entry.ipv6_3t_route.eg_keep_ecn = 1;
 				entry.ipv6_3t_route.eg_keep_cls = 1;
 #endif
-			}
-
-			if (IS_IPV6_5T_ROUTE(foe) || IS_IPV6_6RD(foe)) {
+			} else if (IS_IPV6_5T_ROUTE(&entry) || IS_IPV6_6RD(&entry)) {
 				entry.ipv6_5t_route.sport =
 					foe->ipv6_5t_route.sport;
 				entry.ipv6_5t_route.dport =
@@ -1468,6 +1465,8 @@
 				entry.ipv6_5t_route.eg_keep_ecn = 1;
 				entry.ipv6_5t_route.eg_keep_cls = 1;
 #endif
+			} else {
+				return 0;
 			}
 
 			if (IS_IPV6_5T_ROUTE(&entry) &&
@@ -1794,7 +1793,7 @@
 		}
 	}
 
-	if (IS_IPV4_GRP(foe)) {
+	if (IS_IPV4_GRP(&entry)) {
 		entry.ipv4_hnapt.iblk2.dp = gmac;
 		entry.ipv4_hnapt.iblk2.port_mg =
 			(hnat_priv->data->version == MTK_HNAT_V1_1) ? 0x3f : 0;
@@ -1893,15 +1892,54 @@
 		entry.bfib1.ttl = 1;
 		entry.bfib1.state = BIND;
 	} else {
-		if (IS_IPV4_GRP(foe))
-			entry.ipv4_hnapt.act_dp |= UDF_HNAT_PRE_FILLED;
-		else
-			entry.ipv6_5t_route.act_dp |= UDF_HNAT_PRE_FILLED;
+		if (spin_trylock(&hnat_priv->entry_lock)) {
+			/* If this entry is already lock, we should not modify it right now */
+			if (is_hnat_entry_locked(foe)) {
+				skb_hnat_filled(skb) = HNAT_INFO_FILLED;
+				spin_unlock(&hnat_priv->entry_lock);
+				return 0;
+			}
+
+			/* Final check if the entry is not in UNBIND state,
+			 * we should not modify it right now.
+			 */
+			if (foe->udib1.state != UNBIND) {
+				spin_unlock(&hnat_priv->entry_lock);
+				return 0;
+			}
+
+			/* Keep the entry locked until hook_tx is called */
+			hnat_set_entry_lock(&entry, true);
+
+			/* We must ensure all info has been updated before set to hw */
+			wmb();
+			memcpy(foe, &entry, sizeof(entry));
+
+			skb_hnat_filled(skb) = HNAT_INFO_FILLED;
+			spin_unlock(&hnat_priv->entry_lock);
+		}
+		return 0;
 	}
 
+	/* Final check if the entry is not in UNBIND state,
+	 * we should not modify it right now.
+	 */
+	if (foe->udib1.state != UNBIND)
+		return 0;
+
+	/* We must ensure all info has been updated before set to hw */
 	wmb();
-	memcpy(foe, &entry, sizeof(entry));
-	/*reset statistic for this entry*/
+	/* Before entry enter BIND state, write other fields first,
+	 * prevent racing with hardware accesses.
+	 */
+	memcpy(&foe->ipv4_hnapt.sip, &entry.ipv4_hnapt.sip,
+	       sizeof(entry) - sizeof(entry.bfib1));
+	/* We must ensure all info has been updated before set to hw */
+	wmb();
+	/* After other fields have been written, write info1 to BIND the entry */
+	memcpy(foe, &entry, sizeof(entry.bfib1));
+
+	/* reset statistic for this entry */
 	if (hnat_priv->data->per_flow_accounting &&
 	    skb_hnat_entry(skb) < hnat_priv->foe_etry_num &&
 	    skb_hnat_ppe(skb) < CFG_PPE_NUM) {
@@ -1917,11 +1955,25 @@
 {
 	struct foe_entry *hw_entry, entry;
 	struct ethhdr *eth;
+	struct iphdr *iph;
+	struct ipv6hdr *ip6h;
 
-	if (skb_hnat_alg(skb) ||
-	    !is_magic_tag_valid(skb) || !IS_SPACE_AVAILABLE_HEAD(skb))
+	if (!skb_hnat_is_hashed(skb) || skb_hnat_ppe(skb) >= CFG_PPE_NUM)
+		return NF_ACCEPT;
+
+	hw_entry = &hnat_priv->foe_table_cpu[skb_hnat_ppe(skb)][skb_hnat_entry(skb)];
+	memcpy(&entry, hw_entry, sizeof(entry));
+
+	if (!is_hnat_entry_locked(&entry))
 		return NF_ACCEPT;
 
+	if (unlikely(entry.bfib1.state != UNBIND))
+		goto check_release_entry_lock;
+
+	if (skb_hnat_alg(skb) || !is_hnat_info_filled(skb) ||
+	    !is_magic_tag_valid(skb) || !IS_SPACE_AVAILABLE_HEAD(skb))
+		goto check_release_entry_lock;
+
 	trace_printk(
 		"[%s]entry=%x reason=%x gmac_no=%x wdmaid=%x rxid=%x wcid=%x bssid=%x\n",
 		__func__, skb_hnat_entry(skb), skb_hnat_reason(skb), gmac_no,
@@ -1930,35 +1982,20 @@
 
 	if ((gmac_no != NR_WDMA0_PORT) && (gmac_no != NR_WDMA1_PORT) &&
 	    (gmac_no != NR_WDMA2_PORT) && (gmac_no != NR_WHNAT_WDMA_PORT))
-		return NF_ACCEPT;
+		goto check_release_entry_lock;
 
 	if (unlikely(!skb_mac_header_was_set(skb)))
-		return NF_ACCEPT;
-
-	if (!skb_hnat_is_hashed(skb))
-		return NF_ACCEPT;
-
-	if (skb_hnat_entry(skb) >= hnat_priv->foe_etry_num ||
-	    skb_hnat_ppe(skb) >= CFG_PPE_NUM)
-		return NF_ACCEPT;
-
-	hw_entry = &hnat_priv->foe_table_cpu[skb_hnat_ppe(skb)][skb_hnat_entry(skb)];
-	memcpy(&entry, hw_entry, sizeof(entry));
-	if (entry.bfib1.state == BIND)
-		return NF_ACCEPT;
+		goto check_release_entry_lock;
 
 	if (skb_hnat_reason(skb) != HIT_UNBIND_RATE_REACH)
-		return NF_ACCEPT;
-
-	if (!is_hnat_pre_filled(&entry))
-		return NF_ACCEPT;
+		goto check_release_entry_lock;
 
 	eth = eth_hdr(skb);
 
-	/*not bind multicast if PPE mcast not enable*/
+	/* not bind multicast if PPE mcast not enable */
 	if (!hnat_priv->data->mcast) {
 		if (is_multicast_ether_addr(eth->h_dest))
-			return NF_ACCEPT;
+			goto check_release_entry_lock;
 
 		if (IS_IPV4_GRP(&entry))
 			entry.ipv4_hnapt.iblk2.mcast = 0;
@@ -1966,7 +2003,22 @@
 			entry.ipv6_5t_route.iblk2.mcast = 0;
 	}
 
-	spin_lock(&hnat_priv->entry_lock);
+	/* not bind if IP of entry and skb are different */
+	if (IS_IPV4_HNAPT(&entry)) {
+		iph = ip_hdr(skb);
+		if (!iph ||
+		    entry.ipv4_hnapt.new_sip != ntohl(iph->saddr) ||
+		    entry.ipv4_hnapt.new_dip != ntohl(iph->daddr))
+			goto check_release_entry_lock;
+	} else if (IS_IPV6_5T_ROUTE(&entry)) {
+		ip6h = ipv6_hdr(skb);
+		if (!ip6h ||
+		    !hnat_ipv6_addr_equal(&entry.ipv6_5t_route.ipv6_sip0, &ip6h->saddr) ||
+		    !hnat_ipv6_addr_equal(&entry.ipv6_5t_route.ipv6_dip0, &ip6h->daddr)) {
+			goto check_release_entry_lock;
+		}
+	}
+
 	/* Some mt_wifi virtual interfaces, such as apcli,
 	 * will change the smac for specail purpose.
 	 */
@@ -2166,13 +2218,34 @@
 
 	entry.bfib1.ttl = 1;
 	entry.bfib1.state = BIND;
-	if (IS_IPV4_GRP(&entry))
-		entry.ipv4_hnapt.act_dp &= ~UDF_HNAT_PRE_FILLED;
-	else
-		entry.ipv6_5t_route.act_dp &= ~UDF_HNAT_PRE_FILLED;
+
+	/* Final check if the entry is not in UNBIND state,
+	 * we should not modify it right now.
+	 */
+	if (hw_entry->udib1.state != UNBIND)
+		goto check_release_entry_lock;
+
 	/* We must ensure all info has been updated before set to hw */
 	wmb();
-	memcpy(hw_entry, &entry, sizeof(entry));
+	/* Before entry enter BIND state, write other fields first,
+	 * prevent racing with hardware accesses.
+	 */
+	memcpy(&hw_entry->ipv4_hnapt.sip, &entry.ipv4_hnapt.sip,
+	       sizeof(entry) - sizeof(entry.bfib1));
+	/* We must ensure all info has been updated before set to hw */
+	wmb();
+	/* After other fields have been written, write info1 to BIND the entry */
+	memcpy(hw_entry, &entry, sizeof(entry.bfib1));
+
+	hnat_set_entry_lock(hw_entry, false);
+
+	/* reset statistic for this entry */
+	if (hnat_priv->data->per_flow_accounting) {
+		memset(&hnat_priv->acct[skb_hnat_ppe(skb)][skb_hnat_entry(skb)],
+			0, sizeof(struct hnat_accounting));
+		hnat_priv->acct[skb_hnat_ppe(skb)][skb_hnat_entry(skb)].nfct =
+			skb_get_nfct(skb);
+	}
 
 #if defined(CONFIG_MEDIATEK_NETSYS_V3)
 	if (debug_level >= 7) {
@@ -2227,7 +2300,10 @@
 		}
 	}
 #endif
-	spin_unlock(&hnat_priv->entry_lock);
+	return NF_ACCEPT;
+
+check_release_entry_lock:
+	hnat_check_release_entry_lock(hw_entry);
 	return NF_ACCEPT;
 }
 
@@ -2239,6 +2315,7 @@
 	}
 
 	skb_hnat_alg(skb) = 0;
+	skb_hnat_filled(skb) = 0;
 	skb_hnat_magic_tag(skb) = HNAT_MAGIC_TAG;
 
 	if (skb_hnat_iface(skb) == FOE_MAGIC_WED0)
@@ -2715,9 +2792,7 @@
 		if (fn && fn(skb, arp_dev, &hw_path))
 			break;
 
-		spin_lock(&hnat_priv->entry_lock);
 		skb_to_hnat_info(skb, out, entry, &hw_path);
-		spin_unlock(&hnat_priv->entry_lock);
 		break;
 	case HIT_BIND_KEEPALIVE_DUP_OLD_HDR:
 		/* update hnat count to nf_conntrack by keepalive */
diff --git a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/nf_hnat_mtk.h b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/nf_hnat_mtk.h
index 7892cf2..0c4c7ed 100644
--- a/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/nf_hnat_mtk.h
+++ b/21.02/files/target/linux/mediatek/files-5.4/drivers/net/ethernet/mediatek/mtk_hnat/nf_hnat_mtk.h
@@ -85,6 +85,7 @@
 #define HAS_HQOS_MAGIC_TAG(skb) (qos_toggle && skb->protocol == HQOS_MAGIC_TAG)
 
 #define HNAT_MAGIC_TAG 0x6789
+#define HNAT_INFO_FILLED 0x7
 #define WIFI_INFO_LEN 6
 #define FOE_INFO_LEN (10 + WIFI_INFO_LEN)
 #define IS_SPACE_AVAILABLE_HEAD(skb)                                           \
@@ -134,6 +135,7 @@
 #define set_to_ppe(skb) (HNAT_SKB_CB2(skb)->magic = 0x78681415)
 #define is_from_extge(skb) (HNAT_SKB_CB2(skb)->magic == 0x78786688)
 #define is_magic_tag_valid(skb) (skb_hnat_magic_tag(skb) == HNAT_MAGIC_TAG)
+#define is_hnat_info_filled(skb) (skb_hnat_filled(skb) == HNAT_INFO_FILLED)
 #define set_from_mape(skb) (HNAT_SKB_CB2(skb)->magic = 0x78787788)
 #define is_from_mape(skb) (HNAT_SKB_CB2(skb)->magic == 0x78787788)
 #define is_unreserved_port(hdr)						       \