From 03fadbbb3b8750e47f40f472ba71c88b937eadb5 Mon Sep 17 00:00:00 2001
From: Bo Jiao <Bo.Jiao@mediatek.com>
Date: Mon, 6 Feb 2023 11:56:12 +0800
Subject: [PATCH 2000/2010] wifi: mt76: mt7996: add PAO tx with hif_txd ver2.1

Change-Id: I1cef01616b8292cf20991e2683bc1db237095890
---
 Makefile        |   2 +-
 dma.c           |  76 +++++++++++++++++++
 mt76.h          |  39 ++++++++++
 mt7996/init.c   |   3 +-
 mt7996/mac.c    | 160 +++++++++++++++++++++++++++++++++++++--
 mt7996/mac.h    |  28 ++++++-
 mt7996/main.c   |   4 +
 mt7996/mmio.c   |   9 ++-
 mt7996/mt7996.h |   3 +
 mt7996/pci.c    |   4 +
 pao-tx.c        | 194 ++++++++++++++++++++++++++++++++++++++++++++++++
 tx.c            |  19 +++++
 12 files changed, 528 insertions(+), 13 deletions(-)
 create mode 100644 pao-tx.c

diff --git a/Makefile b/Makefile
index 9c287cf..1736df3 100644
--- a/Makefile
+++ b/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_MT76_CONNAC_LIB) += mt76-connac-lib.o
 
 mt76-y := \
 	mmio.o util.o trace.o dma.o mac80211.o debugfs.o eeprom.o \
-	tx.o agg-rx.o mcu.o
+	tx.o agg-rx.o mcu.o pao-tx.o
 
 mt76-$(CONFIG_PCI) += pci.o
 mt76-$(CONFIG_NL80211_TESTMODE) += testmode.o
diff --git a/dma.c b/dma.c
index e1e9062..a3bcc76 100644
--- a/dma.c
+++ b/dma.c
@@ -492,6 +492,76 @@ error:
 	return -ENOMEM;
 }
 
+static int
+mt76_dma_tx_queue_skb_pao(struct mt76_dev *dev, struct mt76_queue *q,
+                         enum mt76_txq_id qid, struct mt76_tx_tid *tid,
+                         struct ieee80211_sta *sta)
+{
+	struct ieee80211_tx_status status = {
+		.sta = sta,
+	};
+	struct mt76_tx_info tx_info;
+	struct ieee80211_hw *hw;
+	struct mt76_txwi_cache *t;
+	int i, n = 0, ret;
+	u8 *txwi;
+
+	t = tid->txwi;
+	tx_info.buf[n].addr = t->dma_addr;
+	tx_info.buf[n++].len = dev->drv->txwi_size;
+
+	for (i = 0; i < t->nbuf; i++) {
+		tx_info.buf[n].addr = t->buf[i].addr;
+		tx_info.buf[n++].len = t->buf[i].len;
+	}
+	tx_info.skb = t->pao_skb[0]; /* first skb */
+	tx_info.nbuf = n;
+	tx_info.txd_ver = MT_HIF_TXD_V2_1;
+
+	if (q->queued + 1 >= q->ndesc - 1) {
+		ret = -ENOMEM;
+		goto free;
+	}
+
+	txwi = mt76_get_txwi_ptr(dev, t);
+	dma_sync_single_for_cpu(dev->dma_dev, t->dma_addr,
+				dev->drv->txwi_size,
+				DMA_TO_DEVICE);
+	ret = dev->drv->tx_prepare_skb(dev, txwi, qid,
+				       tid->wcid, sta, &tx_info);
+	dma_sync_single_for_device(dev->dma_dev, t->dma_addr,
+				   dev->drv->txwi_size,
+				   DMA_TO_DEVICE);
+	if (ret < 0)
+		goto free;
+
+	return mt76_dma_add_buf(dev, q, tx_info.buf, tx_info.nbuf,
+				tx_info.info, tx_info.skb, t);
+
+	free:
+	for (n = 0; n < t->nbuf; n++) {
+		dma_unmap_single(dev->dma_dev, t->buf[n].addr,
+				 t->buf[n].len, DMA_TO_DEVICE);
+
+	       /* fix tx_done accounting on queue overflow */
+		status.skb = t->pao_skb[n];
+#ifdef CONFIG_NL80211_TESTMODE
+		if (mt76_is_testmode_skb(dev, t->pao_skb[n], &hw)) {
+			struct mt76_phy *phy = hw->priv;
+
+			if (t->pao_skb[n] == phy->test.tx_skb)
+				phy->test.tx_done--;
+		}
+#endif
+		hw = mt76_tx_status_get_hw(dev, t->pao_skb[n]);
+		ieee80211_tx_status_ext(hw, &status);
+	}
+
+	mt76_put_txwi(dev, t);
+
+	return ret;
+}
+
 static int
 mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q,
 		      enum mt76_txq_id qid, struct sk_buff *skb,
@@ -546,6 +616,7 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q,
 		tx_info.buf[n++].len = iter->len;
 	}
 	tx_info.nbuf = n;
+	tx_info.txd_ver = MT_HIF_TXD_V1_0;
 
 	if (q->queued + (tx_info.nbuf + 1) / 2 >= q->ndesc - 1) {
 		ret = -ENOMEM;
@@ -560,6 +631,10 @@ mt76_dma_tx_queue_skb(struct mt76_dev *dev, struct mt76_queue *q,
 	if (ret < 0)
 		goto unmap;
 
+	/* PAO tx with hif_txd v2.1 */
+	if (ret)
+		return ret;
+
 	return mt76_dma_add_buf(dev, q, tx_info.buf, tx_info.nbuf,
 				tx_info.info, tx_info.skb, t);
 
@@ -940,6 +1015,7 @@ static const struct mt76_queue_ops mt76_dma_ops = {
 	.alloc = mt76_dma_alloc_queue,
 	.reset_q = mt76_dma_queue_reset,
 	.tx_queue_skb_raw = mt76_dma_tx_queue_skb_raw,
+	.tx_queue_skb_pao = mt76_dma_tx_queue_skb_pao,
 	.tx_queue_skb = mt76_dma_tx_queue_skb,
 	.tx_cleanup = mt76_dma_tx_cleanup,
 	.rx_cleanup = mt76_dma_rx_cleanup,
diff --git a/mt76.h b/mt76.h
index 8abb6f4..069113e 100644
--- a/mt76.h
+++ b/mt76.h
@@ -166,11 +166,30 @@ struct mt76_queue_buf {
 	bool skip_unmap;
 };
 
+#define MT_HIF_TXD_V1_0   2
+#define MT_HIF_TXD_V2_1   4
+struct mt76_tx_tid {
+	struct rcu_head rcu_head;
+
+	struct mt76_dev *dev;
+
+	spinlock_t lock;
+	struct delayed_work pao_work;
+
+	struct mt76_txwi_cache *txwi;
+	struct mt76_wcid *wcid;
+	enum mt76_txq_id qid;
+	int token;
+	u32 cur_len;
+	u8 nframe;
+};
+
 struct mt76_tx_info {
 	struct mt76_queue_buf buf[32];
 	struct sk_buff *skb;
 	int nbuf;
 	u32 info;
+	u8 txd_ver;
 };
 
 struct mt76_queue_entry {
@@ -260,6 +279,10 @@ struct mt76_queue_ops {
 	int (*tx_queue_skb_raw)(struct mt76_dev *dev, struct mt76_queue *q,
 				struct sk_buff *skb, u32 tx_info);
 
+	int (*tx_queue_skb_pao)(struct mt76_dev *dev, struct mt76_queue *q,
+				enum mt76_txq_id qid, struct mt76_tx_tid *tid,
+				struct ieee80211_sta *sta);
+
 	void *(*dequeue)(struct mt76_dev *dev, struct mt76_queue *q, bool flush,
 			 int *len, u32 *info, bool *more);
 
@@ -329,6 +352,7 @@ DECLARE_EWMA(signal, 10, 8);
 
 struct mt76_wcid {
 	struct mt76_rx_tid __rcu *aggr[IEEE80211_NUM_TIDS];
+	struct mt76_tx_tid __rcu *pao[IEEE80211_NUM_TIDS];
 
 	atomic_t non_aql_packets;
 	unsigned long flags;
@@ -375,6 +399,10 @@ struct mt76_txwi_cache {
 		struct sk_buff *skb;
 		void *ptr;
 	};
+
+	struct mt76_queue_buf buf[32];
+	struct sk_buff *pao_skb[32];
+	int nbuf;
 };
 
 struct mt76_rx_tid {
@@ -453,12 +481,15 @@ struct mt76_hw_cap {
 #define MT_DRV_RX_DMA_HDR		BIT(3)
 #define MT_DRV_HW_MGMT_TXQ		BIT(4)
 #define MT_DRV_AMSDU_OFFLOAD		BIT(5)
+#define MT_DRV_TX_PAO			BIT(6)
 
 struct mt76_driver_ops {
 	u32 drv_flags;
 	u32 survey_flags;
 	u16 txwi_size;
 	u16 token_size;
+	u16 tx_pao_num;
+	u16 tx_pao_size;
 	u8 mcs_rates;
 
 	void (*update_survey)(struct mt76_phy *phy);
@@ -1009,6 +1040,7 @@ static inline u16 mt76_rev(struct mt76_dev *dev)
 #define mt76_init_queues(dev, ...)		(dev)->mt76.queue_ops->init(&((dev)->mt76), __VA_ARGS__)
 #define mt76_queue_alloc(dev, ...)	(dev)->mt76.queue_ops->alloc(&((dev)->mt76), __VA_ARGS__)
 #define mt76_tx_queue_skb_raw(dev, ...)	(dev)->mt76.queue_ops->tx_queue_skb_raw(&((dev)->mt76), __VA_ARGS__)
+#define mt76_tx_queue_skb_pao(dev, ...)	(dev)->mt76.queue_ops->tx_queue_skb_pao(&((dev)->mt76), __VA_ARGS__)
 #define mt76_tx_queue_skb(dev, ...)	(dev)->mt76.queue_ops->tx_queue_skb(&((dev)->mt76), __VA_ARGS__)
 #define mt76_queue_rx_reset(dev, ...)	(dev)->mt76.queue_ops->rx_reset(&((dev)->mt76), __VA_ARGS__)
 #define mt76_queue_tx_cleanup(dev, ...)	(dev)->mt76.queue_ops->tx_cleanup(&((dev)->mt76), __VA_ARGS__)
@@ -1252,6 +1284,11 @@ void mt76_set_stream_caps(struct mt76_phy *phy, bool vht);
 int mt76_rx_aggr_start(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tid,
 		       u16 ssn, u16 size);
 void mt76_rx_aggr_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tid);
+int mt76_tx_pao_aggr(struct mt76_dev *dev, struct mt76_wcid *wcid,
+			enum mt76_txq_id qid, struct mt76_txwi_cache *txwi,
+			int id, dma_addr_t addr, u16 len);
+int mt76_tx_pao_start(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno);
+void mt76_tx_pao_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno);
 
 void mt76_wcid_key_setup(struct mt76_dev *dev, struct mt76_wcid *wcid,
 			 struct ieee80211_key_conf *key);
@@ -1263,6 +1300,8 @@ void mt76_tx_status_unlock(struct mt76_dev *dev, struct sk_buff_head *list)
 
 int mt76_tx_status_skb_add(struct mt76_dev *dev, struct mt76_wcid *wcid,
 			   struct sk_buff *skb);
+struct sk_buff *mt76_tx_status_skb_remove(struct mt76_dev *dev,
+					  struct mt76_wcid *wcid, int pktid);
 struct sk_buff *mt76_tx_status_skb_get(struct mt76_dev *dev,
 				       struct mt76_wcid *wcid, int pktid,
 				       struct sk_buff_head *list);
diff --git a/mt7996/init.c b/mt7996/init.c
index 768b298..196740d 100644
--- a/mt7996/init.c
+++ b/mt7996/init.c
@@ -475,7 +475,6 @@ static void mt7996_mac_init_basic_rates(struct mt7996_dev *dev)
 
 void mt7996_mac_init(struct mt7996_dev *dev)
 {
-#define HIF_TXD_V2_1	4
 	int i;
 
 	mt76_clear(dev, MT_MDP_DCR2, MT_MDP_DCR2_RX_TRANS_SHORT);
@@ -501,7 +500,7 @@ void mt7996_mac_init(struct mt7996_dev *dev)
 
 	mt7996_mcu_wa_cmd(dev, MCU_WA_PARAM_CMD(SET),
 			  MCU_WA_PARAM_HW_PATH_HIF_VER,
-			  HIF_TXD_V2_1, 0);
+			  MT_HIF_TXD_V2_1, 0);
 
 	for (i = MT_BAND0; i <= MT_BAND2; i++)
 		mt7996_mac_init_band(dev, i);
diff --git a/mt7996/mac.c b/mt7996/mac.c
index 993b43c..e2e8a97 100644
--- a/mt7996/mac.c
+++ b/mt7996/mac.c
@@ -1089,6 +1089,111 @@ void mt7996_mac_write_txwi(struct mt7996_dev *dev, __le32 *txwi,
 	}
 }
 
+static int
+mt7996_tx_prepare_skb_pao(struct mt76_dev *mdev, void *txwi_ptr,
+			struct mt76_wcid *wcid, struct mt76_tx_info *tx_info)
+{
+	struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx_info->skb);
+	struct ieee80211_vif *vif = info->control.vif;
+	struct mt7996_vif *mvif;
+	struct mt76_txwi_cache *t;
+	int i, j, id, nbuf = tx_info->nbuf - 1;
+	u32 val, total_len = MT_TXD_SIZE;
+	u8 tid, *txwi = (u8 *)txwi_ptr;
+	__le32 *txp, *txd = txwi_ptr;
+	u16 eth_type;
+
+	tid = tx_info->skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+
+	t = (struct mt76_txwi_cache *)(txwi + mdev->drv->txwi_size);
+	id = mt76_token_consume(mdev, &t);
+	if (id < 0)
+		return id;
+	memset(txwi_ptr, 0, mdev->drv->txwi_size);
+
+	/* fill in txd */
+	val = FIELD_PREP(MT_TXD0_VER, 2);
+	for (i = 0; i < nbuf; i++) {
+		eth_type = get_unaligned_be16(&t->pao_skb[i]->data[12]);
+
+		total_len += ((eth_type <= ETH_P_802_3_MIN) ? 0 : 8);
+
+		if (i < nbuf - 1)
+			total_len += ALIGN(tx_info->buf[i + 1].len, 4);
+		else
+			total_len += tx_info->buf[i + 1].len;
+	}
+
+	if (nbuf > 1)
+		val |= FIELD_PREP(MT_TXD0_TX_BYTES, total_len);
+
+	txd[0] = cpu_to_le32(val);
+
+	/* fill in txp */
+	txp = (__le32 *) ((u8 *)txwi + MT_TXD_SIZE);
+
+	/* dw0 */
+	val = FIELD_PREP(GENMASK(31, 16), 0);
+
+	eth_type = get_unaligned_be16(&tx_info->skb->data[12]);
+	if ((eth_type <= ETH_P_802_3_MIN) ? 0 : 1)
+		val |= MT_TXP_ETYPE;
+
+	if (nbuf > 1)
+		val |= MT_TXP_AMSDU;
+
+	val |= FIELD_PREP(MT_TXP_TOKEN_ID, id);
+	val |= MT_TXP_FROM_HOST;
+	txp[0] = cpu_to_le32(val);
+
+	/* dw1 */
+	val = FIELD_PREP(MT_TXP_USER_PRIORITY, tid);
+
+	if (vif) {
+		mvif = (struct mt7996_vif *) vif->drv_priv;
+
+		val |= FIELD_PREP(MT_TXP_BSS_IDX, mvif->mt76.idx);
+	}
+	val |= FIELD_PREP(MT_TXP_BUF_NUM, nbuf);
+
+	val |= FIELD_PREP(MT_TXP_MSDU_CNT, nbuf);
+	txp[1] = cpu_to_le32(val);
+
+	/* dw2 */
+	val = FIELD_PREP(MT_TXP_ETH_TYPE, cpu_to_be16(eth_type));
+
+	val |= FIELD_PREP(MT_TXP_WLAN_IDX, wcid->idx);
+	txp[2] = cpu_to_le32(val);
+
+	for (i = 0, j = 4; i < nbuf; i++) {
+		if (i % 2 == 0) {
+			txp[j++] = cpu_to_le32(tx_info->buf[i + 1].addr);
+
+			val = FIELD_PREP(MT_TXP_BUF_LEN0, tx_info->buf[i + 1].len) |
+			      FIELD_PREP(MT_TXP_BUF_PTR0_H, tx_info->buf[i + 1].addr >> 32);
+			txp[j] |= cpu_to_le32(val);
+		} else {
+
+			val = FIELD_PREP(MT_TXP_BUF_LEN1, tx_info->buf[i + 1].len) |
+			      FIELD_PREP(MT_TXP_BUF_PTR1_H, tx_info->buf[i + 1].addr >> 32);
+			txp[j++] |= cpu_to_le32(val);
+
+			txp[j++] = cpu_to_le32(tx_info->buf[i + 1].addr);
+		}
+
+		txp[23] |= 1 << (i + MT_TXP_ML_SHIFT);
+	}
+	txp[23] = cpu_to_le32(txp[23]);
+
+	tx_info->skb = DMA_DUMMY_DATA;
+	/* pass partial skb header to fw */
+	tx_info->buf[1].len = MT_CT_PARSE_LEN;
+	tx_info->buf[1].skip_unmap = true;
+	tx_info->nbuf = 1;
+
+	return 0;
+}
+
 int mt7996_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr,
 			  enum mt76_txq_id qid, struct mt76_wcid *wcid,
 			  struct ieee80211_sta *sta,
@@ -1101,7 +1206,7 @@ int mt7996_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr,
 	struct ieee80211_vif *vif = info->control.vif;
 	struct mt76_connac_txp_common *txp;
 	struct mt76_txwi_cache *t;
-	int id, i, pid, nbuf = tx_info->nbuf - 1;
+	int ret, id, i, pid, nbuf = tx_info->nbuf - 1;
 	bool is_8023 = info->flags & IEEE80211_TX_CTL_HW_80211_ENCAP;
 	u8 *txwi = (u8 *)txwi_ptr;
 
@@ -1120,15 +1225,30 @@ int mt7996_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr,
 		}
 	}
 
+	if (tx_info->txd_ver == MT_HIF_TXD_V2_1)
+		return mt7996_tx_prepare_skb_pao(mdev, txwi_ptr,
+						 wcid, tx_info);
+
+	pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb);
 	t = (struct mt76_txwi_cache *)(txwi + mdev->drv->txwi_size);
 	t->skb = tx_info->skb;
 
+	/* PAO tx condition check */
+	if (dev->pao_support && is_8023 && pid < MT_PACKET_ID_FIRST &&
+	    nbuf == 1 && tx_info->txd_ver == MT_HIF_TXD_V1_0) {
+		ret = mt76_tx_pao_aggr(mdev, wcid, qid, t, id,
+				       tx_info->buf[1].addr,
+				       tx_info->buf[1].len);
+		if (ret > 0)
+			return ret;
+	}
+
 	id = mt76_token_consume(mdev, &t);
-	if (id < 0)
+	if (id < 0) {
+		mt76_tx_status_skb_remove(mdev, wcid, pid);
 		return id;
-
-	pid = mt76_tx_status_skb_add(mdev, wcid, tx_info->skb);
-	memset(txwi_ptr, 0, MT_TXD_SIZE);
+	}
+	memset(txwi_ptr, 0, mdev->drv->txwi_size);
 	/* Transmit non qos data by 802.11 header and need to fill txd by host*/
 	if (!is_8023 || pid >= MT_PACKET_ID_FIRST)
 		mt7996_mac_write_txwi(dev, txwi_ptr, tx_info->skb, wcid, key,
@@ -1166,6 +1286,7 @@ int mt7996_tx_prepare_skb(struct mt76_dev *mdev, void *txwi_ptr,
 		txp->fw.rept_wds_wcid = cpu_to_le16(wcid->idx);
 
 	tx_info->skb = DMA_DUMMY_DATA;
+	tx_info->buf[0].len = MT_TXD_SIZE + sizeof(struct mt76_connac_fw_txp);
 
 	/* pass partial skb header to fw */
 	tx_info->buf[1].len = MT_CT_PARSE_LEN;
@@ -1200,6 +1321,21 @@ mt7996_tx_check_aggr(struct ieee80211_sta *sta, __le32 *txwi)
 		ieee80211_start_tx_ba_session(sta, tid, 0);
 }
 
+static void
+mt7996_txp_skb_unmap(struct mt76_dev *dev, struct mt76_txwi_cache *t)
+{
+	int i;
+
+	if (t && t->nbuf) {
+		for (i = 0; i < t->nbuf; i++)
+			dma_unmap_single(dev->dev, t->buf[i].addr,
+					 t->buf[i].len, DMA_TO_DEVICE);
+		return;
+	}
+
+	mt76_connac_txp_skb_unmap(dev, t);
+}
+
 static void
 mt7996_txwi_free(struct mt7996_dev *dev, struct mt76_txwi_cache *t,
 		 struct ieee80211_sta *sta, struct list_head *free_list)
@@ -1209,7 +1345,7 @@ mt7996_txwi_free(struct mt7996_dev *dev, struct mt76_txwi_cache *t,
 	__le32 *txwi;
 	u16 wcid_idx;
 
-	mt76_connac_txp_skb_unmap(mdev, t);
+	mt7996_txp_skb_unmap(mdev, t);
 	if (!t->skb)
 		goto out;
 
@@ -1224,6 +1360,18 @@ mt7996_txwi_free(struct mt7996_dev *dev, struct mt76_txwi_cache *t,
 		wcid_idx = le32_get_bits(txwi[1], MT_TXD1_WLAN_IDX);
 	}
 
+	if (t && t->nbuf) {
+		int i;
+
+		for (i = 0; i < t->nbuf; i++) {
+			__mt76_tx_complete_skb(mdev, wcid_idx,
+					       t->pao_skb[i], free_list);
+
+			t->pao_skb[i] = NULL;
+		}
+		t->nbuf = 0;
+		goto out;
+	}
 	__mt76_tx_complete_skb(mdev, wcid_idx, t->skb, free_list);
 
 out:
diff --git a/mt7996/mac.h b/mt7996/mac.h
index 74ad1e8..1e360b3 100644
--- a/mt7996/mac.h
+++ b/mt7996/mac.h
@@ -173,7 +173,8 @@ enum tx_mgnt_type {
 
 #define MT_TXD0_Q_IDX			GENMASK(31, 25)
 #define MT_TXD0_PKT_FMT			GENMASK(24, 23)
-#define MT_TXD0_ETH_TYPE_OFFSET		GENMASK(22, 16)
+#define MT_TXD0_VER			GENMASK(22, 19)
+#define MT_TXD0_ETH_TYPE_OFFSET		GENMASK(18, 16)
 #define MT_TXD0_TX_BYTES		GENMASK(15, 0)
 
 #define MT_TXD1_FIXED_RATE		BIT(31)
@@ -252,6 +253,31 @@ enum tx_mgnt_type {
 /* VHT/HE only use bits 0-3 */
 #define MT_TX_RATE_IDX			GENMASK(5, 0)
 
+#define MT_TXP_V2_SIZE			(24 * 4)
+
+#define MT_TXP_NON_CIPHER		BIT(3)
+#define MT_TXP_BC_MC_FLAG		GENMASK(6, 5)
+#define MT_TXP_FROM_HOST		BIT(7)
+#define MT_TXP_ETYPE			BIT(8)
+#define MT_TXP_AMSDU			BIT(9)
+#define MT_TXP_TOKEN_ID			GENMASK(31, 16)
+
+#define MT_TXP_BSS_IDX			GENMASK(7, 0)
+#define MT_TXP_USER_PRIORITY		GENMASK(15, 8)
+#define MT_TXP_BUF_NUM			GENMASK(20, 16)
+#define MT_TXP_MSDU_CNT			GENMASK(25, 21)
+
+#define MT_TXP_ETH_TYPE			GENMASK(15, 0)
+#define MT_TXP_WLAN_IDX			GENMASK(27, 16)
+
+#define MT_TXP_BUF_PTR0_L		GENMASK(31, 0)
+#define MT_TXP_BUF_LEN0			GENMASK(11, 0)
+#define MT_TXP_BUF_PTR0_H		GENMASK(15, 12)
+#define MT_TXP_BUF_LEN1			GENMASK(27, 16)
+#define MT_TXP_BUF_PTR1_H		GENMASK(31, 28)
+#define MT_TXP_BUF_PTR1_L		GENMASK(31, 0)
+#define MT_TXP_ML_SHIFT			16
+
 #define MT_TXFREE0_PKT_TYPE		GENMASK(31, 27)
 #define MT_TXFREE0_MSDU_CNT		GENMASK(25, 16)
 #define MT_TXFREE0_RX_BYTE		GENMASK(15, 0)
diff --git a/mt7996/main.c b/mt7996/main.c
index f0bdec6..b970595 100644
--- a/mt7996/main.c
+++ b/mt7996/main.c
@@ -803,12 +803,15 @@ mt7996_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_TX_OPERATIONAL:
 		mtxq->aggr = true;
 		mtxq->send_bar = false;
+		if (params->amsdu && dev->pao_support)
+			mt76_tx_pao_start(&dev->mt76, &msta->wcid, tid);
 		ret = mt7996_mcu_add_tx_ba(dev, params, true);
 		break;
 	case IEEE80211_AMPDU_TX_STOP_FLUSH:
 	case IEEE80211_AMPDU_TX_STOP_FLUSH_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->ampdu_state);
+		mt76_tx_pao_stop(&dev->mt76, &msta->wcid, tid);
 		ret = mt7996_mcu_add_tx_ba(dev, params, false);
 		break;
 	case IEEE80211_AMPDU_TX_START:
@@ -818,6 +821,7 @@ mt7996_ampdu_action(struct ieee80211_hw *hw, struct ieee80211_vif *vif,
 	case IEEE80211_AMPDU_TX_STOP_CONT:
 		mtxq->aggr = false;
 		clear_bit(tid, &msta->ampdu_state);
+		mt76_tx_pao_stop(&dev->mt76, &msta->wcid, tid);
 		ret = mt7996_mcu_add_tx_ba(dev, params, false);
 		ieee80211_stop_tx_ba_cb_irqsafe(vif, sta->addr, tid);
 		break;
diff --git a/mt7996/mmio.c b/mt7996/mmio.c
index 3a591a7..ec8415b 100644
--- a/mt7996/mmio.c
+++ b/mt7996/mmio.c
@@ -324,15 +324,18 @@ struct mt7996_dev *mt7996_mmio_probe(struct device *pdev,
 				     void __iomem *mem_base, u32 device_id)
 {
 	static const struct mt76_driver_ops drv_ops = {
-		/* txwi_size = txd size + txp size */
-		.txwi_size = MT_TXD_SIZE + sizeof(struct mt76_connac_fw_txp),
+		.txwi_size = MT_TXD_SIZE +
+			     max_t(u16, sizeof(struct mt76_connac_fw_txp), MT_TXP_V2_SIZE),
 		.drv_flags = MT_DRV_TXWI_NO_FREE |
 			     MT_DRV_AMSDU_OFFLOAD |
-			     MT_DRV_HW_MGMT_TXQ,
+			     MT_DRV_HW_MGMT_TXQ |
+			     MT_DRV_TX_PAO,
 		.survey_flags = SURVEY_INFO_TIME_TX |
 				SURVEY_INFO_TIME_RX |
 				SURVEY_INFO_TIME_BSS_RX,
 		.token_size = MT7996_TOKEN_SIZE,
+		.tx_pao_size = MT7996_TX_PAO_SIZE,
+		.tx_pao_num = MT7996_TX_PAO_NUM_MAX,
 		.tx_prepare_skb = mt7996_tx_prepare_skb,
 		.tx_complete_skb = mt76_connac_tx_complete_skb,
 		.rx_skb = mt7996_queue_rx_skb,
diff --git a/mt7996/mt7996.h b/mt7996/mt7996.h
index e371964..6ec40d2 100644
--- a/mt7996/mt7996.h
+++ b/mt7996/mt7996.h
@@ -39,6 +39,8 @@
 #define MT7996_EEPROM_SIZE		7680
 #define MT7996_EEPROM_BLOCK_SIZE	16
 #define MT7996_TOKEN_SIZE		16384
+#define MT7996_TX_PAO_SIZE		(IEEE80211_MAX_MPDU_LEN_VHT_11454 - 256)
+#define MT7996_TX_PAO_NUM_MAX		13
 
 #define MT7996_CFEND_RATE_DEFAULT	0x49	/* OFDM 24M */
 #define MT7996_CFEND_RATE_11B		0x03	/* 11B LP, 11M */
@@ -389,6 +391,7 @@ struct mt7996_dev {
 	bool tbtc_support:1;
 	bool flash_mode:1;
 	bool has_eht:1;
+	bool pao_support:1;
 
 	bool testmode_enable;
 	bool bin_file_mode;
diff --git a/mt7996/pci.c b/mt7996/pci.c
index c530105..97c3879 100644
--- a/mt7996/pci.c
+++ b/mt7996/pci.c
@@ -11,6 +11,9 @@
 #include "mac.h"
 #include "../trace.h"
 
+static bool pao_enable = false;
+module_param(pao_enable, bool, 0644);
+
 static LIST_HEAD(hif_list);
 static DEFINE_SPINLOCK(hif_lock);
 static u32 hif_idx;
@@ -121,6 +124,7 @@ static int mt7996_pci_probe(struct pci_dev *pdev,
 	if (IS_ERR(dev))
 		return PTR_ERR(dev);
 
+	dev->pao_support = pao_enable;
 	mdev = &dev->mt76;
 	mt7996_wfsys_reset(dev);
 	hif2 = mt7996_pci_init_hif2(pdev);
diff --git a/pao-tx.c b/pao-tx.c
new file mode 100644
index 0000000..a43f60f
--- /dev/null
+++ b/pao-tx.c
@@ -0,0 +1,194 @@
+// SPDX-License-Identifier: ISC
+/*
+ * Copyright (C) 2022 MediaTek Inc.
+ */
+#include "mt76.h"
+#include "trace.h"
+
+static void
+mt76_tx_pao_release(struct mt76_tx_tid *tid)
+{
+	struct mt76_dev *dev = tid->dev;
+	struct ieee80211_sta *sta = wcid_to_sta(tid->wcid);
+	struct mt76_phy *phy;
+	struct mt76_queue *q;
+
+	phy = mt76_dev_phy(dev, tid->wcid->phy_idx);
+	q = phy->q_tx[tid->qid];
+
+	if (tid->nframe) {
+		dev->queue_ops->tx_queue_skb_pao(dev, q, tid->qid, tid, sta);
+		dev->queue_ops->kick(dev, q);
+
+		tid->nframe = 0;
+		tid->cur_len = 0;
+		tid->txwi = NULL;
+	}
+}
+
+static void
+mt76_tx_pao_work(struct work_struct *work)
+{
+	struct mt76_tx_tid *tid = container_of(work, struct mt76_tx_tid,
+					       pao_work.work);
+
+	local_bh_disable();
+	rcu_read_lock();
+
+	spin_lock_bh(&tid->lock);
+
+	mt76_tx_pao_release(tid);
+
+	spin_unlock_bh(&tid->lock);
+
+	rcu_read_unlock();
+	local_bh_enable();
+}
+
+int mt76_tx_pao_aggr(struct mt76_dev *dev, struct mt76_wcid *wcid,
+		     enum mt76_txq_id qid, struct mt76_txwi_cache *txwi,
+		     int id, dma_addr_t addr, u16 len)
+{
+	struct mt76_txwi_cache *t;
+	struct sk_buff *skb = txwi->skb;
+	struct mt76_tx_tid *tid;
+	struct ieee80211_sta *sta;
+	u16 eth_type = get_unaligned_be16(&skb->data[12]);
+	u32 pao_size;
+	u8 tidno;
+
+	if (!(dev->drv->drv_flags & MT_DRV_TX_PAO))
+		return -EINVAL;
+
+	if (eth_type != ETH_P_IP ||
+	    eth_type == ETH_P_8021Q ||
+	    is_broadcast_ether_addr(skb->data) ||
+	    is_multicast_ether_addr(skb->data))
+		return -EINVAL;
+
+	if (skb_is_gso(skb))
+		return -EINVAL;
+
+	sta = wcid_to_sta(wcid);
+	if (!sta)
+		return -EINVAL;
+
+	tidno = skb->priority & IEEE80211_QOS_CTL_TID_MASK;
+	tid = rcu_dereference(wcid->pao[tidno]);
+	if (!tid)
+		return -EINVAL;
+
+	if (tid->wcid != wcid)
+		return -EINVAL;
+
+	spin_lock_bh(&tid->lock);
+	/* amsdu limit check */
+	pao_size = min_t(u16, sta->deflink.agg.max_amsdu_len,
+			 dev->drv->tx_pao_size);
+	if (tid->nframe > dev->drv->tx_pao_num - 1 ||
+	    tid->cur_len + len > pao_size)
+	    mt76_tx_pao_release(tid);
+
+	/* insert to amsdu */
+	if (!tid->txwi) {
+		txwi->nbuf = 0;
+		tid->txwi = txwi;
+		tid->qid = qid;
+		ieee80211_queue_delayed_work(dev->hw, &tid->pao_work, 4);
+	} else {
+		txwi->skb = NULL;
+		mt76_put_txwi(dev, txwi);
+	}
+
+	t = tid->txwi;
+	t->buf[t->nbuf].addr = addr;
+	t->buf[t->nbuf].len = len;
+	t->pao_skb[t->nbuf] = skb;
+	t->nbuf++;
+	tid->nframe = t->nbuf;
+	tid->cur_len += len;
+
+	spin_unlock_bh(&tid->lock);
+
+	return tid->nframe;
+}
+EXPORT_SYMBOL_GPL(mt76_tx_pao_aggr);
+
+int mt76_tx_pao_start(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno)
+{
+	struct mt76_tx_tid *tid;
+
+	if (!(dev->drv->drv_flags & MT_DRV_TX_PAO))
+		return -EINVAL;
+
+	mt76_tx_pao_stop(dev, wcid, tidno);
+
+	tid = kzalloc(sizeof(struct mt76_tx_tid), GFP_KERNEL);
+	if (!tid)
+		return -ENOMEM;
+
+	tid->dev = dev;
+	tid->wcid = wcid;
+	INIT_DELAYED_WORK(&tid->pao_work, mt76_tx_pao_work);
+
+	spin_lock_init(&tid->lock);
+
+	rcu_assign_pointer(wcid->pao[tidno], tid);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mt76_tx_pao_start);
+
+static void mt76_tx_pao_shutdown(struct mt76_dev *dev, struct ieee80211_sta *sta,
+			struct mt76_tx_tid *tid)
+{
+	struct ieee80211_tx_status status;
+	struct ieee80211_hw *hw;
+	int i;
+
+	spin_lock_bh(&tid->lock);
+
+	if (tid->txwi) {
+		struct mt76_txwi_cache *t = tid->txwi;
+
+		tid->nframe = 0;
+		tid->cur_len = 0;
+		tid->txwi = NULL;
+
+		status.sta = sta;
+		for (i = 0; i < t->nbuf; i++) {
+			dma_unmap_single(dev->dma_dev, t->buf[i].addr,
+					 t->buf[i].len, DMA_TO_DEVICE);
+
+			status.skb = t->pao_skb[i];
+			hw = mt76_tx_status_get_hw(dev, t->pao_skb[i]);
+			ieee80211_tx_status_ext(hw, &status);
+
+			t->pao_skb[i] = NULL;
+		}
+
+		mt76_put_txwi(dev, t);
+	}
+	spin_unlock_bh(&tid->lock);
+}
+
+void mt76_tx_pao_stop(struct mt76_dev *dev, struct mt76_wcid *wcid, u8 tidno)
+{
+	struct mt76_tx_tid *tid = NULL;
+	struct ieee80211_sta *sta;
+
+	if (!(dev->drv->drv_flags & MT_DRV_TX_PAO))
+		return;
+
+	sta = wcid_to_sta(wcid);
+	if (!sta)
+		return;
+
+	tid = rcu_replace_pointer(wcid->pao[tidno], tid,
+				  lockdep_is_held(&dev->mutex));
+	if (tid) {
+		mt76_tx_pao_shutdown(dev, sta, tid);
+		kfree_rcu(tid, rcu_head);
+	}
+}
+EXPORT_SYMBOL_GPL(mt76_tx_pao_stop);
\ No newline at end of file
diff --git a/tx.c b/tx.c
index 72b3ec7..3962e01 100644
--- a/tx.c
+++ b/tx.c
@@ -159,6 +159,25 @@ out:
 }
 EXPORT_SYMBOL_GPL(mt76_tx_status_skb_add);
 
+struct sk_buff *
+mt76_tx_status_skb_remove(struct mt76_dev *dev,
+			struct mt76_wcid *wcid, int pktid)
+{
+	struct sk_buff *skb;
+
+	spin_lock_bh(&dev->status_lock);
+
+	skb = idr_remove(&wcid->pktid, pktid);
+
+	if (idr_is_empty(&wcid->pktid))
+		list_del_init(&wcid->list);
+
+	spin_lock_bh(&dev->status_lock);
+
+	return skb;
+}
+EXPORT_SYMBOL_GPL(mt76_tx_status_skb_remove);
+
 struct sk_buff *
 mt76_tx_status_skb_get(struct mt76_dev *dev, struct mt76_wcid *wcid, int pktid,
 		       struct sk_buff_head *list)
-- 
2.18.0

