+ ctx_txd->tunneling_params =
+ rte_cpu_to_le_32(cd_tunneling_params);
+
+ /* TX context descriptor based double VLAN insert */
+ if (ol_flags & PKT_TX_QINQ) {
+ cd_l2tag2 = tx_pkt->vlan_tci_outer;
+ cd_type_cmd_tso_mss |=
+ ((uint64_t)ICE_TX_CTX_DESC_IL2TAG2 <<
+ ICE_TXD_CTX_QW1_CMD_S);
+ }
+ ctx_txd->l2tag2 = rte_cpu_to_le_16(cd_l2tag2);
+ ctx_txd->qw1 =
+ rte_cpu_to_le_64(cd_type_cmd_tso_mss);
+
+ txe->last_id = tx_last;
+ tx_id = txe->next_id;
+ txe = txn;
+ }
+ m_seg = tx_pkt;
+
+ do {
+ txd = &tx_ring[tx_id];
+ txn = &sw_ring[txe->next_id];
+
+ if (txe->mbuf)
+ rte_pktmbuf_free_seg(txe->mbuf);
+ txe->mbuf = m_seg;
+
+ /* Setup TX Descriptor */
+ slen = m_seg->data_len;
+ buf_dma_addr = rte_mbuf_data_iova(m_seg);
+
+ while ((ol_flags & PKT_TX_TCP_SEG) &&
+ unlikely(slen > ICE_MAX_DATA_PER_TXD)) {
+ txd->buf_addr = rte_cpu_to_le_64(buf_dma_addr);
+ txd->cmd_type_offset_bsz =
+ rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DATA |
+ ((uint64_t)td_cmd << ICE_TXD_QW1_CMD_S) |
+ ((uint64_t)td_offset << ICE_TXD_QW1_OFFSET_S) |
+ ((uint64_t)ICE_MAX_DATA_PER_TXD <<
+ ICE_TXD_QW1_TX_BUF_SZ_S) |
+ ((uint64_t)td_tag << ICE_TXD_QW1_L2TAG1_S));
+
+ buf_dma_addr += ICE_MAX_DATA_PER_TXD;
+ slen -= ICE_MAX_DATA_PER_TXD;
+
+ txe->last_id = tx_last;
+ tx_id = txe->next_id;
+ txe = txn;
+ txd = &tx_ring[tx_id];
+ txn = &sw_ring[txe->next_id];
+ }
+
+ txd->buf_addr = rte_cpu_to_le_64(buf_dma_addr);
+ txd->cmd_type_offset_bsz =
+ rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DATA |
+ ((uint64_t)td_cmd << ICE_TXD_QW1_CMD_S) |
+ ((uint64_t)td_offset << ICE_TXD_QW1_OFFSET_S) |
+ ((uint64_t)slen << ICE_TXD_QW1_TX_BUF_SZ_S) |
+ ((uint64_t)td_tag << ICE_TXD_QW1_L2TAG1_S));
+
+ txe->last_id = tx_last;
+ tx_id = txe->next_id;
+ txe = txn;
+ m_seg = m_seg->next;
+ } while (m_seg);
+
+ /* fill the last descriptor with End of Packet (EOP) bit */
+ td_cmd |= ICE_TX_DESC_CMD_EOP;
+ txq->nb_tx_used = (uint16_t)(txq->nb_tx_used + nb_used);
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_used);
+
+ /* set RS bit on the last descriptor of one packet */
+ if (txq->nb_tx_used >= txq->tx_rs_thresh) {
+ PMD_TX_LOG(DEBUG,
+ "Setting RS bit on TXD id="
+ "%4u (port=%d queue=%d)",
+ tx_last, txq->port_id, txq->queue_id);
+
+ td_cmd |= ICE_TX_DESC_CMD_RS;
+
+ /* Update txq RS bit counters */
+ txq->nb_tx_used = 0;
+ }
+ txd->cmd_type_offset_bsz |=
+ rte_cpu_to_le_64(((uint64_t)td_cmd) <<
+ ICE_TXD_QW1_CMD_S);
+ }
+end_of_tx:
+ /* update Tail register */
+ ICE_PCI_REG_WRITE(txq->qtx_tail, tx_id);
+ txq->tx_tail = tx_id;
+
+ return nb_tx;
+}
+
+static __rte_always_inline int
+ice_tx_free_bufs(struct ice_tx_queue *txq)
+{
+ struct ice_tx_entry *txep;
+ uint16_t i;
+
+ if ((txq->tx_ring[txq->tx_next_dd].cmd_type_offset_bsz &
+ rte_cpu_to_le_64(ICE_TXD_QW1_DTYPE_M)) !=
+ rte_cpu_to_le_64(ICE_TX_DESC_DTYPE_DESC_DONE))
+ return 0;
+
+ txep = &txq->sw_ring[txq->tx_next_dd - (txq->tx_rs_thresh - 1)];
+
+ for (i = 0; i < txq->tx_rs_thresh; i++)
+ rte_prefetch0((txep + i)->mbuf);
+
+ if (txq->offloads & DEV_TX_OFFLOAD_MBUF_FAST_FREE) {
+ for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
+ rte_mempool_put(txep->mbuf->pool, txep->mbuf);
+ txep->mbuf = NULL;
+ }
+ } else {
+ for (i = 0; i < txq->tx_rs_thresh; ++i, ++txep) {
+ rte_pktmbuf_free_seg(txep->mbuf);
+ txep->mbuf = NULL;
+ }
+ }
+
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free + txq->tx_rs_thresh);
+ txq->tx_next_dd = (uint16_t)(txq->tx_next_dd + txq->tx_rs_thresh);
+ if (txq->tx_next_dd >= txq->nb_tx_desc)
+ txq->tx_next_dd = (uint16_t)(txq->tx_rs_thresh - 1);
+
+ return txq->tx_rs_thresh;
+}
+
+static int
+ice_tx_done_cleanup_full(struct ice_tx_queue *txq,
+ uint32_t free_cnt)
+{
+ struct ice_tx_entry *swr_ring = txq->sw_ring;
+ uint16_t i, tx_last, tx_id;
+ uint16_t nb_tx_free_last;
+ uint16_t nb_tx_to_clean;
+ uint32_t pkt_cnt;
+
+ /* Start free mbuf from the next of tx_tail */
+ tx_last = txq->tx_tail;
+ tx_id = swr_ring[tx_last].next_id;
+
+ if (txq->nb_tx_free == 0 && ice_xmit_cleanup(txq))
+ return 0;
+
+ nb_tx_to_clean = txq->nb_tx_free;
+ nb_tx_free_last = txq->nb_tx_free;
+ if (!free_cnt)
+ free_cnt = txq->nb_tx_desc;
+
+ /* Loop through swr_ring to count the amount of
+ * freeable mubfs and packets.
+ */
+ for (pkt_cnt = 0; pkt_cnt < free_cnt; ) {
+ for (i = 0; i < nb_tx_to_clean &&
+ pkt_cnt < free_cnt &&
+ tx_id != tx_last; i++) {
+ if (swr_ring[tx_id].mbuf != NULL) {
+ rte_pktmbuf_free_seg(swr_ring[tx_id].mbuf);
+ swr_ring[tx_id].mbuf = NULL;
+
+ /*
+ * last segment in the packet,
+ * increment packet count
+ */
+ pkt_cnt += (swr_ring[tx_id].last_id == tx_id);
+ }
+
+ tx_id = swr_ring[tx_id].next_id;
+ }
+
+ if (txq->tx_rs_thresh > txq->nb_tx_desc -
+ txq->nb_tx_free || tx_id == tx_last)
+ break;
+
+ if (pkt_cnt < free_cnt) {
+ if (ice_xmit_cleanup(txq))
+ break;
+
+ nb_tx_to_clean = txq->nb_tx_free - nb_tx_free_last;
+ nb_tx_free_last = txq->nb_tx_free;
+ }
+ }
+
+ return (int)pkt_cnt;
+}
+
+#ifdef RTE_ARCH_X86
+static int
+ice_tx_done_cleanup_vec(struct ice_tx_queue *txq __rte_unused,
+ uint32_t free_cnt __rte_unused)
+{
+ return -ENOTSUP;
+}
+#endif
+
+static int
+ice_tx_done_cleanup_simple(struct ice_tx_queue *txq,
+ uint32_t free_cnt)
+{
+ int i, n, cnt;
+
+ if (free_cnt == 0 || free_cnt > txq->nb_tx_desc)
+ free_cnt = txq->nb_tx_desc;
+
+ cnt = free_cnt - free_cnt % txq->tx_rs_thresh;
+
+ for (i = 0; i < cnt; i += n) {
+ if (txq->nb_tx_desc - txq->nb_tx_free < txq->tx_rs_thresh)
+ break;
+
+ n = ice_tx_free_bufs(txq);
+
+ if (n == 0)
+ break;
+ }
+
+ return i;
+}
+
+int
+ice_tx_done_cleanup(void *txq, uint32_t free_cnt)
+{
+ struct ice_tx_queue *q = (struct ice_tx_queue *)txq;
+ struct rte_eth_dev *dev = &rte_eth_devices[q->port_id];
+ struct ice_adapter *ad =
+ ICE_DEV_PRIVATE_TO_ADAPTER(dev->data->dev_private);
+
+#ifdef RTE_ARCH_X86
+ if (ad->tx_vec_allowed)
+ return ice_tx_done_cleanup_vec(q, free_cnt);
+#endif
+ if (ad->tx_simple_allowed)
+ return ice_tx_done_cleanup_simple(q, free_cnt);
+ else
+ return ice_tx_done_cleanup_full(q, free_cnt);
+}
+
+/* Populate 4 descriptors with data from 4 mbufs */
+static inline void
+tx4(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkts)
+{
+ uint64_t dma_addr;
+ uint32_t i;
+
+ for (i = 0; i < 4; i++, txdp++, pkts++) {
+ dma_addr = rte_mbuf_data_iova(*pkts);
+ txdp->buf_addr = rte_cpu_to_le_64(dma_addr);
+ txdp->cmd_type_offset_bsz =
+ ice_build_ctob((uint32_t)ICE_TD_CMD, 0,
+ (*pkts)->data_len, 0);
+ }
+}
+
+/* Populate 1 descriptor with data from 1 mbuf */
+static inline void
+tx1(volatile struct ice_tx_desc *txdp, struct rte_mbuf **pkts)
+{
+ uint64_t dma_addr;
+
+ dma_addr = rte_mbuf_data_iova(*pkts);
+ txdp->buf_addr = rte_cpu_to_le_64(dma_addr);
+ txdp->cmd_type_offset_bsz =
+ ice_build_ctob((uint32_t)ICE_TD_CMD, 0,
+ (*pkts)->data_len, 0);
+}
+
+static inline void
+ice_tx_fill_hw_ring(struct ice_tx_queue *txq, struct rte_mbuf **pkts,
+ uint16_t nb_pkts)
+{
+ volatile struct ice_tx_desc *txdp = &txq->tx_ring[txq->tx_tail];
+ struct ice_tx_entry *txep = &txq->sw_ring[txq->tx_tail];
+ const int N_PER_LOOP = 4;
+ const int N_PER_LOOP_MASK = N_PER_LOOP - 1;
+ int mainpart, leftover;
+ int i, j;
+
+ /**
+ * Process most of the packets in chunks of N pkts. Any
+ * leftover packets will get processed one at a time.
+ */
+ mainpart = nb_pkts & ((uint32_t)~N_PER_LOOP_MASK);
+ leftover = nb_pkts & ((uint32_t)N_PER_LOOP_MASK);
+ for (i = 0; i < mainpart; i += N_PER_LOOP) {
+ /* Copy N mbuf pointers to the S/W ring */
+ for (j = 0; j < N_PER_LOOP; ++j)
+ (txep + i + j)->mbuf = *(pkts + i + j);
+ tx4(txdp + i, pkts + i);
+ }
+
+ if (unlikely(leftover > 0)) {
+ for (i = 0; i < leftover; ++i) {
+ (txep + mainpart + i)->mbuf = *(pkts + mainpart + i);
+ tx1(txdp + mainpart + i, pkts + mainpart + i);
+ }
+ }
+}
+
+static inline uint16_t
+tx_xmit_pkts(struct ice_tx_queue *txq,
+ struct rte_mbuf **tx_pkts,
+ uint16_t nb_pkts)
+{
+ volatile struct ice_tx_desc *txr = txq->tx_ring;
+ uint16_t n = 0;
+
+ /**
+ * Begin scanning the H/W ring for done descriptors when the number
+ * of available descriptors drops below tx_free_thresh. For each done
+ * descriptor, free the associated buffer.
+ */
+ if (txq->nb_tx_free < txq->tx_free_thresh)
+ ice_tx_free_bufs(txq);
+
+ /* Use available descriptor only */
+ nb_pkts = (uint16_t)RTE_MIN(txq->nb_tx_free, nb_pkts);
+ if (unlikely(!nb_pkts))
+ return 0;
+
+ txq->nb_tx_free = (uint16_t)(txq->nb_tx_free - nb_pkts);
+ if ((txq->tx_tail + nb_pkts) > txq->nb_tx_desc) {
+ n = (uint16_t)(txq->nb_tx_desc - txq->tx_tail);
+ ice_tx_fill_hw_ring(txq, tx_pkts, n);
+ txr[txq->tx_next_rs].cmd_type_offset_bsz |=
+ rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
+ ICE_TXD_QW1_CMD_S);
+ txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+ txq->tx_tail = 0;
+ }
+
+ /* Fill hardware descriptor ring with mbuf data */
+ ice_tx_fill_hw_ring(txq, tx_pkts + n, (uint16_t)(nb_pkts - n));
+ txq->tx_tail = (uint16_t)(txq->tx_tail + (nb_pkts - n));
+
+ /* Determin if RS bit needs to be set */
+ if (txq->tx_tail > txq->tx_next_rs) {
+ txr[txq->tx_next_rs].cmd_type_offset_bsz |=
+ rte_cpu_to_le_64(((uint64_t)ICE_TX_DESC_CMD_RS) <<
+ ICE_TXD_QW1_CMD_S);
+ txq->tx_next_rs =
+ (uint16_t)(txq->tx_next_rs + txq->tx_rs_thresh);
+ if (txq->tx_next_rs >= txq->nb_tx_desc)
+ txq->tx_next_rs = (uint16_t)(txq->tx_rs_thresh - 1);
+ }
+
+ if (txq->tx_tail >= txq->nb_tx_desc)
+ txq->tx_tail = 0;
+
+ /* Update the tx tail register */
+ ICE_PCI_REG_WC_WRITE(txq->qtx_tail, txq->tx_tail);