+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
+ const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
+ const unsigned int sges_n = rxq->sges_n;
+ struct rte_mbuf *pkt = NULL;
+ struct rte_mbuf *seg = NULL;
+ volatile struct mlx5_cqe *cqe =
+ &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ unsigned int i = 0;
+ unsigned int rq_ci = rxq->rq_ci << sges_n;
+ int len = 0; /* keep its value across iterations. */
+
+ while (pkts_n) {
+ unsigned int idx = rq_ci & wqe_cnt;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
+ struct rte_mbuf *rep = (*rxq->elts)[idx];
+ volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+ uint32_t rss_hash_res;
+
+ if (pkt)
+ NEXT(seg) = rep;
+ seg = rep;
+ rte_prefetch0(seg);
+ rte_prefetch0(cqe);
+ rte_prefetch0(wqe);
+ rep = rte_mbuf_raw_alloc(rxq->mp);
+ if (unlikely(rep == NULL)) {
+ ++rxq->stats.rx_nombuf;
+ if (!pkt) {
+ /*
+ * no buffers before we even started,
+ * bail out silently.
+ */
+ break;
+ }
+ while (pkt != seg) {
+ assert(pkt != (*rxq->elts)[idx]);
+ rep = NEXT(pkt);
+ NEXT(pkt) = NULL;
+ NB_SEGS(pkt) = 1;
+ rte_mbuf_raw_free(pkt);
+ pkt = rep;
+ }
+ break;
+ }
+ if (!pkt) {
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
+ len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
+ if (!len) {
+ rte_mbuf_raw_free(rep);
+ break;
+ }
+ pkt = seg;
+ assert(len >= (rxq->crc_present << 2));
+ pkt->ol_flags &= EXT_ATTACHED_MBUF;
+ /* If compressed, take hash result from mini-CQE. */
+ rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
+ cqe->rx_hash_res :
+ mcqe->rx_hash_result);
+ rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+ if (rxq->crc_present)
+ len -= RTE_ETHER_CRC_LEN;
+ PKT_LEN(pkt) = len;
+ if (cqe->lro_num_seg > 1) {
+ mlx5_lro_update_hdr
+ (rte_pktmbuf_mtod(pkt, uint8_t *), cqe,
+ len);
+ pkt->ol_flags |= PKT_RX_LRO;
+ pkt->tso_segsz = len / cqe->lro_num_seg;
+ }
+ }
+ DATA_LEN(rep) = DATA_LEN(seg);
+ PKT_LEN(rep) = PKT_LEN(seg);
+ SET_DATA_OFF(rep, DATA_OFF(seg));
+ PORT(rep) = PORT(seg);
+ (*rxq->elts)[idx] = rep;
+ /*
+ * Fill NIC descriptor with the new buffer. The lkey and size
+ * of the buffers are already known, only the buffer address
+ * changes.
+ */
+ wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
+ if (len > DATA_LEN(seg)) {
+ len -= DATA_LEN(seg);
+ ++NB_SEGS(pkt);
+ ++rq_ci;
+ continue;
+ }
+ DATA_LEN(seg) = len;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+ /* Return packet. */
+ *(pkts++) = pkt;
+ pkt = NULL;
+ --pkts_n;
+ ++i;
+ /* Align consumer index to the next stride. */
+ rq_ci >>= sges_n;
+ ++rq_ci;
+ rq_ci <<= sges_n;
+ }
+ if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
+ return 0;
+ /* Update the consumer index. */
+ rxq->rq_ci = rq_ci >> sges_n;
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+#endif
+ return i;
+}
+
+/**
+ * Update LRO packet TCP header.
+ * The HW LRO feature doesn't update the TCP header after coalescing the
+ * TCP segments but supplies information in CQE to fill it by SW.
+ *
+ * @param tcp
+ * Pointer to the TCP header.
+ * @param cqe
+ * Pointer to the completion entry..
+ * @param phcsum
+ * The L3 pseudo-header checksum.
+ */
+static inline void
+mlx5_lro_update_tcp_hdr(struct rte_tcp_hdr *restrict tcp,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t phcsum)
+{
+ uint8_t l4_type = (rte_be_to_cpu_16(cqe->hdr_type_etc) &
+ MLX5_CQE_L4_TYPE_MASK) >> MLX5_CQE_L4_TYPE_SHIFT;
+ /*
+ * The HW calculates only the TCP payload checksum, need to complete
+ * the TCP header checksum and the L3 pseudo-header checksum.
+ */
+ uint32_t csum = phcsum + cqe->csum;
+
+ if (l4_type == MLX5_L4_HDR_TYPE_TCP_EMPTY_ACK ||
+ l4_type == MLX5_L4_HDR_TYPE_TCP_WITH_ACL) {
+ tcp->tcp_flags |= RTE_TCP_ACK_FLAG;
+ tcp->recv_ack = cqe->lro_ack_seq_num;
+ tcp->rx_win = cqe->lro_tcp_win;
+ }
+ if (cqe->lro_tcppsh_abort_dupack & MLX5_CQE_LRO_PUSH_MASK)
+ tcp->tcp_flags |= RTE_TCP_PSH_FLAG;
+ tcp->cksum = 0;
+ csum += rte_raw_cksum(tcp, (tcp->data_off & 0xF) * 4);
+ csum = ((csum & 0xffff0000) >> 16) + (csum & 0xffff);
+ csum = (~csum) & 0xffff;
+ if (csum == 0)
+ csum = 0xffff;
+ tcp->cksum = csum;
+}
+
+/**
+ * Update LRO packet headers.
+ * The HW LRO feature doesn't update the L3/TCP headers after coalescing the
+ * TCP segments but supply information in CQE to fill it by SW.
+ *
+ * @param padd
+ * The packet address.
+ * @param cqe
+ * Pointer to the completion entry..
+ * @param len
+ * The packet length.
+ */
+static inline void
+mlx5_lro_update_hdr(uint8_t *restrict padd,
+ volatile struct mlx5_cqe *restrict cqe,
+ uint32_t len)
+{
+ union {
+ struct rte_ether_hdr *eth;
+ struct rte_vlan_hdr *vlan;
+ struct rte_ipv4_hdr *ipv4;
+ struct rte_ipv6_hdr *ipv6;
+ struct rte_tcp_hdr *tcp;
+ uint8_t *hdr;
+ } h = {
+ .hdr = padd,
+ };
+ uint16_t proto = h.eth->ether_type;
+ uint32_t phcsum;
+
+ h.eth++;
+ while (proto == RTE_BE16(RTE_ETHER_TYPE_VLAN) ||
+ proto == RTE_BE16(RTE_ETHER_TYPE_QINQ)) {
+ proto = h.vlan->eth_proto;
+ h.vlan++;
+ }
+ if (proto == RTE_BE16(RTE_ETHER_TYPE_IPV4)) {
+ h.ipv4->time_to_live = cqe->lro_min_ttl;
+ h.ipv4->total_length = rte_cpu_to_be_16(len - (h.hdr - padd));
+ h.ipv4->hdr_checksum = 0;
+ h.ipv4->hdr_checksum = rte_ipv4_cksum(h.ipv4);
+ phcsum = rte_ipv4_phdr_cksum(h.ipv4, 0);
+ h.ipv4++;
+ } else {
+ h.ipv6->hop_limits = cqe->lro_min_ttl;
+ h.ipv6->payload_len = rte_cpu_to_be_16(len - (h.hdr - padd) -
+ sizeof(*h.ipv6));
+ phcsum = rte_ipv6_phdr_cksum(h.ipv6, 0);
+ h.ipv6++;
+ }
+ mlx5_lro_update_tcp_hdr(h.tcp, cqe, phcsum);
+}
+
+void
+mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
+{
+ struct mlx5_mprq_buf *buf = opaque;
+
+ if (rte_atomic16_read(&buf->refcnt) == 1) {
+ rte_mempool_put(buf->mp, buf);
+ } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
+ rte_atomic16_set(&buf->refcnt, 1);
+ rte_mempool_put(buf->mp, buf);
+ }
+}
+
+void
+mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
+{
+ mlx5_mprq_buf_free_cb(NULL, buf);
+}
+
+static inline void
+mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx,
+ const unsigned int strd_n)
+{
+ struct mlx5_mprq_buf *rep = rxq->mprq_repl;
+ volatile struct mlx5_wqe_data_seg *wqe =
+ &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
+ void *addr;
+
+ assert(rep != NULL);
+ /* Replace MPRQ buf. */
+ (*rxq->mprq_bufs)[rq_idx] = rep;
+ /* Replace WQE. */
+ addr = mlx5_mprq_buf_addr(rep, strd_n);
+ wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
+ /* If there's only one MR, no need to replace LKey in WQE. */
+ if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
+ wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
+ /* Stash a mbuf for next replacement. */
+ if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
+ rxq->mprq_repl = rep;
+ else
+ rxq->mprq_repl = NULL;
+}
+
+/**
+ * DPDK callback for RX with Multi-Packet RQ support.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
+{
+ struct mlx5_rxq_data *rxq = dpdk_rxq;
+ const unsigned int strd_n = 1 << rxq->strd_num_n;
+ const unsigned int strd_sz = 1 << rxq->strd_sz_n;
+ const unsigned int strd_shift =
+ MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
+ const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
+ const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
+ volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+ unsigned int i = 0;
+ uint32_t rq_ci = rxq->rq_ci;
+ uint16_t consumed_strd = rxq->consumed_strd;
+ uint16_t headroom_sz = rxq->strd_headroom_en * RTE_PKTMBUF_HEADROOM;
+ struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+
+ while (i < pkts_n) {
+ struct rte_mbuf *pkt;
+ void *addr;
+ int ret;
+ unsigned int len;
+ uint16_t strd_cnt;
+ uint16_t strd_idx;
+ uint32_t offset;
+ uint32_t byte_cnt;
+ volatile struct mlx5_mini_cqe8 *mcqe = NULL;
+ uint32_t rss_hash_res = 0;
+ uint8_t lro_num_seg;
+
+ if (consumed_strd == strd_n) {
+ /* Replace WQE only if the buffer is still in use. */
+ if (rte_atomic16_read(&buf->refcnt) > 1) {
+ mprq_buf_replace(rxq, rq_ci & wq_mask, strd_n);
+ /* Release the old buffer. */
+ mlx5_mprq_buf_free(buf);
+ } else if (unlikely(rxq->mprq_repl == NULL)) {
+ struct mlx5_mprq_buf *rep;
+
+ /*
+ * Currently, the MPRQ mempool is out of buffer
+ * and doing memcpy regardless of the size of Rx
+ * packet. Retry allocation to get back to
+ * normal.
+ */
+ if (!rte_mempool_get(rxq->mprq_mp,
+ (void **)&rep))
+ rxq->mprq_repl = rep;
+ }
+ /* Advance to the next WQE. */
+ consumed_strd = 0;
+ ++rq_ci;
+ buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
+ }
+ cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
+ ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
+ if (!ret)
+ break;
+ byte_cnt = ret;
+ strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
+ MLX5_MPRQ_STRIDE_NUM_SHIFT;
+ assert(strd_cnt);
+ consumed_strd += strd_cnt;
+ if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
+ continue;
+ if (mcqe == NULL) {
+ rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
+ strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
+ } else {
+ /* mini-CQE for MPRQ doesn't have hash result. */
+ strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
+ }
+ assert(strd_idx < strd_n);
+ assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
+ lro_num_seg = cqe->lro_num_seg;
+ /*
+ * Currently configured to receive a packet per a stride. But if
+ * MTU is adjusted through kernel interface, device could
+ * consume multiple strides without raising an error. In this
+ * case, the packet should be dropped because it is bigger than
+ * the max_rx_pkt_len.
+ */
+ if (unlikely(!lro_num_seg && strd_cnt > 1)) {
+ ++rxq->stats.idropped;
+ continue;
+ }
+ pkt = rte_pktmbuf_alloc(rxq->mp);
+ if (unlikely(pkt == NULL)) {
+ ++rxq->stats.rx_nombuf;
+ break;
+ }
+ len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
+ assert((int)len >= (rxq->crc_present << 2));
+ if (rxq->crc_present)
+ len -= RTE_ETHER_CRC_LEN;
+ offset = strd_idx * strd_sz + strd_shift;
+ addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf, strd_n), offset);
+ /*
+ * Memcpy packets to the target mbuf if:
+ * - The size of packet is smaller than mprq_max_memcpy_len.
+ * - Out of buffer in the Mempool for Multi-Packet RQ.
+ */
+ if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
+ /*
+ * When memcpy'ing packet due to out-of-buffer, the
+ * packet must be smaller than the target mbuf.
+ */
+ if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
+ rte_pktmbuf_free_seg(pkt);
+ ++rxq->stats.idropped;
+ continue;
+ }
+ rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
+ DATA_LEN(pkt) = len;
+ } else {
+ rte_iova_t buf_iova;
+ struct rte_mbuf_ext_shared_info *shinfo;
+ uint16_t buf_len = strd_cnt * strd_sz;
+ void *buf_addr;
+
+ /* Increment the refcnt of the whole chunk. */
+ rte_atomic16_add_return(&buf->refcnt, 1);
+ assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
+ strd_n + 1);
+ buf_addr = RTE_PTR_SUB(addr, headroom_sz);
+ /*
+ * MLX5 device doesn't use iova but it is necessary in a
+ * case where the Rx packet is transmitted via a
+ * different PMD.
+ */
+ buf_iova = rte_mempool_virt2iova(buf) +
+ RTE_PTR_DIFF(buf_addr, buf);
+ shinfo = &buf->shinfos[strd_idx];
+ rte_mbuf_ext_refcnt_set(shinfo, 1);
+ /*
+ * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
+ * attaching the stride to mbuf and more offload flags
+ * will be added below by calling rxq_cq_to_mbuf().
+ * Other fields will be overwritten.
+ */
+ rte_pktmbuf_attach_extbuf(pkt, buf_addr, buf_iova,
+ buf_len, shinfo);
+ /* Set mbuf head-room. */
+ pkt->data_off = headroom_sz;
+ assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
+ /*
+ * Prevent potential overflow due to MTU change through
+ * kernel interface.
+ */
+ if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
+ rte_pktmbuf_free_seg(pkt);
+ ++rxq->stats.idropped;
+ continue;
+ }
+ DATA_LEN(pkt) = len;
+ /*
+ * LRO packet may consume all the stride memory, in this
+ * case packet head-room space is not guaranteed so must
+ * to add an empty mbuf for the head-room.
+ */
+ if (!rxq->strd_headroom_en) {
+ struct rte_mbuf *headroom_mbuf =
+ rte_pktmbuf_alloc(rxq->mp);
+
+ if (unlikely(headroom_mbuf == NULL)) {
+ rte_pktmbuf_free_seg(pkt);
+ ++rxq->stats.rx_nombuf;
+ break;
+ }
+ PORT(pkt) = rxq->port_id;
+ NEXT(headroom_mbuf) = pkt;
+ pkt = headroom_mbuf;
+ NB_SEGS(pkt) = 2;
+ }
+ }
+ rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
+ if (lro_num_seg > 1) {
+ mlx5_lro_update_hdr(addr, cqe, len);
+ pkt->ol_flags |= PKT_RX_LRO;
+ pkt->tso_segsz = strd_sz;
+ }
+ PKT_LEN(pkt) = len;
+ PORT(pkt) = rxq->port_id;
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment bytes counter. */
+ rxq->stats.ibytes += PKT_LEN(pkt);
+#endif
+ /* Return packet. */
+ *(pkts++) = pkt;
+ ++i;
+ }
+ /* Update the consumer indexes. */
+ rxq->consumed_strd = consumed_strd;
+ rte_cio_wmb();
+ *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
+ if (rq_ci != rxq->rq_ci) {
+ rxq->rq_ci = rq_ci;
+ rte_cio_wmb();
+ *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
+ }
+#ifdef MLX5_PMD_SOFT_COUNTERS
+ /* Increment packets counter. */
+ rxq->stats.ipackets += i;
+#endif
+ return i;
+}
+
+/**
+ * Dummy DPDK callback for TX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_txq
+ * Generic pointer to TX queue structure.
+ * @param[in] pkts
+ * Packets to transmit.
+ * @param pkts_n
+ * Number of packets in array.
+ *
+ * @return
+ * Number of packets successfully transmitted (<= pkts_n).
+ */
+uint16_t
+removed_tx_burst(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+/**
+ * Dummy DPDK callback for RX.
+ *
+ * This function is used to temporarily replace the real callback during
+ * unsafe control operations on the queue, or in case of error.
+ *
+ * @param dpdk_rxq
+ * Generic pointer to RX queue structure.
+ * @param[out] pkts
+ * Array to store received packets.
+ * @param pkts_n
+ * Maximum number of packets in array.
+ *
+ * @return
+ * Number of packets successfully received (<= pkts_n).
+ */
+uint16_t
+removed_rx_burst(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ rte_mb();
+ return 0;
+}
+
+/*
+ * Vectorized Rx/Tx routines are not compiled in when required vector
+ * instructions are not supported on a target architecture. The following null
+ * stubs are needed for linkage when those are not included outside of this file
+ * (e.g. mlx5_rxtx_vec_sse.c for x86).
+ */
+
+__rte_weak uint16_t
+mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
+ struct rte_mbuf **pkts __rte_unused,
+ uint16_t pkts_n __rte_unused)
+{
+ return 0;
+}
+
+__rte_weak int
+mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+__rte_weak int
+mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
+{
+ return -ENOTSUP;
+}
+
+/**
+ * Free the mbufs from the linear array of pointers.
+ *
+ * @param pkts
+ * Pointer to array of packets to be free.
+ * @param pkts_n
+ * Number of packets to be freed.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_mbuf(struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ unsigned int olx __rte_unused)
+{
+ struct rte_mempool *pool = NULL;
+ struct rte_mbuf **p_free = NULL;
+ struct rte_mbuf *mbuf;
+ unsigned int n_free = 0;
+
+ /*
+ * The implemented algorithm eliminates
+ * copying pointers to temporary array
+ * for rte_mempool_put_bulk() calls.
+ */
+ assert(pkts);
+ assert(pkts_n);
+ for (;;) {
+ for (;;) {
+ /*
+ * Decrement mbuf reference counter, detach
+ * indirect and external buffers if needed.
+ */
+ mbuf = rte_pktmbuf_prefree_seg(*pkts);
+ if (likely(mbuf != NULL)) {
+ assert(mbuf == *pkts);
+ if (likely(n_free != 0)) {
+ if (unlikely(pool != mbuf->pool))
+ /* From different pool. */
+ break;
+ } else {
+ /* Start new scan array. */
+ pool = mbuf->pool;
+ p_free = pkts;
+ }
+ ++n_free;
+ ++pkts;
+ --pkts_n;
+ if (unlikely(pkts_n == 0)) {
+ mbuf = NULL;
+ break;
+ }
+ } else {
+ /*
+ * This happens if mbuf is still referenced.
+ * We can't put it back to the pool, skip.
+ */
+ ++pkts;
+ --pkts_n;
+ if (unlikely(n_free != 0))
+ /* There is some array to free.*/
+ break;
+ if (unlikely(pkts_n == 0))
+ /* Last mbuf, nothing to free. */
+ return;
+ }
+ }
+ for (;;) {
+ /*
+ * This loop is implemented to avoid multiple
+ * inlining of rte_mempool_put_bulk().
+ */
+ assert(pool);
+ assert(p_free);
+ assert(n_free);
+ /*
+ * Free the array of pre-freed mbufs
+ * belonging to the same memory pool.
+ */
+ rte_mempool_put_bulk(pool, (void *)p_free, n_free);
+ if (unlikely(mbuf != NULL)) {
+ /* There is the request to start new scan. */
+ pool = mbuf->pool;
+ p_free = pkts++;
+ n_free = 1;
+ --pkts_n;
+ if (likely(pkts_n != 0))
+ break;
+ /*
+ * This is the last mbuf to be freed.
+ * Do one more loop iteration to complete.
+ * This is rare case of the last unique mbuf.
+ */
+ mbuf = NULL;
+ continue;
+ }
+ if (likely(pkts_n == 0))
+ return;
+ n_free = 0;
+ break;
+ }
+ }
+}
+
+/**
+ * Free the mbuf from the elts ring buffer till new tail.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param tail
+ * Index in elts to free up to, becomes new elts tail.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_free_elts(struct mlx5_txq_data *restrict txq,
+ uint16_t tail,
+ unsigned int olx __rte_unused)
+{
+ uint16_t n_elts = tail - txq->elts_tail;
+
+ assert(n_elts);
+ assert(n_elts <= txq->elts_s);
+ /*
+ * Implement a loop to support ring buffer wraparound
+ * with single inlining of mlx5_tx_free_mbuf().
+ */
+ do {
+ unsigned int part;
+
+ part = txq->elts_s - (txq->elts_tail & txq->elts_m);
+ part = RTE_MIN(part, n_elts);
+ assert(part);
+ assert(part <= txq->elts_s);
+ mlx5_tx_free_mbuf(&txq->elts[txq->elts_tail & txq->elts_m],
+ part, olx);
+ txq->elts_tail += part;
+ n_elts -= part;
+ } while (n_elts);
+}
+
+/**
+ * Store the mbuf being sent into elts ring buffer.
+ * On Tx completion these mbufs will be freed.
+ *
+ * @param txq
+ * Pointer to Tx queue structure.
+ * @param pkts
+ * Pointer to array of packets to be stored.
+ * @param pkts_n
+ * Number of packets to be stored.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_copy_elts(struct mlx5_txq_data *restrict txq,
+ struct rte_mbuf **restrict pkts,
+ unsigned int pkts_n,
+ unsigned int olx __rte_unused)
+{
+ unsigned int part;
+ struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
+
+ assert(pkts);
+ assert(pkts_n);
+ part = txq->elts_s - (txq->elts_head & txq->elts_m);
+ assert(part);
+ assert(part <= txq->elts_s);
+ /* This code is a good candidate for vectorizing with SIMD. */
+ rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
+ (void *)pkts,
+ RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
+ txq->elts_head += pkts_n;
+ if (unlikely(part < pkts_n))
+ /* The copy is wrapping around the elts array. */
+ rte_memcpy((void *)elts, (void *)(pkts + part),
+ (pkts_n - part) * sizeof(struct rte_mbuf *));
+}
+
+/**
+ * Update completion queue consuming index via doorbell
+ * and flush the completed data buffers.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param valid CQE pointer
+ * if not NULL update txq->wqe_pi and flush the buffers
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_comp_flush(struct mlx5_txq_data *restrict txq,
+ volatile struct mlx5_cqe *last_cqe,
+ unsigned int olx __rte_unused)
+{
+ if (likely(last_cqe != NULL)) {
+ uint16_t tail;
+
+ txq->wqe_pi = rte_be_to_cpu_16(last_cqe->wqe_counter);
+ tail = ((volatile struct mlx5_wqe_cseg *)
+ (txq->wqes + (txq->wqe_pi & txq->wqe_m)))->misc;
+ if (likely(tail != txq->elts_tail)) {
+ mlx5_tx_free_elts(txq, tail, olx);
+ assert(tail == txq->elts_tail);
+ }
+ }
+}
+
+/**
+ * Manage TX completions. This routine checks the CQ for
+ * arrived CQEs, deduces the last accomplished WQE in SQ,
+ * updates SQ producing index and frees all completed mbufs.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ *
+ * NOTE: not inlined intentionally, it makes tx_burst
+ * routine smaller, simple and faster - from experiments.
+ */
+static void
+mlx5_tx_handle_completion(struct mlx5_txq_data *restrict txq,
+ unsigned int olx __rte_unused)
+{
+ unsigned int count = MLX5_TX_COMP_MAX_CQE;
+ volatile struct mlx5_cqe *last_cqe = NULL;
+ uint16_t ci = txq->cq_ci;
+ int ret;
+
+ static_assert(MLX5_CQE_STATUS_HW_OWN < 0, "Must be negative value");
+ static_assert(MLX5_CQE_STATUS_SW_OWN < 0, "Must be negative value");
+ do {
+ volatile struct mlx5_cqe *cqe;
+
+ cqe = &txq->cqes[ci & txq->cqe_m];
+ ret = check_cqe(cqe, txq->cqe_s, ci);
+ if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
+ if (likely(ret != MLX5_CQE_STATUS_ERR)) {
+ /* No new CQEs in completion queue. */
+ assert(ret == MLX5_CQE_STATUS_HW_OWN);
+ break;
+ }
+ /*
+ * Some error occurred, try to restart.
+ * We have no barrier after WQE related Doorbell
+ * written, make sure all writes are completed
+ * here, before we might perform SQ reset.
+ */
+ rte_wmb();
+ ret = mlx5_tx_error_cqe_handle
+ (txq, (volatile struct mlx5_err_cqe *)cqe);
+ if (unlikely(ret < 0)) {
+ /*
+ * Some error occurred on queue error
+ * handling, we do not advance the index
+ * here, allowing to retry on next call.
+ */
+ return;
+ }
+ /*
+ * We are going to fetch all entries with
+ * MLX5_CQE_SYNDROME_WR_FLUSH_ERR status.
+ */
+ ++ci;
+ continue;
+ }
+ /* Normal transmit completion. */
+ ++ci;
+ last_cqe = cqe;
+#ifndef NDEBUG
+ if (txq->cq_pi)
+ --txq->cq_pi;
+#endif
+ /*
+ * We have to restrict the amount of processed CQEs
+ * in one tx_burst routine call. The CQ may be large
+ * and many CQEs may be updated by the NIC in one
+ * transaction. Buffers freeing is time consuming,
+ * multiple iterations may introduce significant
+ * latency.
+ */
+ if (--count == 0)
+ break;
+ } while (true);
+ if (likely(ci != txq->cq_ci)) {
+ /*
+ * Update completion queue consuming index
+ * and ring doorbell to notify hardware.
+ */
+ rte_compiler_barrier();
+ txq->cq_ci = ci;
+ *txq->cq_db = rte_cpu_to_be_32(ci);
+ mlx5_tx_comp_flush(txq, last_cqe, olx);
+ }
+}
+
+/**
+ * Check if the completion request flag should be set in the last WQE.
+ * Both pushed mbufs and WQEs are monitored and the completion request
+ * flag is set if any of thresholds is reached.
+ *
+ * @param txq
+ * Pointer to TX queue structure.
+ * @param loc
+ * Pointer to burst routine local context.
+ * @param olx
+ * Configured Tx offloads mask. It is fully defined at
+ * compile time and may be used for optimization.
+ */
+static __rte_always_inline void
+mlx5_tx_request_completion(struct mlx5_txq_data *restrict txq,
+ struct mlx5_txq_local *restrict loc,
+ unsigned int olx)
+{
+ uint16_t head = txq->elts_head;
+ unsigned int part;
+
+ part = MLX5_TXOFF_CONFIG(INLINE) ?
+ 0 : loc->pkts_sent - loc->pkts_copy;
+ head += part;
+ if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
+ (MLX5_TXOFF_CONFIG(INLINE) &&
+ (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
+ volatile struct mlx5_wqe *last = loc->wqe_last;
+
+ txq->elts_comp = head;
+ if (MLX5_TXOFF_CONFIG(INLINE))
+ txq->wqe_comp = txq->wqe_ci;
+ /* Request unconditional completion on last WQE. */
+ last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
+ MLX5_COMP_MODE_OFFSET);
+ /* Save elts_head in unused "immediate" field of WQE. */
+ last->cseg.misc = head;
+ /*
+ * A CQE slot must always be available. Count the
+ * issued CEQ "always" request instead of production
+ * index due to here can be CQE with errors and
+ * difference with ci may become inconsistent.
+ */
+ assert(txq->cqe_s > ++txq->cq_pi);
+ }
+}
+
+/**
+ * DPDK callback to check the status of a tx descriptor.
+ *
+ * @param tx_queue
+ * The tx queue.
+ * @param[in] offset
+ * The index of the descriptor in the ring.
+ *
+ * @return
+ * The status of the tx descriptor.
+ */
+int
+mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
+{
+ struct mlx5_txq_data *restrict txq = tx_queue;
+ uint16_t used;