4 * Copyright 2015 6WIND S.A.
5 * Copyright 2015 Mellanox.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of 6WIND S.A. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
40 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
42 #pragma GCC diagnostic ignored "-Wpedantic"
44 #include <infiniband/verbs.h>
45 #include <infiniband/mlx5_hw.h>
46 #include <infiniband/arch.h>
48 #pragma GCC diagnostic error "-Wpedantic"
51 /* DPDK headers don't like -pedantic. */
53 #pragma GCC diagnostic ignored "-Wpedantic"
56 #include <rte_mempool.h>
57 #include <rte_prefetch.h>
58 #include <rte_common.h>
59 #include <rte_branch_prediction.h>
60 #include <rte_ether.h>
62 #pragma GCC diagnostic error "-Wpedantic"
66 #include "mlx5_utils.h"
67 #include "mlx5_rxtx.h"
68 #include "mlx5_autoconf.h"
69 #include "mlx5_defs.h"
72 static __rte_always_inline int
73 check_cqe(volatile struct mlx5_cqe *cqe,
74 unsigned int cqes_n, const uint16_t ci);
76 static __rte_always_inline void
77 txq_complete(struct txq *txq);
79 static __rte_always_inline uint32_t
80 txq_mp2mr(struct txq *txq, struct rte_mempool *mp);
82 static __rte_always_inline void
83 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe);
85 static __rte_always_inline uint32_t
86 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
88 static __rte_always_inline int
89 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
90 uint16_t cqe_cnt, uint32_t *rss_hash);
92 static __rte_always_inline uint32_t
93 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe);
98 * Verify or set magic value in CQE.
107 check_cqe_seen(volatile struct mlx5_cqe *cqe)
109 static const uint8_t magic[] = "seen";
110 volatile uint8_t (*buf)[sizeof(cqe->rsvd0)] = &cqe->rsvd0;
114 for (i = 0; i < sizeof(magic) && i < sizeof(*buf); ++i)
115 if (!ret || (*buf)[i] != magic[i]) {
117 (*buf)[i] = magic[i];
125 * Check whether CQE is valid.
130 * Size of completion queue.
135 * 0 on success, 1 on failure.
138 check_cqe(volatile struct mlx5_cqe *cqe,
139 unsigned int cqes_n, const uint16_t ci)
141 uint16_t idx = ci & cqes_n;
142 uint8_t op_own = cqe->op_own;
143 uint8_t op_owner = MLX5_CQE_OWNER(op_own);
144 uint8_t op_code = MLX5_CQE_OPCODE(op_own);
146 if (unlikely((op_owner != (!!(idx))) || (op_code == MLX5_CQE_INVALID)))
147 return 1; /* No CQE. */
149 if ((op_code == MLX5_CQE_RESP_ERR) ||
150 (op_code == MLX5_CQE_REQ_ERR)) {
151 volatile struct mlx5_err_cqe *err_cqe = (volatile void *)cqe;
152 uint8_t syndrome = err_cqe->syndrome;
154 if ((syndrome == MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR) ||
155 (syndrome == MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR))
157 if (!check_cqe_seen(cqe))
158 ERROR("unexpected CQE error %u (0x%02x)"
160 op_code, op_code, syndrome);
162 } else if ((op_code != MLX5_CQE_RESP_SEND) &&
163 (op_code != MLX5_CQE_REQ)) {
164 if (!check_cqe_seen(cqe))
165 ERROR("unexpected CQE opcode %u (0x%02x)",
174 * Return the address of the WQE.
177 * Pointer to TX queue structure.
179 * WQE consumer index.
184 static inline uintptr_t *
185 tx_mlx5_wqe(struct txq *txq, uint16_t ci)
187 ci &= ((1 << txq->wqe_n) - 1);
188 return (uintptr_t *)((uintptr_t)txq->wqes + ci * MLX5_WQE_SIZE);
192 * Return the size of tailroom of WQ.
195 * Pointer to TX queue structure.
197 * Pointer to tail of WQ.
203 tx_mlx5_wq_tailroom(struct txq *txq, void *addr)
206 tailroom = (uintptr_t)(txq->wqes) +
207 (1 << txq->wqe_n) * MLX5_WQE_SIZE -
213 * Copy data to tailroom of circular queue.
216 * Pointer to destination.
220 * Number of bytes to copy.
222 * Pointer to head of queue.
224 * Size of tailroom from dst.
227 * Pointer after copied data.
230 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
231 void *base, size_t tailroom)
236 rte_memcpy(dst, src, tailroom);
237 rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
239 ret = (uint8_t *)base + n - tailroom;
241 rte_memcpy(dst, src, n);
242 ret = (n == tailroom) ? base : (uint8_t *)dst + n;
248 * Manage TX completions.
250 * When sending a burst, mlx5_tx_burst() posts several WRs.
253 * Pointer to TX queue structure.
256 txq_complete(struct txq *txq)
258 const unsigned int elts_n = 1 << txq->elts_n;
259 const unsigned int cqe_n = 1 << txq->cqe_n;
260 const unsigned int cqe_cnt = cqe_n - 1;
261 uint16_t elts_free = txq->elts_tail;
263 uint16_t cq_ci = txq->cq_ci;
264 volatile struct mlx5_cqe *cqe = NULL;
265 volatile struct mlx5_wqe_ctrl *ctrl;
268 volatile struct mlx5_cqe *tmp;
270 tmp = &(*txq->cqes)[cq_ci & cqe_cnt];
271 if (check_cqe(tmp, cqe_n, cq_ci))
275 if (MLX5_CQE_FORMAT(cqe->op_own) == MLX5_COMPRESSED) {
276 if (!check_cqe_seen(cqe))
277 ERROR("unexpected compressed CQE, TX stopped");
280 if ((MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) ||
281 (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_REQ_ERR)) {
282 if (!check_cqe_seen(cqe))
283 ERROR("unexpected error CQE, TX stopped");
289 if (unlikely(cqe == NULL))
291 txq->wqe_pi = ntohs(cqe->wqe_counter);
292 ctrl = (volatile struct mlx5_wqe_ctrl *)
293 tx_mlx5_wqe(txq, txq->wqe_pi);
294 elts_tail = ctrl->ctrl3;
295 assert(elts_tail < (1 << txq->wqe_n));
297 while (elts_free != elts_tail) {
298 struct rte_mbuf *elt = (*txq->elts)[elts_free];
299 unsigned int elts_free_next =
300 (elts_free + 1) & (elts_n - 1);
301 struct rte_mbuf *elt_next = (*txq->elts)[elts_free_next];
305 memset(&(*txq->elts)[elts_free],
307 sizeof((*txq->elts)[elts_free]));
309 RTE_MBUF_PREFETCH_TO_FREE(elt_next);
310 /* Only one segment needs to be freed. */
311 rte_pktmbuf_free_seg(elt);
312 elts_free = elts_free_next;
315 txq->elts_tail = elts_tail;
316 /* Update the consumer index. */
318 *txq->cq_db = htonl(cq_ci);
322 * Get Memory Pool (MP) from mbuf. If mbuf is indirect, the pool from which
323 * the cloned mbuf is allocated is returned instead.
329 * Memory pool where data is located for given mbuf.
331 static struct rte_mempool *
332 txq_mb2mp(struct rte_mbuf *buf)
334 if (unlikely(RTE_MBUF_INDIRECT(buf)))
335 return rte_mbuf_from_indirect(buf)->pool;
340 * Get Memory Region (MR) <-> Memory Pool (MP) association from txq->mp2mr[].
341 * Add MP to txq->mp2mr[] if it's not registered yet. If mp2mr[] is full,
342 * remove an entry first.
345 * Pointer to TX queue structure.
347 * Memory Pool for which a Memory Region lkey must be returned.
350 * mr->lkey on success, (uint32_t)-1 on failure.
352 static inline uint32_t
353 txq_mp2mr(struct txq *txq, struct rte_mempool *mp)
356 uint32_t lkey = (uint32_t)-1;
358 for (i = 0; (i != RTE_DIM(txq->mp2mr)); ++i) {
359 if (unlikely(txq->mp2mr[i].mp == NULL)) {
360 /* Unknown MP, add a new MR for it. */
363 if (txq->mp2mr[i].mp == mp) {
364 assert(txq->mp2mr[i].lkey != (uint32_t)-1);
365 assert(htonl(txq->mp2mr[i].mr->lkey) ==
367 lkey = txq->mp2mr[i].lkey;
371 if (unlikely(lkey == (uint32_t)-1))
372 lkey = txq_mp2mr_reg(txq, mp, i);
377 * Ring TX queue doorbell.
380 * Pointer to TX queue structure.
382 * Pointer to the last WQE posted in the NIC.
385 mlx5_tx_dbrec(struct txq *txq, volatile struct mlx5_wqe *wqe)
387 uint64_t *dst = (uint64_t *)((uintptr_t)txq->bf_reg);
388 volatile uint64_t *src = ((volatile uint64_t *)wqe);
391 *txq->qp_db = htonl(txq->wqe_ci);
392 /* Ensure ordering between DB record and BF copy. */
398 * DPDK callback to check the status of a tx descriptor.
403 * The index of the descriptor in the ring.
406 * The status of the tx descriptor.
409 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
411 struct txq *txq = tx_queue;
412 const unsigned int elts_n = 1 << txq->elts_n;
413 const unsigned int elts_cnt = elts_n - 1;
417 used = (txq->elts_head - txq->elts_tail) & elts_cnt;
419 return RTE_ETH_TX_DESC_FULL;
420 return RTE_ETH_TX_DESC_DONE;
424 * DPDK callback to check the status of a rx descriptor.
429 * The index of the descriptor in the ring.
432 * The status of the tx descriptor.
435 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
437 struct rxq *rxq = rx_queue;
438 struct rxq_zip *zip = &rxq->zip;
439 volatile struct mlx5_cqe *cqe;
440 const unsigned int cqe_n = (1 << rxq->cqe_n);
441 const unsigned int cqe_cnt = cqe_n - 1;
445 /* if we are processing a compressed cqe */
447 used = zip->cqe_cnt - zip->ca;
453 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
454 while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
458 op_own = cqe->op_own;
459 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
460 n = ntohl(cqe->byte_cnt);
465 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
467 used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
469 return RTE_ETH_RX_DESC_DONE;
470 return RTE_ETH_RX_DESC_AVAIL;
474 * DPDK callback for TX.
477 * Generic pointer to TX queue structure.
479 * Packets to transmit.
481 * Number of packets in array.
484 * Number of packets successfully transmitted (<= pkts_n).
487 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
489 struct txq *txq = (struct txq *)dpdk_txq;
490 uint16_t elts_head = txq->elts_head;
491 const unsigned int elts_n = 1 << txq->elts_n;
496 unsigned int max_inline = txq->max_inline;
497 const unsigned int inline_en = !!max_inline && txq->inline_en;
500 volatile struct mlx5_wqe_v *wqe = NULL;
501 volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
502 unsigned int segs_n = 0;
503 struct rte_mbuf *buf = NULL;
506 if (unlikely(!pkts_n))
508 /* Prefetch first packet cacheline. */
509 rte_prefetch0(*pkts);
510 /* Start processing. */
512 max = (elts_n - (elts_head - txq->elts_tail));
515 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
516 if (unlikely(!max_wqe))
519 volatile rte_v128u32_t *dseg = NULL;
522 unsigned int sg = 0; /* counter of additional segs attached. */
525 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
526 uint16_t tso_header_sz = 0;
528 uint8_t cs_flags = 0;
530 uint16_t tso_segsz = 0;
531 #ifdef MLX5_PMD_SOFT_COUNTERS
532 uint32_t total_length = 0;
537 segs_n = buf->nb_segs;
539 * Make sure there is enough room to store this packet and
540 * that one ring entry remains unused.
543 if (max < segs_n + 1)
547 if (unlikely(--max_wqe == 0))
549 wqe = (volatile struct mlx5_wqe_v *)
550 tx_mlx5_wqe(txq, txq->wqe_ci);
551 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
553 rte_prefetch0(*(pkts + 1));
554 addr = rte_pktmbuf_mtod(buf, uintptr_t);
555 length = DATA_LEN(buf);
556 ehdr = (((uint8_t *)addr)[1] << 8) |
557 ((uint8_t *)addr)[0];
558 #ifdef MLX5_PMD_SOFT_COUNTERS
559 total_length = length;
561 if (length < (MLX5_WQE_DWORD_SIZE + 2))
563 /* Update element. */
564 (*txq->elts)[elts_head] = buf;
565 /* Prefetch next buffer data. */
568 rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
569 /* Should we enable HW CKSUM offload */
571 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM)) {
572 const uint64_t is_tunneled = buf->ol_flags &
574 PKT_TX_TUNNEL_VXLAN);
576 if (is_tunneled && txq->tunnel_en) {
577 cs_flags = MLX5_ETH_WQE_L3_INNER_CSUM |
578 MLX5_ETH_WQE_L4_INNER_CSUM;
579 if (buf->ol_flags & PKT_TX_OUTER_IP_CKSUM)
580 cs_flags |= MLX5_ETH_WQE_L3_CSUM;
582 cs_flags = MLX5_ETH_WQE_L3_CSUM |
583 MLX5_ETH_WQE_L4_CSUM;
586 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
587 /* Replace the Ethernet type by the VLAN if necessary. */
588 if (buf->ol_flags & PKT_TX_VLAN_PKT) {
589 uint32_t vlan = htonl(0x81000000 | buf->vlan_tci);
590 unsigned int len = 2 * ETHER_ADDR_LEN - 2;
594 /* Copy Destination and source mac address. */
595 memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
597 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
598 /* Copy missing two bytes to end the DSeg. */
599 memcpy((uint8_t *)raw + len + sizeof(vlan),
600 ((uint8_t *)addr) + len, 2);
604 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
605 MLX5_WQE_DWORD_SIZE);
606 length -= pkt_inline_sz;
607 addr += pkt_inline_sz;
610 tso = buf->ol_flags & PKT_TX_TCP_SEG;
612 uintptr_t end = (uintptr_t)
613 (((uintptr_t)txq->wqes) +
617 uint8_t vlan_sz = (buf->ol_flags &
618 PKT_TX_VLAN_PKT) ? 4 : 0;
619 const uint64_t is_tunneled =
622 PKT_TX_TUNNEL_VXLAN);
624 tso_header_sz = buf->l2_len + vlan_sz +
625 buf->l3_len + buf->l4_len;
626 tso_segsz = buf->tso_segsz;
628 if (is_tunneled && txq->tunnel_en) {
629 tso_header_sz += buf->outer_l2_len +
631 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
633 cs_flags |= MLX5_ETH_WQE_L4_CSUM;
635 if (unlikely(tso_header_sz >
636 MLX5_MAX_TSO_HEADER))
638 copy_b = tso_header_sz - pkt_inline_sz;
639 /* First seg must contain all headers. */
640 assert(copy_b <= length);
641 raw += MLX5_WQE_DWORD_SIZE;
643 ((end - (uintptr_t)raw) > copy_b)) {
644 uint16_t n = (MLX5_WQE_DS(copy_b) -
647 if (unlikely(max_wqe < n))
650 rte_memcpy((void *)raw,
651 (void *)addr, copy_b);
654 pkt_inline_sz += copy_b;
656 * Another DWORD will be added
657 * in the inline part.
659 raw += MLX5_WQE_DS(copy_b) *
660 MLX5_WQE_DWORD_SIZE -
664 wqe->ctrl = (rte_v128u32_t){
665 htonl(txq->wqe_ci << 8),
666 htonl(txq->qp_num_8s | 1),
677 /* Inline if enough room. */
678 if (inline_en || tso) {
679 uintptr_t end = (uintptr_t)
680 (((uintptr_t)txq->wqes) +
681 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
682 unsigned int inline_room = max_inline *
683 RTE_CACHE_LINE_SIZE -
685 uintptr_t addr_end = (addr + inline_room) &
686 ~(RTE_CACHE_LINE_SIZE - 1);
687 unsigned int copy_b = (addr_end > addr) ?
688 RTE_MIN((addr_end - addr), length) :
691 raw += MLX5_WQE_DWORD_SIZE;
692 if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
694 * One Dseg remains in the current WQE. To
695 * keep the computation positive, it is
696 * removed after the bytes to Dseg conversion.
698 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
700 if (unlikely(max_wqe < n))
705 htonl(copy_b | MLX5_INLINE_SEG);
708 MLX5_WQE_DS(tso_header_sz) *
710 rte_memcpy((void *)raw,
711 (void *)&inl, sizeof(inl));
713 pkt_inline_sz += sizeof(inl);
715 rte_memcpy((void *)raw, (void *)addr, copy_b);
718 pkt_inline_sz += copy_b;
721 * 2 DWORDs consumed by the WQE header + ETH segment +
722 * the size of the inline part of the packet.
724 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
726 if (ds % (MLX5_WQE_SIZE /
727 MLX5_WQE_DWORD_SIZE) == 0) {
728 if (unlikely(--max_wqe == 0))
730 dseg = (volatile rte_v128u32_t *)
731 tx_mlx5_wqe(txq, txq->wqe_ci +
734 dseg = (volatile rte_v128u32_t *)
736 (ds * MLX5_WQE_DWORD_SIZE));
739 } else if (!segs_n) {
742 /* dseg will be advance as part of next_seg */
743 dseg = (volatile rte_v128u32_t *)
745 ((ds - 1) * MLX5_WQE_DWORD_SIZE));
750 * No inline has been done in the packet, only the
751 * Ethernet Header as been stored.
753 dseg = (volatile rte_v128u32_t *)
754 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
757 /* Add the remaining packet as a simple ds. */
758 naddr = htonll(addr);
759 *dseg = (rte_v128u32_t){
761 txq_mp2mr(txq, txq_mb2mp(buf)),
774 * Spill on next WQE when the current one does not have
775 * enough room left. Size of WQE must a be a multiple
776 * of data segment size.
778 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
779 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
780 if (unlikely(--max_wqe == 0))
782 dseg = (volatile rte_v128u32_t *)
783 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
784 rte_prefetch0(tx_mlx5_wqe(txq,
785 txq->wqe_ci + ds / 4 + 1));
792 length = DATA_LEN(buf);
793 #ifdef MLX5_PMD_SOFT_COUNTERS
794 total_length += length;
796 /* Store segment information. */
797 naddr = htonll(rte_pktmbuf_mtod(buf, uintptr_t));
798 *dseg = (rte_v128u32_t){
800 txq_mp2mr(txq, txq_mb2mp(buf)),
804 elts_head = (elts_head + 1) & (elts_n - 1);
805 (*txq->elts)[elts_head] = buf;
807 /* Advance counter only if all segs are successfully posted. */
813 elts_head = (elts_head + 1) & (elts_n - 1);
816 /* Initialize known and common part of the WQE structure. */
818 wqe->ctrl = (rte_v128u32_t){
819 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_TSO),
820 htonl(txq->qp_num_8s | ds),
824 wqe->eseg = (rte_v128u32_t){
826 cs_flags | (htons(tso_segsz) << 16),
828 (ehdr << 16) | htons(tso_header_sz),
831 wqe->ctrl = (rte_v128u32_t){
832 htonl((txq->wqe_ci << 8) | MLX5_OPCODE_SEND),
833 htonl(txq->qp_num_8s | ds),
837 wqe->eseg = (rte_v128u32_t){
841 (ehdr << 16) | htons(pkt_inline_sz),
845 txq->wqe_ci += (ds + 3) / 4;
846 /* Save the last successful WQE for completion request */
847 last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
848 #ifdef MLX5_PMD_SOFT_COUNTERS
849 /* Increment sent bytes counter. */
850 txq->stats.obytes += total_length;
852 } while (i < pkts_n);
853 /* Take a shortcut if nothing must be sent. */
854 if (unlikely((i + k) == 0))
856 txq->elts_head = (txq->elts_head + i + j) & (elts_n - 1);
857 /* Check whether completion threshold has been reached. */
858 comp = txq->elts_comp + i + j + k;
859 if (comp >= MLX5_TX_COMP_THRESH) {
860 /* Request completion on last WQE. */
861 last_wqe->ctrl2 = htonl(8);
862 /* Save elts_head in unused "immediate" field of WQE. */
863 last_wqe->ctrl3 = txq->elts_head;
866 txq->elts_comp = comp;
868 #ifdef MLX5_PMD_SOFT_COUNTERS
869 /* Increment sent packets counter. */
870 txq->stats.opackets += i;
872 /* Ring QP doorbell. */
873 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
878 * Open a MPW session.
881 * Pointer to TX queue structure.
883 * Pointer to MPW session structure.
888 mlx5_mpw_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
890 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
891 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
892 (volatile struct mlx5_wqe_data_seg (*)[])
893 tx_mlx5_wqe(txq, idx + 1);
895 mpw->state = MLX5_MPW_STATE_OPENED;
899 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
900 mpw->wqe->eseg.mss = htons(length);
901 mpw->wqe->eseg.inline_hdr_sz = 0;
902 mpw->wqe->eseg.rsvd0 = 0;
903 mpw->wqe->eseg.rsvd1 = 0;
904 mpw->wqe->eseg.rsvd2 = 0;
905 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
906 (txq->wqe_ci << 8) | MLX5_OPCODE_TSO);
907 mpw->wqe->ctrl[2] = 0;
908 mpw->wqe->ctrl[3] = 0;
909 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
910 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
911 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
912 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
913 mpw->data.dseg[2] = &(*dseg)[0];
914 mpw->data.dseg[3] = &(*dseg)[1];
915 mpw->data.dseg[4] = &(*dseg)[2];
919 * Close a MPW session.
922 * Pointer to TX queue structure.
924 * Pointer to MPW session structure.
927 mlx5_mpw_close(struct txq *txq, struct mlx5_mpw *mpw)
929 unsigned int num = mpw->pkts_n;
932 * Store size in multiple of 16 bytes. Control and Ethernet segments
935 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | (2 + num));
936 mpw->state = MLX5_MPW_STATE_CLOSED;
941 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
942 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
946 * DPDK callback for TX with MPW support.
949 * Generic pointer to TX queue structure.
951 * Packets to transmit.
953 * Number of packets in array.
956 * Number of packets successfully transmitted (<= pkts_n).
959 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
961 struct txq *txq = (struct txq *)dpdk_txq;
962 uint16_t elts_head = txq->elts_head;
963 const unsigned int elts_n = 1 << txq->elts_n;
969 struct mlx5_mpw mpw = {
970 .state = MLX5_MPW_STATE_CLOSED,
973 if (unlikely(!pkts_n))
975 /* Prefetch first packet cacheline. */
976 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
977 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
978 /* Start processing. */
980 max = (elts_n - (elts_head - txq->elts_tail));
983 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
984 if (unlikely(!max_wqe))
987 struct rte_mbuf *buf = *(pkts++);
988 unsigned int elts_head_next;
990 unsigned int segs_n = buf->nb_segs;
991 uint32_t cs_flags = 0;
994 * Make sure there is enough room to store this packet and
995 * that one ring entry remains unused.
998 if (max < segs_n + 1)
1000 /* Do not bother with large packets MPW cannot handle. */
1001 if (segs_n > MLX5_MPW_DSEG_MAX)
1005 /* Should we enable HW CKSUM offload */
1007 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1008 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1009 /* Retrieve packet information. */
1010 length = PKT_LEN(buf);
1012 /* Start new session if packet differs. */
1013 if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
1014 ((mpw.len != length) ||
1016 (mpw.wqe->eseg.cs_flags != cs_flags)))
1017 mlx5_mpw_close(txq, &mpw);
1018 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1020 * Multi-Packet WQE consumes at most two WQE.
1021 * mlx5_mpw_new() expects to be able to use such
1024 if (unlikely(max_wqe < 2))
1027 mlx5_mpw_new(txq, &mpw, length);
1028 mpw.wqe->eseg.cs_flags = cs_flags;
1030 /* Multi-segment packets must be alone in their MPW. */
1031 assert((segs_n == 1) || (mpw.pkts_n == 0));
1032 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1036 volatile struct mlx5_wqe_data_seg *dseg;
1039 elts_head_next = (elts_head + 1) & (elts_n - 1);
1041 (*txq->elts)[elts_head] = buf;
1042 dseg = mpw.data.dseg[mpw.pkts_n];
1043 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1044 *dseg = (struct mlx5_wqe_data_seg){
1045 .byte_count = htonl(DATA_LEN(buf)),
1046 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1047 .addr = htonll(addr),
1049 elts_head = elts_head_next;
1050 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1051 length += DATA_LEN(buf);
1057 assert(length == mpw.len);
1058 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1059 mlx5_mpw_close(txq, &mpw);
1060 elts_head = elts_head_next;
1061 #ifdef MLX5_PMD_SOFT_COUNTERS
1062 /* Increment sent bytes counter. */
1063 txq->stats.obytes += length;
1067 /* Take a shortcut if nothing must be sent. */
1068 if (unlikely(i == 0))
1070 /* Check whether completion threshold has been reached. */
1071 /* "j" includes both packets and segments. */
1072 comp = txq->elts_comp + j;
1073 if (comp >= MLX5_TX_COMP_THRESH) {
1074 volatile struct mlx5_wqe *wqe = mpw.wqe;
1076 /* Request completion on last WQE. */
1077 wqe->ctrl[2] = htonl(8);
1078 /* Save elts_head in unused "immediate" field of WQE. */
1079 wqe->ctrl[3] = elts_head;
1082 txq->elts_comp = comp;
1084 #ifdef MLX5_PMD_SOFT_COUNTERS
1085 /* Increment sent packets counter. */
1086 txq->stats.opackets += i;
1088 /* Ring QP doorbell. */
1089 if (mpw.state == MLX5_MPW_STATE_OPENED)
1090 mlx5_mpw_close(txq, &mpw);
1091 mlx5_tx_dbrec(txq, mpw.wqe);
1092 txq->elts_head = elts_head;
1097 * Open a MPW inline session.
1100 * Pointer to TX queue structure.
1102 * Pointer to MPW session structure.
1107 mlx5_mpw_inline_new(struct txq *txq, struct mlx5_mpw *mpw, uint32_t length)
1109 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1110 struct mlx5_wqe_inl_small *inl;
1112 mpw->state = MLX5_MPW_INL_STATE_OPENED;
1116 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1117 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_MPW << 24) |
1118 (txq->wqe_ci << 8) |
1120 mpw->wqe->ctrl[2] = 0;
1121 mpw->wqe->ctrl[3] = 0;
1122 mpw->wqe->eseg.mss = htons(length);
1123 mpw->wqe->eseg.inline_hdr_sz = 0;
1124 mpw->wqe->eseg.cs_flags = 0;
1125 mpw->wqe->eseg.rsvd0 = 0;
1126 mpw->wqe->eseg.rsvd1 = 0;
1127 mpw->wqe->eseg.rsvd2 = 0;
1128 inl = (struct mlx5_wqe_inl_small *)
1129 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
1130 mpw->data.raw = (uint8_t *)&inl->raw;
1134 * Close a MPW inline session.
1137 * Pointer to TX queue structure.
1139 * Pointer to MPW session structure.
1142 mlx5_mpw_inline_close(struct txq *txq, struct mlx5_mpw *mpw)
1145 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1146 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1148 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1150 * Store size in multiple of 16 bytes. Control and Ethernet segments
1153 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(size));
1154 mpw->state = MLX5_MPW_STATE_CLOSED;
1155 inl->byte_cnt = htonl(mpw->total_len | MLX5_INLINE_SEG);
1156 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1160 * DPDK callback for TX with MPW inline support.
1163 * Generic pointer to TX queue structure.
1165 * Packets to transmit.
1167 * Number of packets in array.
1170 * Number of packets successfully transmitted (<= pkts_n).
1173 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1176 struct txq *txq = (struct txq *)dpdk_txq;
1177 uint16_t elts_head = txq->elts_head;
1178 const unsigned int elts_n = 1 << txq->elts_n;
1184 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1185 struct mlx5_mpw mpw = {
1186 .state = MLX5_MPW_STATE_CLOSED,
1189 * Compute the maximum number of WQE which can be consumed by inline
1192 * - 1 control segment,
1193 * - 1 Ethernet segment,
1194 * - N Dseg from the inline request.
1196 const unsigned int wqe_inl_n =
1197 ((2 * MLX5_WQE_DWORD_SIZE +
1198 txq->max_inline * RTE_CACHE_LINE_SIZE) +
1199 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1201 if (unlikely(!pkts_n))
1203 /* Prefetch first packet cacheline. */
1204 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1205 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1206 /* Start processing. */
1208 max = (elts_n - (elts_head - txq->elts_tail));
1212 struct rte_mbuf *buf = *(pkts++);
1213 unsigned int elts_head_next;
1216 unsigned int segs_n = buf->nb_segs;
1217 uint32_t cs_flags = 0;
1220 * Make sure there is enough room to store this packet and
1221 * that one ring entry remains unused.
1224 if (max < segs_n + 1)
1226 /* Do not bother with large packets MPW cannot handle. */
1227 if (segs_n > MLX5_MPW_DSEG_MAX)
1232 * Compute max_wqe in case less WQE were consumed in previous
1235 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1236 /* Should we enable HW CKSUM offload */
1238 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1239 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1240 /* Retrieve packet information. */
1241 length = PKT_LEN(buf);
1242 /* Start new session if packet differs. */
1243 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1244 if ((mpw.len != length) ||
1246 (mpw.wqe->eseg.cs_flags != cs_flags))
1247 mlx5_mpw_close(txq, &mpw);
1248 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1249 if ((mpw.len != length) ||
1251 (length > inline_room) ||
1252 (mpw.wqe->eseg.cs_flags != cs_flags)) {
1253 mlx5_mpw_inline_close(txq, &mpw);
1255 txq->max_inline * RTE_CACHE_LINE_SIZE;
1258 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1259 if ((segs_n != 1) ||
1260 (length > inline_room)) {
1262 * Multi-Packet WQE consumes at most two WQE.
1263 * mlx5_mpw_new() expects to be able to use
1266 if (unlikely(max_wqe < 2))
1269 mlx5_mpw_new(txq, &mpw, length);
1270 mpw.wqe->eseg.cs_flags = cs_flags;
1272 if (unlikely(max_wqe < wqe_inl_n))
1274 max_wqe -= wqe_inl_n;
1275 mlx5_mpw_inline_new(txq, &mpw, length);
1276 mpw.wqe->eseg.cs_flags = cs_flags;
1279 /* Multi-segment packets must be alone in their MPW. */
1280 assert((segs_n == 1) || (mpw.pkts_n == 0));
1281 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1282 assert(inline_room ==
1283 txq->max_inline * RTE_CACHE_LINE_SIZE);
1284 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1288 volatile struct mlx5_wqe_data_seg *dseg;
1291 (elts_head + 1) & (elts_n - 1);
1293 (*txq->elts)[elts_head] = buf;
1294 dseg = mpw.data.dseg[mpw.pkts_n];
1295 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1296 *dseg = (struct mlx5_wqe_data_seg){
1297 .byte_count = htonl(DATA_LEN(buf)),
1298 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1299 .addr = htonll(addr),
1301 elts_head = elts_head_next;
1302 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1303 length += DATA_LEN(buf);
1309 assert(length == mpw.len);
1310 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1311 mlx5_mpw_close(txq, &mpw);
1315 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1316 assert(length <= inline_room);
1317 assert(length == DATA_LEN(buf));
1318 elts_head_next = (elts_head + 1) & (elts_n - 1);
1319 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1320 (*txq->elts)[elts_head] = buf;
1321 /* Maximum number of bytes before wrapping. */
1322 max = ((((uintptr_t)(txq->wqes)) +
1325 (uintptr_t)mpw.data.raw);
1327 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1330 mpw.data.raw = (volatile void *)txq->wqes;
1331 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1332 (void *)(addr + max),
1334 mpw.data.raw += length - max;
1336 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1342 (volatile void *)txq->wqes;
1344 mpw.data.raw += length;
1347 mpw.total_len += length;
1349 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1350 mlx5_mpw_inline_close(txq, &mpw);
1352 txq->max_inline * RTE_CACHE_LINE_SIZE;
1354 inline_room -= length;
1357 elts_head = elts_head_next;
1358 #ifdef MLX5_PMD_SOFT_COUNTERS
1359 /* Increment sent bytes counter. */
1360 txq->stats.obytes += length;
1364 /* Take a shortcut if nothing must be sent. */
1365 if (unlikely(i == 0))
1367 /* Check whether completion threshold has been reached. */
1368 /* "j" includes both packets and segments. */
1369 comp = txq->elts_comp + j;
1370 if (comp >= MLX5_TX_COMP_THRESH) {
1371 volatile struct mlx5_wqe *wqe = mpw.wqe;
1373 /* Request completion on last WQE. */
1374 wqe->ctrl[2] = htonl(8);
1375 /* Save elts_head in unused "immediate" field of WQE. */
1376 wqe->ctrl[3] = elts_head;
1379 txq->elts_comp = comp;
1381 #ifdef MLX5_PMD_SOFT_COUNTERS
1382 /* Increment sent packets counter. */
1383 txq->stats.opackets += i;
1385 /* Ring QP doorbell. */
1386 if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1387 mlx5_mpw_inline_close(txq, &mpw);
1388 else if (mpw.state == MLX5_MPW_STATE_OPENED)
1389 mlx5_mpw_close(txq, &mpw);
1390 mlx5_tx_dbrec(txq, mpw.wqe);
1391 txq->elts_head = elts_head;
1396 * Open an Enhanced MPW session.
1399 * Pointer to TX queue structure.
1401 * Pointer to MPW session structure.
1406 mlx5_empw_new(struct txq *txq, struct mlx5_mpw *mpw, int padding)
1408 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1410 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1412 mpw->total_len = sizeof(struct mlx5_wqe);
1413 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1414 mpw->wqe->ctrl[0] = htonl((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1415 (txq->wqe_ci << 8) |
1416 MLX5_OPCODE_ENHANCED_MPSW);
1417 mpw->wqe->ctrl[2] = 0;
1418 mpw->wqe->ctrl[3] = 0;
1419 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1420 if (unlikely(padding)) {
1421 uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1423 /* Pad the first 2 DWORDs with zero-length inline header. */
1424 *(volatile uint32_t *)addr = htonl(MLX5_INLINE_SEG);
1425 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1426 htonl(MLX5_INLINE_SEG);
1427 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1428 /* Start from the next WQEBB. */
1429 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1431 mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1436 * Close an Enhanced MPW session.
1439 * Pointer to TX queue structure.
1441 * Pointer to MPW session structure.
1444 * Number of consumed WQEs.
1446 static inline uint16_t
1447 mlx5_empw_close(struct txq *txq, struct mlx5_mpw *mpw)
1451 /* Store size in multiple of 16 bytes. Control and Ethernet segments
1454 mpw->wqe->ctrl[1] = htonl(txq->qp_num_8s | MLX5_WQE_DS(mpw->total_len));
1455 mpw->state = MLX5_MPW_STATE_CLOSED;
1456 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1462 * DPDK callback for TX with Enhanced MPW support.
1465 * Generic pointer to TX queue structure.
1467 * Packets to transmit.
1469 * Number of packets in array.
1472 * Number of packets successfully transmitted (<= pkts_n).
1475 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1477 struct txq *txq = (struct txq *)dpdk_txq;
1478 uint16_t elts_head = txq->elts_head;
1479 const unsigned int elts_n = 1 << txq->elts_n;
1482 unsigned int max_elts;
1484 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1485 unsigned int mpw_room = 0;
1486 unsigned int inl_pad = 0;
1488 struct mlx5_mpw mpw = {
1489 .state = MLX5_MPW_STATE_CLOSED,
1492 if (unlikely(!pkts_n))
1494 /* Start processing. */
1496 max_elts = (elts_n - (elts_head - txq->elts_tail));
1497 if (max_elts > elts_n)
1499 /* A CQE slot must always be available. */
1500 assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1501 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1502 if (unlikely(!max_wqe))
1505 struct rte_mbuf *buf = *(pkts++);
1506 unsigned int elts_head_next;
1510 unsigned int do_inline = 0; /* Whether inline is possible. */
1512 unsigned int segs_n = buf->nb_segs;
1513 uint32_t cs_flags = 0;
1516 * Make sure there is enough room to store this packet and
1517 * that one ring entry remains unused.
1520 if (max_elts - j < segs_n + 1)
1522 /* Do not bother with large packets MPW cannot handle. */
1523 if (segs_n > MLX5_MPW_DSEG_MAX)
1525 /* Should we enable HW CKSUM offload. */
1527 (PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM | PKT_TX_UDP_CKSUM))
1528 cs_flags = MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
1529 /* Retrieve packet information. */
1530 length = PKT_LEN(buf);
1531 /* Start new session if:
1532 * - multi-segment packet
1533 * - no space left even for a dseg
1534 * - next packet can be inlined with a new WQE
1536 * It can't be MLX5_MPW_STATE_OPENED as always have a single
1539 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1540 if ((segs_n != 1) ||
1541 (inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1543 (length <= txq->inline_max_packet_sz &&
1544 inl_pad + sizeof(inl_hdr) + length >
1546 (mpw.wqe->eseg.cs_flags != cs_flags))
1547 max_wqe -= mlx5_empw_close(txq, &mpw);
1549 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1550 if (unlikely(segs_n != 1)) {
1551 /* Fall back to legacy MPW.
1552 * A MPW session consumes 2 WQEs at most to
1553 * include MLX5_MPW_DSEG_MAX pointers.
1555 if (unlikely(max_wqe < 2))
1557 mlx5_mpw_new(txq, &mpw, length);
1559 /* In Enhanced MPW, inline as much as the budget
1560 * is allowed. The remaining space is to be
1561 * filled with dsegs. If the title WQEBB isn't
1562 * padded, it will have 2 dsegs there.
1564 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1565 (max_inline ? max_inline :
1566 pkts_n * MLX5_WQE_DWORD_SIZE) +
1568 if (unlikely(max_wqe * MLX5_WQE_SIZE <
1571 /* Don't pad the title WQEBB to not waste WQ. */
1572 mlx5_empw_new(txq, &mpw, 0);
1573 mpw_room -= mpw.total_len;
1576 length <= txq->inline_max_packet_sz &&
1577 sizeof(inl_hdr) + length <= mpw_room &&
1580 mpw.wqe->eseg.cs_flags = cs_flags;
1582 /* Evaluate whether the next packet can be inlined.
1583 * Inlininig is possible when:
1584 * - length is less than configured value
1585 * - length fits for remaining space
1586 * - not required to fill the title WQEBB with dsegs
1589 length <= txq->inline_max_packet_sz &&
1590 inl_pad + sizeof(inl_hdr) + length <=
1592 (!txq->mpw_hdr_dseg ||
1593 mpw.total_len >= MLX5_WQE_SIZE);
1595 /* Multi-segment packets must be alone in their MPW. */
1596 assert((segs_n == 1) || (mpw.pkts_n == 0));
1597 if (unlikely(mpw.state == MLX5_MPW_STATE_OPENED)) {
1598 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1602 volatile struct mlx5_wqe_data_seg *dseg;
1605 (elts_head + 1) & (elts_n - 1);
1607 (*txq->elts)[elts_head] = buf;
1608 dseg = mpw.data.dseg[mpw.pkts_n];
1609 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1610 *dseg = (struct mlx5_wqe_data_seg){
1611 .byte_count = htonl(DATA_LEN(buf)),
1612 .lkey = txq_mp2mr(txq, txq_mb2mp(buf)),
1613 .addr = htonll(addr),
1615 elts_head = elts_head_next;
1616 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1617 length += DATA_LEN(buf);
1623 /* A multi-segmented packet takes one MPW session.
1624 * TODO: Pack more multi-segmented packets if possible.
1626 mlx5_mpw_close(txq, &mpw);
1631 } else if (do_inline) {
1632 /* Inline packet into WQE. */
1635 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1636 assert(length == DATA_LEN(buf));
1637 inl_hdr = htonl(length | MLX5_INLINE_SEG);
1638 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1639 mpw.data.raw = (volatile void *)
1640 ((uintptr_t)mpw.data.raw + inl_pad);
1641 max = tx_mlx5_wq_tailroom(txq,
1642 (void *)(uintptr_t)mpw.data.raw);
1643 /* Copy inline header. */
1644 mpw.data.raw = (volatile void *)
1646 (void *)(uintptr_t)mpw.data.raw,
1649 (void *)(uintptr_t)txq->wqes,
1651 max = tx_mlx5_wq_tailroom(txq,
1652 (void *)(uintptr_t)mpw.data.raw);
1653 /* Copy packet data. */
1654 mpw.data.raw = (volatile void *)
1656 (void *)(uintptr_t)mpw.data.raw,
1659 (void *)(uintptr_t)txq->wqes,
1662 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1663 /* No need to get completion as the entire packet is
1664 * copied to WQ. Free the buf right away.
1666 elts_head_next = elts_head;
1667 rte_pktmbuf_free_seg(buf);
1668 mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1669 /* Add pad in the next packet if any. */
1670 inl_pad = (((uintptr_t)mpw.data.raw +
1671 (MLX5_WQE_DWORD_SIZE - 1)) &
1672 ~(MLX5_WQE_DWORD_SIZE - 1)) -
1673 (uintptr_t)mpw.data.raw;
1675 /* No inline. Load a dseg of packet pointer. */
1676 volatile rte_v128u32_t *dseg;
1678 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1679 assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1680 assert(length == DATA_LEN(buf));
1681 if (!tx_mlx5_wq_tailroom(txq,
1682 (void *)((uintptr_t)mpw.data.raw
1684 dseg = (volatile void *)txq->wqes;
1686 dseg = (volatile void *)
1687 ((uintptr_t)mpw.data.raw +
1689 elts_head_next = (elts_head + 1) & (elts_n - 1);
1690 (*txq->elts)[elts_head] = buf;
1691 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1692 for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1693 rte_prefetch2((void *)(addr +
1694 n * RTE_CACHE_LINE_SIZE));
1695 naddr = htonll(addr);
1696 *dseg = (rte_v128u32_t) {
1698 txq_mp2mr(txq, txq_mb2mp(buf)),
1702 mpw.data.raw = (volatile void *)(dseg + 1);
1703 mpw.total_len += (inl_pad + sizeof(*dseg));
1706 mpw_room -= (inl_pad + sizeof(*dseg));
1709 elts_head = elts_head_next;
1710 #ifdef MLX5_PMD_SOFT_COUNTERS
1711 /* Increment sent bytes counter. */
1712 txq->stats.obytes += length;
1715 } while (i < pkts_n);
1716 /* Take a shortcut if nothing must be sent. */
1717 if (unlikely(i == 0))
1719 /* Check whether completion threshold has been reached. */
1720 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1721 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1722 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1723 volatile struct mlx5_wqe *wqe = mpw.wqe;
1725 /* Request completion on last WQE. */
1726 wqe->ctrl[2] = htonl(8);
1727 /* Save elts_head in unused "immediate" field of WQE. */
1728 wqe->ctrl[3] = elts_head;
1730 txq->mpw_comp = txq->wqe_ci;
1733 txq->elts_comp += j;
1735 #ifdef MLX5_PMD_SOFT_COUNTERS
1736 /* Increment sent packets counter. */
1737 txq->stats.opackets += i;
1739 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1740 mlx5_empw_close(txq, &mpw);
1741 else if (mpw.state == MLX5_MPW_STATE_OPENED)
1742 mlx5_mpw_close(txq, &mpw);
1743 /* Ring QP doorbell. */
1744 mlx5_tx_dbrec(txq, mpw.wqe);
1745 txq->elts_head = elts_head;
1750 * Translate RX completion flags to packet type.
1755 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1758 * Packet type for struct rte_mbuf.
1760 static inline uint32_t
1761 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1764 uint16_t flags = ntohs(cqe->hdr_type_etc);
1766 if (cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) {
1769 MLX5_CQE_RX_IPV4_PACKET,
1770 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN) |
1772 MLX5_CQE_RX_IPV6_PACKET,
1773 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN);
1774 pkt_type |= ((cqe->pkt_info & MLX5_CQE_RX_OUTER_PACKET) ?
1775 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN :
1776 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1780 MLX5_CQE_L3_HDR_TYPE_IPV6,
1781 RTE_PTYPE_L3_IPV6_EXT_UNKNOWN) |
1783 MLX5_CQE_L3_HDR_TYPE_IPV4,
1784 RTE_PTYPE_L3_IPV4_EXT_UNKNOWN);
1790 * Get size of the next packet for a given CQE. For compressed CQEs, the
1791 * consumer index is updated only once all packets of the current one have
1795 * Pointer to RX queue.
1798 * @param[out] rss_hash
1799 * Packet RSS Hash result.
1802 * Packet size in bytes (0 if there is none), -1 in case of completion
1806 mlx5_rx_poll_len(struct rxq *rxq, volatile struct mlx5_cqe *cqe,
1807 uint16_t cqe_cnt, uint32_t *rss_hash)
1809 struct rxq_zip *zip = &rxq->zip;
1810 uint16_t cqe_n = cqe_cnt + 1;
1814 /* Process compressed data in the CQE and mini arrays. */
1816 volatile struct mlx5_mini_cqe8 (*mc)[8] =
1817 (volatile struct mlx5_mini_cqe8 (*)[8])
1818 (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt]);
1820 len = ntohl((*mc)[zip->ai & 7].byte_cnt);
1821 *rss_hash = ntohl((*mc)[zip->ai & 7].rx_hash_result);
1822 if ((++zip->ai & 7) == 0) {
1823 /* Invalidate consumed CQEs */
1826 while (idx != end) {
1827 (*rxq->cqes)[idx & cqe_cnt].op_own =
1828 MLX5_CQE_INVALIDATE;
1832 * Increment consumer index to skip the number of
1833 * CQEs consumed. Hardware leaves holes in the CQ
1834 * ring for software use.
1839 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1840 /* Invalidate the rest */
1844 while (idx != end) {
1845 (*rxq->cqes)[idx & cqe_cnt].op_own =
1846 MLX5_CQE_INVALIDATE;
1849 rxq->cq_ci = zip->cq_ci;
1852 /* No compressed data, get next CQE and verify if it is compressed. */
1857 ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1858 if (unlikely(ret == 1))
1861 op_own = cqe->op_own;
1862 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1863 volatile struct mlx5_mini_cqe8 (*mc)[8] =
1864 (volatile struct mlx5_mini_cqe8 (*)[8])
1865 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1868 /* Fix endianness. */
1869 zip->cqe_cnt = ntohl(cqe->byte_cnt);
1871 * Current mini array position is the one returned by
1874 * If completion comprises several mini arrays, as a
1875 * special case the second one is located 7 CQEs after
1876 * the initial CQE instead of 8 for subsequent ones.
1878 zip->ca = rxq->cq_ci;
1879 zip->na = zip->ca + 7;
1880 /* Compute the next non compressed CQE. */
1882 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1883 /* Get packet size to return. */
1884 len = ntohl((*mc)[0].byte_cnt);
1885 *rss_hash = ntohl((*mc)[0].rx_hash_result);
1887 /* Prefetch all the entries to be invalidated */
1890 while (idx != end) {
1891 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1895 len = ntohl(cqe->byte_cnt);
1896 *rss_hash = ntohl(cqe->rx_hash_res);
1898 /* Error while receiving packet. */
1899 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1906 * Translate RX completion flags to offload flags.
1909 * Pointer to RX queue structure.
1914 * Offload flags (ol_flags) for struct rte_mbuf.
1916 static inline uint32_t
1917 rxq_cq_to_ol_flags(struct rxq *rxq, volatile struct mlx5_cqe *cqe)
1919 uint32_t ol_flags = 0;
1920 uint16_t flags = ntohs(cqe->hdr_type_etc);
1924 MLX5_CQE_RX_L3_HDR_VALID,
1925 PKT_RX_IP_CKSUM_GOOD) |
1927 MLX5_CQE_RX_L4_HDR_VALID,
1928 PKT_RX_L4_CKSUM_GOOD);
1929 if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1932 MLX5_CQE_RX_L3_HDR_VALID,
1933 PKT_RX_IP_CKSUM_GOOD) |
1935 MLX5_CQE_RX_L4_HDR_VALID,
1936 PKT_RX_L4_CKSUM_GOOD);
1941 * DPDK callback for RX.
1944 * Generic pointer to RX queue structure.
1946 * Array to store received packets.
1948 * Maximum number of packets in array.
1951 * Number of packets successfully received (<= pkts_n).
1954 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1956 struct rxq *rxq = dpdk_rxq;
1957 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1958 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1959 const unsigned int sges_n = rxq->sges_n;
1960 struct rte_mbuf *pkt = NULL;
1961 struct rte_mbuf *seg = NULL;
1962 volatile struct mlx5_cqe *cqe =
1963 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1965 unsigned int rq_ci = rxq->rq_ci << sges_n;
1966 int len = 0; /* keep its value across iterations. */
1969 unsigned int idx = rq_ci & wqe_cnt;
1970 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1971 struct rte_mbuf *rep = (*rxq->elts)[idx];
1972 uint32_t rss_hash_res = 0;
1980 rep = rte_mbuf_raw_alloc(rxq->mp);
1981 if (unlikely(rep == NULL)) {
1982 ++rxq->stats.rx_nombuf;
1985 * no buffers before we even started,
1986 * bail out silently.
1990 while (pkt != seg) {
1991 assert(pkt != (*rxq->elts)[idx]);
1995 rte_mbuf_raw_free(pkt);
2001 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2002 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
2005 rte_mbuf_raw_free(rep);
2008 if (unlikely(len == -1)) {
2009 /* RX error, packet is likely too large. */
2010 rte_mbuf_raw_free(rep);
2011 ++rxq->stats.idropped;
2015 assert(len >= (rxq->crc_present << 2));
2016 /* Update packet information. */
2017 pkt->packet_type = 0;
2019 if (rss_hash_res && rxq->rss_hash) {
2020 pkt->hash.rss = rss_hash_res;
2021 pkt->ol_flags = PKT_RX_RSS_HASH;
2024 MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
2025 pkt->ol_flags |= PKT_RX_FDIR;
2026 if (cqe->sop_drop_qpn !=
2027 htonl(MLX5_FLOW_MARK_DEFAULT)) {
2028 uint32_t mark = cqe->sop_drop_qpn;
2030 pkt->ol_flags |= PKT_RX_FDIR_ID;
2032 mlx5_flow_mark_get(mark);
2035 if (rxq->csum | rxq->csum_l2tun) {
2036 pkt->packet_type = rxq_cq_to_pkt_type(cqe);
2037 pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
2039 if (rxq->vlan_strip &&
2040 (cqe->hdr_type_etc &
2041 htons(MLX5_CQE_VLAN_STRIPPED))) {
2042 pkt->ol_flags |= PKT_RX_VLAN_PKT |
2043 PKT_RX_VLAN_STRIPPED;
2044 pkt->vlan_tci = ntohs(cqe->vlan_info);
2046 if (rxq->crc_present)
2047 len -= ETHER_CRC_LEN;
2050 DATA_LEN(rep) = DATA_LEN(seg);
2051 PKT_LEN(rep) = PKT_LEN(seg);
2052 SET_DATA_OFF(rep, DATA_OFF(seg));
2053 PORT(rep) = PORT(seg);
2054 (*rxq->elts)[idx] = rep;
2056 * Fill NIC descriptor with the new buffer. The lkey and size
2057 * of the buffers are already known, only the buffer address
2060 wqe->addr = htonll(rte_pktmbuf_mtod(rep, uintptr_t));
2061 if (len > DATA_LEN(seg)) {
2062 len -= DATA_LEN(seg);
2067 DATA_LEN(seg) = len;
2068 #ifdef MLX5_PMD_SOFT_COUNTERS
2069 /* Increment bytes counter. */
2070 rxq->stats.ibytes += PKT_LEN(pkt);
2072 /* Return packet. */
2078 /* Align consumer index to the next stride. */
2083 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
2085 /* Update the consumer index. */
2086 rxq->rq_ci = rq_ci >> sges_n;
2088 *rxq->cq_db = htonl(rxq->cq_ci);
2090 *rxq->rq_db = htonl(rxq->rq_ci);
2091 #ifdef MLX5_PMD_SOFT_COUNTERS
2092 /* Increment packets counter. */
2093 rxq->stats.ipackets += i;
2099 * Dummy DPDK callback for TX.
2101 * This function is used to temporarily replace the real callback during
2102 * unsafe control operations on the queue, or in case of error.
2105 * Generic pointer to TX queue structure.
2107 * Packets to transmit.
2109 * Number of packets in array.
2112 * Number of packets successfully transmitted (<= pkts_n).
2115 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
2124 * Dummy DPDK callback for RX.
2126 * This function is used to temporarily replace the real callback during
2127 * unsafe control operations on the queue, or in case of error.
2130 * Generic pointer to RX queue structure.
2132 * Array to store received packets.
2134 * Maximum number of packets in array.
2137 * Number of packets successfully received (<= pkts_n).
2140 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)