1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2015 6WIND S.A.
3 * Copyright 2015 Mellanox Technologies, Ltd
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
14 #pragma GCC diagnostic ignored "-Wpedantic"
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
19 #pragma GCC diagnostic error "-Wpedantic"
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <rte_cycles.h>
31 #include "mlx5_utils.h"
32 #include "mlx5_rxtx.h"
33 #include "mlx5_autoconf.h"
34 #include "mlx5_defs.h"
37 static __rte_always_inline uint32_t
38 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
40 static __rte_always_inline int
41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
42 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
44 static __rte_always_inline uint32_t
45 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
47 static __rte_always_inline void
48 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
49 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
51 static __rte_always_inline void
52 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx);
54 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
55 [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
58 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
59 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
62 * Build a table to translate Rx completion flags to packet type.
64 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
67 mlx5_set_ptype_table(void)
70 uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
72 /* Last entry must not be overwritten, reserved for errored packet. */
73 for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
74 (*p)[i] = RTE_PTYPE_UNKNOWN;
76 * The index to the array should have:
77 * bit[1:0] = l3_hdr_type
78 * bit[4:2] = l4_hdr_type
81 * bit[7] = outer_l3_type
84 (*p)[0x00] = RTE_PTYPE_L2_ETHER;
86 (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
88 (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
91 (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
93 (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
96 (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
98 (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
100 (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
102 (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
104 (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
106 (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
109 (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
111 (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
113 /* Repeat with outer_l3_type being set. Just in case. */
114 (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
115 RTE_PTYPE_L4_NONFRAG;
116 (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
117 RTE_PTYPE_L4_NONFRAG;
118 (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
120 (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
122 (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
124 (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
126 (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
128 (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
130 (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
132 (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
134 (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
136 (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
139 (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
140 (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
141 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
142 RTE_PTYPE_INNER_L4_NONFRAG;
143 (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
144 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
145 RTE_PTYPE_INNER_L4_NONFRAG;
146 (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
147 (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
148 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
149 RTE_PTYPE_INNER_L4_NONFRAG;
150 (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
151 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
152 RTE_PTYPE_INNER_L4_NONFRAG;
153 /* Tunneled - Fragmented */
154 (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
155 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
156 RTE_PTYPE_INNER_L4_FRAG;
157 (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
158 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
159 RTE_PTYPE_INNER_L4_FRAG;
160 (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
161 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
162 RTE_PTYPE_INNER_L4_FRAG;
163 (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
164 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
165 RTE_PTYPE_INNER_L4_FRAG;
167 (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
168 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
169 RTE_PTYPE_INNER_L4_TCP;
170 (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
171 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
172 RTE_PTYPE_INNER_L4_TCP;
173 (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
174 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
175 RTE_PTYPE_INNER_L4_TCP;
176 (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
177 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
178 RTE_PTYPE_INNER_L4_TCP;
179 (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
180 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
181 RTE_PTYPE_INNER_L4_TCP;
182 (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
183 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
184 RTE_PTYPE_INNER_L4_TCP;
185 (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
186 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
187 RTE_PTYPE_INNER_L4_TCP;
188 (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
189 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
190 RTE_PTYPE_INNER_L4_TCP;
191 (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
192 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
193 RTE_PTYPE_INNER_L4_TCP;
194 (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
195 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
196 RTE_PTYPE_INNER_L4_TCP;
197 (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
198 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
199 RTE_PTYPE_INNER_L4_TCP;
200 (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
201 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
202 RTE_PTYPE_INNER_L4_TCP;
204 (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
205 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
206 RTE_PTYPE_INNER_L4_UDP;
207 (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
208 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
209 RTE_PTYPE_INNER_L4_UDP;
210 (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
211 RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
212 RTE_PTYPE_INNER_L4_UDP;
213 (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
214 RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
215 RTE_PTYPE_INNER_L4_UDP;
219 * Build a table to translate packet to checksum type of Verbs.
222 mlx5_set_cksum_table(void)
228 * The index should have:
229 * bit[0] = PKT_TX_TCP_SEG
230 * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
231 * bit[4] = PKT_TX_IP_CKSUM
232 * bit[8] = PKT_TX_OUTER_IP_CKSUM
235 for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
238 /* Tunneled packet. */
239 if (i & (1 << 8)) /* Outer IP. */
240 v |= MLX5_ETH_WQE_L3_CSUM;
241 if (i & (1 << 4)) /* Inner IP. */
242 v |= MLX5_ETH_WQE_L3_INNER_CSUM;
243 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
244 v |= MLX5_ETH_WQE_L4_INNER_CSUM;
247 if (i & (1 << 4)) /* IP. */
248 v |= MLX5_ETH_WQE_L3_CSUM;
249 if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
250 v |= MLX5_ETH_WQE_L4_CSUM;
252 mlx5_cksum_table[i] = v;
257 * Build a table to translate packet type of mbuf to SWP type of Verbs.
260 mlx5_set_swp_types_table(void)
266 * The index should have:
267 * bit[0:1] = PKT_TX_L4_MASK
268 * bit[4] = PKT_TX_IPV6
269 * bit[8] = PKT_TX_OUTER_IPV6
270 * bit[9] = PKT_TX_OUTER_UDP
272 for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
275 v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
277 v |= MLX5_ETH_WQE_L4_OUTER_UDP;
279 v |= MLX5_ETH_WQE_L3_INNER_IPV6;
280 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
281 v |= MLX5_ETH_WQE_L4_INNER_UDP;
282 mlx5_swp_types_table[i] = v;
287 * Return the size of tailroom of WQ.
290 * Pointer to TX queue structure.
292 * Pointer to tail of WQ.
298 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
301 tailroom = (uintptr_t)(txq->wqes) +
302 (1 << txq->wqe_n) * MLX5_WQE_SIZE -
308 * Copy data to tailroom of circular queue.
311 * Pointer to destination.
315 * Number of bytes to copy.
317 * Pointer to head of queue.
319 * Size of tailroom from dst.
322 * Pointer after copied data.
325 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
326 void *base, size_t tailroom)
331 rte_memcpy(dst, src, tailroom);
332 rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
334 ret = (uint8_t *)base + n - tailroom;
336 rte_memcpy(dst, src, n);
337 ret = (n == tailroom) ? base : (uint8_t *)dst + n;
343 * Inline TSO headers into WQE.
346 * 0 on success, negative errno value on failure.
349 inline_tso(struct mlx5_txq_data *txq, struct rte_mbuf *buf,
352 uint16_t *pkt_inline_sz,
356 uint16_t *tso_header_sz)
358 uintptr_t end = (uintptr_t)(((uintptr_t)txq->wqes) +
359 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
361 uint8_t vlan_sz = (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
362 const uint8_t tunneled = txq->tunnel_en && (buf->ol_flags &
366 *tso_segsz = buf->tso_segsz;
367 *tso_header_sz = buf->l2_len + vlan_sz + buf->l3_len + buf->l4_len;
368 if (unlikely(*tso_segsz == 0 || *tso_header_sz == 0)) {
369 txq->stats.oerrors++;
373 *tso_header_sz += buf->outer_l2_len + buf->outer_l3_len;
374 /* First seg must contain all TSO headers. */
375 if (unlikely(*tso_header_sz > MLX5_MAX_TSO_HEADER) ||
376 *tso_header_sz > DATA_LEN(buf)) {
377 txq->stats.oerrors++;
380 copy_b = *tso_header_sz - *pkt_inline_sz;
381 if (!copy_b || ((end - (uintptr_t)*raw) < copy_b))
383 n_wqe = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
384 if (unlikely(*max_wqe < n_wqe))
387 rte_memcpy((void *)*raw, (void *)*addr, copy_b);
390 copy_b = MLX5_WQE_DS(copy_b) * MLX5_WQE_DWORD_SIZE;
391 *pkt_inline_sz += copy_b;
397 * DPDK callback to check the status of a tx descriptor.
402 * The index of the descriptor in the ring.
405 * The status of the tx descriptor.
408 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
410 struct mlx5_txq_data *txq = tx_queue;
413 mlx5_tx_complete(txq);
414 used = txq->elts_head - txq->elts_tail;
416 return RTE_ETH_TX_DESC_FULL;
417 return RTE_ETH_TX_DESC_DONE;
421 * Internal function to compute the number of used descriptors in an RX queue
427 * The number of used rx descriptor.
430 rx_queue_count(struct mlx5_rxq_data *rxq)
432 struct rxq_zip *zip = &rxq->zip;
433 volatile struct mlx5_cqe *cqe;
434 const unsigned int cqe_n = (1 << rxq->cqe_n);
435 const unsigned int cqe_cnt = cqe_n - 1;
439 /* if we are processing a compressed cqe */
441 used = zip->cqe_cnt - zip->ca;
447 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
448 while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
452 op_own = cqe->op_own;
453 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
454 n = rte_be_to_cpu_32(cqe->byte_cnt);
459 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
461 used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
466 * DPDK callback to check the status of a rx descriptor.
471 * The index of the descriptor in the ring.
474 * The status of the tx descriptor.
477 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
479 struct mlx5_rxq_data *rxq = rx_queue;
480 struct mlx5_rxq_ctrl *rxq_ctrl =
481 container_of(rxq, struct mlx5_rxq_ctrl, rxq);
482 struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
484 if (dev->rx_pkt_burst != mlx5_rx_burst) {
488 if (offset >= (1 << rxq->elts_n)) {
492 if (offset < rx_queue_count(rxq))
493 return RTE_ETH_RX_DESC_DONE;
494 return RTE_ETH_RX_DESC_AVAIL;
498 * DPDK callback to get the number of used descriptors in a RX queue
501 * Pointer to the device structure.
507 * The number of used rx descriptor.
508 * -EINVAL if the queue is invalid
511 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
513 struct mlx5_priv *priv = dev->data->dev_private;
514 struct mlx5_rxq_data *rxq;
516 if (dev->rx_pkt_burst != mlx5_rx_burst) {
520 rxq = (*priv->rxqs)[rx_queue_id];
525 return rx_queue_count(rxq);
528 #define MLX5_SYSTEM_LOG_DIR "/var/log"
530 * Dump debug information to log file.
535 * If not NULL this string is printed as a header to the output
536 * and the output will be in hexadecimal view.
538 * This is the buffer address to print out.
540 * The number of bytes to dump out.
543 mlx5_dump_debug_information(const char *fname, const char *hex_title,
544 const void *buf, unsigned int hex_len)
548 MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
549 fd = fopen(path, "a+");
551 DRV_LOG(WARNING, "cannot open %s for debug dump\n",
553 MKSTR(path2, "./%s", fname);
554 fd = fopen(path2, "a+");
556 DRV_LOG(ERR, "cannot open %s for debug dump\n",
560 DRV_LOG(INFO, "New debug dump in file %s\n", path2);
562 DRV_LOG(INFO, "New debug dump in file %s\n", path);
565 rte_hexdump(fd, hex_title, buf, hex_len);
567 fprintf(fd, "%s", (const char *)buf);
568 fprintf(fd, "\n\n\n");
573 * DPDK callback for TX.
576 * Generic pointer to TX queue structure.
578 * Packets to transmit.
580 * Number of packets in array.
583 * Number of packets successfully transmitted (<= pkts_n).
586 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
588 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
589 uint16_t elts_head = txq->elts_head;
590 const uint16_t elts_n = 1 << txq->elts_n;
591 const uint16_t elts_m = elts_n - 1;
598 volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
599 unsigned int segs_n = 0;
600 const unsigned int max_inline = txq->max_inline;
603 if (unlikely(!pkts_n))
605 /* Prefetch first packet cacheline. */
606 rte_prefetch0(*pkts);
607 /* Start processing. */
608 mlx5_tx_complete(txq);
609 max_elts = (elts_n - (elts_head - txq->elts_tail));
610 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
611 if (unlikely(!max_wqe))
614 struct rte_mbuf *buf = *pkts; /* First_seg. */
616 volatile struct mlx5_wqe_v *wqe = NULL;
617 volatile rte_v128u32_t *dseg = NULL;
620 unsigned int sg = 0; /* counter of additional segs attached. */
622 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
623 uint16_t tso_header_sz = 0;
626 uint8_t tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
627 uint32_t swp_offsets = 0;
628 uint8_t swp_types = 0;
630 uint16_t tso_segsz = 0;
631 #ifdef MLX5_PMD_SOFT_COUNTERS
632 uint32_t total_length = 0;
636 segs_n = buf->nb_segs;
638 * Make sure there is enough room to store this packet and
639 * that one ring entry remains unused.
642 if (max_elts < segs_n)
646 if (unlikely(--max_wqe == 0))
648 wqe = (volatile struct mlx5_wqe_v *)
649 tx_mlx5_wqe(txq, txq->wqe_ci);
650 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
652 rte_prefetch0(*(pkts + 1));
653 addr = rte_pktmbuf_mtod(buf, uintptr_t);
654 length = DATA_LEN(buf);
655 ehdr = (((uint8_t *)addr)[1] << 8) |
656 ((uint8_t *)addr)[0];
657 #ifdef MLX5_PMD_SOFT_COUNTERS
658 total_length = length;
660 if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
661 txq->stats.oerrors++;
664 /* Update element. */
665 (*txq->elts)[elts_head & elts_m] = buf;
666 /* Prefetch next buffer data. */
669 rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
670 cs_flags = txq_ol_cksum_to_cs(buf);
671 txq_mbuf_to_swp(txq, buf, (uint8_t *)&swp_offsets, &swp_types);
672 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
673 /* Copy metadata from mbuf if valid */
674 metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
676 /* Replace the Ethernet type by the VLAN if necessary. */
677 if (buf->ol_flags & PKT_TX_VLAN_PKT) {
678 uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
680 unsigned int len = 2 * RTE_ETHER_ADDR_LEN - 2;
684 /* Copy Destination and source mac address. */
685 memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
687 memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
688 /* Copy missing two bytes to end the DSeg. */
689 memcpy((uint8_t *)raw + len + sizeof(vlan),
690 ((uint8_t *)addr) + len, 2);
694 memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
695 MLX5_WQE_DWORD_SIZE);
696 length -= pkt_inline_sz;
697 addr += pkt_inline_sz;
699 raw += MLX5_WQE_DWORD_SIZE;
701 ret = inline_tso(txq, buf, &length,
702 &addr, &pkt_inline_sz,
704 &tso_segsz, &tso_header_sz);
705 if (ret == -EINVAL) {
707 } else if (ret == -EAGAIN) {
709 wqe->ctrl = (rte_v128u32_t){
710 rte_cpu_to_be_32(txq->wqe_ci << 8),
711 rte_cpu_to_be_32(txq->qp_num_8s | 1),
716 #ifdef MLX5_PMD_SOFT_COUNTERS
723 /* Inline if enough room. */
724 if (max_inline || tso) {
726 uintptr_t end = (uintptr_t)
727 (((uintptr_t)txq->wqes) +
728 (1 << txq->wqe_n) * MLX5_WQE_SIZE);
729 unsigned int inline_room = max_inline *
730 RTE_CACHE_LINE_SIZE -
731 (pkt_inline_sz - 2) -
737 addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
738 RTE_CACHE_LINE_SIZE);
739 copy_b = (addr_end > addr) ?
740 RTE_MIN((addr_end - addr), length) : 0;
741 if (copy_b && ((end - (uintptr_t)raw) >
742 (copy_b + sizeof(inl)))) {
744 * One Dseg remains in the current WQE. To
745 * keep the computation positive, it is
746 * removed after the bytes to Dseg conversion.
748 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
750 if (unlikely(max_wqe < n))
755 inl = rte_cpu_to_be_32(copy_b |
757 rte_memcpy((void *)raw,
758 (void *)&inl, sizeof(inl));
760 pkt_inline_sz += sizeof(inl);
762 rte_memcpy((void *)raw, (void *)addr, copy_b);
765 pkt_inline_sz += copy_b;
768 * 2 DWORDs consumed by the WQE header + ETH segment +
769 * the size of the inline part of the packet.
771 ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
773 if (ds % (MLX5_WQE_SIZE /
774 MLX5_WQE_DWORD_SIZE) == 0) {
775 if (unlikely(--max_wqe == 0))
777 dseg = (volatile rte_v128u32_t *)
778 tx_mlx5_wqe(txq, txq->wqe_ci +
781 dseg = (volatile rte_v128u32_t *)
783 (ds * MLX5_WQE_DWORD_SIZE));
786 } else if (!segs_n) {
790 * Further inline the next segment only for
795 inline_room -= copy_b;
799 /* Move to the next segment. */
803 addr = rte_pktmbuf_mtod(buf, uintptr_t);
804 length = DATA_LEN(buf);
805 #ifdef MLX5_PMD_SOFT_COUNTERS
806 total_length += length;
808 (*txq->elts)[++elts_head & elts_m] = buf;
813 * No inline has been done in the packet, only the
814 * Ethernet Header as been stored.
816 dseg = (volatile rte_v128u32_t *)
817 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
820 /* Add the remaining packet as a simple ds. */
821 addr_64 = rte_cpu_to_be_64(addr);
822 *dseg = (rte_v128u32_t){
823 rte_cpu_to_be_32(length),
824 mlx5_tx_mb2mr(txq, buf),
837 * Spill on next WQE when the current one does not have
838 * enough room left. Size of WQE must a be a multiple
839 * of data segment size.
841 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
842 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
843 if (unlikely(--max_wqe == 0))
845 dseg = (volatile rte_v128u32_t *)
846 tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
847 rte_prefetch0(tx_mlx5_wqe(txq,
848 txq->wqe_ci + ds / 4 + 1));
855 length = DATA_LEN(buf);
856 #ifdef MLX5_PMD_SOFT_COUNTERS
857 total_length += length;
859 /* Store segment information. */
860 addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
861 *dseg = (rte_v128u32_t){
862 rte_cpu_to_be_32(length),
863 mlx5_tx_mb2mr(txq, buf),
867 (*txq->elts)[++elts_head & elts_m] = buf;
871 if (ds > MLX5_DSEG_MAX) {
872 txq->stats.oerrors++;
879 /* Initialize known and common part of the WQE structure. */
881 wqe->ctrl = (rte_v128u32_t){
882 rte_cpu_to_be_32((txq->wqe_ci << 8) |
884 rte_cpu_to_be_32(txq->qp_num_8s | ds),
888 wqe->eseg = (rte_v128u32_t){
890 cs_flags | (swp_types << 8) |
891 (rte_cpu_to_be_16(tso_segsz) << 16),
893 (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
896 wqe->ctrl = (rte_v128u32_t){
897 rte_cpu_to_be_32((txq->wqe_ci << 8) |
899 rte_cpu_to_be_32(txq->qp_num_8s | ds),
903 wqe->eseg = (rte_v128u32_t){
905 cs_flags | (swp_types << 8),
907 (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
911 txq->wqe_ci += (ds + 3) / 4;
912 /* Save the last successful WQE for completion request */
913 last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
914 #ifdef MLX5_PMD_SOFT_COUNTERS
915 /* Increment sent bytes counter. */
916 txq->stats.obytes += total_length;
918 } while (i < pkts_n);
919 /* Take a shortcut if nothing must be sent. */
920 if (unlikely((i + k) == 0))
922 txq->elts_head += (i + j);
923 /* Check whether completion threshold has been reached. */
924 comp = txq->elts_comp + i + j + k;
925 if (comp >= MLX5_TX_COMP_THRESH) {
926 /* A CQE slot must always be available. */
927 assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
928 /* Request completion on last WQE. */
929 last_wqe->ctrl2 = rte_cpu_to_be_32(8);
930 /* Save elts_head in unused "immediate" field of WQE. */
931 last_wqe->ctrl3 = txq->elts_head;
934 txq->elts_comp = comp;
936 #ifdef MLX5_PMD_SOFT_COUNTERS
937 /* Increment sent packets counter. */
938 txq->stats.opackets += i;
940 /* Ring QP doorbell. */
941 mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
946 * Open a MPW session.
949 * Pointer to TX queue structure.
951 * Pointer to MPW session structure.
956 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
958 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
959 volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
960 (volatile struct mlx5_wqe_data_seg (*)[])
961 tx_mlx5_wqe(txq, idx + 1);
963 mpw->state = MLX5_MPW_STATE_OPENED;
967 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
968 mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
969 mpw->wqe->eseg.inline_hdr_sz = 0;
970 mpw->wqe->eseg.rsvd0 = 0;
971 mpw->wqe->eseg.rsvd1 = 0;
972 mpw->wqe->eseg.flow_table_metadata = 0;
973 mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
976 mpw->wqe->ctrl[2] = 0;
977 mpw->wqe->ctrl[3] = 0;
978 mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
979 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
980 mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
981 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
982 mpw->data.dseg[2] = &(*dseg)[0];
983 mpw->data.dseg[3] = &(*dseg)[1];
984 mpw->data.dseg[4] = &(*dseg)[2];
988 * Close a MPW session.
991 * Pointer to TX queue structure.
993 * Pointer to MPW session structure.
996 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
998 unsigned int num = mpw->pkts_n;
1001 * Store size in multiple of 16 bytes. Control and Ethernet segments
1004 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
1005 mpw->state = MLX5_MPW_STATE_CLOSED;
1010 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1011 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1015 * DPDK callback for TX with MPW support.
1018 * Generic pointer to TX queue structure.
1020 * Packets to transmit.
1022 * Number of packets in array.
1025 * Number of packets successfully transmitted (<= pkts_n).
1028 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1030 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1031 uint16_t elts_head = txq->elts_head;
1032 const uint16_t elts_n = 1 << txq->elts_n;
1033 const uint16_t elts_m = elts_n - 1;
1039 struct mlx5_mpw mpw = {
1040 .state = MLX5_MPW_STATE_CLOSED,
1043 if (unlikely(!pkts_n))
1045 /* Prefetch first packet cacheline. */
1046 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1047 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1048 /* Start processing. */
1049 mlx5_tx_complete(txq);
1050 max_elts = (elts_n - (elts_head - txq->elts_tail));
1051 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1052 if (unlikely(!max_wqe))
1055 struct rte_mbuf *buf = *(pkts++);
1057 unsigned int segs_n = buf->nb_segs;
1059 rte_be32_t metadata;
1062 * Make sure there is enough room to store this packet and
1063 * that one ring entry remains unused.
1066 if (max_elts < segs_n)
1068 /* Do not bother with large packets MPW cannot handle. */
1069 if (segs_n > MLX5_MPW_DSEG_MAX) {
1070 txq->stats.oerrors++;
1075 cs_flags = txq_ol_cksum_to_cs(buf);
1076 /* Copy metadata from mbuf if valid */
1077 metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1079 /* Retrieve packet information. */
1080 length = PKT_LEN(buf);
1082 /* Start new session if packet differs. */
1083 if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
1084 ((mpw.len != length) ||
1086 (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1087 (mpw.wqe->eseg.cs_flags != cs_flags)))
1088 mlx5_mpw_close(txq, &mpw);
1089 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1091 * Multi-Packet WQE consumes at most two WQE.
1092 * mlx5_mpw_new() expects to be able to use such
1095 if (unlikely(max_wqe < 2))
1098 mlx5_mpw_new(txq, &mpw, length);
1099 mpw.wqe->eseg.cs_flags = cs_flags;
1100 mpw.wqe->eseg.flow_table_metadata = metadata;
1102 /* Multi-segment packets must be alone in their MPW. */
1103 assert((segs_n == 1) || (mpw.pkts_n == 0));
1104 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1108 volatile struct mlx5_wqe_data_seg *dseg;
1112 (*txq->elts)[elts_head++ & elts_m] = buf;
1113 dseg = mpw.data.dseg[mpw.pkts_n];
1114 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1115 *dseg = (struct mlx5_wqe_data_seg){
1116 .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
1117 .lkey = mlx5_tx_mb2mr(txq, buf),
1118 .addr = rte_cpu_to_be_64(addr),
1120 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1121 length += DATA_LEN(buf);
1127 assert(length == mpw.len);
1128 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1129 mlx5_mpw_close(txq, &mpw);
1130 #ifdef MLX5_PMD_SOFT_COUNTERS
1131 /* Increment sent bytes counter. */
1132 txq->stats.obytes += length;
1136 /* Take a shortcut if nothing must be sent. */
1137 if (unlikely(i == 0))
1139 /* Check whether completion threshold has been reached. */
1140 /* "j" includes both packets and segments. */
1141 comp = txq->elts_comp + j;
1142 if (comp >= MLX5_TX_COMP_THRESH) {
1143 volatile struct mlx5_wqe *wqe = mpw.wqe;
1145 /* A CQE slot must always be available. */
1146 assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1147 /* Request completion on last WQE. */
1148 wqe->ctrl[2] = rte_cpu_to_be_32(8);
1149 /* Save elts_head in unused "immediate" field of WQE. */
1150 wqe->ctrl[3] = elts_head;
1153 txq->elts_comp = comp;
1155 #ifdef MLX5_PMD_SOFT_COUNTERS
1156 /* Increment sent packets counter. */
1157 txq->stats.opackets += i;
1159 /* Ring QP doorbell. */
1160 if (mpw.state == MLX5_MPW_STATE_OPENED)
1161 mlx5_mpw_close(txq, &mpw);
1162 mlx5_tx_dbrec(txq, mpw.wqe);
1163 txq->elts_head = elts_head;
1168 * Open a MPW inline session.
1171 * Pointer to TX queue structure.
1173 * Pointer to MPW session structure.
1178 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
1181 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1182 struct mlx5_wqe_inl_small *inl;
1184 mpw->state = MLX5_MPW_INL_STATE_OPENED;
1188 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1189 mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
1190 (txq->wqe_ci << 8) |
1192 mpw->wqe->ctrl[2] = 0;
1193 mpw->wqe->ctrl[3] = 0;
1194 mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
1195 mpw->wqe->eseg.inline_hdr_sz = 0;
1196 mpw->wqe->eseg.cs_flags = 0;
1197 mpw->wqe->eseg.rsvd0 = 0;
1198 mpw->wqe->eseg.rsvd1 = 0;
1199 mpw->wqe->eseg.flow_table_metadata = 0;
1200 inl = (struct mlx5_wqe_inl_small *)
1201 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
1202 mpw->data.raw = (uint8_t *)&inl->raw;
1206 * Close a MPW inline session.
1209 * Pointer to TX queue structure.
1211 * Pointer to MPW session structure.
1214 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1217 struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
1218 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
1220 size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
1222 * Store size in multiple of 16 bytes. Control and Ethernet segments
1225 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1227 mpw->state = MLX5_MPW_STATE_CLOSED;
1228 inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
1229 txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1233 * DPDK callback for TX with MPW inline support.
1236 * Generic pointer to TX queue structure.
1238 * Packets to transmit.
1240 * Number of packets in array.
1243 * Number of packets successfully transmitted (<= pkts_n).
1246 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
1249 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1250 uint16_t elts_head = txq->elts_head;
1251 const uint16_t elts_n = 1 << txq->elts_n;
1252 const uint16_t elts_m = elts_n - 1;
1258 unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1259 struct mlx5_mpw mpw = {
1260 .state = MLX5_MPW_STATE_CLOSED,
1263 * Compute the maximum number of WQE which can be consumed by inline
1266 * - 1 control segment,
1267 * - 1 Ethernet segment,
1268 * - N Dseg from the inline request.
1270 const unsigned int wqe_inl_n =
1271 ((2 * MLX5_WQE_DWORD_SIZE +
1272 txq->max_inline * RTE_CACHE_LINE_SIZE) +
1273 RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1275 if (unlikely(!pkts_n))
1277 /* Prefetch first packet cacheline. */
1278 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1279 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1280 /* Start processing. */
1281 mlx5_tx_complete(txq);
1282 max_elts = (elts_n - (elts_head - txq->elts_tail));
1284 struct rte_mbuf *buf = *(pkts++);
1287 unsigned int segs_n = buf->nb_segs;
1289 rte_be32_t metadata;
1292 * Make sure there is enough room to store this packet and
1293 * that one ring entry remains unused.
1296 if (max_elts < segs_n)
1298 /* Do not bother with large packets MPW cannot handle. */
1299 if (segs_n > MLX5_MPW_DSEG_MAX) {
1300 txq->stats.oerrors++;
1306 * Compute max_wqe in case less WQE were consumed in previous
1309 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1310 cs_flags = txq_ol_cksum_to_cs(buf);
1311 /* Copy metadata from mbuf if valid */
1312 metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1314 /* Retrieve packet information. */
1315 length = PKT_LEN(buf);
1316 /* Start new session if packet differs. */
1317 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1318 if ((mpw.len != length) ||
1320 (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1321 (mpw.wqe->eseg.cs_flags != cs_flags))
1322 mlx5_mpw_close(txq, &mpw);
1323 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1324 if ((mpw.len != length) ||
1326 (length > inline_room) ||
1327 (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1328 (mpw.wqe->eseg.cs_flags != cs_flags)) {
1329 mlx5_mpw_inline_close(txq, &mpw);
1331 txq->max_inline * RTE_CACHE_LINE_SIZE;
1334 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1335 if ((segs_n != 1) ||
1336 (length > inline_room)) {
1338 * Multi-Packet WQE consumes at most two WQE.
1339 * mlx5_mpw_new() expects to be able to use
1342 if (unlikely(max_wqe < 2))
1345 mlx5_mpw_new(txq, &mpw, length);
1346 mpw.wqe->eseg.cs_flags = cs_flags;
1347 mpw.wqe->eseg.flow_table_metadata = metadata;
1349 if (unlikely(max_wqe < wqe_inl_n))
1351 max_wqe -= wqe_inl_n;
1352 mlx5_mpw_inline_new(txq, &mpw, length);
1353 mpw.wqe->eseg.cs_flags = cs_flags;
1354 mpw.wqe->eseg.flow_table_metadata = metadata;
1357 /* Multi-segment packets must be alone in their MPW. */
1358 assert((segs_n == 1) || (mpw.pkts_n == 0));
1359 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1360 assert(inline_room ==
1361 txq->max_inline * RTE_CACHE_LINE_SIZE);
1362 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1366 volatile struct mlx5_wqe_data_seg *dseg;
1369 (*txq->elts)[elts_head++ & elts_m] = buf;
1370 dseg = mpw.data.dseg[mpw.pkts_n];
1371 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1372 *dseg = (struct mlx5_wqe_data_seg){
1374 rte_cpu_to_be_32(DATA_LEN(buf)),
1375 .lkey = mlx5_tx_mb2mr(txq, buf),
1376 .addr = rte_cpu_to_be_64(addr),
1378 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1379 length += DATA_LEN(buf);
1385 assert(length == mpw.len);
1386 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1387 mlx5_mpw_close(txq, &mpw);
1391 assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1392 assert(length <= inline_room);
1393 assert(length == DATA_LEN(buf));
1394 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1395 (*txq->elts)[elts_head++ & elts_m] = buf;
1396 /* Maximum number of bytes before wrapping. */
1397 max = ((((uintptr_t)(txq->wqes)) +
1400 (uintptr_t)mpw.data.raw);
1402 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1405 mpw.data.raw = (volatile void *)txq->wqes;
1406 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1407 (void *)(addr + max),
1409 mpw.data.raw += length - max;
1411 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1417 (volatile void *)txq->wqes;
1419 mpw.data.raw += length;
1422 mpw.total_len += length;
1424 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1425 mlx5_mpw_inline_close(txq, &mpw);
1427 txq->max_inline * RTE_CACHE_LINE_SIZE;
1429 inline_room -= length;
1432 #ifdef MLX5_PMD_SOFT_COUNTERS
1433 /* Increment sent bytes counter. */
1434 txq->stats.obytes += length;
1438 /* Take a shortcut if nothing must be sent. */
1439 if (unlikely(i == 0))
1441 /* Check whether completion threshold has been reached. */
1442 /* "j" includes both packets and segments. */
1443 comp = txq->elts_comp + j;
1444 if (comp >= MLX5_TX_COMP_THRESH) {
1445 volatile struct mlx5_wqe *wqe = mpw.wqe;
1447 /* A CQE slot must always be available. */
1448 assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1449 /* Request completion on last WQE. */
1450 wqe->ctrl[2] = rte_cpu_to_be_32(8);
1451 /* Save elts_head in unused "immediate" field of WQE. */
1452 wqe->ctrl[3] = elts_head;
1455 txq->elts_comp = comp;
1457 #ifdef MLX5_PMD_SOFT_COUNTERS
1458 /* Increment sent packets counter. */
1459 txq->stats.opackets += i;
1461 /* Ring QP doorbell. */
1462 if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1463 mlx5_mpw_inline_close(txq, &mpw);
1464 else if (mpw.state == MLX5_MPW_STATE_OPENED)
1465 mlx5_mpw_close(txq, &mpw);
1466 mlx5_tx_dbrec(txq, mpw.wqe);
1467 txq->elts_head = elts_head;
1472 * Open an Enhanced MPW session.
1475 * Pointer to TX queue structure.
1477 * Pointer to MPW session structure.
1482 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1484 uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1486 mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1488 mpw->total_len = sizeof(struct mlx5_wqe);
1489 mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1491 rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1492 (txq->wqe_ci << 8) |
1493 MLX5_OPCODE_ENHANCED_MPSW);
1494 mpw->wqe->ctrl[2] = 0;
1495 mpw->wqe->ctrl[3] = 0;
1496 memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1497 if (unlikely(padding)) {
1498 uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1500 /* Pad the first 2 DWORDs with zero-length inline header. */
1501 *(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1502 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1503 rte_cpu_to_be_32(MLX5_INLINE_SEG);
1504 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1505 /* Start from the next WQEBB. */
1506 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1508 mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1513 * Close an Enhanced MPW session.
1516 * Pointer to TX queue structure.
1518 * Pointer to MPW session structure.
1521 * Number of consumed WQEs.
1523 static inline uint16_t
1524 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1528 /* Store size in multiple of 16 bytes. Control and Ethernet segments
1531 mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1532 MLX5_WQE_DS(mpw->total_len));
1533 mpw->state = MLX5_MPW_STATE_CLOSED;
1534 ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1540 * TX with Enhanced MPW support.
1543 * Pointer to TX queue structure.
1545 * Packets to transmit.
1547 * Number of packets in array.
1550 * Number of packets successfully transmitted (<= pkts_n).
1552 static inline uint16_t
1553 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1556 uint16_t elts_head = txq->elts_head;
1557 const uint16_t elts_n = 1 << txq->elts_n;
1558 const uint16_t elts_m = elts_n - 1;
1563 unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1564 unsigned int mpw_room = 0;
1565 unsigned int inl_pad = 0;
1568 struct mlx5_mpw mpw = {
1569 .state = MLX5_MPW_STATE_CLOSED,
1572 if (unlikely(!pkts_n))
1574 /* Start processing. */
1575 mlx5_tx_complete(txq);
1576 max_elts = (elts_n - (elts_head - txq->elts_tail));
1577 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1578 if (unlikely(!max_wqe))
1581 struct rte_mbuf *buf = *(pkts++);
1583 unsigned int do_inline = 0; /* Whether inline is possible. */
1586 rte_be32_t metadata;
1588 /* Multi-segmented packet is handled in slow-path outside. */
1589 assert(NB_SEGS(buf) == 1);
1590 /* Make sure there is enough room to store this packet. */
1591 if (max_elts - j == 0)
1593 cs_flags = txq_ol_cksum_to_cs(buf);
1594 /* Copy metadata from mbuf if valid */
1595 metadata = buf->ol_flags & PKT_TX_METADATA ? buf->tx_metadata :
1597 /* Retrieve packet information. */
1598 length = PKT_LEN(buf);
1599 /* Start new session if:
1600 * - multi-segment packet
1601 * - no space left even for a dseg
1602 * - next packet can be inlined with a new WQE
1605 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1606 if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1608 (length <= txq->inline_max_packet_sz &&
1609 inl_pad + sizeof(inl_hdr) + length >
1611 (mpw.wqe->eseg.flow_table_metadata != metadata) ||
1612 (mpw.wqe->eseg.cs_flags != cs_flags))
1613 max_wqe -= mlx5_empw_close(txq, &mpw);
1615 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1616 /* In Enhanced MPW, inline as much as the budget is
1617 * allowed. The remaining space is to be filled with
1618 * dsegs. If the title WQEBB isn't padded, it will have
1621 mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1622 (max_inline ? max_inline :
1623 pkts_n * MLX5_WQE_DWORD_SIZE) +
1625 if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1627 /* Don't pad the title WQEBB to not waste WQ. */
1628 mlx5_empw_new(txq, &mpw, 0);
1629 mpw_room -= mpw.total_len;
1631 do_inline = length <= txq->inline_max_packet_sz &&
1632 sizeof(inl_hdr) + length <= mpw_room &&
1634 mpw.wqe->eseg.cs_flags = cs_flags;
1635 mpw.wqe->eseg.flow_table_metadata = metadata;
1637 /* Evaluate whether the next packet can be inlined.
1638 * Inlininig is possible when:
1639 * - length is less than configured value
1640 * - length fits for remaining space
1641 * - not required to fill the title WQEBB with dsegs
1644 length <= txq->inline_max_packet_sz &&
1645 inl_pad + sizeof(inl_hdr) + length <=
1647 (!txq->mpw_hdr_dseg ||
1648 mpw.total_len >= MLX5_WQE_SIZE);
1650 if (max_inline && do_inline) {
1651 /* Inline packet into WQE. */
1654 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1655 assert(length == DATA_LEN(buf));
1656 inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1657 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1658 mpw.data.raw = (volatile void *)
1659 ((uintptr_t)mpw.data.raw + inl_pad);
1660 max = tx_mlx5_wq_tailroom(txq,
1661 (void *)(uintptr_t)mpw.data.raw);
1662 /* Copy inline header. */
1663 mpw.data.raw = (volatile void *)
1665 (void *)(uintptr_t)mpw.data.raw,
1668 (void *)(uintptr_t)txq->wqes,
1670 max = tx_mlx5_wq_tailroom(txq,
1671 (void *)(uintptr_t)mpw.data.raw);
1672 /* Copy packet data. */
1673 mpw.data.raw = (volatile void *)
1675 (void *)(uintptr_t)mpw.data.raw,
1678 (void *)(uintptr_t)txq->wqes,
1681 mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1682 /* No need to get completion as the entire packet is
1683 * copied to WQ. Free the buf right away.
1685 rte_pktmbuf_free_seg(buf);
1686 mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1687 /* Add pad in the next packet if any. */
1688 inl_pad = (((uintptr_t)mpw.data.raw +
1689 (MLX5_WQE_DWORD_SIZE - 1)) &
1690 ~(MLX5_WQE_DWORD_SIZE - 1)) -
1691 (uintptr_t)mpw.data.raw;
1693 /* No inline. Load a dseg of packet pointer. */
1694 volatile rte_v128u32_t *dseg;
1696 assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1697 assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1698 assert(length == DATA_LEN(buf));
1699 if (!tx_mlx5_wq_tailroom(txq,
1700 (void *)((uintptr_t)mpw.data.raw
1702 dseg = (volatile void *)txq->wqes;
1704 dseg = (volatile void *)
1705 ((uintptr_t)mpw.data.raw +
1707 (*txq->elts)[elts_head++ & elts_m] = buf;
1708 addr_64 = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1710 *dseg = (rte_v128u32_t) {
1711 rte_cpu_to_be_32(length),
1712 mlx5_tx_mb2mr(txq, buf),
1716 mpw.data.raw = (volatile void *)(dseg + 1);
1717 mpw.total_len += (inl_pad + sizeof(*dseg));
1720 mpw_room -= (inl_pad + sizeof(*dseg));
1723 #ifdef MLX5_PMD_SOFT_COUNTERS
1724 /* Increment sent bytes counter. */
1725 txq->stats.obytes += length;
1728 } while (i < pkts_n);
1729 /* Take a shortcut if nothing must be sent. */
1730 if (unlikely(i == 0))
1732 /* Check whether completion threshold has been reached. */
1733 if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1734 (uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1735 (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1736 volatile struct mlx5_wqe *wqe = mpw.wqe;
1738 /* A CQE slot must always be available. */
1739 assert((1u << txq->cqe_n) - (txq->cq_pi++ - txq->cq_ci));
1740 /* Request completion on last WQE. */
1741 wqe->ctrl[2] = rte_cpu_to_be_32(8);
1742 /* Save elts_head in unused "immediate" field of WQE. */
1743 wqe->ctrl[3] = elts_head;
1745 txq->mpw_comp = txq->wqe_ci;
1747 txq->elts_comp += j;
1749 #ifdef MLX5_PMD_SOFT_COUNTERS
1750 /* Increment sent packets counter. */
1751 txq->stats.opackets += i;
1753 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1754 mlx5_empw_close(txq, &mpw);
1755 /* Ring QP doorbell. */
1756 mlx5_tx_dbrec(txq, mpw.wqe);
1757 txq->elts_head = elts_head;
1762 * DPDK callback for TX with Enhanced MPW support.
1765 * Generic pointer to TX queue structure.
1767 * Packets to transmit.
1769 * Number of packets in array.
1772 * Number of packets successfully transmitted (<= pkts_n).
1775 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1777 struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1780 while (pkts_n > nb_tx) {
1784 n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1786 ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1791 n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1793 ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1803 * Translate RX completion flags to packet type.
1806 * Pointer to RX queue structure.
1810 * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1813 * Packet type for struct rte_mbuf.
1815 static inline uint32_t
1816 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1819 uint8_t pinfo = cqe->pkt_info;
1820 uint16_t ptype = cqe->hdr_type_etc;
1823 * The index to the array should have:
1824 * bit[1:0] = l3_hdr_type
1825 * bit[4:2] = l4_hdr_type
1828 * bit[7] = outer_l3_type
1830 idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1831 return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
1835 * Initialize Rx WQ and indexes.
1838 * Pointer to RX queue structure.
1841 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
1843 const unsigned int wqe_n = 1 << rxq->elts_n;
1846 for (i = 0; (i != wqe_n); ++i) {
1847 volatile struct mlx5_wqe_data_seg *scat;
1849 uint32_t byte_count;
1851 if (mlx5_rxq_mprq_enabled(rxq)) {
1852 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
1854 scat = &((volatile struct mlx5_wqe_mprq *)
1856 addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
1857 byte_count = (1 << rxq->strd_sz_n) *
1858 (1 << rxq->strd_num_n);
1860 struct rte_mbuf *buf = (*rxq->elts)[i];
1862 scat = &((volatile struct mlx5_wqe_data_seg *)
1864 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1865 byte_count = DATA_LEN(buf);
1867 /* scat->addr must be able to store a pointer. */
1868 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
1869 *scat = (struct mlx5_wqe_data_seg){
1870 .addr = rte_cpu_to_be_64(addr),
1871 .byte_count = rte_cpu_to_be_32(byte_count),
1872 .lkey = mlx5_rx_addr2mr(rxq, addr),
1875 rxq->consumed_strd = 0;
1876 rxq->decompressed = 0;
1878 rxq->zip = (struct rxq_zip){
1881 /* Update doorbell counter. */
1882 rxq->rq_ci = wqe_n >> rxq->sges_n;
1884 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1888 * Handle a Rx error.
1889 * The function inserts the RQ state to reset when the first error CQE is
1890 * shown, then drains the CQ by the caller function loop. When the CQ is empty,
1891 * it moves the RQ state to ready and initializes the RQ.
1892 * Next CQE identification and error counting are in the caller responsibility.
1895 * Pointer to RX queue structure.
1896 * @param[in] mbuf_prepare
1897 * Whether to prepare mbufs for the RQ.
1900 * -1 in case of recovery error, otherwise the CQE status.
1903 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
1905 const uint16_t cqe_n = 1 << rxq->cqe_n;
1906 const uint16_t cqe_mask = cqe_n - 1;
1907 const unsigned int wqe_n = 1 << rxq->elts_n;
1908 struct mlx5_rxq_ctrl *rxq_ctrl =
1909 container_of(rxq, struct mlx5_rxq_ctrl, rxq);
1910 struct ibv_wq_attr mod = {
1911 .attr_mask = IBV_WQ_ATTR_STATE,
1914 volatile struct mlx5_cqe *cqe;
1915 volatile struct mlx5_err_cqe *err_cqe;
1917 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
1921 switch (rxq->err_state) {
1922 case MLX5_RXQ_ERR_STATE_NO_ERROR:
1923 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
1925 case MLX5_RXQ_ERR_STATE_NEED_RESET:
1926 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1928 mod.wq_state = IBV_WQS_RESET;
1929 ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
1931 DRV_LOG(ERR, "Cannot change Rx WQ state to RESET %s\n",
1935 if (rxq_ctrl->dump_file_n <
1936 rxq_ctrl->priv->config.max_dump_files_num) {
1937 MKSTR(err_str, "Unexpected CQE error syndrome "
1938 "0x%02x CQN = %u RQN = %u wqe_counter = %u"
1939 " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
1940 rxq->cqn, rxq_ctrl->ibv->wq->wq_num,
1941 rte_be_to_cpu_16(u.err_cqe->wqe_counter),
1942 rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
1943 MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
1944 rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
1945 mlx5_dump_debug_information(name, NULL, err_str, 0);
1946 mlx5_dump_debug_information(name, "MLX5 Error CQ:",
1947 (const void *)((uintptr_t)
1949 sizeof(*u.cqe) * cqe_n);
1950 mlx5_dump_debug_information(name, "MLX5 Error RQ:",
1951 (const void *)((uintptr_t)
1954 rxq_ctrl->dump_file_n++;
1956 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
1958 case MLX5_RXQ_ERR_STATE_NEED_READY:
1959 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
1960 if (ret == MLX5_CQE_STATUS_HW_OWN) {
1962 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1965 * The RQ consumer index must be zeroed while moving
1966 * from RESET state to RDY state.
1968 *rxq->rq_db = rte_cpu_to_be_32(0);
1970 mod.wq_state = IBV_WQS_RDY;
1971 ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
1973 DRV_LOG(ERR, "Cannot change Rx WQ state to RDY"
1974 " %s\n", strerror(errno));
1978 const uint16_t q_mask = wqe_n - 1;
1980 struct rte_mbuf **elt;
1982 unsigned int n = wqe_n - (rxq->rq_ci -
1985 for (i = 0; i < (int)n; ++i) {
1986 elt_idx = (rxq->rq_ci + i) & q_mask;
1987 elt = &(*rxq->elts)[elt_idx];
1988 *elt = rte_mbuf_raw_alloc(rxq->mp);
1990 for (i--; i >= 0; --i) {
1991 elt_idx = (rxq->rq_ci +
1995 rte_pktmbuf_free_seg
2002 mlx5_rxq_initialize(rxq);
2003 rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
2012 * Get size of the next packet for a given CQE. For compressed CQEs, the
2013 * consumer index is updated only once all packets of the current one have
2017 * Pointer to RX queue.
2021 * Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
2025 * 0 in case of empty CQE, otherwise the packet size in bytes.
2028 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
2029 uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
2031 struct rxq_zip *zip = &rxq->zip;
2032 uint16_t cqe_n = cqe_cnt + 1;
2038 /* Process compressed data in the CQE and mini arrays. */
2040 volatile struct mlx5_mini_cqe8 (*mc)[8] =
2041 (volatile struct mlx5_mini_cqe8 (*)[8])
2042 (uintptr_t)(&(*rxq->cqes)[zip->ca &
2045 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
2046 *mcqe = &(*mc)[zip->ai & 7];
2047 if ((++zip->ai & 7) == 0) {
2048 /* Invalidate consumed CQEs */
2051 while (idx != end) {
2052 (*rxq->cqes)[idx & cqe_cnt].op_own =
2053 MLX5_CQE_INVALIDATE;
2057 * Increment consumer index to skip the number
2058 * of CQEs consumed. Hardware leaves holes in
2059 * the CQ ring for software use.
2064 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
2065 /* Invalidate the rest */
2069 while (idx != end) {
2070 (*rxq->cqes)[idx & cqe_cnt].op_own =
2071 MLX5_CQE_INVALIDATE;
2074 rxq->cq_ci = zip->cq_ci;
2078 * No compressed data, get next CQE and verify if it is
2085 ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
2086 if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
2087 if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
2089 ret = mlx5_rx_err_handle(rxq, 0);
2090 if (ret == MLX5_CQE_STATUS_HW_OWN ||
2098 op_own = cqe->op_own;
2099 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
2100 volatile struct mlx5_mini_cqe8 (*mc)[8] =
2101 (volatile struct mlx5_mini_cqe8 (*)[8])
2102 (uintptr_t)(&(*rxq->cqes)
2106 /* Fix endianness. */
2107 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
2109 * Current mini array position is the one
2110 * returned by check_cqe64().
2112 * If completion comprises several mini arrays,
2113 * as a special case the second one is located
2114 * 7 CQEs after the initial CQE instead of 8
2115 * for subsequent ones.
2117 zip->ca = rxq->cq_ci;
2118 zip->na = zip->ca + 7;
2119 /* Compute the next non compressed CQE. */
2121 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
2122 /* Get packet size to return. */
2123 len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
2126 /* Prefetch all to be invalidated */
2129 while (idx != end) {
2130 rte_prefetch0(&(*rxq->cqes)[(idx) &
2135 len = rte_be_to_cpu_32(cqe->byte_cnt);
2138 if (unlikely(rxq->err_state)) {
2139 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2140 ++rxq->stats.idropped;
2148 * Translate RX completion flags to offload flags.
2154 * Offload flags (ol_flags) for struct rte_mbuf.
2156 static inline uint32_t
2157 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
2159 uint32_t ol_flags = 0;
2160 uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
2164 MLX5_CQE_RX_L3_HDR_VALID,
2165 PKT_RX_IP_CKSUM_GOOD) |
2167 MLX5_CQE_RX_L4_HDR_VALID,
2168 PKT_RX_L4_CKSUM_GOOD);
2173 * Fill in mbuf fields from RX completion flags.
2174 * Note that pkt->ol_flags should be initialized outside of this function.
2177 * Pointer to RX queue.
2182 * @param rss_hash_res
2183 * Packet RSS Hash result.
2186 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
2187 volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
2189 /* Update packet information. */
2190 pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
2191 if (rss_hash_res && rxq->rss_hash) {
2192 pkt->hash.rss = rss_hash_res;
2193 pkt->ol_flags |= PKT_RX_RSS_HASH;
2195 if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
2196 pkt->ol_flags |= PKT_RX_FDIR;
2197 if (cqe->sop_drop_qpn !=
2198 rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
2199 uint32_t mark = cqe->sop_drop_qpn;
2201 pkt->ol_flags |= PKT_RX_FDIR_ID;
2202 pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
2206 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
2207 if (rxq->vlan_strip &&
2208 (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
2209 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
2210 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
2212 if (rxq->hw_timestamp) {
2213 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
2214 pkt->ol_flags |= PKT_RX_TIMESTAMP;
2219 * DPDK callback for RX.
2222 * Generic pointer to RX queue structure.
2224 * Array to store received packets.
2226 * Maximum number of packets in array.
2229 * Number of packets successfully received (<= pkts_n).
2232 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
2234 struct mlx5_rxq_data *rxq = dpdk_rxq;
2235 const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
2236 const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
2237 const unsigned int sges_n = rxq->sges_n;
2238 struct rte_mbuf *pkt = NULL;
2239 struct rte_mbuf *seg = NULL;
2240 volatile struct mlx5_cqe *cqe =
2241 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2243 unsigned int rq_ci = rxq->rq_ci << sges_n;
2244 int len = 0; /* keep its value across iterations. */
2247 unsigned int idx = rq_ci & wqe_cnt;
2248 volatile struct mlx5_wqe_data_seg *wqe =
2249 &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
2250 struct rte_mbuf *rep = (*rxq->elts)[idx];
2251 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
2252 uint32_t rss_hash_res;
2260 rep = rte_mbuf_raw_alloc(rxq->mp);
2261 if (unlikely(rep == NULL)) {
2262 ++rxq->stats.rx_nombuf;
2265 * no buffers before we even started,
2266 * bail out silently.
2270 while (pkt != seg) {
2271 assert(pkt != (*rxq->elts)[idx]);
2275 rte_mbuf_raw_free(pkt);
2281 cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
2282 len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
2284 rte_mbuf_raw_free(rep);
2288 assert(len >= (rxq->crc_present << 2));
2290 /* If compressed, take hash result from mini-CQE. */
2291 rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
2293 mcqe->rx_hash_result);
2294 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
2295 if (rxq->crc_present)
2296 len -= RTE_ETHER_CRC_LEN;
2299 DATA_LEN(rep) = DATA_LEN(seg);
2300 PKT_LEN(rep) = PKT_LEN(seg);
2301 SET_DATA_OFF(rep, DATA_OFF(seg));
2302 PORT(rep) = PORT(seg);
2303 (*rxq->elts)[idx] = rep;
2305 * Fill NIC descriptor with the new buffer. The lkey and size
2306 * of the buffers are already known, only the buffer address
2309 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
2310 /* If there's only one MR, no need to replace LKey in WQE. */
2311 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
2312 wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
2313 if (len > DATA_LEN(seg)) {
2314 len -= DATA_LEN(seg);
2319 DATA_LEN(seg) = len;
2320 #ifdef MLX5_PMD_SOFT_COUNTERS
2321 /* Increment bytes counter. */
2322 rxq->stats.ibytes += PKT_LEN(pkt);
2324 /* Return packet. */
2329 /* Align consumer index to the next stride. */
2334 if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
2336 /* Update the consumer index. */
2337 rxq->rq_ci = rq_ci >> sges_n;
2339 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
2341 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
2342 #ifdef MLX5_PMD_SOFT_COUNTERS
2343 /* Increment packets counter. */
2344 rxq->stats.ipackets += i;
2350 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
2352 struct mlx5_mprq_buf *buf = opaque;
2354 if (rte_atomic16_read(&buf->refcnt) == 1) {
2355 rte_mempool_put(buf->mp, buf);
2356 } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
2357 rte_atomic16_set(&buf->refcnt, 1);
2358 rte_mempool_put(buf->mp, buf);
2363 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
2365 mlx5_mprq_buf_free_cb(NULL, buf);
2369 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)
2371 struct mlx5_mprq_buf *rep = rxq->mprq_repl;
2372 volatile struct mlx5_wqe_data_seg *wqe =
2373 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
2376 assert(rep != NULL);
2377 /* Replace MPRQ buf. */
2378 (*rxq->mprq_bufs)[rq_idx] = rep;
2380 addr = mlx5_mprq_buf_addr(rep);
2381 wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
2382 /* If there's only one MR, no need to replace LKey in WQE. */
2383 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
2384 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
2385 /* Stash a mbuf for next replacement. */
2386 if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
2387 rxq->mprq_repl = rep;
2389 rxq->mprq_repl = NULL;
2393 * DPDK callback for RX with Multi-Packet RQ support.
2396 * Generic pointer to RX queue structure.
2398 * Array to store received packets.
2400 * Maximum number of packets in array.
2403 * Number of packets successfully received (<= pkts_n).
2406 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
2408 struct mlx5_rxq_data *rxq = dpdk_rxq;
2409 const unsigned int strd_n = 1 << rxq->strd_num_n;
2410 const unsigned int strd_sz = 1 << rxq->strd_sz_n;
2411 const unsigned int strd_shift =
2412 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
2413 const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
2414 const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
2415 volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
2417 uint32_t rq_ci = rxq->rq_ci;
2418 uint16_t consumed_strd = rxq->consumed_strd;
2419 struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
2421 while (i < pkts_n) {
2422 struct rte_mbuf *pkt;
2430 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
2431 uint32_t rss_hash_res = 0;
2433 if (consumed_strd == strd_n) {
2434 /* Replace WQE only if the buffer is still in use. */
2435 if (rte_atomic16_read(&buf->refcnt) > 1) {
2436 mprq_buf_replace(rxq, rq_ci & wq_mask);
2437 /* Release the old buffer. */
2438 mlx5_mprq_buf_free(buf);
2439 } else if (unlikely(rxq->mprq_repl == NULL)) {
2440 struct mlx5_mprq_buf *rep;
2443 * Currently, the MPRQ mempool is out of buffer
2444 * and doing memcpy regardless of the size of Rx
2445 * packet. Retry allocation to get back to
2448 if (!rte_mempool_get(rxq->mprq_mp,
2450 rxq->mprq_repl = rep;
2452 /* Advance to the next WQE. */
2455 buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
2457 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
2458 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
2462 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
2463 MLX5_MPRQ_STRIDE_NUM_SHIFT;
2465 consumed_strd += strd_cnt;
2466 if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
2469 rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
2470 strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
2472 /* mini-CQE for MPRQ doesn't have hash result. */
2473 strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
2475 assert(strd_idx < strd_n);
2476 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
2478 * Currently configured to receive a packet per a stride. But if
2479 * MTU is adjusted through kernel interface, device could
2480 * consume multiple strides without raising an error. In this
2481 * case, the packet should be dropped because it is bigger than
2482 * the max_rx_pkt_len.
2484 if (unlikely(strd_cnt > 1)) {
2485 ++rxq->stats.idropped;
2488 pkt = rte_pktmbuf_alloc(rxq->mp);
2489 if (unlikely(pkt == NULL)) {
2490 ++rxq->stats.rx_nombuf;
2493 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
2494 assert((int)len >= (rxq->crc_present << 2));
2495 if (rxq->crc_present)
2496 len -= RTE_ETHER_CRC_LEN;
2497 offset = strd_idx * strd_sz + strd_shift;
2498 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset);
2499 /* Initialize the offload flag. */
2502 * Memcpy packets to the target mbuf if:
2503 * - The size of packet is smaller than mprq_max_memcpy_len.
2504 * - Out of buffer in the Mempool for Multi-Packet RQ.
2506 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
2508 * When memcpy'ing packet due to out-of-buffer, the
2509 * packet must be smaller than the target mbuf.
2511 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
2512 rte_pktmbuf_free_seg(pkt);
2513 ++rxq->stats.idropped;
2516 rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
2518 rte_iova_t buf_iova;
2519 struct rte_mbuf_ext_shared_info *shinfo;
2520 uint16_t buf_len = strd_cnt * strd_sz;
2522 /* Increment the refcnt of the whole chunk. */
2523 rte_atomic16_add_return(&buf->refcnt, 1);
2524 assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
2526 addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
2528 * MLX5 device doesn't use iova but it is necessary in a
2529 * case where the Rx packet is transmitted via a
2532 buf_iova = rte_mempool_virt2iova(buf) +
2533 RTE_PTR_DIFF(addr, buf);
2534 shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr,
2535 &buf_len, mlx5_mprq_buf_free_cb, buf);
2537 * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
2538 * attaching the stride to mbuf and more offload flags
2539 * will be added below by calling rxq_cq_to_mbuf().
2540 * Other fields will be overwritten.
2542 rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova, buf_len,
2544 rte_pktmbuf_reset_headroom(pkt);
2545 assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
2547 * Prevent potential overflow due to MTU change through
2550 if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
2551 rte_pktmbuf_free_seg(pkt);
2552 ++rxq->stats.idropped;
2556 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
2558 DATA_LEN(pkt) = len;
2559 PORT(pkt) = rxq->port_id;
2560 #ifdef MLX5_PMD_SOFT_COUNTERS
2561 /* Increment bytes counter. */
2562 rxq->stats.ibytes += PKT_LEN(pkt);
2564 /* Return packet. */
2568 /* Update the consumer indexes. */
2569 rxq->consumed_strd = consumed_strd;
2571 *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
2572 if (rq_ci != rxq->rq_ci) {
2575 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
2577 #ifdef MLX5_PMD_SOFT_COUNTERS
2578 /* Increment packets counter. */
2579 rxq->stats.ipackets += i;
2585 * Dummy DPDK callback for TX.
2587 * This function is used to temporarily replace the real callback during
2588 * unsafe control operations on the queue, or in case of error.
2591 * Generic pointer to TX queue structure.
2593 * Packets to transmit.
2595 * Number of packets in array.
2598 * Number of packets successfully transmitted (<= pkts_n).
2601 removed_tx_burst(void *dpdk_txq __rte_unused,
2602 struct rte_mbuf **pkts __rte_unused,
2603 uint16_t pkts_n __rte_unused)
2610 * Dummy DPDK callback for RX.
2612 * This function is used to temporarily replace the real callback during
2613 * unsafe control operations on the queue, or in case of error.
2616 * Generic pointer to RX queue structure.
2618 * Array to store received packets.
2620 * Maximum number of packets in array.
2623 * Number of packets successfully received (<= pkts_n).
2626 removed_rx_burst(void *dpdk_txq __rte_unused,
2627 struct rte_mbuf **pkts __rte_unused,
2628 uint16_t pkts_n __rte_unused)
2635 * Vectorized Rx/Tx routines are not compiled in when required vector
2636 * instructions are not supported on a target architecture. The following null
2637 * stubs are needed for linkage when those are not included outside of this file
2638 * (e.g. mlx5_rxtx_vec_sse.c for x86).
2642 mlx5_tx_burst_raw_vec(void *dpdk_txq __rte_unused,
2643 struct rte_mbuf **pkts __rte_unused,
2644 uint16_t pkts_n __rte_unused)
2650 mlx5_tx_burst_vec(void *dpdk_txq __rte_unused,
2651 struct rte_mbuf **pkts __rte_unused,
2652 uint16_t pkts_n __rte_unused)
2658 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
2659 struct rte_mbuf **pkts __rte_unused,
2660 uint16_t pkts_n __rte_unused)
2666 mlx5_check_raw_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2672 mlx5_check_vec_tx_support(struct rte_eth_dev *dev __rte_unused)
2678 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
2684 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)