1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2021 6WIND S.A.
3 * Copyright 2021 Mellanox Technologies, Ltd
6 #ifndef RTE_PMD_MLX5_TX_H_
7 #define RTE_PMD_MLX5_TX_H_
10 #include <sys/queue.h>
13 #include <rte_mempool.h>
14 #include <rte_common.h>
15 #include <rte_spinlock.h>
17 #include <mlx5_common_mr.h>
20 #include "mlx5_autoconf.h"
22 /* TX burst subroutines return codes. */
23 enum mlx5_txcmp_code {
24 MLX5_TXCMP_CODE_EXIT = 0,
25 MLX5_TXCMP_CODE_ERROR,
26 MLX5_TXCMP_CODE_SINGLE,
27 MLX5_TXCMP_CODE_MULTI,
33 * These defines are used to configure Tx burst routine option set supported
34 * at compile time. The not specified options are optimized out due to if
35 * conditions can be explicitly calculated at compile time.
36 * The offloads with bigger runtime check (require more CPU cycles toskip)
37 * overhead should have the bigger index - this is needed to select the better
38 * matching routine function if no exact match and some offloads are not
41 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
42 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
43 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
44 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
45 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
46 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
47 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
48 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
49 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
50 #define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
52 /* The most common offloads groups. */
53 #define MLX5_TXOFF_CONFIG_NONE 0
54 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
55 MLX5_TXOFF_CONFIG_TSO | \
56 MLX5_TXOFF_CONFIG_SWP | \
57 MLX5_TXOFF_CONFIG_CSUM | \
58 MLX5_TXOFF_CONFIG_INLINE | \
59 MLX5_TXOFF_CONFIG_VLAN | \
60 MLX5_TXOFF_CONFIG_METADATA)
62 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
64 #define MLX5_TXOFF_PRE_DECL(func) \
65 uint16_t mlx5_tx_burst_##func(void *txq, \
66 struct rte_mbuf **pkts, \
69 #define MLX5_TXOFF_DECL(func, olx) \
70 uint16_t mlx5_tx_burst_##func(void *txq, \
71 struct rte_mbuf **pkts, \
74 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
75 pkts, pkts_n, (olx)); \
78 /* Mbuf dynamic flag offset for inline. */
79 extern uint64_t rte_net_mlx5_dynf_inline_mask;
80 #define RTE_MBUF_F_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
82 extern uint32_t mlx5_ptype_table[] __rte_cache_aligned;
83 extern uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
84 extern uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
86 struct mlx5_txq_stats {
87 #ifdef MLX5_PMD_SOFT_COUNTERS
88 uint64_t opackets; /**< Total of successfully sent packets. */
89 uint64_t obytes; /**< Total of successfully sent bytes. */
91 uint64_t oerrors; /**< Total number of failed transmitted packets. */
94 /* TX queue send local data. */
96 struct mlx5_txq_local {
97 struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
98 struct rte_mbuf *mbuf; /* first mbuf to process. */
99 uint16_t pkts_copy; /* packets copied to elts. */
100 uint16_t pkts_sent; /* packets sent. */
101 uint16_t pkts_loop; /* packets sent on loop entry. */
102 uint16_t elts_free; /* available elts remain. */
103 uint16_t wqe_free; /* available wqe remain. */
104 uint16_t mbuf_off; /* data offset in current mbuf. */
105 uint16_t mbuf_nseg; /* number of remaining mbuf. */
106 uint16_t mbuf_free; /* number of inline mbufs to free. */
109 /* TX queue descriptor. */
111 struct mlx5_txq_data {
112 uint16_t elts_head; /* Current counter in (*elts)[]. */
113 uint16_t elts_tail; /* Counter of first element awaiting completion. */
114 uint16_t elts_comp; /* elts index since last completion request. */
115 uint16_t elts_s; /* Number of mbuf elements. */
116 uint16_t elts_m; /* Mask for mbuf elements indices. */
117 /* Fields related to elts mbuf storage. */
118 uint16_t wqe_ci; /* Consumer index for work queue. */
119 uint16_t wqe_pi; /* Producer index for work queue. */
120 uint16_t wqe_s; /* Number of WQ elements. */
121 uint16_t wqe_m; /* Mask Number for WQ elements. */
122 uint16_t wqe_comp; /* WQE index since last completion request. */
123 uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
124 /* WQ related fields. */
125 uint16_t cq_ci; /* Consumer index for completion queue. */
126 uint16_t cq_pi; /* Production index for completion queue. */
127 uint16_t cqe_s; /* Number of CQ elements. */
128 uint16_t cqe_m; /* Mask for CQ indices. */
129 /* CQ related fields. */
130 uint16_t elts_n:4; /* elts[] length (in log2). */
131 uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
132 uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
133 uint16_t tso_en:1; /* When set hardware TSO is enabled. */
134 uint16_t tunnel_en:1;
135 /* When set TX offload for tunneled packets are supported. */
136 uint16_t swp_en:1; /* Whether SW parser is enabled. */
137 uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
138 uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
139 uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
140 uint16_t fast_free:1; /* mbuf fast free on Tx is enabled. */
141 uint16_t inlen_send; /* Ordinary send data inline size. */
142 uint16_t inlen_empw; /* eMPW max packet size to inline. */
143 uint16_t inlen_mode; /* Minimal data length to inline. */
144 uint32_t qp_num_8s; /* QP number shifted by 8. */
145 uint64_t offloads; /* Offloads for Tx Queue. */
146 struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
147 struct mlx5_wqe *wqes; /* Work queue. */
148 struct mlx5_wqe *wqes_end; /* Work queue array limit. */
149 #ifdef RTE_LIBRTE_MLX5_DEBUG
150 uint32_t *fcqs; /* Free completion queue (debug extended). */
152 uint16_t *fcqs; /* Free completion queue. */
154 volatile struct mlx5_cqe *cqes; /* Completion queue. */
155 volatile uint32_t *qp_db; /* Work queue doorbell. */
156 volatile uint32_t *cq_db; /* Completion queue doorbell. */
157 uint16_t port_id; /* Port ID of device. */
158 uint16_t idx; /* Queue index. */
159 uint64_t ts_mask; /* Timestamp flag dynamic mask. */
160 int32_t ts_offset; /* Timestamp field dynamic offset. */
161 struct mlx5_dev_ctx_shared *sh; /* Shared context. */
162 struct mlx5_txq_stats stats; /* TX queue counters. */
164 rte_spinlock_t *uar_lock;
165 /* UAR access lock required for 32bit implementations */
167 struct rte_mbuf *elts[0];
168 /* Storage for queued packets, must be the last field. */
169 } __rte_cache_aligned;
172 MLX5_TXQ_TYPE_STANDARD, /* Standard Tx queue. */
173 MLX5_TXQ_TYPE_HAIRPIN, /* Hairpin Tx queue. */
176 /* TX queue control descriptor. */
177 struct mlx5_txq_ctrl {
178 LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
179 uint32_t refcnt; /* Reference counter. */
180 unsigned int socket; /* CPU socket ID for allocations. */
181 enum mlx5_txq_type type; /* The txq ctrl type. */
182 unsigned int max_inline_data; /* Max inline data. */
183 unsigned int max_tso_header; /* Max TSO header size. */
184 struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
185 struct mlx5_priv *priv; /* Back pointer to private data. */
186 off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
187 uint16_t dump_file_n; /* Number of dump files. */
188 struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
189 uint32_t hairpin_status; /* Hairpin binding status. */
190 struct mlx5_txq_data txq; /* Data path structure. */
191 /* Must be the last field in the structure, contains elts[]. */
196 int mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
197 int mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
198 int mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
199 int mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
200 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
201 unsigned int socket, const struct rte_eth_txconf *conf);
202 int mlx5_tx_hairpin_queue_setup
203 (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
204 const struct rte_eth_hairpin_conf *hairpin_conf);
205 void mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid);
206 void txq_uar_init(struct mlx5_txq_ctrl *txq_ctrl, void *bf_reg);
207 int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
208 void mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev);
209 int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
210 struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
211 uint16_t desc, unsigned int socket,
212 const struct rte_eth_txconf *conf);
213 struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
214 (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
215 const struct rte_eth_hairpin_conf *hairpin_conf);
216 struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
217 int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
218 int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
219 int mlx5_txq_verify(struct rte_eth_dev *dev);
220 void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
221 void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
222 uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
223 void mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev);
227 uint16_t removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts,
229 void mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
230 unsigned int olx __rte_unused);
231 int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
232 void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
233 struct rte_eth_txq_info *qinfo);
234 int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
235 struct rte_eth_burst_mode *mode);
239 MLX5_TXOFF_PRE_DECL(full_empw);
240 MLX5_TXOFF_PRE_DECL(none_empw);
241 MLX5_TXOFF_PRE_DECL(md_empw);
242 MLX5_TXOFF_PRE_DECL(mt_empw);
243 MLX5_TXOFF_PRE_DECL(mtsc_empw);
244 MLX5_TXOFF_PRE_DECL(mti_empw);
245 MLX5_TXOFF_PRE_DECL(mtv_empw);
246 MLX5_TXOFF_PRE_DECL(mtiv_empw);
247 MLX5_TXOFF_PRE_DECL(sc_empw);
248 MLX5_TXOFF_PRE_DECL(sci_empw);
249 MLX5_TXOFF_PRE_DECL(scv_empw);
250 MLX5_TXOFF_PRE_DECL(sciv_empw);
251 MLX5_TXOFF_PRE_DECL(i_empw);
252 MLX5_TXOFF_PRE_DECL(v_empw);
253 MLX5_TXOFF_PRE_DECL(iv_empw);
255 /* mlx5_tx_nompw.c */
257 MLX5_TXOFF_PRE_DECL(full);
258 MLX5_TXOFF_PRE_DECL(none);
259 MLX5_TXOFF_PRE_DECL(md);
260 MLX5_TXOFF_PRE_DECL(mt);
261 MLX5_TXOFF_PRE_DECL(mtsc);
262 MLX5_TXOFF_PRE_DECL(mti);
263 MLX5_TXOFF_PRE_DECL(mtv);
264 MLX5_TXOFF_PRE_DECL(mtiv);
265 MLX5_TXOFF_PRE_DECL(sc);
266 MLX5_TXOFF_PRE_DECL(sci);
267 MLX5_TXOFF_PRE_DECL(scv);
268 MLX5_TXOFF_PRE_DECL(sciv);
269 MLX5_TXOFF_PRE_DECL(i);
270 MLX5_TXOFF_PRE_DECL(v);
271 MLX5_TXOFF_PRE_DECL(iv);
275 MLX5_TXOFF_PRE_DECL(full_ts_nompw);
276 MLX5_TXOFF_PRE_DECL(full_ts_nompwi);
277 MLX5_TXOFF_PRE_DECL(full_ts);
278 MLX5_TXOFF_PRE_DECL(full_ts_noi);
279 MLX5_TXOFF_PRE_DECL(none_ts);
280 MLX5_TXOFF_PRE_DECL(mdi_ts);
281 MLX5_TXOFF_PRE_DECL(mti_ts);
282 MLX5_TXOFF_PRE_DECL(mtiv_ts);
286 MLX5_TXOFF_PRE_DECL(none_mpw);
287 MLX5_TXOFF_PRE_DECL(mci_mpw);
288 MLX5_TXOFF_PRE_DECL(mc_mpw);
289 MLX5_TXOFF_PRE_DECL(i_mpw);
291 static __rte_always_inline uint64_t *
292 mlx5_tx_bfreg(struct mlx5_txq_data *txq)
294 return MLX5_PROC_PRIV(txq->port_id)->uar_table[txq->idx];
298 * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
299 * 64bit architectures.
302 * value to write in CPU endian format.
304 * Address to write to.
306 * Address of the lock to use for that UAR access.
308 static __rte_always_inline void
309 __mlx5_uar_write64_relaxed(uint64_t val, void *addr,
310 rte_spinlock_t *lock __rte_unused)
313 *(uint64_t *)addr = val;
314 #else /* !RTE_ARCH_64 */
315 rte_spinlock_lock(lock);
316 *(uint32_t *)addr = val;
318 *((uint32_t *)addr + 1) = val >> 32;
319 rte_spinlock_unlock(lock);
324 * Provide safe 64bit store operation to mlx5 UAR region for both 32bit and
325 * 64bit architectures while guaranteeing the order of execution with the
326 * code being executed.
329 * value to write in CPU endian format.
331 * Address to write to.
333 * Address of the lock to use for that UAR access.
335 static __rte_always_inline void
336 __mlx5_uar_write64(uint64_t val, void *addr, rte_spinlock_t *lock)
339 __mlx5_uar_write64_relaxed(val, addr, lock);
342 /* Assist macros, used instead of directly calling the functions they wrap. */
344 #define mlx5_uar_write64_relaxed(val, dst, lock) \
345 __mlx5_uar_write64_relaxed(val, dst, NULL)
346 #define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, NULL)
348 #define mlx5_uar_write64_relaxed(val, dst, lock) \
349 __mlx5_uar_write64_relaxed(val, dst, lock)
350 #define mlx5_uar_write64(val, dst, lock) __mlx5_uar_write64(val, dst, lock)
354 * Ring TX queue doorbell and flush the update if requested.
357 * Pointer to TX queue structure.
359 * Pointer to the last WQE posted in the NIC.
361 * Request for write memory barrier after BlueFlame update.
363 static __rte_always_inline void
364 mlx5_tx_dbrec_cond_wmb(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe,
367 uint64_t *dst = mlx5_tx_bfreg(txq);
368 volatile uint64_t *src = ((volatile uint64_t *)wqe);
371 *txq->qp_db = rte_cpu_to_be_32(txq->wqe_ci);
372 /* Ensure ordering between DB record and BF copy. */
374 mlx5_uar_write64_relaxed(*src, dst, txq->uar_lock);
380 * Ring TX queue doorbell and flush the update by write memory barrier.
383 * Pointer to TX queue structure.
385 * Pointer to the last WQE posted in the NIC.
387 static __rte_always_inline void
388 mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
390 mlx5_tx_dbrec_cond_wmb(txq, wqe, 1);
394 * Convert timestamp from mbuf format to linear counter
395 * of Clock Queue completions (24 bits).
398 * Pointer to the device shared context to fetch Tx
399 * packet pacing timestamp and parameters.
401 * Timestamp from mbuf to convert.
403 * positive or zero value - completion ID to wait.
404 * negative value - conversion error.
406 static __rte_always_inline int32_t
407 mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
414 * Read atomically two uint64_t fields and compare lsb bits.
415 * It there is no match - the timestamp was updated in
416 * the service thread, data should be re-read.
418 rte_compiler_barrier();
419 ci = __atomic_load_n(&sh->txpp.ts.ci_ts, __ATOMIC_RELAXED);
420 ts = __atomic_load_n(&sh->txpp.ts.ts, __ATOMIC_RELAXED);
421 rte_compiler_barrier();
422 if (!((ts ^ ci) << (64 - MLX5_CQ_INDEX_WIDTH)))
425 /* Perform the skew correction, positive value to send earlier. */
426 mts -= sh->txpp.skew;
428 if (unlikely(mts >= UINT64_MAX / 2)) {
429 /* We have negative integer, mts is in the past. */
430 __atomic_fetch_add(&sh->txpp.err_ts_past,
431 1, __ATOMIC_RELAXED);
434 tick = sh->txpp.tick;
436 /* Convert delta to completions, round up. */
437 mts = (mts + tick - 1) / tick;
438 if (unlikely(mts >= (1 << MLX5_CQ_INDEX_WIDTH) / 2 - 1)) {
439 /* We have mts is too distant future. */
440 __atomic_fetch_add(&sh->txpp.err_ts_future,
441 1, __ATOMIC_RELAXED);
444 mts <<= 64 - MLX5_CQ_INDEX_WIDTH;
446 ci >>= 64 - MLX5_CQ_INDEX_WIDTH;
451 * Set Software Parser flags and offsets in Ethernet Segment of WQE.
452 * Flags must be preliminary initialized to zero.
455 * Pointer to burst routine local context.
457 * Pointer to store Software Parser flags.
459 * Configured Tx offloads mask. It is fully defined at
460 * compile time and may be used for optimization.
463 * Software Parser offsets packed in dword.
464 * Software Parser flags are set by pointer.
466 static __rte_always_inline uint32_t
467 txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
472 unsigned int idx, off;
475 if (!MLX5_TXOFF_CONFIG(SWP))
477 ol = loc->mbuf->ol_flags;
478 tunnel = ol & RTE_MBUF_F_TX_TUNNEL_MASK;
480 * Check whether Software Parser is required.
481 * Only customized tunnels may ask for.
483 if (likely(tunnel != RTE_MBUF_F_TX_TUNNEL_UDP && tunnel != RTE_MBUF_F_TX_TUNNEL_IP))
486 * The index should have:
487 * bit[0:1] = RTE_MBUF_F_TX_L4_MASK
488 * bit[4] = RTE_MBUF_F_TX_IPV6
489 * bit[8] = RTE_MBUF_F_TX_OUTER_IPV6
490 * bit[9] = RTE_MBUF_F_TX_OUTER_UDP
492 idx = (ol & (RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_IPV6 | RTE_MBUF_F_TX_OUTER_IPV6)) >> 52;
493 idx |= (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP) ? (1 << 9) : 0;
494 *swp_flags = mlx5_swp_types_table[idx];
496 * Set offsets for SW parser. Since ConnectX-5, SW parser just
497 * complements HW parser. SW parser starts to engage only if HW parser
498 * can't reach a header. For the older devices, HW parser will not kick
499 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
500 * should be set regardless of HW offload.
502 off = loc->mbuf->outer_l2_len;
503 if (MLX5_TXOFF_CONFIG(VLAN) && ol & RTE_MBUF_F_TX_VLAN)
504 off += sizeof(struct rte_vlan_hdr);
505 set = (off >> 1) << 8; /* Outer L3 offset. */
506 off += loc->mbuf->outer_l3_len;
507 if (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP)
508 set |= off >> 1; /* Outer L4 offset. */
509 if (ol & (RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6)) { /* Inner IP. */
510 const uint64_t csum = ol & RTE_MBUF_F_TX_L4_MASK;
511 off += loc->mbuf->l2_len;
512 set |= (off >> 1) << 24; /* Inner L3 offset. */
513 if (csum == RTE_MBUF_F_TX_TCP_CKSUM ||
514 csum == RTE_MBUF_F_TX_UDP_CKSUM ||
515 (MLX5_TXOFF_CONFIG(TSO) && ol & RTE_MBUF_F_TX_TCP_SEG)) {
516 off += loc->mbuf->l3_len;
517 set |= (off >> 1) << 16; /* Inner L4 offset. */
520 set = rte_cpu_to_le_32(set);
525 * Convert the Checksum offloads to Verbs.
528 * Pointer to the mbuf.
531 * Converted checksum flags.
533 static __rte_always_inline uint8_t
534 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
537 uint8_t is_tunnel = !!(buf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK);
538 const uint64_t ol_flags_mask = RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_L4_MASK |
539 RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_OUTER_IP_CKSUM;
542 * The index should have:
543 * bit[0] = RTE_MBUF_F_TX_TCP_SEG
544 * bit[2:3] = RTE_MBUF_F_TX_UDP_CKSUM, RTE_MBUF_F_TX_TCP_CKSUM
545 * bit[4] = RTE_MBUF_F_TX_IP_CKSUM
546 * bit[8] = RTE_MBUF_F_TX_OUTER_IP_CKSUM
549 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
550 return mlx5_cksum_table[idx];
554 * Free the mbufs from the linear array of pointers.
557 * Pointer to Tx queue structure.
559 * Pointer to array of packets to be free.
561 * Number of packets to be freed.
563 * Configured Tx offloads mask. It is fully defined at
564 * compile time and may be used for optimization.
566 static __rte_always_inline void
567 mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
568 struct rte_mbuf **__rte_restrict pkts,
570 unsigned int olx __rte_unused)
572 struct rte_mempool *pool = NULL;
573 struct rte_mbuf **p_free = NULL;
574 struct rte_mbuf *mbuf;
575 unsigned int n_free = 0;
578 * The implemented algorithm eliminates
579 * copying pointers to temporary array
580 * for rte_mempool_put_bulk() calls.
585 * Free mbufs directly to the pool in bulk
586 * if fast free offload is engaged
588 if (!MLX5_TXOFF_CONFIG(MULTI) && txq->fast_free) {
591 rte_mempool_put_bulk(pool, (void *)pkts, pkts_n);
597 * Decrement mbuf reference counter, detach
598 * indirect and external buffers if needed.
600 mbuf = rte_pktmbuf_prefree_seg(*pkts);
601 if (likely(mbuf != NULL)) {
602 MLX5_ASSERT(mbuf == *pkts);
603 if (likely(n_free != 0)) {
604 if (unlikely(pool != mbuf->pool))
605 /* From different pool. */
608 /* Start new scan array. */
615 if (unlikely(pkts_n == 0)) {
621 * This happens if mbuf is still referenced.
622 * We can't put it back to the pool, skip.
626 if (unlikely(n_free != 0))
627 /* There is some array to free.*/
629 if (unlikely(pkts_n == 0))
630 /* Last mbuf, nothing to free. */
636 * This loop is implemented to avoid multiple
637 * inlining of rte_mempool_put_bulk().
643 * Free the array of pre-freed mbufs
644 * belonging to the same memory pool.
646 rte_mempool_put_bulk(pool, (void *)p_free, n_free);
647 if (unlikely(mbuf != NULL)) {
648 /* There is the request to start new scan. */
653 if (likely(pkts_n != 0))
656 * This is the last mbuf to be freed.
657 * Do one more loop iteration to complete.
658 * This is rare case of the last unique mbuf.
663 if (likely(pkts_n == 0))
672 * No inline version to free buffers for optimal call
673 * on the tx_burst completion.
675 static __rte_noinline void
676 __mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
677 struct rte_mbuf **__rte_restrict pkts,
679 unsigned int olx __rte_unused)
681 mlx5_tx_free_mbuf(txq, pkts, pkts_n, olx);
685 * Free the mbuf from the elts ring buffer till new tail.
688 * Pointer to Tx queue structure.
690 * Index in elts to free up to, becomes new elts tail.
692 * Configured Tx offloads mask. It is fully defined at
693 * compile time and may be used for optimization.
695 static __rte_always_inline void
696 mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
698 unsigned int olx __rte_unused)
700 uint16_t n_elts = tail - txq->elts_tail;
703 MLX5_ASSERT(n_elts <= txq->elts_s);
705 * Implement a loop to support ring buffer wraparound
706 * with single inlining of mlx5_tx_free_mbuf().
711 part = txq->elts_s - (txq->elts_tail & txq->elts_m);
712 part = RTE_MIN(part, n_elts);
714 MLX5_ASSERT(part <= txq->elts_s);
715 mlx5_tx_free_mbuf(txq,
716 &txq->elts[txq->elts_tail & txq->elts_m],
718 txq->elts_tail += part;
724 * Store the mbuf being sent into elts ring buffer.
725 * On Tx completion these mbufs will be freed.
728 * Pointer to Tx queue structure.
730 * Pointer to array of packets to be stored.
732 * Number of packets to be stored.
734 * Configured Tx offloads mask. It is fully defined at
735 * compile time and may be used for optimization.
737 static __rte_always_inline void
738 mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
739 struct rte_mbuf **__rte_restrict pkts,
741 unsigned int olx __rte_unused)
744 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
748 part = txq->elts_s - (txq->elts_head & txq->elts_m);
750 MLX5_ASSERT(part <= txq->elts_s);
751 /* This code is a good candidate for vectorizing with SIMD. */
752 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
754 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
755 txq->elts_head += pkts_n;
756 if (unlikely(part < pkts_n))
757 /* The copy is wrapping around the elts array. */
758 rte_memcpy((void *)elts, (void *)(pkts + part),
759 (pkts_n - part) * sizeof(struct rte_mbuf *));
763 * Check if the completion request flag should be set in the last WQE.
764 * Both pushed mbufs and WQEs are monitored and the completion request
765 * flag is set if any of thresholds is reached.
768 * Pointer to TX queue structure.
770 * Pointer to burst routine local context.
772 * Configured Tx offloads mask. It is fully defined at
773 * compile time and may be used for optimization.
775 static __rte_always_inline void
776 mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
777 struct mlx5_txq_local *__rte_restrict loc,
780 uint16_t head = txq->elts_head;
783 part = MLX5_TXOFF_CONFIG(INLINE) ?
784 0 : loc->pkts_sent - loc->pkts_copy;
786 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
787 (MLX5_TXOFF_CONFIG(INLINE) &&
788 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
789 volatile struct mlx5_wqe *last = loc->wqe_last;
792 txq->elts_comp = head;
793 if (MLX5_TXOFF_CONFIG(INLINE))
794 txq->wqe_comp = txq->wqe_ci;
795 /* Request unconditional completion on last WQE. */
796 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
797 MLX5_COMP_MODE_OFFSET);
798 /* Save elts_head in dedicated free on completion queue. */
799 #ifdef RTE_LIBRTE_MLX5_DEBUG
800 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
801 (last->cseg.opcode >> 8) << 16;
803 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
805 /* A CQE slot must always be available. */
806 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
811 * Build the Control Segment with specified opcode:
813 * - MLX5_OPCODE_ENHANCED_MPSW
817 * Pointer to TX queue structure.
819 * Pointer to burst routine local context.
821 * Pointer to WQE to fill with built Control Segment.
823 * Supposed length of WQE in segments.
825 * SQ WQE opcode to put into Control Segment.
827 * Configured Tx offloads mask. It is fully defined at
828 * compile time and may be used for optimization.
830 static __rte_always_inline void
831 mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
832 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
833 struct mlx5_wqe *__rte_restrict wqe,
836 unsigned int olx __rte_unused)
838 struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
840 /* For legacy MPW replace the EMPW by TSO with modifier. */
841 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
842 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
843 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
844 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
845 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
846 MLX5_COMP_MODE_OFFSET);
847 cs->misc = RTE_BE32(0);
851 * Build the Synchronize Queue Segment with specified completion index.
854 * Pointer to TX queue structure.
856 * Pointer to burst routine local context.
858 * Pointer to WQE to fill with built Control Segment.
860 * Completion index in Clock Queue to wait.
862 * Configured Tx offloads mask. It is fully defined at
863 * compile time and may be used for optimization.
865 static __rte_always_inline void
866 mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
867 struct mlx5_txq_local *restrict loc __rte_unused,
868 struct mlx5_wqe *restrict wqe,
870 unsigned int olx __rte_unused)
872 struct mlx5_wqe_qseg *qs;
874 qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
875 qs->max_index = rte_cpu_to_be_32(wci);
876 qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq_obj.cq->id);
877 qs->reserved0 = RTE_BE32(0);
878 qs->reserved1 = RTE_BE32(0);
882 * Build the Ethernet Segment without inlined data.
883 * Supports Software Parser, Checksums and VLAN insertion Tx offload features.
886 * Pointer to TX queue structure.
888 * Pointer to burst routine local context.
890 * Pointer to WQE to fill with built Ethernet Segment.
892 * Configured Tx offloads mask. It is fully defined at
893 * compile time and may be used for optimization.
895 static __rte_always_inline void
896 mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
897 struct mlx5_txq_local *__rte_restrict loc,
898 struct mlx5_wqe *__rte_restrict wqe,
901 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
905 * Calculate and set check sum flags first, dword field
906 * in segment may be shared with Software Parser flags.
908 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
909 es->flags = rte_cpu_to_le_32(csum);
911 * Calculate and set Software Parser offsets and flags.
912 * These flags a set for custom UDP and IP tunnel packets.
914 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
915 /* Fill metadata field if needed. */
916 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
917 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
918 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
920 /* Engage VLAN tag insertion feature if requested. */
921 if (MLX5_TXOFF_CONFIG(VLAN) &&
922 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
924 * We should get here only if device support
925 * this feature correctly.
927 MLX5_ASSERT(txq->vlan_en);
928 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
929 loc->mbuf->vlan_tci);
931 es->inline_hdr = RTE_BE32(0);
936 * Build the Ethernet Segment with minimal inlined data
937 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
938 * used to fill the gap in single WQEBB WQEs.
939 * Supports Software Parser, Checksums and VLAN
940 * insertion Tx offload features.
943 * Pointer to TX queue structure.
945 * Pointer to burst routine local context.
947 * Pointer to WQE to fill with built Ethernet Segment.
949 * Length of VLAN tag insertion if any.
951 * Configured Tx offloads mask. It is fully defined at
952 * compile time and may be used for optimization.
954 static __rte_always_inline void
955 mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
956 struct mlx5_txq_local *__rte_restrict loc,
957 struct mlx5_wqe *__rte_restrict wqe,
961 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
963 uint8_t *psrc, *pdst;
966 * Calculate and set check sum flags first, dword field
967 * in segment may be shared with Software Parser flags.
969 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
970 es->flags = rte_cpu_to_le_32(csum);
972 * Calculate and set Software Parser offsets and flags.
973 * These flags a set for custom UDP and IP tunnel packets.
975 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
976 /* Fill metadata field if needed. */
977 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
978 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
979 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
981 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
982 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
983 es->inline_data = *(unaligned_uint16_t *)psrc;
984 psrc += sizeof(uint16_t);
985 pdst = (uint8_t *)(es + 1);
986 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
987 /* Implement VLAN tag insertion as part inline data. */
988 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
989 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
990 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
991 /* Insert VLAN ethertype + VLAN tag. */
992 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
993 ((RTE_ETHER_TYPE_VLAN << 16) |
994 loc->mbuf->vlan_tci);
995 pdst += sizeof(struct rte_vlan_hdr);
996 /* Copy the rest two bytes from packet data. */
997 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
998 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1000 /* Fill the gap in the title WQEBB with inline data. */
1001 rte_mov16(pdst, psrc);
1006 * Build the Ethernet Segment with entire packet data inlining. Checks the
1007 * boundary of WQEBB and ring buffer wrapping, supports Software Parser,
1008 * Checksums and VLAN insertion Tx offload features.
1011 * Pointer to TX queue structure.
1013 * Pointer to burst routine local context.
1015 * Pointer to WQE to fill with built Ethernet Segment.
1017 * Length of VLAN tag insertion if any.
1019 * Length of data to inline (VLAN included, if any).
1021 * TSO flag, set mss field from the packet.
1023 * Configured Tx offloads mask. It is fully defined at
1024 * compile time and may be used for optimization.
1027 * Pointer to the next Data Segment (aligned and wrapped around).
1029 static __rte_always_inline struct mlx5_wqe_dseg *
1030 mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
1031 struct mlx5_txq_local *__rte_restrict loc,
1032 struct mlx5_wqe *__rte_restrict wqe,
1038 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1040 uint8_t *psrc, *pdst;
1044 * Calculate and set check sum flags first, dword field
1045 * in segment may be shared with Software Parser flags.
1047 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1050 csum |= loc->mbuf->tso_segsz;
1051 es->flags = rte_cpu_to_be_32(csum);
1053 es->flags = rte_cpu_to_le_32(csum);
1056 * Calculate and set Software Parser offsets and flags.
1057 * These flags a set for custom UDP and IP tunnel packets.
1059 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1060 /* Fill metadata field if needed. */
1061 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1062 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1063 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1065 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
1066 es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
1067 es->inline_data = *(unaligned_uint16_t *)psrc;
1068 psrc += sizeof(uint16_t);
1069 pdst = (uint8_t *)(es + 1);
1070 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1071 /* Implement VLAN tag insertion as part inline data. */
1072 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
1073 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1074 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
1075 /* Insert VLAN ethertype + VLAN tag. */
1076 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1077 ((RTE_ETHER_TYPE_VLAN << 16) |
1078 loc->mbuf->vlan_tci);
1079 pdst += sizeof(struct rte_vlan_hdr);
1080 /* Copy the rest two bytes from packet data. */
1081 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
1082 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
1083 psrc += sizeof(uint16_t);
1085 /* Fill the gap in the title WQEBB with inline data. */
1086 rte_mov16(pdst, psrc);
1087 psrc += sizeof(rte_v128u32_t);
1089 pdst = (uint8_t *)(es + 2);
1090 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1091 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1092 inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
1094 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1095 return (struct mlx5_wqe_dseg *)pdst;
1098 * The WQEBB space availability is checked by caller.
1099 * Here we should be aware of WQE ring buffer wraparound only.
1101 part = (uint8_t *)txq->wqes_end - pdst;
1102 part = RTE_MIN(part, inlen);
1104 rte_memcpy(pdst, psrc, part);
1106 if (likely(!inlen)) {
1108 * If return value is not used by the caller
1109 * the code below will be optimized out.
1112 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1113 if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1114 pdst = (uint8_t *)txq->wqes;
1115 return (struct mlx5_wqe_dseg *)pdst;
1117 pdst = (uint8_t *)txq->wqes;
1124 * Copy data from chain of mbuf to the specified linear buffer.
1125 * Checksums and VLAN insertion Tx offload features. If data
1126 * from some mbuf copied completely this mbuf is freed. Local
1127 * structure is used to keep the byte stream state.
1130 * Pointer to the destination linear buffer.
1132 * Pointer to burst routine local context.
1134 * Length of data to be copied.
1136 * Length of data to be copied ignoring no inline hint.
1138 * Configured Tx offloads mask. It is fully defined at
1139 * compile time and may be used for optimization.
1142 * Number of actual copied data bytes. This is always greater than or
1143 * equal to must parameter and might be lesser than len in no inline
1144 * hint flag is encountered.
1146 static __rte_always_inline unsigned int
1147 mlx5_tx_mseg_memcpy(uint8_t *pdst,
1148 struct mlx5_txq_local *__rte_restrict loc,
1151 unsigned int olx __rte_unused)
1153 struct rte_mbuf *mbuf;
1154 unsigned int part, dlen, copy = 0;
1158 MLX5_ASSERT(must <= len);
1160 /* Allow zero length packets, must check first. */
1161 dlen = rte_pktmbuf_data_len(loc->mbuf);
1162 if (dlen <= loc->mbuf_off) {
1163 /* Exhausted packet, just free. */
1165 loc->mbuf = mbuf->next;
1166 rte_pktmbuf_free_seg(mbuf);
1168 MLX5_ASSERT(loc->mbuf_nseg > 1);
1169 MLX5_ASSERT(loc->mbuf);
1171 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
1176 * We already copied the minimal
1177 * requested amount of data.
1182 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
1184 * Copy only the minimal required
1185 * part of the data buffer.
1192 dlen -= loc->mbuf_off;
1193 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1195 part = RTE_MIN(len, dlen);
1196 rte_memcpy(pdst, psrc, part);
1198 loc->mbuf_off += part;
1201 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
1203 /* Exhausted packet, just free. */
1205 loc->mbuf = mbuf->next;
1206 rte_pktmbuf_free_seg(mbuf);
1208 MLX5_ASSERT(loc->mbuf_nseg >= 1);
1218 * Build the Ethernet Segment with inlined data from multi-segment packet.
1219 * Checks the boundary of WQEBB and ring buffer wrapping, supports Software
1220 * Parser, Checksums and VLAN insertion Tx offload features.
1223 * Pointer to TX queue structure.
1225 * Pointer to burst routine local context.
1227 * Pointer to WQE to fill with built Ethernet Segment.
1229 * Length of VLAN tag insertion if any.
1231 * Length of data to inline (VLAN included, if any).
1233 * TSO flag, set mss field from the packet.
1235 * Configured Tx offloads mask. It is fully defined at
1236 * compile time and may be used for optimization.
1239 * Pointer to the next Data Segment (aligned and possible NOT wrapped
1240 * around - caller should do wrapping check on its own).
1242 static __rte_always_inline struct mlx5_wqe_dseg *
1243 mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
1244 struct mlx5_txq_local *__rte_restrict loc,
1245 struct mlx5_wqe *__rte_restrict wqe,
1251 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1254 unsigned int part, tlen = 0;
1257 * Calculate and set check sum flags first, uint32_t field
1258 * in segment may be shared with Software Parser flags.
1260 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1263 csum |= loc->mbuf->tso_segsz;
1264 es->flags = rte_cpu_to_be_32(csum);
1266 es->flags = rte_cpu_to_le_32(csum);
1269 * Calculate and set Software Parser offsets and flags.
1270 * These flags a set for custom UDP and IP tunnel packets.
1272 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1273 /* Fill metadata field if needed. */
1274 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1275 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1276 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1278 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1279 pdst = (uint8_t *)&es->inline_data;
1280 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1281 /* Implement VLAN tag insertion as part inline data. */
1282 mlx5_tx_mseg_memcpy(pdst, loc,
1283 2 * RTE_ETHER_ADDR_LEN,
1284 2 * RTE_ETHER_ADDR_LEN, olx);
1285 pdst += 2 * RTE_ETHER_ADDR_LEN;
1286 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1287 ((RTE_ETHER_TYPE_VLAN << 16) |
1288 loc->mbuf->vlan_tci);
1289 pdst += sizeof(struct rte_vlan_hdr);
1290 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
1292 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1294 * The WQEBB space availability is checked by caller.
1295 * Here we should be aware of WQE ring buffer wraparound only.
1297 part = (uint8_t *)txq->wqes_end - pdst;
1298 part = RTE_MIN(part, inlen - tlen);
1304 * Copying may be interrupted inside the routine
1305 * if run into no inline hint flag.
1307 copy = tso ? inlen : txq->inlen_mode;
1308 copy = tlen >= copy ? 0 : (copy - tlen);
1309 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
1311 if (likely(inlen <= tlen) || copy < part) {
1312 es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
1314 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1315 return (struct mlx5_wqe_dseg *)pdst;
1317 pdst = (uint8_t *)txq->wqes;
1318 part = inlen - tlen;
1323 * Build the Data Segment of pointer type.
1326 * Pointer to TX queue structure.
1328 * Pointer to burst routine local context.
1330 * Pointer to WQE to fill with built Data Segment.
1332 * Data buffer to point.
1334 * Data buffer length.
1336 * Configured Tx offloads mask. It is fully defined at
1337 * compile time and may be used for optimization.
1339 static __rte_always_inline void
1340 mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
1341 struct mlx5_txq_local *__rte_restrict loc,
1342 struct mlx5_wqe_dseg *__rte_restrict dseg,
1345 unsigned int olx __rte_unused)
1349 dseg->bcount = rte_cpu_to_be_32(len);
1350 dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1351 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1355 * Build the Data Segment of pointer type or inline if data length is less than
1356 * buffer in minimal Data Segment size.
1359 * Pointer to TX queue structure.
1361 * Pointer to burst routine local context.
1363 * Pointer to WQE to fill with built Data Segment.
1365 * Data buffer to point.
1367 * Data buffer length.
1369 * Configured Tx offloads mask. It is fully defined at
1370 * compile time and may be used for optimization.
1372 static __rte_always_inline void
1373 mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
1374 struct mlx5_txq_local *__rte_restrict loc,
1375 struct mlx5_wqe_dseg *__rte_restrict dseg,
1378 unsigned int olx __rte_unused)
1384 if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
1385 dseg->bcount = rte_cpu_to_be_32(len);
1386 dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1387 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1391 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1392 /* Unrolled implementation of generic rte_memcpy. */
1393 dst = (uintptr_t)&dseg->inline_data[0];
1394 src = (uintptr_t)buf;
1396 #ifdef RTE_ARCH_STRICT_ALIGN
1397 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
1398 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1399 dst += sizeof(uint32_t);
1400 src += sizeof(uint32_t);
1401 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1402 dst += sizeof(uint32_t);
1403 src += sizeof(uint32_t);
1405 *(uint64_t *)dst = *(unaligned_uint64_t *)src;
1406 dst += sizeof(uint64_t);
1407 src += sizeof(uint64_t);
1411 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1412 dst += sizeof(uint32_t);
1413 src += sizeof(uint32_t);
1416 *(uint16_t *)dst = *(unaligned_uint16_t *)src;
1417 dst += sizeof(uint16_t);
1418 src += sizeof(uint16_t);
1421 *(uint8_t *)dst = *(uint8_t *)src;
1425 * Build the Data Segment of inlined data from single
1426 * segment packet, no VLAN insertion.
1429 * Pointer to TX queue structure.
1431 * Pointer to burst routine local context.
1433 * Pointer to WQE to fill with built Data Segment.
1435 * Data buffer to point.
1437 * Data buffer length.
1439 * Configured Tx offloads mask. It is fully defined at
1440 * compile time and may be used for optimization.
1443 * Pointer to the next Data Segment after inlined data.
1444 * Ring buffer wraparound check is needed. We do not do it here because it
1445 * may not be needed for the last packet in the eMPW session.
1447 static __rte_always_inline struct mlx5_wqe_dseg *
1448 mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
1449 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1450 struct mlx5_wqe_dseg *__rte_restrict dseg,
1453 unsigned int olx __rte_unused)
1458 if (!MLX5_TXOFF_CONFIG(MPW)) {
1459 /* Store the descriptor byte counter for eMPW sessions. */
1460 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1461 pdst = &dseg->inline_data[0];
1463 /* The entire legacy MPW session counter is stored on close. */
1464 pdst = (uint8_t *)dseg;
1467 * The WQEBB space availability is checked by caller.
1468 * Here we should be aware of WQE ring buffer wraparound only.
1470 part = (uint8_t *)txq->wqes_end - pdst;
1471 part = RTE_MIN(part, len);
1473 rte_memcpy(pdst, buf, part);
1477 if (!MLX5_TXOFF_CONFIG(MPW))
1478 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1479 /* Note: no final wraparound check here. */
1480 return (struct mlx5_wqe_dseg *)pdst;
1482 pdst = (uint8_t *)txq->wqes;
1489 * Build the Data Segment of inlined data from single
1490 * segment packet with VLAN insertion.
1493 * Pointer to TX queue structure.
1495 * Pointer to burst routine local context.
1497 * Pointer to the dseg fill with built Data Segment.
1499 * Data buffer to point.
1501 * Data buffer length.
1503 * Configured Tx offloads mask. It is fully defined at
1504 * compile time and may be used for optimization.
1507 * Pointer to the next Data Segment after inlined data.
1508 * Ring buffer wraparound check is needed.
1510 static __rte_always_inline struct mlx5_wqe_dseg *
1511 mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
1512 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1513 struct mlx5_wqe_dseg *__rte_restrict dseg,
1516 unsigned int olx __rte_unused)
1522 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
1523 if (!MLX5_TXOFF_CONFIG(MPW)) {
1524 /* Store the descriptor byte counter for eMPW sessions. */
1525 dseg->bcount = rte_cpu_to_be_32
1526 ((len + sizeof(struct rte_vlan_hdr)) |
1527 MLX5_ETH_WQE_DATA_INLINE);
1528 pdst = &dseg->inline_data[0];
1530 /* The entire legacy MPW session counter is stored on close. */
1531 pdst = (uint8_t *)dseg;
1533 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
1534 buf += MLX5_DSEG_MIN_INLINE_SIZE;
1535 pdst += MLX5_DSEG_MIN_INLINE_SIZE;
1536 len -= MLX5_DSEG_MIN_INLINE_SIZE;
1537 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
1538 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1539 if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1540 pdst = (uint8_t *)txq->wqes;
1541 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
1542 loc->mbuf->vlan_tci);
1543 pdst += sizeof(struct rte_vlan_hdr);
1545 * The WQEBB space availability is checked by caller.
1546 * Here we should be aware of WQE ring buffer wraparound only.
1548 part = (uint8_t *)txq->wqes_end - pdst;
1549 part = RTE_MIN(part, len);
1551 rte_memcpy(pdst, buf, part);
1555 if (!MLX5_TXOFF_CONFIG(MPW))
1556 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1557 /* Note: no final wraparound check here. */
1558 return (struct mlx5_wqe_dseg *)pdst;
1560 pdst = (uint8_t *)txq->wqes;
1567 * Build the Ethernet Segment with optionally inlined data with
1568 * VLAN insertion and following Data Segments (if any) from
1569 * multi-segment packet. Used by ordinary send and TSO.
1572 * Pointer to TX queue structure.
1574 * Pointer to burst routine local context.
1576 * Pointer to WQE to fill with built Ethernet/Data Segments.
1578 * Length of VLAN header to insert, 0 means no VLAN insertion.
1580 * Data length to inline. For TSO this parameter specifies exact value,
1581 * for ordinary send routine can be aligned by caller to provide better WQE
1582 * space saving and data buffer start address alignment.
1583 * This length includes VLAN header being inserted.
1585 * Zero means ordinary send, inlined data can be extended,
1586 * otherwise this is TSO, inlined data length is fixed.
1588 * Configured Tx offloads mask. It is fully defined at
1589 * compile time and may be used for optimization.
1592 * Actual size of built WQE in segments.
1594 static __rte_always_inline unsigned int
1595 mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
1596 struct mlx5_txq_local *__rte_restrict loc,
1597 struct mlx5_wqe *__rte_restrict wqe,
1601 unsigned int olx __rte_unused)
1603 struct mlx5_wqe_dseg *__rte_restrict dseg;
1606 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
1607 loc->mbuf_nseg = NB_SEGS(loc->mbuf);
1610 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
1611 if (!loc->mbuf_nseg)
1614 * There are still some mbuf remaining, not inlined.
1615 * The first mbuf may be partially inlined and we
1616 * must process the possible non-zero data offset.
1618 if (loc->mbuf_off) {
1623 * Exhausted packets must be dropped before.
1624 * Non-zero offset means there are some data
1625 * remained in the packet.
1627 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
1628 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
1629 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1631 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
1633 * Build the pointer/minimal Data Segment.
1634 * Do ring buffer wrapping check in advance.
1636 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1637 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1638 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
1639 /* Store the mbuf to be freed on completion. */
1640 MLX5_ASSERT(loc->elts_free);
1641 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1644 if (--loc->mbuf_nseg == 0)
1646 loc->mbuf = loc->mbuf->next;
1650 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1651 struct rte_mbuf *mbuf;
1653 /* Zero length segment found, just skip. */
1655 loc->mbuf = loc->mbuf->next;
1656 rte_pktmbuf_free_seg(mbuf);
1657 if (--loc->mbuf_nseg == 0)
1660 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1661 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1664 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1665 rte_pktmbuf_data_len(loc->mbuf), olx);
1666 MLX5_ASSERT(loc->elts_free);
1667 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1670 if (--loc->mbuf_nseg == 0)
1672 loc->mbuf = loc->mbuf->next;
1677 /* Calculate actual segments used from the dseg pointer. */
1678 if ((uintptr_t)wqe < (uintptr_t)dseg)
1679 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
1681 ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
1682 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
1687 * The routine checks timestamp flag in the current packet,
1688 * and push WAIT WQE into the queue if scheduling is required.
1691 * Pointer to TX queue structure.
1693 * Pointer to burst routine local context.
1695 * Configured Tx offloads mask. It is fully defined at
1696 * compile time and may be used for optimization.
1699 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1700 * MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
1701 * MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
1702 * Local context variables partially updated.
1704 static __rte_always_inline enum mlx5_txcmp_code
1705 mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
1706 struct mlx5_txq_local *restrict loc,
1709 if (MLX5_TXOFF_CONFIG(TXPP) &&
1710 loc->mbuf->ol_flags & txq->ts_mask) {
1711 struct mlx5_wqe *wqe;
1716 * Estimate the required space quickly and roughly.
1717 * We would like to ensure the packet can be pushed
1718 * to the queue and we won't get the orphan WAIT WQE.
1720 if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
1721 loc->elts_free < NB_SEGS(loc->mbuf))
1722 return MLX5_TXCMP_CODE_EXIT;
1723 /* Convert the timestamp into completion to wait. */
1724 ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
1725 wci = mlx5_txpp_convert_tx_ts(txq->sh, ts);
1726 if (unlikely(wci < 0))
1727 return MLX5_TXCMP_CODE_SINGLE;
1728 /* Build the WAIT WQE with specified completion. */
1729 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1730 mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx);
1731 mlx5_tx_wseg_init(txq, loc, wqe, wci, olx);
1734 return MLX5_TXCMP_CODE_MULTI;
1736 return MLX5_TXCMP_CODE_SINGLE;
1740 * Tx one packet function for multi-segment TSO. Supports all
1741 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
1742 * sends one packet per WQE.
1744 * This routine is responsible for storing processed mbuf
1745 * into elts ring buffer and update elts_head.
1748 * Pointer to TX queue structure.
1750 * Pointer to burst routine local context.
1752 * Configured Tx offloads mask. It is fully defined at
1753 * compile time and may be used for optimization.
1756 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1757 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1758 * Local context variables partially updated.
1760 static __rte_always_inline enum mlx5_txcmp_code
1761 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
1762 struct mlx5_txq_local *__rte_restrict loc,
1765 struct mlx5_wqe *__rte_restrict wqe;
1766 unsigned int ds, dlen, inlen, ntcp, vlan = 0;
1768 if (MLX5_TXOFF_CONFIG(TXPP)) {
1769 enum mlx5_txcmp_code wret;
1771 /* Generate WAIT for scheduling if requested. */
1772 wret = mlx5_tx_schedule_send(txq, loc, olx);
1773 if (wret == MLX5_TXCMP_CODE_EXIT)
1774 return MLX5_TXCMP_CODE_EXIT;
1775 if (wret == MLX5_TXCMP_CODE_ERROR)
1776 return MLX5_TXCMP_CODE_ERROR;
1779 * Calculate data length to be inlined to estimate
1780 * the required space in WQE ring buffer.
1782 dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1783 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1784 vlan = sizeof(struct rte_vlan_hdr);
1785 inlen = loc->mbuf->l2_len + vlan +
1786 loc->mbuf->l3_len + loc->mbuf->l4_len;
1787 if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
1788 return MLX5_TXCMP_CODE_ERROR;
1789 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
1790 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
1791 /* Packet must contain all TSO headers. */
1792 if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
1793 inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
1794 inlen > (dlen + vlan)))
1795 return MLX5_TXCMP_CODE_ERROR;
1796 MLX5_ASSERT(inlen >= txq->inlen_mode);
1798 * Check whether there are enough free WQEBBs:
1800 * - Ethernet Segment
1801 * - First Segment of inlined Ethernet data
1802 * - ... data continued ...
1803 * - Data Segments of pointer/min inline type
1805 ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
1806 MLX5_ESEG_MIN_INLINE_SIZE +
1808 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
1809 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1810 return MLX5_TXCMP_CODE_EXIT;
1811 /* Check for maximal WQE size. */
1812 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
1813 return MLX5_TXCMP_CODE_ERROR;
1814 #ifdef MLX5_PMD_SOFT_COUNTERS
1815 /* Update sent data bytes/packets counters. */
1816 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
1817 loc->mbuf->tso_segsz;
1819 * One will be added for mbuf itself at the end of the mlx5_tx_burst
1820 * from loc->pkts_sent field.
1823 txq->stats.opackets += ntcp;
1824 txq->stats.obytes += dlen + vlan + ntcp * inlen;
1826 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1827 loc->wqe_last = wqe;
1828 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
1829 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
1830 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
1831 txq->wqe_ci += (ds + 3) / 4;
1832 loc->wqe_free -= (ds + 3) / 4;
1833 return MLX5_TXCMP_CODE_MULTI;
1837 * Tx one packet function for multi-segment SEND. Supports all types of Tx
1838 * offloads, uses MLX5_OPCODE_SEND to build WQEs, sends one packet per WQE,
1839 * without any data inlining in Ethernet Segment.
1841 * This routine is responsible for storing processed mbuf
1842 * into elts ring buffer and update elts_head.
1845 * Pointer to TX queue structure.
1847 * Pointer to burst routine local context.
1849 * Configured Tx offloads mask. It is fully defined at
1850 * compile time and may be used for optimization.
1853 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1854 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1855 * Local context variables partially updated.
1857 static __rte_always_inline enum mlx5_txcmp_code
1858 mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
1859 struct mlx5_txq_local *__rte_restrict loc,
1862 struct mlx5_wqe_dseg *__rte_restrict dseg;
1863 struct mlx5_wqe *__rte_restrict wqe;
1864 unsigned int ds, nseg;
1866 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1867 if (MLX5_TXOFF_CONFIG(TXPP)) {
1868 enum mlx5_txcmp_code wret;
1870 /* Generate WAIT for scheduling if requested. */
1871 wret = mlx5_tx_schedule_send(txq, loc, olx);
1872 if (wret == MLX5_TXCMP_CODE_EXIT)
1873 return MLX5_TXCMP_CODE_EXIT;
1874 if (wret == MLX5_TXCMP_CODE_ERROR)
1875 return MLX5_TXCMP_CODE_ERROR;
1878 * No inline at all, it means the CPU cycles saving is prioritized at
1879 * configuration, we should not copy any packet data to WQE.
1881 nseg = NB_SEGS(loc->mbuf);
1883 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1884 return MLX5_TXCMP_CODE_EXIT;
1885 /* Check for maximal WQE size. */
1886 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
1887 return MLX5_TXCMP_CODE_ERROR;
1889 * Some Tx offloads may cause an error if packet is not long enough,
1890 * check against assumed minimal length.
1892 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
1893 return MLX5_TXCMP_CODE_ERROR;
1894 #ifdef MLX5_PMD_SOFT_COUNTERS
1895 /* Update sent data bytes counter. */
1896 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
1897 if (MLX5_TXOFF_CONFIG(VLAN) &&
1898 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1899 txq->stats.obytes += sizeof(struct rte_vlan_hdr);
1902 * SEND WQE, one WQEBB:
1903 * - Control Segment, SEND opcode
1904 * - Ethernet Segment, optional VLAN, no inline
1905 * - Data Segments, pointer only type
1907 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1908 loc->wqe_last = wqe;
1909 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
1910 mlx5_tx_eseg_none(txq, loc, wqe, olx);
1911 dseg = &wqe->dseg[0];
1913 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1914 struct rte_mbuf *mbuf;
1917 * Zero length segment found, have to correct total
1918 * size of WQE in segments.
1919 * It is supposed to be rare occasion, so in normal
1920 * case (no zero length segments) we avoid extra
1921 * writing to the Control Segment.
1924 wqe->cseg.sq_ds -= RTE_BE32(1);
1926 loc->mbuf = mbuf->next;
1927 rte_pktmbuf_free_seg(mbuf);
1933 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1934 rte_pktmbuf_data_len(loc->mbuf), olx);
1935 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1940 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1941 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1942 loc->mbuf = loc->mbuf->next;
1945 txq->wqe_ci += (ds + 3) / 4;
1946 loc->wqe_free -= (ds + 3) / 4;
1947 return MLX5_TXCMP_CODE_MULTI;
1951 * Tx one packet function for multi-segment SEND. Supports all
1952 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
1953 * sends one packet per WQE, with data inlining in
1954 * Ethernet Segment and minimal Data Segments.
1956 * This routine is responsible for storing processed mbuf
1957 * into elts ring buffer and update elts_head.
1960 * Pointer to TX queue structure.
1962 * Pointer to burst routine local context.
1964 * Configured Tx offloads mask. It is fully defined at
1965 * compile time and may be used for optimization.
1968 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1969 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1970 * Local context variables partially updated.
1972 static __rte_always_inline enum mlx5_txcmp_code
1973 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
1974 struct mlx5_txq_local *__rte_restrict loc,
1977 struct mlx5_wqe *__rte_restrict wqe;
1978 unsigned int ds, inlen, dlen, vlan = 0;
1980 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
1981 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1982 if (MLX5_TXOFF_CONFIG(TXPP)) {
1983 enum mlx5_txcmp_code wret;
1985 /* Generate WAIT for scheduling if requested. */
1986 wret = mlx5_tx_schedule_send(txq, loc, olx);
1987 if (wret == MLX5_TXCMP_CODE_EXIT)
1988 return MLX5_TXCMP_CODE_EXIT;
1989 if (wret == MLX5_TXCMP_CODE_ERROR)
1990 return MLX5_TXCMP_CODE_ERROR;
1993 * First calculate data length to be inlined
1994 * to estimate the required space for WQE.
1996 dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1997 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1998 vlan = sizeof(struct rte_vlan_hdr);
1999 inlen = dlen + vlan;
2000 /* Check against minimal length. */
2001 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
2002 return MLX5_TXCMP_CODE_ERROR;
2003 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
2004 if (inlen > txq->inlen_send ||
2005 loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
2006 struct rte_mbuf *mbuf;
2011 nxlen = rte_pktmbuf_data_len(mbuf);
2013 * Packet length exceeds the allowed inline data length,
2014 * check whether the minimal inlining is required.
2016 if (txq->inlen_mode) {
2017 MLX5_ASSERT(txq->inlen_mode >=
2018 MLX5_ESEG_MIN_INLINE_SIZE);
2019 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
2020 inlen = txq->inlen_mode;
2021 } else if (vlan && !txq->vlan_en) {
2023 * VLAN insertion is requested and hardware does not
2024 * support the offload, will do with software inline.
2026 inlen = MLX5_ESEG_MIN_INLINE_SIZE;
2027 } else if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE ||
2028 nxlen > txq->inlen_send) {
2029 return mlx5_tx_packet_multi_send(txq, loc, olx);
2034 * Now we know the minimal amount of data is requested
2035 * to inline. Check whether we should inline the buffers
2036 * from the chain beginning to eliminate some mbufs.
2038 if (unlikely(nxlen <= txq->inlen_send)) {
2039 /* We can inline first mbuf at least. */
2040 if (nxlen < inlen) {
2043 /* Scan mbufs till inlen filled. */
2048 nxlen = rte_pktmbuf_data_len(mbuf);
2050 } while (unlikely(nxlen < inlen));
2051 if (unlikely(nxlen > txq->inlen_send)) {
2052 /* We cannot inline entire mbuf. */
2053 smlen = inlen - smlen;
2054 start = rte_pktmbuf_mtod_offset
2055 (mbuf, uintptr_t, smlen);
2063 /* There should be not end of packet. */
2065 nxlen = inlen + rte_pktmbuf_data_len(mbuf);
2066 } while (unlikely(nxlen < txq->inlen_send));
2068 start = rte_pktmbuf_mtod(mbuf, uintptr_t);
2070 * Check whether we can do inline to align start
2071 * address of data buffer to cacheline.
2074 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
2075 if (unlikely(start)) {
2077 if (start <= txq->inlen_send)
2082 * Check whether there are enough free WQEBBs:
2084 * - Ethernet Segment
2085 * - First Segment of inlined Ethernet data
2086 * - ... data continued ...
2087 * - Data Segments of pointer/min inline type
2089 * Estimate the number of Data Segments conservatively,
2090 * supposing no any mbufs is being freed during inlining.
2092 MLX5_ASSERT(inlen <= txq->inlen_send);
2093 ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
2094 MLX5_ESEG_MIN_INLINE_SIZE +
2096 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2097 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
2098 return MLX5_TXCMP_CODE_EXIT;
2099 /* Check for maximal WQE size. */
2100 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
2101 return MLX5_TXCMP_CODE_ERROR;
2102 #ifdef MLX5_PMD_SOFT_COUNTERS
2103 /* Update sent data bytes/packets counters. */
2104 txq->stats.obytes += dlen + vlan;
2106 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2107 loc->wqe_last = wqe;
2108 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
2109 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
2110 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2111 txq->wqe_ci += (ds + 3) / 4;
2112 loc->wqe_free -= (ds + 3) / 4;
2113 return MLX5_TXCMP_CODE_MULTI;
2117 * Tx burst function for multi-segment packets. Supports all
2118 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
2119 * sends one packet per WQE. Function stops sending if it
2120 * encounters the single-segment packet.
2122 * This routine is responsible for storing processed mbuf
2123 * into elts ring buffer and update elts_head.
2126 * Pointer to TX queue structure.
2128 * Packets to transmit.
2130 * Number of packets in array.
2132 * Pointer to burst routine local context.
2134 * Configured Tx offloads mask. It is fully defined at
2135 * compile time and may be used for optimization.
2138 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2139 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2140 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2141 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
2142 * Local context variables updated.
2144 static __rte_always_inline enum mlx5_txcmp_code
2145 mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
2146 struct rte_mbuf **__rte_restrict pkts,
2147 unsigned int pkts_n,
2148 struct mlx5_txq_local *__rte_restrict loc,
2151 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2152 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2153 pkts += loc->pkts_sent + 1;
2154 pkts_n -= loc->pkts_sent;
2156 enum mlx5_txcmp_code ret;
2158 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2160 * Estimate the number of free elts quickly but conservatively.
2161 * Some segment may be fully inlined and freed,
2162 * ignore this here - precise estimation is costly.
2164 if (loc->elts_free < NB_SEGS(loc->mbuf))
2165 return MLX5_TXCMP_CODE_EXIT;
2166 if (MLX5_TXOFF_CONFIG(TSO) &&
2167 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
2168 /* Proceed with multi-segment TSO. */
2169 ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
2170 } else if (MLX5_TXOFF_CONFIG(INLINE)) {
2171 /* Proceed with multi-segment SEND with inlining. */
2172 ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
2174 /* Proceed with multi-segment SEND w/o inlining. */
2175 ret = mlx5_tx_packet_multi_send(txq, loc, olx);
2177 if (ret == MLX5_TXCMP_CODE_EXIT)
2178 return MLX5_TXCMP_CODE_EXIT;
2179 if (ret == MLX5_TXCMP_CODE_ERROR)
2180 return MLX5_TXCMP_CODE_ERROR;
2181 /* WQE is built, go to the next packet. */
2184 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2185 return MLX5_TXCMP_CODE_EXIT;
2186 loc->mbuf = *pkts++;
2188 rte_prefetch0(*pkts);
2189 if (likely(NB_SEGS(loc->mbuf) > 1))
2191 /* Here ends the series of multi-segment packets. */
2192 if (MLX5_TXOFF_CONFIG(TSO) &&
2193 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2194 return MLX5_TXCMP_CODE_TSO;
2195 return MLX5_TXCMP_CODE_SINGLE;
2201 * Tx burst function for single-segment packets with TSO.
2202 * Supports all types of Tx offloads, except multi-packets.
2203 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
2204 * Function stops sending if it encounters the multi-segment
2205 * packet or packet without TSO requested.
2207 * The routine is responsible for storing processed mbuf into elts ring buffer
2208 * and update elts_head if inline offloads is requested due to possible early
2209 * freeing of the inlined mbufs (can not store pkts array in elts as a batch).
2212 * Pointer to TX queue structure.
2214 * Packets to transmit.
2216 * Number of packets in array.
2218 * Pointer to burst routine local context.
2220 * Configured Tx offloads mask. It is fully defined at
2221 * compile time and may be used for optimization.
2224 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2225 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2226 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2227 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2228 * Local context variables updated.
2230 static __rte_always_inline enum mlx5_txcmp_code
2231 mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
2232 struct rte_mbuf **__rte_restrict pkts,
2233 unsigned int pkts_n,
2234 struct mlx5_txq_local *__rte_restrict loc,
2237 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2238 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2239 pkts += loc->pkts_sent + 1;
2240 pkts_n -= loc->pkts_sent;
2242 struct mlx5_wqe_dseg *__rte_restrict dseg;
2243 struct mlx5_wqe *__rte_restrict wqe;
2244 unsigned int ds, dlen, hlen, ntcp, vlan = 0;
2247 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2248 if (MLX5_TXOFF_CONFIG(TXPP)) {
2249 enum mlx5_txcmp_code wret;
2251 /* Generate WAIT for scheduling if requested. */
2252 wret = mlx5_tx_schedule_send(txq, loc, olx);
2253 if (wret == MLX5_TXCMP_CODE_EXIT)
2254 return MLX5_TXCMP_CODE_EXIT;
2255 if (wret == MLX5_TXCMP_CODE_ERROR)
2256 return MLX5_TXCMP_CODE_ERROR;
2258 dlen = rte_pktmbuf_data_len(loc->mbuf);
2259 if (MLX5_TXOFF_CONFIG(VLAN) &&
2260 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2261 vlan = sizeof(struct rte_vlan_hdr);
2264 * First calculate the WQE size to check
2265 * whether we have enough space in ring buffer.
2267 hlen = loc->mbuf->l2_len + vlan +
2268 loc->mbuf->l3_len + loc->mbuf->l4_len;
2269 if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
2270 return MLX5_TXCMP_CODE_ERROR;
2271 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
2272 hlen += loc->mbuf->outer_l2_len +
2273 loc->mbuf->outer_l3_len;
2274 /* Segment must contain all TSO headers. */
2275 if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
2276 hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
2277 hlen > (dlen + vlan)))
2278 return MLX5_TXCMP_CODE_ERROR;
2280 * Check whether there are enough free WQEBBs:
2282 * - Ethernet Segment
2283 * - First Segment of inlined Ethernet data
2284 * - ... data continued ...
2285 * - Finishing Data Segment of pointer type
2287 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
2288 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2289 if (loc->wqe_free < ((ds + 3) / 4))
2290 return MLX5_TXCMP_CODE_EXIT;
2291 #ifdef MLX5_PMD_SOFT_COUNTERS
2292 /* Update sent data bytes/packets counters. */
2293 ntcp = (dlen + vlan - hlen +
2294 loc->mbuf->tso_segsz - 1) /
2295 loc->mbuf->tso_segsz;
2297 * One will be added for mbuf itself at the end
2298 * of the mlx5_tx_burst from loc->pkts_sent field.
2301 txq->stats.opackets += ntcp;
2302 txq->stats.obytes += dlen + vlan + ntcp * hlen;
2305 * Build the TSO WQE:
2307 * - Ethernet Segment with hlen bytes inlined
2308 * - Data Segment of pointer type
2310 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2311 loc->wqe_last = wqe;
2312 mlx5_tx_cseg_init(txq, loc, wqe, ds,
2313 MLX5_OPCODE_TSO, olx);
2314 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
2315 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
2316 dlen -= hlen - vlan;
2317 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2319 * WQE is built, update the loop parameters
2320 * and go to the next packet.
2322 txq->wqe_ci += (ds + 3) / 4;
2323 loc->wqe_free -= (ds + 3) / 4;
2324 if (MLX5_TXOFF_CONFIG(INLINE))
2325 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2329 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2330 return MLX5_TXCMP_CODE_EXIT;
2331 loc->mbuf = *pkts++;
2333 rte_prefetch0(*pkts);
2334 if (MLX5_TXOFF_CONFIG(MULTI) &&
2335 unlikely(NB_SEGS(loc->mbuf) > 1))
2336 return MLX5_TXCMP_CODE_MULTI;
2337 if (likely(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)))
2338 return MLX5_TXCMP_CODE_SINGLE;
2339 /* Continue with the next TSO packet. */
2345 * Analyze the packet and select the best method to send.
2348 * Pointer to TX queue structure.
2350 * Pointer to burst routine local context.
2352 * Configured Tx offloads mask. It is fully defined at
2353 * compile time and may be used for optimization.
2355 * The predefined flag whether do complete check for
2356 * multi-segment packets and TSO.
2359 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2360 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
2361 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
2362 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
2364 static __rte_always_inline enum mlx5_txcmp_code
2365 mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
2366 struct mlx5_txq_local *__rte_restrict loc,
2370 /* Check for multi-segment packet. */
2372 MLX5_TXOFF_CONFIG(MULTI) &&
2373 unlikely(NB_SEGS(loc->mbuf) > 1))
2374 return MLX5_TXCMP_CODE_MULTI;
2375 /* Check for TSO packet. */
2377 MLX5_TXOFF_CONFIG(TSO) &&
2378 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2379 return MLX5_TXCMP_CODE_TSO;
2380 /* Check if eMPW is enabled at all. */
2381 if (!MLX5_TXOFF_CONFIG(EMPW))
2382 return MLX5_TXCMP_CODE_SINGLE;
2383 /* Check if eMPW can be engaged. */
2384 if (MLX5_TXOFF_CONFIG(VLAN) &&
2385 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) &&
2386 (!MLX5_TXOFF_CONFIG(INLINE) ||
2387 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
2388 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
2390 * eMPW does not support VLAN insertion offload, we have to
2391 * inline the entire packet but packet is too long for inlining.
2393 return MLX5_TXCMP_CODE_SINGLE;
2395 return MLX5_TXCMP_CODE_EMPW;
2399 * Check the next packet attributes to match with the eMPW batch ones.
2400 * In addition, for legacy MPW the packet length is checked either.
2403 * Pointer to TX queue structure.
2405 * Pointer to Ethernet Segment of eMPW batch.
2407 * Pointer to burst routine local context.
2409 * Length of previous packet in MPW descriptor.
2411 * Configured Tx offloads mask. It is fully defined at
2412 * compile time and may be used for optimization.
2415 * true - packet match with eMPW batch attributes.
2416 * false - no match, eMPW should be restarted.
2418 static __rte_always_inline bool
2419 mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
2420 struct mlx5_wqe_eseg *__rte_restrict es,
2421 struct mlx5_txq_local *__rte_restrict loc,
2425 uint8_t swp_flags = 0;
2427 /* Compare the checksum flags, if any. */
2428 if (MLX5_TXOFF_CONFIG(CSUM) &&
2429 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
2431 /* Compare the Software Parser offsets and flags. */
2432 if (MLX5_TXOFF_CONFIG(SWP) &&
2433 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
2434 es->swp_flags != swp_flags))
2436 /* Fill metadata field if needed. */
2437 if (MLX5_TXOFF_CONFIG(METADATA) &&
2438 es->metadata != (loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
2439 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) : 0))
2441 /* Legacy MPW can send packets with the same length only. */
2442 if (MLX5_TXOFF_CONFIG(MPW) &&
2443 dlen != rte_pktmbuf_data_len(loc->mbuf))
2445 /* There must be no VLAN packets in eMPW loop. */
2446 if (MLX5_TXOFF_CONFIG(VLAN))
2447 MLX5_ASSERT(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN));
2448 /* Check if the scheduling is requested. */
2449 if (MLX5_TXOFF_CONFIG(TXPP) &&
2450 loc->mbuf->ol_flags & txq->ts_mask)
2456 * Update send loop variables and WQE for eMPW loop without data inlining.
2457 * Number of Data Segments is equal to the number of sent packets.
2460 * Pointer to TX queue structure.
2462 * Pointer to burst routine local context.
2464 * Number of packets/Data Segments/Packets.
2466 * Accumulated statistics, bytes sent.
2468 * Configured Tx offloads mask. It is fully defined at
2469 * compile time and may be used for optimization.
2472 * true - packet match with eMPW batch attributes.
2473 * false - no match, eMPW should be restarted.
2475 static __rte_always_inline void
2476 mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
2477 struct mlx5_txq_local *__rte_restrict loc,
2480 unsigned int olx __rte_unused)
2482 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2483 #ifdef MLX5_PMD_SOFT_COUNTERS
2484 /* Update sent data bytes counter. */
2485 txq->stats.obytes += slen;
2489 loc->elts_free -= ds;
2490 loc->pkts_sent += ds;
2492 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2493 txq->wqe_ci += (ds + 3) / 4;
2494 loc->wqe_free -= (ds + 3) / 4;
2498 * Update send loop variables and WQE for eMPW loop with data inlining.
2499 * Gets the size of pushed descriptors and data to the WQE.
2502 * Pointer to TX queue structure.
2504 * Pointer to burst routine local context.
2506 * Total size of descriptor/data in bytes.
2508 * Accumulated statistics, data bytes sent.
2510 * The base WQE for the eMPW/MPW descriptor.
2512 * Configured Tx offloads mask. It is fully defined at
2513 * compile time and may be used for optimization.
2516 * true - packet match with eMPW batch attributes.
2517 * false - no match, eMPW should be restarted.
2519 static __rte_always_inline void
2520 mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
2521 struct mlx5_txq_local *__rte_restrict loc,
2524 struct mlx5_wqe *__rte_restrict wqem,
2525 unsigned int olx __rte_unused)
2527 struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
2529 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2530 #ifdef MLX5_PMD_SOFT_COUNTERS
2531 /* Update sent data bytes counter. */
2532 txq->stats.obytes += slen;
2536 if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
2538 * If the legacy MPW session contains the inline packets
2539 * we should set the only inline data segment length
2540 * and align the total length to the segment size.
2542 MLX5_ASSERT(len > sizeof(dseg->bcount));
2543 dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
2544 MLX5_ETH_WQE_DATA_INLINE);
2545 len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
2548 * The session is not legacy MPW or contains the
2549 * data buffer pointer segments.
2551 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
2552 len = len / MLX5_WSEG_SIZE + 2;
2554 wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
2555 txq->wqe_ci += (len + 3) / 4;
2556 loc->wqe_free -= (len + 3) / 4;
2557 loc->wqe_last = wqem;
2561 * The set of Tx burst functions for single-segment packets without TSO
2562 * and with Multi-Packet Writing feature support.
2563 * Supports all types of Tx offloads, except multi-packets and TSO.
2565 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends as many packet
2566 * per WQE as it can. If eMPW is not configured or packet can not be sent with
2567 * eMPW (VLAN insertion) the ordinary SEND opcode is used and only one packet
2570 * Functions stop sending if it encounters the multi-segment packet or packet
2571 * with TSO requested.
2573 * The routines are responsible for storing processed mbuf into elts ring buffer
2574 * and update elts_head if inlining offload is requested. Otherwise the copying
2575 * mbufs to elts can be postponed and completed at the end of burst routine.
2578 * Pointer to TX queue structure.
2580 * Packets to transmit.
2582 * Number of packets in array.
2584 * Pointer to burst routine local context.
2586 * Configured Tx offloads mask. It is fully defined at
2587 * compile time and may be used for optimization.
2590 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2591 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2592 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2593 * MLX5_TXCMP_CODE_TSO - TSO packet encountered.
2594 * MLX5_TXCMP_CODE_SINGLE - used inside functions set.
2595 * MLX5_TXCMP_CODE_EMPW - used inside functions set.
2597 * Local context variables updated.
2600 * The routine sends packets with MLX5_OPCODE_EMPW
2601 * without inlining, this is dedicated optimized branch.
2602 * No VLAN insertion is supported.
2604 static __rte_always_inline enum mlx5_txcmp_code
2605 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
2606 struct rte_mbuf **__rte_restrict pkts,
2607 unsigned int pkts_n,
2608 struct mlx5_txq_local *__rte_restrict loc,
2612 * Subroutine is the part of mlx5_tx_burst_single() and sends
2613 * single-segment packet with eMPW opcode without data inlining.
2615 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2616 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2617 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2618 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2619 pkts += loc->pkts_sent + 1;
2620 pkts_n -= loc->pkts_sent;
2622 struct mlx5_wqe_dseg *__rte_restrict dseg;
2623 struct mlx5_wqe_eseg *__rte_restrict eseg;
2624 enum mlx5_txcmp_code ret;
2625 unsigned int part, loop;
2626 unsigned int slen = 0;
2629 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2630 if (MLX5_TXOFF_CONFIG(TXPP)) {
2631 enum mlx5_txcmp_code wret;
2633 /* Generate WAIT for scheduling if requested. */
2634 wret = mlx5_tx_schedule_send(txq, loc, olx);
2635 if (wret == MLX5_TXCMP_CODE_EXIT)
2636 return MLX5_TXCMP_CODE_EXIT;
2637 if (wret == MLX5_TXCMP_CODE_ERROR)
2638 return MLX5_TXCMP_CODE_ERROR;
2640 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2641 MLX5_MPW_MAX_PACKETS :
2642 MLX5_EMPW_MAX_PACKETS);
2643 if (unlikely(loc->elts_free < part)) {
2644 /* We have no enough elts to save all mbufs. */
2645 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
2646 return MLX5_TXCMP_CODE_EXIT;
2647 /* But we still able to send at least minimal eMPW. */
2648 part = loc->elts_free;
2650 /* Check whether we have enough WQEs */
2651 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
2652 if (unlikely(loc->wqe_free <
2653 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2654 return MLX5_TXCMP_CODE_EXIT;
2655 part = (loc->wqe_free * 4) - 2;
2657 if (likely(part > 1))
2658 rte_prefetch0(*pkts);
2659 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2661 * Build eMPW title WQEBB:
2662 * - Control Segment, eMPW opcode
2663 * - Ethernet Segment, no inline
2665 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
2666 MLX5_OPCODE_ENHANCED_MPSW, olx);
2667 mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
2668 olx & ~MLX5_TXOFF_CONFIG_VLAN);
2669 eseg = &loc->wqe_last->eseg;
2670 dseg = &loc->wqe_last->dseg[0];
2672 /* Store the packet length for legacy MPW. */
2673 if (MLX5_TXOFF_CONFIG(MPW))
2674 eseg->mss = rte_cpu_to_be_16
2675 (rte_pktmbuf_data_len(loc->mbuf));
2677 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2678 #ifdef MLX5_PMD_SOFT_COUNTERS
2679 /* Update sent data bytes counter. */
2684 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2686 if (unlikely(--loop == 0))
2688 loc->mbuf = *pkts++;
2689 if (likely(loop > 1))
2690 rte_prefetch0(*pkts);
2691 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2693 * Unroll the completion code to avoid
2694 * returning variable value - it results in
2695 * unoptimized sequent checking in caller.
2697 if (ret == MLX5_TXCMP_CODE_MULTI) {
2699 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2700 if (unlikely(!loc->elts_free ||
2702 return MLX5_TXCMP_CODE_EXIT;
2703 return MLX5_TXCMP_CODE_MULTI;
2705 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2706 if (ret == MLX5_TXCMP_CODE_TSO) {
2708 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2709 if (unlikely(!loc->elts_free ||
2711 return MLX5_TXCMP_CODE_EXIT;
2712 return MLX5_TXCMP_CODE_TSO;
2714 if (ret == MLX5_TXCMP_CODE_SINGLE) {
2716 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2717 if (unlikely(!loc->elts_free ||
2719 return MLX5_TXCMP_CODE_EXIT;
2720 return MLX5_TXCMP_CODE_SINGLE;
2722 if (ret != MLX5_TXCMP_CODE_EMPW) {
2725 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2726 return MLX5_TXCMP_CODE_ERROR;
2729 * Check whether packet parameters coincide
2730 * within assumed eMPW batch:
2731 * - check sum settings
2733 * - software parser settings
2734 * - packets length (legacy MPW only)
2735 * - scheduling is not required
2737 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
2740 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2741 if (unlikely(!loc->elts_free ||
2743 return MLX5_TXCMP_CODE_EXIT;
2747 /* Packet attributes match, continue the same eMPW. */
2749 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2750 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2752 /* eMPW is built successfully, update loop parameters. */
2754 MLX5_ASSERT(pkts_n >= part);
2755 #ifdef MLX5_PMD_SOFT_COUNTERS
2756 /* Update sent data bytes counter. */
2757 txq->stats.obytes += slen;
2759 loc->elts_free -= part;
2760 loc->pkts_sent += part;
2761 txq->wqe_ci += (2 + part + 3) / 4;
2762 loc->wqe_free -= (2 + part + 3) / 4;
2764 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2765 return MLX5_TXCMP_CODE_EXIT;
2766 loc->mbuf = *pkts++;
2767 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2768 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
2770 /* Continue sending eMPW batches. */
2776 * The routine sends packets with MLX5_OPCODE_EMPW
2777 * with inlining, optionally supports VLAN insertion.
2779 static __rte_always_inline enum mlx5_txcmp_code
2780 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
2781 struct rte_mbuf **__rte_restrict pkts,
2782 unsigned int pkts_n,
2783 struct mlx5_txq_local *__rte_restrict loc,
2787 * Subroutine is the part of mlx5_tx_burst_single() and sends
2788 * single-segment packet with eMPW opcode with data inlining.
2790 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2791 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2792 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2793 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2794 pkts += loc->pkts_sent + 1;
2795 pkts_n -= loc->pkts_sent;
2797 struct mlx5_wqe_dseg *__rte_restrict dseg;
2798 struct mlx5_wqe *__rte_restrict wqem;
2799 enum mlx5_txcmp_code ret;
2800 unsigned int room, part, nlim;
2801 unsigned int slen = 0;
2803 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2804 if (MLX5_TXOFF_CONFIG(TXPP)) {
2805 enum mlx5_txcmp_code wret;
2807 /* Generate WAIT for scheduling if requested. */
2808 wret = mlx5_tx_schedule_send(txq, loc, olx);
2809 if (wret == MLX5_TXCMP_CODE_EXIT)
2810 return MLX5_TXCMP_CODE_EXIT;
2811 if (wret == MLX5_TXCMP_CODE_ERROR)
2812 return MLX5_TXCMP_CODE_ERROR;
2815 * Limits the amount of packets in one WQE
2816 * to improve CQE latency generation.
2818 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2819 MLX5_MPW_INLINE_MAX_PACKETS :
2820 MLX5_EMPW_MAX_PACKETS);
2821 /* Check whether we have minimal amount WQEs */
2822 if (unlikely(loc->wqe_free <
2823 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2824 return MLX5_TXCMP_CODE_EXIT;
2825 if (likely(pkts_n > 1))
2826 rte_prefetch0(*pkts);
2827 wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2829 * Build eMPW title WQEBB:
2830 * - Control Segment, eMPW opcode, zero DS
2831 * - Ethernet Segment, no inline
2833 mlx5_tx_cseg_init(txq, loc, wqem, 0,
2834 MLX5_OPCODE_ENHANCED_MPSW, olx);
2835 mlx5_tx_eseg_none(txq, loc, wqem,
2836 olx & ~MLX5_TXOFF_CONFIG_VLAN);
2837 dseg = &wqem->dseg[0];
2838 /* Store the packet length for legacy MPW. */
2839 if (MLX5_TXOFF_CONFIG(MPW))
2840 wqem->eseg.mss = rte_cpu_to_be_16
2841 (rte_pktmbuf_data_len(loc->mbuf));
2842 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
2843 loc->wqe_free) * MLX5_WQE_SIZE -
2844 MLX5_WQE_CSEG_SIZE -
2846 /* Limit the room for legacy MPW sessions for performance. */
2847 if (MLX5_TXOFF_CONFIG(MPW))
2848 room = RTE_MIN(room,
2849 RTE_MAX(txq->inlen_empw +
2850 sizeof(dseg->bcount) +
2851 (MLX5_TXOFF_CONFIG(VLAN) ?
2852 sizeof(struct rte_vlan_hdr) : 0),
2853 MLX5_MPW_INLINE_MAX_PACKETS *
2854 MLX5_WQE_DSEG_SIZE));
2855 /* Build WQE till we have space, packets and resources. */
2858 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2859 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2862 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2863 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
2864 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
2866 * Some Tx offloads may cause an error if packet is not
2867 * long enough, check against assumed minimal length.
2869 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
2871 if (unlikely(!part))
2872 return MLX5_TXCMP_CODE_ERROR;
2874 * We have some successfully built
2875 * packet Data Segments to send.
2877 mlx5_tx_idone_empw(txq, loc, part,
2879 return MLX5_TXCMP_CODE_ERROR;
2881 /* Inline or not inline - that's the Question. */
2882 if (dlen > txq->inlen_empw ||
2883 loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2885 if (MLX5_TXOFF_CONFIG(MPW)) {
2886 if (dlen > txq->inlen_send)
2890 /* Open new inline MPW session. */
2891 tlen += sizeof(dseg->bcount);
2892 dseg->bcount = RTE_BE32(0);
2894 (dseg, sizeof(dseg->bcount));
2897 * No pointer and inline descriptor
2898 * intermix for legacy MPW sessions.
2900 if (wqem->dseg[0].bcount)
2904 tlen = sizeof(dseg->bcount) + dlen;
2906 /* Inline entire packet, optional VLAN insertion. */
2907 if (MLX5_TXOFF_CONFIG(VLAN) &&
2908 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2910 * The packet length must be checked in
2911 * mlx5_tx_able_to_empw() and packet
2912 * fits into inline length guaranteed.
2915 sizeof(struct rte_vlan_hdr)) <=
2917 tlen += sizeof(struct rte_vlan_hdr);
2920 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
2922 #ifdef MLX5_PMD_SOFT_COUNTERS
2923 /* Update sent data bytes counter. */
2924 slen += sizeof(struct rte_vlan_hdr);
2929 dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
2932 if (!MLX5_TXOFF_CONFIG(MPW))
2933 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
2934 MLX5_ASSERT(room >= tlen);
2937 * Packet data are completely inline,
2938 * we can try to free the packet.
2940 if (likely(loc->pkts_sent == loc->mbuf_free)) {
2942 * All the packets from the burst beginning
2943 * are inline, we can free mbufs directly
2944 * from the origin array on tx_burst exit().
2950 * In order no to call rte_pktmbuf_free_seg() here,
2951 * in the most inner loop (that might be very
2952 * expensive) we just save the mbuf in elts.
2954 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2959 * No pointer and inline descriptor
2960 * intermix for legacy MPW sessions.
2962 if (MLX5_TXOFF_CONFIG(MPW) &&
2964 wqem->dseg[0].bcount == RTE_BE32(0))
2967 * Not inlinable VLAN packets are
2968 * proceeded outside of this routine.
2970 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2971 if (MLX5_TXOFF_CONFIG(VLAN))
2972 MLX5_ASSERT(!(loc->mbuf->ol_flags &
2973 RTE_MBUF_F_TX_VLAN));
2974 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2975 /* We have to store mbuf in elts.*/
2976 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2978 room -= MLX5_WQE_DSEG_SIZE;
2979 /* Ring buffer wraparound is checked at the loop end.*/
2982 #ifdef MLX5_PMD_SOFT_COUNTERS
2983 /* Update sent data bytes counter. */
2988 if (unlikely(!pkts_n || !loc->elts_free)) {
2990 * We have no resources/packets to
2991 * continue build descriptors.
2994 mlx5_tx_idone_empw(txq, loc, part,
2996 return MLX5_TXCMP_CODE_EXIT;
2998 loc->mbuf = *pkts++;
2999 if (likely(pkts_n > 1))
3000 rte_prefetch0(*pkts);
3001 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3003 * Unroll the completion code to avoid
3004 * returning variable value - it results in
3005 * unoptimized sequent checking in caller.
3007 if (ret == MLX5_TXCMP_CODE_MULTI) {
3009 mlx5_tx_idone_empw(txq, loc, part,
3011 if (unlikely(!loc->elts_free ||
3013 return MLX5_TXCMP_CODE_EXIT;
3014 return MLX5_TXCMP_CODE_MULTI;
3016 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3017 if (ret == MLX5_TXCMP_CODE_TSO) {
3019 mlx5_tx_idone_empw(txq, loc, part,
3021 if (unlikely(!loc->elts_free ||
3023 return MLX5_TXCMP_CODE_EXIT;
3024 return MLX5_TXCMP_CODE_TSO;
3026 if (ret == MLX5_TXCMP_CODE_SINGLE) {
3028 mlx5_tx_idone_empw(txq, loc, part,
3030 if (unlikely(!loc->elts_free ||
3032 return MLX5_TXCMP_CODE_EXIT;
3033 return MLX5_TXCMP_CODE_SINGLE;
3035 if (ret != MLX5_TXCMP_CODE_EMPW) {
3038 mlx5_tx_idone_empw(txq, loc, part,
3040 return MLX5_TXCMP_CODE_ERROR;
3042 /* Check if we have minimal room left. */
3044 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
3047 * Check whether packet parameters coincide
3048 * within assumed eMPW batch:
3049 * - check sum settings
3051 * - software parser settings
3052 * - packets length (legacy MPW only)
3053 * - scheduling is not required
3055 if (!mlx5_tx_match_empw(txq, &wqem->eseg,
3058 /* Packet attributes match, continue the same eMPW. */
3059 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
3060 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
3063 * We get here to close an existing eMPW
3064 * session and start the new one.
3066 MLX5_ASSERT(pkts_n);
3068 if (unlikely(!part))
3069 return MLX5_TXCMP_CODE_EXIT;
3070 mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
3071 if (unlikely(!loc->elts_free ||
3073 return MLX5_TXCMP_CODE_EXIT;
3074 /* Continue the loop with new eMPW session. */
3080 * The routine sends packets with ordinary MLX5_OPCODE_SEND.
3081 * Data inlining and VLAN insertion are supported.
3083 static __rte_always_inline enum mlx5_txcmp_code
3084 mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
3085 struct rte_mbuf **__rte_restrict pkts,
3086 unsigned int pkts_n,
3087 struct mlx5_txq_local *__rte_restrict loc,
3091 * Subroutine is the part of mlx5_tx_burst_single()
3092 * and sends single-segment packet with SEND opcode.
3094 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3095 MLX5_ASSERT(pkts_n > loc->pkts_sent);
3096 pkts += loc->pkts_sent + 1;
3097 pkts_n -= loc->pkts_sent;
3099 struct mlx5_wqe *__rte_restrict wqe;
3100 enum mlx5_txcmp_code ret;
3102 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3103 if (MLX5_TXOFF_CONFIG(TXPP)) {
3104 enum mlx5_txcmp_code wret;
3106 /* Generate WAIT for scheduling if requested. */
3107 wret = mlx5_tx_schedule_send(txq, loc, olx);
3108 if (wret == MLX5_TXCMP_CODE_EXIT)
3109 return MLX5_TXCMP_CODE_EXIT;
3110 if (wret == MLX5_TXCMP_CODE_ERROR)
3111 return MLX5_TXCMP_CODE_ERROR;
3113 if (MLX5_TXOFF_CONFIG(INLINE)) {
3114 unsigned int inlen, vlan = 0;
3116 inlen = rte_pktmbuf_data_len(loc->mbuf);
3117 if (MLX5_TXOFF_CONFIG(VLAN) &&
3118 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
3119 vlan = sizeof(struct rte_vlan_hdr);
3123 * If inlining is enabled at configuration time
3124 * the limit must be not less than minimal size.
3125 * Otherwise we would do extra check for data
3126 * size to avoid crashes due to length overflow.
3128 MLX5_ASSERT(txq->inlen_send >=
3129 MLX5_ESEG_MIN_INLINE_SIZE);
3130 if (inlen <= txq->inlen_send) {
3131 unsigned int seg_n, wqe_n;
3133 rte_prefetch0(rte_pktmbuf_mtod
3134 (loc->mbuf, uint8_t *));
3135 /* Check against minimal length. */
3136 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3137 return MLX5_TXCMP_CODE_ERROR;
3138 if (loc->mbuf->ol_flags &
3139 RTE_MBUF_F_TX_DYNF_NOINLINE) {
3141 * The hint flag not to inline packet
3142 * data is set. Check whether we can
3145 if ((!MLX5_TXOFF_CONFIG(EMPW) &&
3147 (MLX5_TXOFF_CONFIG(MPW) &&
3149 if (inlen <= txq->inlen_send)
3152 * The hardware requires the
3153 * minimal inline data header.
3155 goto single_min_inline;
3157 if (MLX5_TXOFF_CONFIG(VLAN) &&
3158 vlan && !txq->vlan_en) {
3160 * We must insert VLAN tag
3161 * by software means.
3163 goto single_part_inline;
3165 goto single_no_inline;
3169 * Completely inlined packet data WQE:
3170 * - Control Segment, SEND opcode
3171 * - Ethernet Segment, no VLAN insertion
3172 * - Data inlined, VLAN optionally inserted
3173 * - Alignment to MLX5_WSEG_SIZE
3174 * Have to estimate amount of WQEBBs
3176 seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
3177 MLX5_ESEG_MIN_INLINE_SIZE +
3178 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3179 /* Check if there are enough WQEBBs. */
3180 wqe_n = (seg_n + 3) / 4;
3181 if (wqe_n > loc->wqe_free)
3182 return MLX5_TXCMP_CODE_EXIT;
3183 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3184 loc->wqe_last = wqe;
3185 mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
3186 MLX5_OPCODE_SEND, olx);
3187 mlx5_tx_eseg_data(txq, loc, wqe,
3188 vlan, inlen, 0, olx);
3189 txq->wqe_ci += wqe_n;
3190 loc->wqe_free -= wqe_n;
3192 * Packet data are completely inlined,
3193 * free the packet immediately.
3195 rte_pktmbuf_free_seg(loc->mbuf);
3196 } else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
3197 MLX5_TXOFF_CONFIG(MPW)) &&
3200 * If minimal inlining is requested the eMPW
3201 * feature should be disabled due to data is
3202 * inlined into Ethernet Segment, which can
3203 * not contain inlined data for eMPW due to
3204 * segment shared for all packets.
3206 struct mlx5_wqe_dseg *__rte_restrict dseg;
3211 * The inline-mode settings require
3212 * to inline the specified amount of
3213 * data bytes to the Ethernet Segment.
3214 * We should check the free space in
3215 * WQE ring buffer to inline partially.
3218 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
3219 MLX5_ASSERT(inlen > txq->inlen_mode);
3220 MLX5_ASSERT(txq->inlen_mode >=
3221 MLX5_ESEG_MIN_INLINE_SIZE);
3223 * Check whether there are enough free WQEBBs:
3225 * - Ethernet Segment
3226 * - First Segment of inlined Ethernet data
3227 * - ... data continued ...
3228 * - Finishing Data Segment of pointer type
3230 ds = (MLX5_WQE_CSEG_SIZE +
3231 MLX5_WQE_ESEG_SIZE +
3232 MLX5_WQE_DSEG_SIZE +
3234 MLX5_ESEG_MIN_INLINE_SIZE +
3235 MLX5_WQE_DSEG_SIZE +
3236 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3237 if (loc->wqe_free < ((ds + 3) / 4))
3238 return MLX5_TXCMP_CODE_EXIT;
3240 * Build the ordinary SEND WQE:
3242 * - Ethernet Segment, inline inlen_mode bytes
3243 * - Data Segment of pointer type
3245 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3246 loc->wqe_last = wqe;
3247 mlx5_tx_cseg_init(txq, loc, wqe, ds,
3248 MLX5_OPCODE_SEND, olx);
3249 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
3252 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3253 txq->inlen_mode - vlan;
3254 inlen -= txq->inlen_mode;
3255 mlx5_tx_dseg_ptr(txq, loc, dseg,
3258 * WQE is built, update the loop parameters
3259 * and got to the next packet.
3261 txq->wqe_ci += (ds + 3) / 4;
3262 loc->wqe_free -= (ds + 3) / 4;
3263 /* We have to store mbuf in elts.*/
3264 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3265 txq->elts[txq->elts_head++ & txq->elts_m] =
3273 * Partially inlined packet data WQE, we have
3274 * some space in title WQEBB, we can fill it
3275 * with some packet data. It takes one WQEBB,
3276 * it is available, no extra space check:
3277 * - Control Segment, SEND opcode
3278 * - Ethernet Segment, no VLAN insertion
3279 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
3280 * - Data Segment, pointer type
3282 * We also get here if VLAN insertion is not
3283 * supported by HW, the inline is enabled.
3286 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3287 loc->wqe_last = wqe;
3288 mlx5_tx_cseg_init(txq, loc, wqe, 4,
3289 MLX5_OPCODE_SEND, olx);
3290 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
3291 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3292 MLX5_ESEG_MIN_INLINE_SIZE - vlan;
3294 * The length check is performed above, by
3295 * comparing with txq->inlen_send. We should
3296 * not get overflow here.
3298 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
3299 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
3300 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
3304 /* We have to store mbuf in elts.*/
3305 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3306 txq->elts[txq->elts_head++ & txq->elts_m] =
3310 #ifdef MLX5_PMD_SOFT_COUNTERS
3311 /* Update sent data bytes counter. */
3312 txq->stats.obytes += vlan +
3313 rte_pktmbuf_data_len(loc->mbuf);
3317 * No inline at all, it means the CPU cycles saving
3318 * is prioritized at configuration, we should not
3319 * copy any packet data to WQE.
3321 * SEND WQE, one WQEBB:
3322 * - Control Segment, SEND opcode
3323 * - Ethernet Segment, optional VLAN, no inline
3324 * - Data Segment, pointer type
3327 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3328 loc->wqe_last = wqe;
3329 mlx5_tx_cseg_init(txq, loc, wqe, 3,
3330 MLX5_OPCODE_SEND, olx);
3331 mlx5_tx_eseg_none(txq, loc, wqe, olx);
3333 (txq, loc, &wqe->dseg[0],
3334 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3335 rte_pktmbuf_data_len(loc->mbuf), olx);
3339 * We should not store mbuf pointer in elts
3340 * if no inlining is configured, this is done
3341 * by calling routine in a batch copy.
3343 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
3345 #ifdef MLX5_PMD_SOFT_COUNTERS
3346 /* Update sent data bytes counter. */
3347 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
3348 if (MLX5_TXOFF_CONFIG(VLAN) &&
3349 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
3350 txq->stats.obytes +=
3351 sizeof(struct rte_vlan_hdr);
3356 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3357 return MLX5_TXCMP_CODE_EXIT;
3358 loc->mbuf = *pkts++;
3360 rte_prefetch0(*pkts);
3361 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3362 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
3368 static __rte_always_inline enum mlx5_txcmp_code
3369 mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
3370 struct rte_mbuf **__rte_restrict pkts,
3371 unsigned int pkts_n,
3372 struct mlx5_txq_local *__rte_restrict loc,
3375 enum mlx5_txcmp_code ret;
3377 ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
3378 if (ret == MLX5_TXCMP_CODE_SINGLE)
3380 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
3382 /* Optimize for inline/no inline eMPW send. */
3383 ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
3384 mlx5_tx_burst_empw_inline
3385 (txq, pkts, pkts_n, loc, olx) :
3386 mlx5_tx_burst_empw_simple
3387 (txq, pkts, pkts_n, loc, olx);
3388 if (ret != MLX5_TXCMP_CODE_SINGLE)
3390 /* The resources to send one packet should remain. */
3391 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3393 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
3394 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
3395 if (ret != MLX5_TXCMP_CODE_EMPW)
3397 /* The resources to send one packet should remain. */
3398 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3403 * DPDK Tx callback template. This is configured template used to generate
3404 * routines optimized for specified offload setup.
3405 * One of this generated functions is chosen at SQ configuration time.
3408 * Generic pointer to TX queue structure.
3410 * Packets to transmit.
3412 * Number of packets in array.
3414 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
3415 * values. Should be static to take compile time static configuration
3419 * Number of packets successfully transmitted (<= pkts_n).
3421 static __rte_always_inline uint16_t
3422 mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
3423 struct rte_mbuf **__rte_restrict pkts,
3427 struct mlx5_txq_local loc;
3428 enum mlx5_txcmp_code ret;
3431 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3432 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3433 if (unlikely(!pkts_n))
3435 if (MLX5_TXOFF_CONFIG(INLINE))
3439 loc.wqe_last = NULL;
3442 loc.pkts_loop = loc.pkts_sent;
3444 * Check if there are some CQEs, if any:
3445 * - process an encountered errors
3446 * - process the completed WQEs
3447 * - free related mbufs
3448 * - doorbell the NIC about processed CQEs
3450 rte_prefetch0(*(pkts + loc.pkts_sent));
3451 mlx5_tx_handle_completion(txq, olx);
3453 * Calculate the number of available resources - elts and WQEs.
3454 * There are two possible different scenarios:
3455 * - no data inlining into WQEs, one WQEBB may contains up to
3456 * four packets, in this case elts become scarce resource
3457 * - data inlining into WQEs, one packet may require multiple
3458 * WQEBBs, the WQEs become the limiting factor.
3460 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3461 loc.elts_free = txq->elts_s -
3462 (uint16_t)(txq->elts_head - txq->elts_tail);
3463 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3464 loc.wqe_free = txq->wqe_s -
3465 (uint16_t)(txq->wqe_ci - txq->wqe_pi);
3466 if (unlikely(!loc.elts_free || !loc.wqe_free))
3470 * Fetch the packet from array. Usually this is the first
3471 * packet in series of multi/single segment packets.
3473 loc.mbuf = *(pkts + loc.pkts_sent);
3474 /* Dedicated branch for multi-segment packets. */
3475 if (MLX5_TXOFF_CONFIG(MULTI) &&
3476 unlikely(NB_SEGS(loc.mbuf) > 1)) {
3478 * Multi-segment packet encountered.
3479 * Hardware is able to process it only
3480 * with SEND/TSO opcodes, one packet
3481 * per WQE, do it in dedicated routine.
3484 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
3485 part = loc.pkts_sent - loc.pkts_copy;
3486 if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3488 * There are some single-segment mbufs not
3489 * stored in elts. The mbufs must be in the
3490 * same order as WQEs, so we must copy the
3491 * mbufs to elts here, before the coming
3492 * multi-segment packet mbufs is appended.
3494 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
3496 loc.pkts_copy = loc.pkts_sent;
3498 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3499 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
3500 if (!MLX5_TXOFF_CONFIG(INLINE))
3501 loc.pkts_copy = loc.pkts_sent;
3503 * These returned code checks are supposed
3504 * to be optimized out due to routine inlining.
3506 if (ret == MLX5_TXCMP_CODE_EXIT) {
3508 * The routine returns this code when
3509 * all packets are sent or there is no
3510 * enough resources to complete request.
3514 if (ret == MLX5_TXCMP_CODE_ERROR) {
3516 * The routine returns this code when some error
3517 * in the incoming packets format occurred.
3519 txq->stats.oerrors++;
3522 if (ret == MLX5_TXCMP_CODE_SINGLE) {
3524 * The single-segment packet was encountered
3525 * in the array, try to send it with the
3526 * best optimized way, possible engaging eMPW.
3528 goto enter_send_single;
3530 if (MLX5_TXOFF_CONFIG(TSO) &&
3531 ret == MLX5_TXCMP_CODE_TSO) {
3533 * The single-segment TSO packet was
3534 * encountered in the array.
3536 goto enter_send_tso;
3538 /* We must not get here. Something is going wrong. */
3540 txq->stats.oerrors++;
3543 /* Dedicated branch for single-segment TSO packets. */
3544 if (MLX5_TXOFF_CONFIG(TSO) &&
3545 unlikely(loc.mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
3547 * TSO might require special way for inlining
3548 * (dedicated parameters) and is sent with
3549 * MLX5_OPCODE_TSO opcode only, provide this
3550 * in dedicated branch.
3553 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
3554 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3555 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
3557 * These returned code checks are supposed
3558 * to be optimized out due to routine inlining.
3560 if (ret == MLX5_TXCMP_CODE_EXIT)
3562 if (ret == MLX5_TXCMP_CODE_ERROR) {
3563 txq->stats.oerrors++;
3566 if (ret == MLX5_TXCMP_CODE_SINGLE)
3567 goto enter_send_single;
3568 if (MLX5_TXOFF_CONFIG(MULTI) &&
3569 ret == MLX5_TXCMP_CODE_MULTI) {
3571 * The multi-segment packet was
3572 * encountered in the array.
3574 goto enter_send_multi;
3576 /* We must not get here. Something is going wrong. */
3578 txq->stats.oerrors++;
3582 * The dedicated branch for the single-segment packets
3583 * without TSO. Often these ones can be sent using
3584 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
3585 * The routine builds the WQEs till it encounters
3586 * the TSO or multi-segment packet (in case if these
3587 * offloads are requested at SQ configuration time).
3590 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3591 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
3593 * These returned code checks are supposed
3594 * to be optimized out due to routine inlining.
3596 if (ret == MLX5_TXCMP_CODE_EXIT)
3598 if (ret == MLX5_TXCMP_CODE_ERROR) {
3599 txq->stats.oerrors++;
3602 if (MLX5_TXOFF_CONFIG(MULTI) &&
3603 ret == MLX5_TXCMP_CODE_MULTI) {
3605 * The multi-segment packet was
3606 * encountered in the array.
3608 goto enter_send_multi;
3610 if (MLX5_TXOFF_CONFIG(TSO) &&
3611 ret == MLX5_TXCMP_CODE_TSO) {
3613 * The single-segment TSO packet was
3614 * encountered in the array.
3616 goto enter_send_tso;
3618 /* We must not get here. Something is going wrong. */
3620 txq->stats.oerrors++;
3624 * Main Tx loop is completed, do the rest:
3625 * - set completion request if thresholds are reached
3626 * - doorbell the hardware
3627 * - copy the rest of mbufs to elts (if any)
3629 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
3630 loc.pkts_sent >= loc.pkts_copy);
3631 /* Take a shortcut if nothing is sent. */
3632 if (unlikely(loc.pkts_sent == loc.pkts_loop))
3634 /* Request CQE generation if limits are reached. */
3635 mlx5_tx_request_completion(txq, &loc, olx);
3637 * Ring QP doorbell immediately after WQE building completion
3638 * to improve latencies. The pure software related data treatment
3639 * can be completed after doorbell. Tx CQEs for this SQ are
3640 * processed in this thread only by the polling.
3642 * The rdma core library can map doorbell register in two ways,
3643 * depending on the environment variable "MLX5_SHUT_UP_BF":
3645 * - as regular cached memory, the variable is either missing or
3646 * set to zero. This type of mapping may cause the significant
3647 * doorbell register writing latency and requires explicit memory
3648 * write barrier to mitigate this issue and prevent write combining.
3650 * - as non-cached memory, the variable is present and set to not "0"
3651 * value. This type of mapping may cause performance impact under
3652 * heavy loading conditions but the explicit write memory barrier is
3653 * not required and it may improve core performance.
3655 * - the legacy behaviour (prior 19.08 release) was to use some
3656 * heuristics to decide whether write memory barrier should
3657 * be performed. This behavior is supported with specifying
3658 * tx_db_nc=2, write barrier is skipped if application provides
3659 * the full recommended burst of packets, it supposes the next
3660 * packets are coming and the write barrier will be issued on
3661 * the next burst (after descriptor writing, at least).
3663 mlx5_tx_dbrec_cond_wmb(txq, loc.wqe_last, !txq->db_nc &&
3664 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
3665 /* Not all of the mbufs may be stored into elts yet. */
3666 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
3667 if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3669 * There are some single-segment mbufs not stored in elts.
3670 * It can be only if the last packet was single-segment.
3671 * The copying is gathered into one place due to it is
3672 * a good opportunity to optimize that with SIMD.
3673 * Unfortunately if inlining is enabled the gaps in pointer
3674 * array may happen due to early freeing of the inlined mbufs.
3676 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
3677 loc.pkts_copy = loc.pkts_sent;
3679 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3680 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3681 if (pkts_n > loc.pkts_sent) {
3683 * If burst size is large there might be no enough CQE
3684 * fetched from completion queue and no enough resources
3685 * freed to send all the packets.
3690 #ifdef MLX5_PMD_SOFT_COUNTERS
3691 /* Increment sent packets counter. */
3692 txq->stats.opackets += loc.pkts_sent;
3694 if (MLX5_TXOFF_CONFIG(INLINE) && loc.mbuf_free)
3695 __mlx5_tx_free_mbuf(txq, pkts, loc.mbuf_free, olx);
3696 return loc.pkts_sent;
3699 #endif /* RTE_PMD_MLX5_TX_H_ */