1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright 2021 6WIND S.A.
3 * Copyright 2021 Mellanox Technologies, Ltd
6 #ifndef RTE_PMD_MLX5_TX_H_
7 #define RTE_PMD_MLX5_TX_H_
10 #include <sys/queue.h>
13 #include <rte_mempool.h>
14 #include <rte_common.h>
15 #include <rte_spinlock.h>
17 #include <mlx5_common.h>
18 #include <mlx5_common_mr.h>
21 #include "mlx5_autoconf.h"
23 /* TX burst subroutines return codes. */
24 enum mlx5_txcmp_code {
25 MLX5_TXCMP_CODE_EXIT = 0,
26 MLX5_TXCMP_CODE_ERROR,
27 MLX5_TXCMP_CODE_SINGLE,
28 MLX5_TXCMP_CODE_MULTI,
34 * These defines are used to configure Tx burst routine option set supported
35 * at compile time. The not specified options are optimized out due to if
36 * conditions can be explicitly calculated at compile time.
37 * The offloads with bigger runtime check (require more CPU cycles toskip)
38 * overhead should have the bigger index - this is needed to select the better
39 * matching routine function if no exact match and some offloads are not
42 #define MLX5_TXOFF_CONFIG_MULTI (1u << 0) /* Multi-segment packets.*/
43 #define MLX5_TXOFF_CONFIG_TSO (1u << 1) /* TCP send offload supported.*/
44 #define MLX5_TXOFF_CONFIG_SWP (1u << 2) /* Tunnels/SW Parser offloads.*/
45 #define MLX5_TXOFF_CONFIG_CSUM (1u << 3) /* Check Sums offloaded. */
46 #define MLX5_TXOFF_CONFIG_INLINE (1u << 4) /* Data inlining supported. */
47 #define MLX5_TXOFF_CONFIG_VLAN (1u << 5) /* VLAN insertion supported.*/
48 #define MLX5_TXOFF_CONFIG_METADATA (1u << 6) /* Flow metadata. */
49 #define MLX5_TXOFF_CONFIG_EMPW (1u << 8) /* Enhanced MPW supported.*/
50 #define MLX5_TXOFF_CONFIG_MPW (1u << 9) /* Legacy MPW supported.*/
51 #define MLX5_TXOFF_CONFIG_TXPP (1u << 10) /* Scheduling on timestamp.*/
53 /* The most common offloads groups. */
54 #define MLX5_TXOFF_CONFIG_NONE 0
55 #define MLX5_TXOFF_CONFIG_FULL (MLX5_TXOFF_CONFIG_MULTI | \
56 MLX5_TXOFF_CONFIG_TSO | \
57 MLX5_TXOFF_CONFIG_SWP | \
58 MLX5_TXOFF_CONFIG_CSUM | \
59 MLX5_TXOFF_CONFIG_INLINE | \
60 MLX5_TXOFF_CONFIG_VLAN | \
61 MLX5_TXOFF_CONFIG_METADATA)
63 #define MLX5_TXOFF_CONFIG(mask) (olx & MLX5_TXOFF_CONFIG_##mask)
65 #define MLX5_TXOFF_PRE_DECL(func) \
66 uint16_t mlx5_tx_burst_##func(void *txq, \
67 struct rte_mbuf **pkts, \
70 #define MLX5_TXOFF_DECL(func, olx) \
71 uint16_t mlx5_tx_burst_##func(void *txq, \
72 struct rte_mbuf **pkts, \
75 return mlx5_tx_burst_tmpl((struct mlx5_txq_data *)txq, \
76 pkts, pkts_n, (olx)); \
79 /* Mbuf dynamic flag offset for inline. */
80 extern uint64_t rte_net_mlx5_dynf_inline_mask;
81 #define RTE_MBUF_F_TX_DYNF_NOINLINE rte_net_mlx5_dynf_inline_mask
83 extern uint32_t mlx5_ptype_table[] __rte_cache_aligned;
84 extern uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
85 extern uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
87 struct mlx5_txq_stats {
88 #ifdef MLX5_PMD_SOFT_COUNTERS
89 uint64_t opackets; /**< Total of successfully sent packets. */
90 uint64_t obytes; /**< Total of successfully sent bytes. */
92 uint64_t oerrors; /**< Total number of failed transmitted packets. */
95 /* TX queue send local data. */
97 struct mlx5_txq_local {
98 struct mlx5_wqe *wqe_last; /* last sent WQE pointer. */
99 struct rte_mbuf *mbuf; /* first mbuf to process. */
100 uint16_t pkts_copy; /* packets copied to elts. */
101 uint16_t pkts_sent; /* packets sent. */
102 uint16_t pkts_loop; /* packets sent on loop entry. */
103 uint16_t elts_free; /* available elts remain. */
104 uint16_t wqe_free; /* available wqe remain. */
105 uint16_t mbuf_off; /* data offset in current mbuf. */
106 uint16_t mbuf_nseg; /* number of remaining mbuf. */
107 uint16_t mbuf_free; /* number of inline mbufs to free. */
110 /* TX queue descriptor. */
112 struct mlx5_txq_data {
113 uint16_t elts_head; /* Current counter in (*elts)[]. */
114 uint16_t elts_tail; /* Counter of first element awaiting completion. */
115 uint16_t elts_comp; /* elts index since last completion request. */
116 uint16_t elts_s; /* Number of mbuf elements. */
117 uint16_t elts_m; /* Mask for mbuf elements indices. */
118 /* Fields related to elts mbuf storage. */
119 uint16_t wqe_ci; /* Consumer index for work queue. */
120 uint16_t wqe_pi; /* Producer index for work queue. */
121 uint16_t wqe_s; /* Number of WQ elements. */
122 uint16_t wqe_m; /* Mask Number for WQ elements. */
123 uint16_t wqe_comp; /* WQE index since last completion request. */
124 uint16_t wqe_thres; /* WQE threshold to request completion in CQ. */
125 /* WQ related fields. */
126 uint16_t cq_ci; /* Consumer index for completion queue. */
127 uint16_t cq_pi; /* Production index for completion queue. */
128 uint16_t cqe_s; /* Number of CQ elements. */
129 uint16_t cqe_m; /* Mask for CQ indices. */
130 /* CQ related fields. */
131 uint16_t elts_n:4; /* elts[] length (in log2). */
132 uint16_t cqe_n:4; /* Number of CQ elements (in log2). */
133 uint16_t wqe_n:4; /* Number of WQ elements (in log2). */
134 uint16_t tso_en:1; /* When set hardware TSO is enabled. */
135 uint16_t tunnel_en:1;
136 /* When set TX offload for tunneled packets are supported. */
137 uint16_t swp_en:1; /* Whether SW parser is enabled. */
138 uint16_t vlan_en:1; /* VLAN insertion in WQE is supported. */
139 uint16_t db_nc:1; /* Doorbell mapped to non-cached region. */
140 uint16_t db_heu:1; /* Doorbell heuristic write barrier. */
141 uint16_t fast_free:1; /* mbuf fast free on Tx is enabled. */
142 uint16_t inlen_send; /* Ordinary send data inline size. */
143 uint16_t inlen_empw; /* eMPW max packet size to inline. */
144 uint16_t inlen_mode; /* Minimal data length to inline. */
145 uint32_t qp_num_8s; /* QP number shifted by 8. */
146 uint64_t offloads; /* Offloads for Tx Queue. */
147 struct mlx5_mr_ctrl mr_ctrl; /* MR control descriptor. */
148 struct mlx5_wqe *wqes; /* Work queue. */
149 struct mlx5_wqe *wqes_end; /* Work queue array limit. */
150 #ifdef RTE_LIBRTE_MLX5_DEBUG
151 uint32_t *fcqs; /* Free completion queue (debug extended). */
153 uint16_t *fcqs; /* Free completion queue. */
155 volatile struct mlx5_cqe *cqes; /* Completion queue. */
156 volatile uint32_t *qp_db; /* Work queue doorbell. */
157 volatile uint32_t *cq_db; /* Completion queue doorbell. */
158 uint16_t port_id; /* Port ID of device. */
159 uint16_t idx; /* Queue index. */
160 uint64_t ts_mask; /* Timestamp flag dynamic mask. */
161 int32_t ts_offset; /* Timestamp field dynamic offset. */
162 struct mlx5_dev_ctx_shared *sh; /* Shared context. */
163 struct mlx5_txq_stats stats; /* TX queue counters. */
164 struct mlx5_uar_data uar_data;
165 struct rte_mbuf *elts[0];
166 /* Storage for queued packets, must be the last field. */
167 } __rte_cache_aligned;
170 MLX5_TXQ_TYPE_STANDARD, /* Standard Tx queue. */
171 MLX5_TXQ_TYPE_HAIRPIN, /* Hairpin Tx queue. */
174 /* TX queue control descriptor. */
175 struct mlx5_txq_ctrl {
176 LIST_ENTRY(mlx5_txq_ctrl) next; /* Pointer to the next element. */
177 uint32_t refcnt; /* Reference counter. */
178 unsigned int socket; /* CPU socket ID for allocations. */
179 enum mlx5_txq_type type; /* The txq ctrl type. */
180 unsigned int max_inline_data; /* Max inline data. */
181 unsigned int max_tso_header; /* Max TSO header size. */
182 struct mlx5_txq_obj *obj; /* Verbs/DevX queue object. */
183 struct mlx5_priv *priv; /* Back pointer to private data. */
184 off_t uar_mmap_offset; /* UAR mmap offset for non-primary process. */
185 uint16_t dump_file_n; /* Number of dump files. */
186 struct rte_eth_hairpin_conf hairpin_conf; /* Hairpin configuration. */
187 uint32_t hairpin_status; /* Hairpin binding status. */
188 struct mlx5_txq_data txq; /* Data path structure. */
189 /* Must be the last field in the structure, contains elts[]. */
194 int mlx5_tx_queue_start(struct rte_eth_dev *dev, uint16_t queue_id);
195 int mlx5_tx_queue_stop(struct rte_eth_dev *dev, uint16_t queue_id);
196 int mlx5_tx_queue_start_primary(struct rte_eth_dev *dev, uint16_t queue_id);
197 int mlx5_tx_queue_stop_primary(struct rte_eth_dev *dev, uint16_t queue_id);
198 int mlx5_tx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
199 unsigned int socket, const struct rte_eth_txconf *conf);
200 int mlx5_tx_hairpin_queue_setup
201 (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
202 const struct rte_eth_hairpin_conf *hairpin_conf);
203 void mlx5_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid);
204 int mlx5_tx_uar_init_secondary(struct rte_eth_dev *dev, int fd);
205 void mlx5_tx_uar_uninit_secondary(struct rte_eth_dev *dev);
206 int mlx5_txq_obj_verify(struct rte_eth_dev *dev);
207 struct mlx5_txq_ctrl *mlx5_txq_new(struct rte_eth_dev *dev, uint16_t idx,
208 uint16_t desc, unsigned int socket,
209 const struct rte_eth_txconf *conf);
210 struct mlx5_txq_ctrl *mlx5_txq_hairpin_new
211 (struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
212 const struct rte_eth_hairpin_conf *hairpin_conf);
213 struct mlx5_txq_ctrl *mlx5_txq_get(struct rte_eth_dev *dev, uint16_t idx);
214 int mlx5_txq_release(struct rte_eth_dev *dev, uint16_t idx);
215 int mlx5_txq_releasable(struct rte_eth_dev *dev, uint16_t idx);
216 int mlx5_txq_verify(struct rte_eth_dev *dev);
217 void txq_alloc_elts(struct mlx5_txq_ctrl *txq_ctrl);
218 void txq_free_elts(struct mlx5_txq_ctrl *txq_ctrl);
219 uint64_t mlx5_get_tx_port_offloads(struct rte_eth_dev *dev);
220 void mlx5_txq_dynf_timestamp_set(struct rte_eth_dev *dev);
224 void mlx5_tx_handle_completion(struct mlx5_txq_data *__rte_restrict txq,
225 unsigned int olx __rte_unused);
226 int mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset);
227 void mlx5_txq_info_get(struct rte_eth_dev *dev, uint16_t queue_id,
228 struct rte_eth_txq_info *qinfo);
229 int mlx5_tx_burst_mode_get(struct rte_eth_dev *dev, uint16_t tx_queue_id,
230 struct rte_eth_burst_mode *mode);
234 MLX5_TXOFF_PRE_DECL(full_empw);
235 MLX5_TXOFF_PRE_DECL(none_empw);
236 MLX5_TXOFF_PRE_DECL(md_empw);
237 MLX5_TXOFF_PRE_DECL(mt_empw);
238 MLX5_TXOFF_PRE_DECL(mtsc_empw);
239 MLX5_TXOFF_PRE_DECL(mti_empw);
240 MLX5_TXOFF_PRE_DECL(mtv_empw);
241 MLX5_TXOFF_PRE_DECL(mtiv_empw);
242 MLX5_TXOFF_PRE_DECL(sc_empw);
243 MLX5_TXOFF_PRE_DECL(sci_empw);
244 MLX5_TXOFF_PRE_DECL(scv_empw);
245 MLX5_TXOFF_PRE_DECL(sciv_empw);
246 MLX5_TXOFF_PRE_DECL(i_empw);
247 MLX5_TXOFF_PRE_DECL(v_empw);
248 MLX5_TXOFF_PRE_DECL(iv_empw);
250 /* mlx5_tx_nompw.c */
252 MLX5_TXOFF_PRE_DECL(full);
253 MLX5_TXOFF_PRE_DECL(none);
254 MLX5_TXOFF_PRE_DECL(md);
255 MLX5_TXOFF_PRE_DECL(mt);
256 MLX5_TXOFF_PRE_DECL(mtsc);
257 MLX5_TXOFF_PRE_DECL(mti);
258 MLX5_TXOFF_PRE_DECL(mtv);
259 MLX5_TXOFF_PRE_DECL(mtiv);
260 MLX5_TXOFF_PRE_DECL(sc);
261 MLX5_TXOFF_PRE_DECL(sci);
262 MLX5_TXOFF_PRE_DECL(scv);
263 MLX5_TXOFF_PRE_DECL(sciv);
264 MLX5_TXOFF_PRE_DECL(i);
265 MLX5_TXOFF_PRE_DECL(v);
266 MLX5_TXOFF_PRE_DECL(iv);
270 MLX5_TXOFF_PRE_DECL(full_ts_nompw);
271 MLX5_TXOFF_PRE_DECL(full_ts_nompwi);
272 MLX5_TXOFF_PRE_DECL(full_ts);
273 MLX5_TXOFF_PRE_DECL(full_ts_noi);
274 MLX5_TXOFF_PRE_DECL(none_ts);
275 MLX5_TXOFF_PRE_DECL(mdi_ts);
276 MLX5_TXOFF_PRE_DECL(mti_ts);
277 MLX5_TXOFF_PRE_DECL(mtiv_ts);
281 MLX5_TXOFF_PRE_DECL(none_mpw);
282 MLX5_TXOFF_PRE_DECL(mci_mpw);
283 MLX5_TXOFF_PRE_DECL(mc_mpw);
284 MLX5_TXOFF_PRE_DECL(i_mpw);
286 static __rte_always_inline struct mlx5_uar_data *
287 mlx5_tx_bfreg(struct mlx5_txq_data *txq)
289 return &MLX5_PROC_PRIV(txq->port_id)->uar_table[txq->idx];
293 * Ring TX queue doorbell and flush the update by write memory barrier.
296 * Pointer to TX queue structure.
298 * Pointer to the last WQE posted in the NIC.
300 static __rte_always_inline void
301 mlx5_tx_dbrec(struct mlx5_txq_data *txq, volatile struct mlx5_wqe *wqe)
303 mlx5_doorbell_ring(mlx5_tx_bfreg(txq), *(volatile uint64_t *)wqe,
304 txq->wqe_ci, txq->qp_db, 1);
308 * Convert timestamp from mbuf format to linear counter
309 * of Clock Queue completions (24 bits).
312 * Pointer to the device shared context to fetch Tx
313 * packet pacing timestamp and parameters.
315 * Timestamp from mbuf to convert.
317 * positive or zero value - completion ID to wait.
318 * negative value - conversion error.
320 static __rte_always_inline int32_t
321 mlx5_txpp_convert_tx_ts(struct mlx5_dev_ctx_shared *sh, uint64_t mts)
328 * Read atomically two uint64_t fields and compare lsb bits.
329 * It there is no match - the timestamp was updated in
330 * the service thread, data should be re-read.
332 rte_compiler_barrier();
333 ci = __atomic_load_n(&sh->txpp.ts.ci_ts, __ATOMIC_RELAXED);
334 ts = __atomic_load_n(&sh->txpp.ts.ts, __ATOMIC_RELAXED);
335 rte_compiler_barrier();
336 if (!((ts ^ ci) << (64 - MLX5_CQ_INDEX_WIDTH)))
339 /* Perform the skew correction, positive value to send earlier. */
340 mts -= sh->txpp.skew;
342 if (unlikely(mts >= UINT64_MAX / 2)) {
343 /* We have negative integer, mts is in the past. */
344 __atomic_fetch_add(&sh->txpp.err_ts_past,
345 1, __ATOMIC_RELAXED);
348 tick = sh->txpp.tick;
350 /* Convert delta to completions, round up. */
351 mts = (mts + tick - 1) / tick;
352 if (unlikely(mts >= (1 << MLX5_CQ_INDEX_WIDTH) / 2 - 1)) {
353 /* We have mts is too distant future. */
354 __atomic_fetch_add(&sh->txpp.err_ts_future,
355 1, __ATOMIC_RELAXED);
358 mts <<= 64 - MLX5_CQ_INDEX_WIDTH;
360 ci >>= 64 - MLX5_CQ_INDEX_WIDTH;
365 * Set Software Parser flags and offsets in Ethernet Segment of WQE.
366 * Flags must be preliminary initialized to zero.
369 * Pointer to burst routine local context.
371 * Pointer to store Software Parser flags.
373 * Configured Tx offloads mask. It is fully defined at
374 * compile time and may be used for optimization.
377 * Software Parser offsets packed in dword.
378 * Software Parser flags are set by pointer.
380 static __rte_always_inline uint32_t
381 txq_mbuf_to_swp(struct mlx5_txq_local *__rte_restrict loc,
386 unsigned int idx, off;
389 if (!MLX5_TXOFF_CONFIG(SWP))
391 ol = loc->mbuf->ol_flags;
392 tunnel = ol & RTE_MBUF_F_TX_TUNNEL_MASK;
394 * Check whether Software Parser is required.
395 * Only customized tunnels may ask for.
397 if (likely(tunnel != RTE_MBUF_F_TX_TUNNEL_UDP && tunnel != RTE_MBUF_F_TX_TUNNEL_IP))
400 * The index should have:
401 * bit[0:1] = RTE_MBUF_F_TX_L4_MASK
402 * bit[4] = RTE_MBUF_F_TX_IPV6
403 * bit[8] = RTE_MBUF_F_TX_OUTER_IPV6
404 * bit[9] = RTE_MBUF_F_TX_OUTER_UDP
406 idx = (ol & (RTE_MBUF_F_TX_L4_MASK | RTE_MBUF_F_TX_IPV6 | RTE_MBUF_F_TX_OUTER_IPV6)) >> 52;
407 idx |= (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP) ? (1 << 9) : 0;
408 *swp_flags = mlx5_swp_types_table[idx];
410 * Set offsets for SW parser. Since ConnectX-5, SW parser just
411 * complements HW parser. SW parser starts to engage only if HW parser
412 * can't reach a header. For the older devices, HW parser will not kick
413 * in if any of SWP offsets is set. Therefore, all of the L3 offsets
414 * should be set regardless of HW offload.
416 off = loc->mbuf->outer_l2_len;
417 if (MLX5_TXOFF_CONFIG(VLAN) && ol & RTE_MBUF_F_TX_VLAN)
418 off += sizeof(struct rte_vlan_hdr);
419 set = (off >> 1) << 8; /* Outer L3 offset. */
420 off += loc->mbuf->outer_l3_len;
421 if (tunnel == RTE_MBUF_F_TX_TUNNEL_UDP)
422 set |= off >> 1; /* Outer L4 offset. */
423 if (ol & (RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IPV6)) { /* Inner IP. */
424 const uint64_t csum = ol & RTE_MBUF_F_TX_L4_MASK;
425 off += loc->mbuf->l2_len;
426 set |= (off >> 1) << 24; /* Inner L3 offset. */
427 if (csum == RTE_MBUF_F_TX_TCP_CKSUM ||
428 csum == RTE_MBUF_F_TX_UDP_CKSUM ||
429 (MLX5_TXOFF_CONFIG(TSO) && ol & RTE_MBUF_F_TX_TCP_SEG)) {
430 off += loc->mbuf->l3_len;
431 set |= (off >> 1) << 16; /* Inner L4 offset. */
434 set = rte_cpu_to_le_32(set);
439 * Convert the Checksum offloads to Verbs.
442 * Pointer to the mbuf.
445 * Converted checksum flags.
447 static __rte_always_inline uint8_t
448 txq_ol_cksum_to_cs(struct rte_mbuf *buf)
451 uint8_t is_tunnel = !!(buf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK);
452 const uint64_t ol_flags_mask = RTE_MBUF_F_TX_TCP_SEG | RTE_MBUF_F_TX_L4_MASK |
453 RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_OUTER_IP_CKSUM;
456 * The index should have:
457 * bit[0] = RTE_MBUF_F_TX_TCP_SEG
458 * bit[2:3] = RTE_MBUF_F_TX_UDP_CKSUM, RTE_MBUF_F_TX_TCP_CKSUM
459 * bit[4] = RTE_MBUF_F_TX_IP_CKSUM
460 * bit[8] = RTE_MBUF_F_TX_OUTER_IP_CKSUM
463 idx = ((buf->ol_flags & ol_flags_mask) >> 50) | (!!is_tunnel << 9);
464 return mlx5_cksum_table[idx];
468 * Free the mbufs from the linear array of pointers.
471 * Pointer to Tx queue structure.
473 * Pointer to array of packets to be free.
475 * Number of packets to be freed.
477 * Configured Tx offloads mask. It is fully defined at
478 * compile time and may be used for optimization.
480 static __rte_always_inline void
481 mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
482 struct rte_mbuf **__rte_restrict pkts,
484 unsigned int olx __rte_unused)
486 struct rte_mempool *pool = NULL;
487 struct rte_mbuf **p_free = NULL;
488 struct rte_mbuf *mbuf;
489 unsigned int n_free = 0;
492 * The implemented algorithm eliminates
493 * copying pointers to temporary array
494 * for rte_mempool_put_bulk() calls.
499 * Free mbufs directly to the pool in bulk
500 * if fast free offload is engaged
502 if (!MLX5_TXOFF_CONFIG(MULTI) && txq->fast_free) {
505 rte_mempool_put_bulk(pool, (void *)pkts, pkts_n);
511 * Decrement mbuf reference counter, detach
512 * indirect and external buffers if needed.
514 mbuf = rte_pktmbuf_prefree_seg(*pkts);
515 if (likely(mbuf != NULL)) {
516 MLX5_ASSERT(mbuf == *pkts);
517 if (likely(n_free != 0)) {
518 if (unlikely(pool != mbuf->pool))
519 /* From different pool. */
522 /* Start new scan array. */
529 if (unlikely(pkts_n == 0)) {
535 * This happens if mbuf is still referenced.
536 * We can't put it back to the pool, skip.
540 if (unlikely(n_free != 0))
541 /* There is some array to free.*/
543 if (unlikely(pkts_n == 0))
544 /* Last mbuf, nothing to free. */
550 * This loop is implemented to avoid multiple
551 * inlining of rte_mempool_put_bulk().
557 * Free the array of pre-freed mbufs
558 * belonging to the same memory pool.
560 rte_mempool_put_bulk(pool, (void *)p_free, n_free);
561 if (unlikely(mbuf != NULL)) {
562 /* There is the request to start new scan. */
567 if (likely(pkts_n != 0))
570 * This is the last mbuf to be freed.
571 * Do one more loop iteration to complete.
572 * This is rare case of the last unique mbuf.
577 if (likely(pkts_n == 0))
586 * No inline version to free buffers for optimal call
587 * on the tx_burst completion.
589 static __rte_noinline void
590 __mlx5_tx_free_mbuf(struct mlx5_txq_data *__rte_restrict txq,
591 struct rte_mbuf **__rte_restrict pkts,
593 unsigned int olx __rte_unused)
595 mlx5_tx_free_mbuf(txq, pkts, pkts_n, olx);
599 * Free the mbuf from the elts ring buffer till new tail.
602 * Pointer to Tx queue structure.
604 * Index in elts to free up to, becomes new elts tail.
606 * Configured Tx offloads mask. It is fully defined at
607 * compile time and may be used for optimization.
609 static __rte_always_inline void
610 mlx5_tx_free_elts(struct mlx5_txq_data *__rte_restrict txq,
612 unsigned int olx __rte_unused)
614 uint16_t n_elts = tail - txq->elts_tail;
617 MLX5_ASSERT(n_elts <= txq->elts_s);
619 * Implement a loop to support ring buffer wraparound
620 * with single inlining of mlx5_tx_free_mbuf().
625 part = txq->elts_s - (txq->elts_tail & txq->elts_m);
626 part = RTE_MIN(part, n_elts);
628 MLX5_ASSERT(part <= txq->elts_s);
629 mlx5_tx_free_mbuf(txq,
630 &txq->elts[txq->elts_tail & txq->elts_m],
632 txq->elts_tail += part;
638 * Store the mbuf being sent into elts ring buffer.
639 * On Tx completion these mbufs will be freed.
642 * Pointer to Tx queue structure.
644 * Pointer to array of packets to be stored.
646 * Number of packets to be stored.
648 * Configured Tx offloads mask. It is fully defined at
649 * compile time and may be used for optimization.
651 static __rte_always_inline void
652 mlx5_tx_copy_elts(struct mlx5_txq_data *__rte_restrict txq,
653 struct rte_mbuf **__rte_restrict pkts,
655 unsigned int olx __rte_unused)
658 struct rte_mbuf **elts = (struct rte_mbuf **)txq->elts;
662 part = txq->elts_s - (txq->elts_head & txq->elts_m);
664 MLX5_ASSERT(part <= txq->elts_s);
665 /* This code is a good candidate for vectorizing with SIMD. */
666 rte_memcpy((void *)(elts + (txq->elts_head & txq->elts_m)),
668 RTE_MIN(part, pkts_n) * sizeof(struct rte_mbuf *));
669 txq->elts_head += pkts_n;
670 if (unlikely(part < pkts_n))
671 /* The copy is wrapping around the elts array. */
672 rte_memcpy((void *)elts, (void *)(pkts + part),
673 (pkts_n - part) * sizeof(struct rte_mbuf *));
677 * Check if the completion request flag should be set in the last WQE.
678 * Both pushed mbufs and WQEs are monitored and the completion request
679 * flag is set if any of thresholds is reached.
682 * Pointer to TX queue structure.
684 * Pointer to burst routine local context.
686 * Configured Tx offloads mask. It is fully defined at
687 * compile time and may be used for optimization.
689 static __rte_always_inline void
690 mlx5_tx_request_completion(struct mlx5_txq_data *__rte_restrict txq,
691 struct mlx5_txq_local *__rte_restrict loc,
694 uint16_t head = txq->elts_head;
697 part = MLX5_TXOFF_CONFIG(INLINE) ?
698 0 : loc->pkts_sent - loc->pkts_copy;
700 if ((uint16_t)(head - txq->elts_comp) >= MLX5_TX_COMP_THRESH ||
701 (MLX5_TXOFF_CONFIG(INLINE) &&
702 (uint16_t)(txq->wqe_ci - txq->wqe_comp) >= txq->wqe_thres)) {
703 volatile struct mlx5_wqe *last = loc->wqe_last;
706 txq->elts_comp = head;
707 if (MLX5_TXOFF_CONFIG(INLINE))
708 txq->wqe_comp = txq->wqe_ci;
709 /* Request unconditional completion on last WQE. */
710 last->cseg.flags = RTE_BE32(MLX5_COMP_ALWAYS <<
711 MLX5_COMP_MODE_OFFSET);
712 /* Save elts_head in dedicated free on completion queue. */
713 #ifdef RTE_LIBRTE_MLX5_DEBUG
714 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head |
715 (last->cseg.opcode >> 8) << 16;
717 txq->fcqs[txq->cq_pi++ & txq->cqe_m] = head;
719 /* A CQE slot must always be available. */
720 MLX5_ASSERT((txq->cq_pi - txq->cq_ci) <= txq->cqe_s);
725 * Build the Control Segment with specified opcode:
727 * - MLX5_OPCODE_ENHANCED_MPSW
731 * Pointer to TX queue structure.
733 * Pointer to burst routine local context.
735 * Pointer to WQE to fill with built Control Segment.
737 * Supposed length of WQE in segments.
739 * SQ WQE opcode to put into Control Segment.
741 * Configured Tx offloads mask. It is fully defined at
742 * compile time and may be used for optimization.
744 static __rte_always_inline void
745 mlx5_tx_cseg_init(struct mlx5_txq_data *__rte_restrict txq,
746 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
747 struct mlx5_wqe *__rte_restrict wqe,
750 unsigned int olx __rte_unused)
752 struct mlx5_wqe_cseg *__rte_restrict cs = &wqe->cseg;
754 /* For legacy MPW replace the EMPW by TSO with modifier. */
755 if (MLX5_TXOFF_CONFIG(MPW) && opcode == MLX5_OPCODE_ENHANCED_MPSW)
756 opcode = MLX5_OPCODE_TSO | MLX5_OPC_MOD_MPW << 24;
757 cs->opcode = rte_cpu_to_be_32((txq->wqe_ci << 8) | opcode);
758 cs->sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
759 cs->flags = RTE_BE32(MLX5_COMP_ONLY_FIRST_ERR <<
760 MLX5_COMP_MODE_OFFSET);
761 cs->misc = RTE_BE32(0);
765 * Build the Synchronize Queue Segment with specified completion index.
768 * Pointer to TX queue structure.
770 * Pointer to burst routine local context.
772 * Pointer to WQE to fill with built Control Segment.
774 * Completion index in Clock Queue to wait.
776 * Configured Tx offloads mask. It is fully defined at
777 * compile time and may be used for optimization.
779 static __rte_always_inline void
780 mlx5_tx_wseg_init(struct mlx5_txq_data *restrict txq,
781 struct mlx5_txq_local *restrict loc __rte_unused,
782 struct mlx5_wqe *restrict wqe,
784 unsigned int olx __rte_unused)
786 struct mlx5_wqe_qseg *qs;
788 qs = RTE_PTR_ADD(wqe, MLX5_WSEG_SIZE);
789 qs->max_index = rte_cpu_to_be_32(wci);
790 qs->qpn_cqn = rte_cpu_to_be_32(txq->sh->txpp.clock_queue.cq_obj.cq->id);
791 qs->reserved0 = RTE_BE32(0);
792 qs->reserved1 = RTE_BE32(0);
796 * Build the Ethernet Segment without inlined data.
797 * Supports Software Parser, Checksums and VLAN insertion Tx offload features.
800 * Pointer to TX queue structure.
802 * Pointer to burst routine local context.
804 * Pointer to WQE to fill with built Ethernet Segment.
806 * Configured Tx offloads mask. It is fully defined at
807 * compile time and may be used for optimization.
809 static __rte_always_inline void
810 mlx5_tx_eseg_none(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
811 struct mlx5_txq_local *__rte_restrict loc,
812 struct mlx5_wqe *__rte_restrict wqe,
815 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
819 * Calculate and set check sum flags first, dword field
820 * in segment may be shared with Software Parser flags.
822 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
823 es->flags = rte_cpu_to_le_32(csum);
825 * Calculate and set Software Parser offsets and flags.
826 * These flags a set for custom UDP and IP tunnel packets.
828 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
829 /* Fill metadata field if needed. */
830 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
831 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
832 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
834 /* Engage VLAN tag insertion feature if requested. */
835 if (MLX5_TXOFF_CONFIG(VLAN) &&
836 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
838 * We should get here only if device support
839 * this feature correctly.
841 MLX5_ASSERT(txq->vlan_en);
842 es->inline_hdr = rte_cpu_to_be_32(MLX5_ETH_WQE_VLAN_INSERT |
843 loc->mbuf->vlan_tci);
845 es->inline_hdr = RTE_BE32(0);
850 * Build the Ethernet Segment with minimal inlined data
851 * of MLX5_ESEG_MIN_INLINE_SIZE bytes length. This is
852 * used to fill the gap in single WQEBB WQEs.
853 * Supports Software Parser, Checksums and VLAN
854 * insertion Tx offload features.
857 * Pointer to TX queue structure.
859 * Pointer to burst routine local context.
861 * Pointer to WQE to fill with built Ethernet Segment.
863 * Length of VLAN tag insertion if any.
865 * Configured Tx offloads mask. It is fully defined at
866 * compile time and may be used for optimization.
868 static __rte_always_inline void
869 mlx5_tx_eseg_dmin(struct mlx5_txq_data *__rte_restrict txq __rte_unused,
870 struct mlx5_txq_local *__rte_restrict loc,
871 struct mlx5_wqe *__rte_restrict wqe,
875 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
877 uint8_t *psrc, *pdst;
880 * Calculate and set check sum flags first, dword field
881 * in segment may be shared with Software Parser flags.
883 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
884 es->flags = rte_cpu_to_le_32(csum);
886 * Calculate and set Software Parser offsets and flags.
887 * These flags a set for custom UDP and IP tunnel packets.
889 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
890 /* Fill metadata field if needed. */
891 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
892 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
893 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
895 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
896 es->inline_hdr_sz = RTE_BE16(MLX5_ESEG_MIN_INLINE_SIZE);
897 es->inline_data = *(unaligned_uint16_t *)psrc;
898 psrc += sizeof(uint16_t);
899 pdst = (uint8_t *)(es + 1);
900 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
901 /* Implement VLAN tag insertion as part inline data. */
902 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
903 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
904 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
905 /* Insert VLAN ethertype + VLAN tag. */
906 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
907 ((RTE_ETHER_TYPE_VLAN << 16) |
908 loc->mbuf->vlan_tci);
909 pdst += sizeof(struct rte_vlan_hdr);
910 /* Copy the rest two bytes from packet data. */
911 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
912 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
914 /* Fill the gap in the title WQEBB with inline data. */
915 rte_mov16(pdst, psrc);
920 * Build the Ethernet Segment with entire packet data inlining. Checks the
921 * boundary of WQEBB and ring buffer wrapping, supports Software Parser,
922 * Checksums and VLAN insertion Tx offload features.
925 * Pointer to TX queue structure.
927 * Pointer to burst routine local context.
929 * Pointer to WQE to fill with built Ethernet Segment.
931 * Length of VLAN tag insertion if any.
933 * Length of data to inline (VLAN included, if any).
935 * TSO flag, set mss field from the packet.
937 * Configured Tx offloads mask. It is fully defined at
938 * compile time and may be used for optimization.
941 * Pointer to the next Data Segment (aligned and wrapped around).
943 static __rte_always_inline struct mlx5_wqe_dseg *
944 mlx5_tx_eseg_data(struct mlx5_txq_data *__rte_restrict txq,
945 struct mlx5_txq_local *__rte_restrict loc,
946 struct mlx5_wqe *__rte_restrict wqe,
952 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
954 uint8_t *psrc, *pdst;
958 * Calculate and set check sum flags first, dword field
959 * in segment may be shared with Software Parser flags.
961 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
964 csum |= loc->mbuf->tso_segsz;
965 es->flags = rte_cpu_to_be_32(csum);
967 es->flags = rte_cpu_to_le_32(csum);
970 * Calculate and set Software Parser offsets and flags.
971 * These flags a set for custom UDP and IP tunnel packets.
973 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
974 /* Fill metadata field if needed. */
975 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
976 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
977 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
979 psrc = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
980 es->inline_hdr_sz = rte_cpu_to_be_16(inlen);
981 es->inline_data = *(unaligned_uint16_t *)psrc;
982 psrc += sizeof(uint16_t);
983 pdst = (uint8_t *)(es + 1);
984 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
985 /* Implement VLAN tag insertion as part inline data. */
986 memcpy(pdst, psrc, 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t));
987 pdst += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
988 psrc += 2 * RTE_ETHER_ADDR_LEN - sizeof(uint16_t);
989 /* Insert VLAN ethertype + VLAN tag. */
990 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
991 ((RTE_ETHER_TYPE_VLAN << 16) |
992 loc->mbuf->vlan_tci);
993 pdst += sizeof(struct rte_vlan_hdr);
994 /* Copy the rest two bytes from packet data. */
995 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, sizeof(uint16_t)));
996 *(uint16_t *)pdst = *(unaligned_uint16_t *)psrc;
997 psrc += sizeof(uint16_t);
999 /* Fill the gap in the title WQEBB with inline data. */
1000 rte_mov16(pdst, psrc);
1001 psrc += sizeof(rte_v128u32_t);
1003 pdst = (uint8_t *)(es + 2);
1004 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1005 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1006 inlen -= MLX5_ESEG_MIN_INLINE_SIZE;
1008 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1009 return (struct mlx5_wqe_dseg *)pdst;
1012 * The WQEBB space availability is checked by caller.
1013 * Here we should be aware of WQE ring buffer wraparound only.
1015 part = (uint8_t *)txq->wqes_end - pdst;
1016 part = RTE_MIN(part, inlen);
1018 rte_memcpy(pdst, psrc, part);
1020 if (likely(!inlen)) {
1022 * If return value is not used by the caller
1023 * the code below will be optimized out.
1026 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1027 if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1028 pdst = (uint8_t *)txq->wqes;
1029 return (struct mlx5_wqe_dseg *)pdst;
1031 pdst = (uint8_t *)txq->wqes;
1038 * Copy data from chain of mbuf to the specified linear buffer.
1039 * Checksums and VLAN insertion Tx offload features. If data
1040 * from some mbuf copied completely this mbuf is freed. Local
1041 * structure is used to keep the byte stream state.
1044 * Pointer to the destination linear buffer.
1046 * Pointer to burst routine local context.
1048 * Length of data to be copied.
1050 * Length of data to be copied ignoring no inline hint.
1052 * Configured Tx offloads mask. It is fully defined at
1053 * compile time and may be used for optimization.
1056 * Number of actual copied data bytes. This is always greater than or
1057 * equal to must parameter and might be lesser than len in no inline
1058 * hint flag is encountered.
1060 static __rte_always_inline unsigned int
1061 mlx5_tx_mseg_memcpy(uint8_t *pdst,
1062 struct mlx5_txq_local *__rte_restrict loc,
1065 unsigned int olx __rte_unused)
1067 struct rte_mbuf *mbuf;
1068 unsigned int part, dlen, copy = 0;
1073 /* Allow zero length packets, must check first. */
1074 dlen = rte_pktmbuf_data_len(loc->mbuf);
1075 if (dlen <= loc->mbuf_off) {
1076 /* Exhausted packet, just free. */
1078 loc->mbuf = mbuf->next;
1079 rte_pktmbuf_free_seg(mbuf);
1081 MLX5_ASSERT(loc->mbuf_nseg > 1);
1082 MLX5_ASSERT(loc->mbuf);
1084 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
1089 * We already copied the minimal
1090 * requested amount of data.
1095 if (diff <= rte_pktmbuf_data_len(loc->mbuf)) {
1097 * Copy only the minimal required
1098 * part of the data buffer. Limit amount
1099 * of data to be copied to the length of
1102 len = RTE_MIN(len, diff);
1107 dlen -= loc->mbuf_off;
1108 psrc = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1110 part = RTE_MIN(len, dlen);
1111 rte_memcpy(pdst, psrc, part);
1113 loc->mbuf_off += part;
1116 if (loc->mbuf_off >= rte_pktmbuf_data_len(loc->mbuf)) {
1118 /* Exhausted packet, just free. */
1120 loc->mbuf = mbuf->next;
1121 rte_pktmbuf_free_seg(mbuf);
1123 MLX5_ASSERT(loc->mbuf_nseg >= 1);
1133 * Build the Ethernet Segment with inlined data from multi-segment packet.
1134 * Checks the boundary of WQEBB and ring buffer wrapping, supports Software
1135 * Parser, Checksums and VLAN insertion Tx offload features.
1138 * Pointer to TX queue structure.
1140 * Pointer to burst routine local context.
1142 * Pointer to WQE to fill with built Ethernet Segment.
1144 * Length of VLAN tag insertion if any.
1146 * Length of data to inline (VLAN included, if any).
1148 * TSO flag, set mss field from the packet.
1150 * Configured Tx offloads mask. It is fully defined at
1151 * compile time and may be used for optimization.
1154 * Pointer to the next Data Segment (aligned and possible NOT wrapped
1155 * around - caller should do wrapping check on its own).
1157 static __rte_always_inline struct mlx5_wqe_dseg *
1158 mlx5_tx_eseg_mdat(struct mlx5_txq_data *__rte_restrict txq,
1159 struct mlx5_txq_local *__rte_restrict loc,
1160 struct mlx5_wqe *__rte_restrict wqe,
1166 struct mlx5_wqe_eseg *__rte_restrict es = &wqe->eseg;
1169 unsigned int part, tlen = 0;
1172 * Calculate and set check sum flags first, uint32_t field
1173 * in segment may be shared with Software Parser flags.
1175 csum = MLX5_TXOFF_CONFIG(CSUM) ? txq_ol_cksum_to_cs(loc->mbuf) : 0;
1178 csum |= loc->mbuf->tso_segsz;
1179 es->flags = rte_cpu_to_be_32(csum);
1181 es->flags = rte_cpu_to_le_32(csum);
1184 * Calculate and set Software Parser offsets and flags.
1185 * These flags a set for custom UDP and IP tunnel packets.
1187 es->swp_offs = txq_mbuf_to_swp(loc, &es->swp_flags, olx);
1188 /* Fill metadata field if needed. */
1189 es->metadata = MLX5_TXOFF_CONFIG(METADATA) ?
1190 loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
1191 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) :
1193 MLX5_ASSERT(inlen >= MLX5_ESEG_MIN_INLINE_SIZE);
1194 pdst = (uint8_t *)&es->inline_data;
1195 if (MLX5_TXOFF_CONFIG(VLAN) && vlan) {
1196 /* Implement VLAN tag insertion as part inline data. */
1197 mlx5_tx_mseg_memcpy(pdst, loc,
1198 2 * RTE_ETHER_ADDR_LEN,
1199 2 * RTE_ETHER_ADDR_LEN, olx);
1200 pdst += 2 * RTE_ETHER_ADDR_LEN;
1201 *(unaligned_uint32_t *)pdst = rte_cpu_to_be_32
1202 ((RTE_ETHER_TYPE_VLAN << 16) |
1203 loc->mbuf->vlan_tci);
1204 pdst += sizeof(struct rte_vlan_hdr);
1205 tlen += 2 * RTE_ETHER_ADDR_LEN + sizeof(struct rte_vlan_hdr);
1207 MLX5_ASSERT(pdst < (uint8_t *)txq->wqes_end);
1209 * The WQEBB space availability is checked by caller.
1210 * Here we should be aware of WQE ring buffer wraparound only.
1212 part = (uint8_t *)txq->wqes_end - pdst;
1213 part = RTE_MIN(part, inlen - tlen);
1219 * Copying may be interrupted inside the routine
1220 * if run into no inline hint flag.
1222 copy = tso ? inlen : txq->inlen_mode;
1223 copy = tlen >= copy ? 0 : (copy - tlen);
1224 copy = mlx5_tx_mseg_memcpy(pdst, loc, part, copy, olx);
1226 if (likely(inlen <= tlen) || copy < part) {
1227 es->inline_hdr_sz = rte_cpu_to_be_16(tlen);
1229 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1230 return (struct mlx5_wqe_dseg *)pdst;
1232 pdst = (uint8_t *)txq->wqes;
1233 part = inlen - tlen;
1238 * Build the Data Segment of pointer type.
1241 * Pointer to TX queue structure.
1243 * Pointer to burst routine local context.
1245 * Pointer to WQE to fill with built Data Segment.
1247 * Data buffer to point.
1249 * Data buffer length.
1251 * Configured Tx offloads mask. It is fully defined at
1252 * compile time and may be used for optimization.
1254 static __rte_always_inline void
1255 mlx5_tx_dseg_ptr(struct mlx5_txq_data *__rte_restrict txq,
1256 struct mlx5_txq_local *__rte_restrict loc,
1257 struct mlx5_wqe_dseg *__rte_restrict dseg,
1260 unsigned int olx __rte_unused)
1264 dseg->bcount = rte_cpu_to_be_32(len);
1265 dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1266 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1270 * Build the Data Segment of pointer type or inline if data length is less than
1271 * buffer in minimal Data Segment size.
1274 * Pointer to TX queue structure.
1276 * Pointer to burst routine local context.
1278 * Pointer to WQE to fill with built Data Segment.
1280 * Data buffer to point.
1282 * Data buffer length.
1284 * Configured Tx offloads mask. It is fully defined at
1285 * compile time and may be used for optimization.
1287 static __rte_always_inline void
1288 mlx5_tx_dseg_iptr(struct mlx5_txq_data *__rte_restrict txq,
1289 struct mlx5_txq_local *__rte_restrict loc,
1290 struct mlx5_wqe_dseg *__rte_restrict dseg,
1293 unsigned int olx __rte_unused)
1299 if (len > MLX5_DSEG_MIN_INLINE_SIZE) {
1300 dseg->bcount = rte_cpu_to_be_32(len);
1301 dseg->lkey = mlx5_mr_mb2mr(&txq->mr_ctrl, loc->mbuf);
1302 dseg->pbuf = rte_cpu_to_be_64((uintptr_t)buf);
1306 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1307 /* Unrolled implementation of generic rte_memcpy. */
1308 dst = (uintptr_t)&dseg->inline_data[0];
1309 src = (uintptr_t)buf;
1311 #ifdef RTE_ARCH_STRICT_ALIGN
1312 MLX5_ASSERT(dst == RTE_PTR_ALIGN(dst, sizeof(uint32_t)));
1313 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1314 dst += sizeof(uint32_t);
1315 src += sizeof(uint32_t);
1316 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1317 dst += sizeof(uint32_t);
1318 src += sizeof(uint32_t);
1320 *(uint64_t *)dst = *(unaligned_uint64_t *)src;
1321 dst += sizeof(uint64_t);
1322 src += sizeof(uint64_t);
1326 *(uint32_t *)dst = *(unaligned_uint32_t *)src;
1327 dst += sizeof(uint32_t);
1328 src += sizeof(uint32_t);
1331 *(uint16_t *)dst = *(unaligned_uint16_t *)src;
1332 dst += sizeof(uint16_t);
1333 src += sizeof(uint16_t);
1336 *(uint8_t *)dst = *(uint8_t *)src;
1340 * Build the Data Segment of inlined data from single
1341 * segment packet, no VLAN insertion.
1344 * Pointer to TX queue structure.
1346 * Pointer to burst routine local context.
1348 * Pointer to WQE to fill with built Data Segment.
1350 * Data buffer to point.
1352 * Data buffer length.
1354 * Configured Tx offloads mask. It is fully defined at
1355 * compile time and may be used for optimization.
1358 * Pointer to the next Data Segment after inlined data.
1359 * Ring buffer wraparound check is needed. We do not do it here because it
1360 * may not be needed for the last packet in the eMPW session.
1362 static __rte_always_inline struct mlx5_wqe_dseg *
1363 mlx5_tx_dseg_empw(struct mlx5_txq_data *__rte_restrict txq,
1364 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1365 struct mlx5_wqe_dseg *__rte_restrict dseg,
1368 unsigned int olx __rte_unused)
1373 if (!MLX5_TXOFF_CONFIG(MPW)) {
1374 /* Store the descriptor byte counter for eMPW sessions. */
1375 dseg->bcount = rte_cpu_to_be_32(len | MLX5_ETH_WQE_DATA_INLINE);
1376 pdst = &dseg->inline_data[0];
1378 /* The entire legacy MPW session counter is stored on close. */
1379 pdst = (uint8_t *)dseg;
1382 * The WQEBB space availability is checked by caller.
1383 * Here we should be aware of WQE ring buffer wraparound only.
1385 part = (uint8_t *)txq->wqes_end - pdst;
1386 part = RTE_MIN(part, len);
1388 rte_memcpy(pdst, buf, part);
1392 if (!MLX5_TXOFF_CONFIG(MPW))
1393 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1394 /* Note: no final wraparound check here. */
1395 return (struct mlx5_wqe_dseg *)pdst;
1397 pdst = (uint8_t *)txq->wqes;
1404 * Build the Data Segment of inlined data from single
1405 * segment packet with VLAN insertion.
1408 * Pointer to TX queue structure.
1410 * Pointer to burst routine local context.
1412 * Pointer to the dseg fill with built Data Segment.
1414 * Data buffer to point.
1416 * Data buffer length.
1418 * Configured Tx offloads mask. It is fully defined at
1419 * compile time and may be used for optimization.
1422 * Pointer to the next Data Segment after inlined data.
1423 * Ring buffer wraparound check is needed.
1425 static __rte_always_inline struct mlx5_wqe_dseg *
1426 mlx5_tx_dseg_vlan(struct mlx5_txq_data *__rte_restrict txq,
1427 struct mlx5_txq_local *__rte_restrict loc __rte_unused,
1428 struct mlx5_wqe_dseg *__rte_restrict dseg,
1431 unsigned int olx __rte_unused)
1437 MLX5_ASSERT(len > MLX5_ESEG_MIN_INLINE_SIZE);
1438 if (!MLX5_TXOFF_CONFIG(MPW)) {
1439 /* Store the descriptor byte counter for eMPW sessions. */
1440 dseg->bcount = rte_cpu_to_be_32
1441 ((len + sizeof(struct rte_vlan_hdr)) |
1442 MLX5_ETH_WQE_DATA_INLINE);
1443 pdst = &dseg->inline_data[0];
1445 /* The entire legacy MPW session counter is stored on close. */
1446 pdst = (uint8_t *)dseg;
1448 memcpy(pdst, buf, MLX5_DSEG_MIN_INLINE_SIZE);
1449 buf += MLX5_DSEG_MIN_INLINE_SIZE;
1450 pdst += MLX5_DSEG_MIN_INLINE_SIZE;
1451 len -= MLX5_DSEG_MIN_INLINE_SIZE;
1452 /* Insert VLAN ethertype + VLAN tag. Pointer is aligned. */
1453 MLX5_ASSERT(pdst == RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE));
1454 if (unlikely(pdst >= (uint8_t *)txq->wqes_end))
1455 pdst = (uint8_t *)txq->wqes;
1456 *(uint32_t *)pdst = rte_cpu_to_be_32((RTE_ETHER_TYPE_VLAN << 16) |
1457 loc->mbuf->vlan_tci);
1458 pdst += sizeof(struct rte_vlan_hdr);
1460 * The WQEBB space availability is checked by caller.
1461 * Here we should be aware of WQE ring buffer wraparound only.
1463 part = (uint8_t *)txq->wqes_end - pdst;
1464 part = RTE_MIN(part, len);
1466 rte_memcpy(pdst, buf, part);
1470 if (!MLX5_TXOFF_CONFIG(MPW))
1471 pdst = RTE_PTR_ALIGN(pdst, MLX5_WSEG_SIZE);
1472 /* Note: no final wraparound check here. */
1473 return (struct mlx5_wqe_dseg *)pdst;
1475 pdst = (uint8_t *)txq->wqes;
1482 * Build the Ethernet Segment with optionally inlined data with
1483 * VLAN insertion and following Data Segments (if any) from
1484 * multi-segment packet. Used by ordinary send and TSO.
1487 * Pointer to TX queue structure.
1489 * Pointer to burst routine local context.
1491 * Pointer to WQE to fill with built Ethernet/Data Segments.
1493 * Length of VLAN header to insert, 0 means no VLAN insertion.
1495 * Data length to inline. For TSO this parameter specifies exact value,
1496 * for ordinary send routine can be aligned by caller to provide better WQE
1497 * space saving and data buffer start address alignment.
1498 * This length includes VLAN header being inserted.
1500 * Zero means ordinary send, inlined data can be extended,
1501 * otherwise this is TSO, inlined data length is fixed.
1503 * Configured Tx offloads mask. It is fully defined at
1504 * compile time and may be used for optimization.
1507 * Actual size of built WQE in segments.
1509 static __rte_always_inline unsigned int
1510 mlx5_tx_mseg_build(struct mlx5_txq_data *__rte_restrict txq,
1511 struct mlx5_txq_local *__rte_restrict loc,
1512 struct mlx5_wqe *__rte_restrict wqe,
1516 unsigned int olx __rte_unused)
1518 struct mlx5_wqe_dseg *__rte_restrict dseg;
1521 MLX5_ASSERT((rte_pktmbuf_pkt_len(loc->mbuf) + vlan) >= inlen);
1522 loc->mbuf_nseg = NB_SEGS(loc->mbuf);
1525 dseg = mlx5_tx_eseg_mdat(txq, loc, wqe, vlan, inlen, tso, olx);
1526 if (!loc->mbuf_nseg)
1529 * There are still some mbuf remaining, not inlined.
1530 * The first mbuf may be partially inlined and we
1531 * must process the possible non-zero data offset.
1533 if (loc->mbuf_off) {
1538 * Exhausted packets must be dropped before.
1539 * Non-zero offset means there are some data
1540 * remained in the packet.
1542 MLX5_ASSERT(loc->mbuf_off < rte_pktmbuf_data_len(loc->mbuf));
1543 MLX5_ASSERT(rte_pktmbuf_data_len(loc->mbuf));
1544 dptr = rte_pktmbuf_mtod_offset(loc->mbuf, uint8_t *,
1546 dlen = rte_pktmbuf_data_len(loc->mbuf) - loc->mbuf_off;
1548 * Build the pointer/minimal Data Segment.
1549 * Do ring buffer wrapping check in advance.
1551 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1552 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1553 mlx5_tx_dseg_iptr(txq, loc, dseg, dptr, dlen, olx);
1554 /* Store the mbuf to be freed on completion. */
1555 MLX5_ASSERT(loc->elts_free);
1556 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1559 if (--loc->mbuf_nseg == 0)
1561 loc->mbuf = loc->mbuf->next;
1565 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1566 struct rte_mbuf *mbuf;
1568 /* Zero length segment found, just skip. */
1570 loc->mbuf = loc->mbuf->next;
1571 rte_pktmbuf_free_seg(mbuf);
1572 if (--loc->mbuf_nseg == 0)
1575 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1576 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1579 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1580 rte_pktmbuf_data_len(loc->mbuf), olx);
1581 MLX5_ASSERT(loc->elts_free);
1582 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1585 if (--loc->mbuf_nseg == 0)
1587 loc->mbuf = loc->mbuf->next;
1592 /* Calculate actual segments used from the dseg pointer. */
1593 if ((uintptr_t)wqe < (uintptr_t)dseg)
1594 ds = ((uintptr_t)dseg - (uintptr_t)wqe) / MLX5_WSEG_SIZE;
1596 ds = (((uintptr_t)dseg - (uintptr_t)wqe) +
1597 txq->wqe_s * MLX5_WQE_SIZE) / MLX5_WSEG_SIZE;
1602 * The routine checks timestamp flag in the current packet,
1603 * and push WAIT WQE into the queue if scheduling is required.
1606 * Pointer to TX queue structure.
1608 * Pointer to burst routine local context.
1610 * Configured Tx offloads mask. It is fully defined at
1611 * compile time and may be used for optimization.
1614 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1615 * MLX5_TXCMP_CODE_SINGLE - continue processing with the packet.
1616 * MLX5_TXCMP_CODE_MULTI - the WAIT inserted, continue processing.
1617 * Local context variables partially updated.
1619 static __rte_always_inline enum mlx5_txcmp_code
1620 mlx5_tx_schedule_send(struct mlx5_txq_data *restrict txq,
1621 struct mlx5_txq_local *restrict loc,
1624 if (MLX5_TXOFF_CONFIG(TXPP) &&
1625 loc->mbuf->ol_flags & txq->ts_mask) {
1626 struct mlx5_wqe *wqe;
1631 * Estimate the required space quickly and roughly.
1632 * We would like to ensure the packet can be pushed
1633 * to the queue and we won't get the orphan WAIT WQE.
1635 if (loc->wqe_free <= MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE ||
1636 loc->elts_free < NB_SEGS(loc->mbuf))
1637 return MLX5_TXCMP_CODE_EXIT;
1638 /* Convert the timestamp into completion to wait. */
1639 ts = *RTE_MBUF_DYNFIELD(loc->mbuf, txq->ts_offset, uint64_t *);
1640 wci = mlx5_txpp_convert_tx_ts(txq->sh, ts);
1641 if (unlikely(wci < 0))
1642 return MLX5_TXCMP_CODE_SINGLE;
1643 /* Build the WAIT WQE with specified completion. */
1644 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1645 mlx5_tx_cseg_init(txq, loc, wqe, 2, MLX5_OPCODE_WAIT, olx);
1646 mlx5_tx_wseg_init(txq, loc, wqe, wci, olx);
1649 return MLX5_TXCMP_CODE_MULTI;
1651 return MLX5_TXCMP_CODE_SINGLE;
1655 * Tx one packet function for multi-segment TSO. Supports all
1656 * types of Tx offloads, uses MLX5_OPCODE_TSO to build WQEs,
1657 * sends one packet per WQE.
1659 * This routine is responsible for storing processed mbuf
1660 * into elts ring buffer and update elts_head.
1663 * Pointer to TX queue structure.
1665 * Pointer to burst routine local context.
1667 * Configured Tx offloads mask. It is fully defined at
1668 * compile time and may be used for optimization.
1671 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1672 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1673 * Local context variables partially updated.
1675 static __rte_always_inline enum mlx5_txcmp_code
1676 mlx5_tx_packet_multi_tso(struct mlx5_txq_data *__rte_restrict txq,
1677 struct mlx5_txq_local *__rte_restrict loc,
1680 struct mlx5_wqe *__rte_restrict wqe;
1681 unsigned int ds, dlen, inlen, ntcp, vlan = 0;
1683 if (MLX5_TXOFF_CONFIG(TXPP)) {
1684 enum mlx5_txcmp_code wret;
1686 /* Generate WAIT for scheduling if requested. */
1687 wret = mlx5_tx_schedule_send(txq, loc, olx);
1688 if (wret == MLX5_TXCMP_CODE_EXIT)
1689 return MLX5_TXCMP_CODE_EXIT;
1690 if (wret == MLX5_TXCMP_CODE_ERROR)
1691 return MLX5_TXCMP_CODE_ERROR;
1694 * Calculate data length to be inlined to estimate
1695 * the required space in WQE ring buffer.
1697 dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1698 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1699 vlan = sizeof(struct rte_vlan_hdr);
1700 inlen = loc->mbuf->l2_len + vlan +
1701 loc->mbuf->l3_len + loc->mbuf->l4_len;
1702 if (unlikely((!inlen || !loc->mbuf->tso_segsz)))
1703 return MLX5_TXCMP_CODE_ERROR;
1704 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
1705 inlen += loc->mbuf->outer_l2_len + loc->mbuf->outer_l3_len;
1706 /* Packet must contain all TSO headers. */
1707 if (unlikely(inlen > MLX5_MAX_TSO_HEADER ||
1708 inlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
1709 inlen > (dlen + vlan)))
1710 return MLX5_TXCMP_CODE_ERROR;
1712 * Check whether there are enough free WQEBBs:
1714 * - Ethernet Segment
1715 * - First Segment of inlined Ethernet data
1716 * - ... data continued ...
1717 * - Data Segments of pointer/min inline type
1719 ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
1720 MLX5_ESEG_MIN_INLINE_SIZE +
1722 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
1723 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1724 return MLX5_TXCMP_CODE_EXIT;
1725 /* Check for maximal WQE size. */
1726 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
1727 return MLX5_TXCMP_CODE_ERROR;
1728 #ifdef MLX5_PMD_SOFT_COUNTERS
1729 /* Update sent data bytes/packets counters. */
1730 ntcp = (dlen - (inlen - vlan) + loc->mbuf->tso_segsz - 1) /
1731 loc->mbuf->tso_segsz;
1733 * One will be added for mbuf itself at the end of the mlx5_tx_burst
1734 * from loc->pkts_sent field.
1737 txq->stats.opackets += ntcp;
1738 txq->stats.obytes += dlen + vlan + ntcp * inlen;
1740 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1741 loc->wqe_last = wqe;
1742 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_TSO, olx);
1743 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 1, olx);
1744 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
1745 txq->wqe_ci += (ds + 3) / 4;
1746 loc->wqe_free -= (ds + 3) / 4;
1747 return MLX5_TXCMP_CODE_MULTI;
1751 * Tx one packet function for multi-segment SEND. Supports all types of Tx
1752 * offloads, uses MLX5_OPCODE_SEND to build WQEs, sends one packet per WQE,
1753 * without any data inlining in Ethernet Segment.
1755 * This routine is responsible for storing processed mbuf
1756 * into elts ring buffer and update elts_head.
1759 * Pointer to TX queue structure.
1761 * Pointer to burst routine local context.
1763 * Configured Tx offloads mask. It is fully defined at
1764 * compile time and may be used for optimization.
1767 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1768 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1769 * Local context variables partially updated.
1771 static __rte_always_inline enum mlx5_txcmp_code
1772 mlx5_tx_packet_multi_send(struct mlx5_txq_data *__rte_restrict txq,
1773 struct mlx5_txq_local *__rte_restrict loc,
1776 struct mlx5_wqe_dseg *__rte_restrict dseg;
1777 struct mlx5_wqe *__rte_restrict wqe;
1778 unsigned int ds, nseg;
1780 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1781 if (MLX5_TXOFF_CONFIG(TXPP)) {
1782 enum mlx5_txcmp_code wret;
1784 /* Generate WAIT for scheduling if requested. */
1785 wret = mlx5_tx_schedule_send(txq, loc, olx);
1786 if (wret == MLX5_TXCMP_CODE_EXIT)
1787 return MLX5_TXCMP_CODE_EXIT;
1788 if (wret == MLX5_TXCMP_CODE_ERROR)
1789 return MLX5_TXCMP_CODE_ERROR;
1792 * No inline at all, it means the CPU cycles saving is prioritized at
1793 * configuration, we should not copy any packet data to WQE.
1795 nseg = NB_SEGS(loc->mbuf);
1797 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
1798 return MLX5_TXCMP_CODE_EXIT;
1799 /* Check for maximal WQE size. */
1800 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
1801 return MLX5_TXCMP_CODE_ERROR;
1803 * Some Tx offloads may cause an error if packet is not long enough,
1804 * check against assumed minimal length.
1806 if (rte_pktmbuf_pkt_len(loc->mbuf) <= MLX5_ESEG_MIN_INLINE_SIZE)
1807 return MLX5_TXCMP_CODE_ERROR;
1808 #ifdef MLX5_PMD_SOFT_COUNTERS
1809 /* Update sent data bytes counter. */
1810 txq->stats.obytes += rte_pktmbuf_pkt_len(loc->mbuf);
1811 if (MLX5_TXOFF_CONFIG(VLAN) &&
1812 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1813 txq->stats.obytes += sizeof(struct rte_vlan_hdr);
1816 * SEND WQE, one WQEBB:
1817 * - Control Segment, SEND opcode
1818 * - Ethernet Segment, optional VLAN, no inline
1819 * - Data Segments, pointer only type
1821 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
1822 loc->wqe_last = wqe;
1823 mlx5_tx_cseg_init(txq, loc, wqe, ds, MLX5_OPCODE_SEND, olx);
1824 mlx5_tx_eseg_none(txq, loc, wqe, olx);
1825 dseg = &wqe->dseg[0];
1827 if (unlikely(!rte_pktmbuf_data_len(loc->mbuf))) {
1828 struct rte_mbuf *mbuf;
1831 * Zero length segment found, have to correct total
1832 * size of WQE in segments.
1833 * It is supposed to be rare occasion, so in normal
1834 * case (no zero length segments) we avoid extra
1835 * writing to the Control Segment.
1838 wqe->cseg.sq_ds -= RTE_BE32(1);
1840 loc->mbuf = mbuf->next;
1841 rte_pktmbuf_free_seg(mbuf);
1847 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
1848 rte_pktmbuf_data_len(loc->mbuf), olx);
1849 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
1854 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
1855 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
1856 loc->mbuf = loc->mbuf->next;
1859 txq->wqe_ci += (ds + 3) / 4;
1860 loc->wqe_free -= (ds + 3) / 4;
1861 return MLX5_TXCMP_CODE_MULTI;
1865 * Tx one packet function for multi-segment SEND. Supports all
1866 * types of Tx offloads, uses MLX5_OPCODE_SEND to build WQEs,
1867 * sends one packet per WQE, with data inlining in
1868 * Ethernet Segment and minimal Data Segments.
1870 * This routine is responsible for storing processed mbuf
1871 * into elts ring buffer and update elts_head.
1874 * Pointer to TX queue structure.
1876 * Pointer to burst routine local context.
1878 * Configured Tx offloads mask. It is fully defined at
1879 * compile time and may be used for optimization.
1882 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
1883 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
1884 * Local context variables partially updated.
1886 static __rte_always_inline enum mlx5_txcmp_code
1887 mlx5_tx_packet_multi_inline(struct mlx5_txq_data *__rte_restrict txq,
1888 struct mlx5_txq_local *__rte_restrict loc,
1891 struct mlx5_wqe *__rte_restrict wqe;
1892 unsigned int ds, inlen, dlen, vlan = 0;
1894 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
1895 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
1896 if (MLX5_TXOFF_CONFIG(TXPP)) {
1897 enum mlx5_txcmp_code wret;
1899 /* Generate WAIT for scheduling if requested. */
1900 wret = mlx5_tx_schedule_send(txq, loc, olx);
1901 if (wret == MLX5_TXCMP_CODE_EXIT)
1902 return MLX5_TXCMP_CODE_EXIT;
1903 if (wret == MLX5_TXCMP_CODE_ERROR)
1904 return MLX5_TXCMP_CODE_ERROR;
1907 * First calculate data length to be inlined
1908 * to estimate the required space for WQE.
1910 dlen = rte_pktmbuf_pkt_len(loc->mbuf);
1911 if (MLX5_TXOFF_CONFIG(VLAN) && loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
1912 vlan = sizeof(struct rte_vlan_hdr);
1913 inlen = dlen + vlan;
1914 /* Check against minimal length. */
1915 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
1916 return MLX5_TXCMP_CODE_ERROR;
1917 MLX5_ASSERT(txq->inlen_send >= MLX5_ESEG_MIN_INLINE_SIZE);
1918 if (inlen > txq->inlen_send ||
1919 loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE) {
1920 struct rte_mbuf *mbuf;
1925 nxlen = rte_pktmbuf_data_len(mbuf);
1927 * Packet length exceeds the allowed inline data length,
1928 * check whether the minimal inlining is required.
1930 if (txq->inlen_mode) {
1931 MLX5_ASSERT(txq->inlen_mode >=
1932 MLX5_ESEG_MIN_INLINE_SIZE);
1933 MLX5_ASSERT(txq->inlen_mode <= txq->inlen_send);
1934 inlen = RTE_MIN(txq->inlen_mode, inlen);
1935 } else if (vlan && !txq->vlan_en) {
1937 * VLAN insertion is requested and hardware does not
1938 * support the offload, will do with software inline.
1940 inlen = MLX5_ESEG_MIN_INLINE_SIZE;
1941 } else if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE ||
1942 nxlen > txq->inlen_send) {
1943 return mlx5_tx_packet_multi_send(txq, loc, olx);
1947 if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
1950 * Now we know the minimal amount of data is requested
1951 * to inline. Check whether we should inline the buffers
1952 * from the chain beginning to eliminate some mbufs.
1954 if (unlikely(nxlen <= txq->inlen_send)) {
1955 /* We can inline first mbuf at least. */
1956 if (nxlen < inlen) {
1959 /* Scan mbufs till inlen filled. */
1964 nxlen = rte_pktmbuf_data_len(mbuf);
1966 } while (unlikely(nxlen < inlen));
1967 if (unlikely(nxlen > txq->inlen_send)) {
1968 /* We cannot inline entire mbuf. */
1969 smlen = inlen - smlen;
1970 start = rte_pktmbuf_mtod_offset
1971 (mbuf, uintptr_t, smlen);
1979 /* There should be not end of packet. */
1981 if (mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
1983 nxlen = inlen + rte_pktmbuf_data_len(mbuf);
1984 } while (unlikely(nxlen < txq->inlen_send));
1986 start = rte_pktmbuf_mtod(mbuf, uintptr_t);
1988 * Check whether we can do inline to align start
1989 * address of data buffer to cacheline.
1992 start = (~start + 1) & (RTE_CACHE_LINE_SIZE - 1);
1993 if (unlikely(start)) {
1995 if (start <= txq->inlen_send)
2000 * Check whether there are enough free WQEBBs:
2002 * - Ethernet Segment
2003 * - First Segment of inlined Ethernet data
2004 * - ... data continued ...
2005 * - Data Segments of pointer/min inline type
2007 * Estimate the number of Data Segments conservatively,
2008 * supposing no any mbufs is being freed during inlining.
2011 MLX5_ASSERT(inlen <= txq->inlen_send);
2012 ds = NB_SEGS(loc->mbuf) + 2 + (inlen -
2013 MLX5_ESEG_MIN_INLINE_SIZE +
2015 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2016 if (unlikely(loc->wqe_free < ((ds + 3) / 4)))
2017 return MLX5_TXCMP_CODE_EXIT;
2018 /* Check for maximal WQE size. */
2019 if (unlikely((MLX5_WQE_SIZE_MAX / MLX5_WSEG_SIZE) < ((ds + 3) / 4)))
2020 return MLX5_TXCMP_CODE_ERROR;
2021 #ifdef MLX5_PMD_SOFT_COUNTERS
2022 /* Update sent data bytes/packets counters. */
2023 txq->stats.obytes += dlen + vlan;
2025 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2026 loc->wqe_last = wqe;
2027 mlx5_tx_cseg_init(txq, loc, wqe, 0, MLX5_OPCODE_SEND, olx);
2028 ds = mlx5_tx_mseg_build(txq, loc, wqe, vlan, inlen, 0, olx);
2029 wqe->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2030 txq->wqe_ci += (ds + 3) / 4;
2031 loc->wqe_free -= (ds + 3) / 4;
2032 return MLX5_TXCMP_CODE_MULTI;
2036 * Tx burst function for multi-segment packets. Supports all
2037 * types of Tx offloads, uses MLX5_OPCODE_SEND/TSO to build WQEs,
2038 * sends one packet per WQE. Function stops sending if it
2039 * encounters the single-segment packet.
2041 * This routine is responsible for storing processed mbuf
2042 * into elts ring buffer and update elts_head.
2045 * Pointer to TX queue structure.
2047 * Packets to transmit.
2049 * Number of packets in array.
2051 * Pointer to burst routine local context.
2053 * Configured Tx offloads mask. It is fully defined at
2054 * compile time and may be used for optimization.
2057 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2058 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2059 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2060 * MLX5_TXCMP_CODE_TSO - TSO single-segment packet encountered.
2061 * Local context variables updated.
2063 static __rte_always_inline enum mlx5_txcmp_code
2064 mlx5_tx_burst_mseg(struct mlx5_txq_data *__rte_restrict txq,
2065 struct rte_mbuf **__rte_restrict pkts,
2066 unsigned int pkts_n,
2067 struct mlx5_txq_local *__rte_restrict loc,
2070 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2071 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2072 pkts += loc->pkts_sent + 1;
2073 pkts_n -= loc->pkts_sent;
2075 enum mlx5_txcmp_code ret;
2077 MLX5_ASSERT(NB_SEGS(loc->mbuf) > 1);
2079 * Estimate the number of free elts quickly but conservatively.
2080 * Some segment may be fully inlined and freed,
2081 * ignore this here - precise estimation is costly.
2083 if (loc->elts_free < NB_SEGS(loc->mbuf))
2084 return MLX5_TXCMP_CODE_EXIT;
2085 if (MLX5_TXOFF_CONFIG(TSO) &&
2086 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
2087 /* Proceed with multi-segment TSO. */
2088 ret = mlx5_tx_packet_multi_tso(txq, loc, olx);
2089 } else if (MLX5_TXOFF_CONFIG(INLINE)) {
2090 /* Proceed with multi-segment SEND with inlining. */
2091 ret = mlx5_tx_packet_multi_inline(txq, loc, olx);
2093 /* Proceed with multi-segment SEND w/o inlining. */
2094 ret = mlx5_tx_packet_multi_send(txq, loc, olx);
2096 if (ret == MLX5_TXCMP_CODE_EXIT)
2097 return MLX5_TXCMP_CODE_EXIT;
2098 if (ret == MLX5_TXCMP_CODE_ERROR)
2099 return MLX5_TXCMP_CODE_ERROR;
2100 /* WQE is built, go to the next packet. */
2103 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2104 return MLX5_TXCMP_CODE_EXIT;
2105 loc->mbuf = *pkts++;
2107 rte_prefetch0(*pkts);
2108 if (likely(NB_SEGS(loc->mbuf) > 1))
2110 /* Here ends the series of multi-segment packets. */
2111 if (MLX5_TXOFF_CONFIG(TSO) &&
2112 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2113 return MLX5_TXCMP_CODE_TSO;
2114 return MLX5_TXCMP_CODE_SINGLE;
2120 * Tx burst function for single-segment packets with TSO.
2121 * Supports all types of Tx offloads, except multi-packets.
2122 * Uses MLX5_OPCODE_TSO to build WQEs, sends one packet per WQE.
2123 * Function stops sending if it encounters the multi-segment
2124 * packet or packet without TSO requested.
2126 * The routine is responsible for storing processed mbuf into elts ring buffer
2127 * and update elts_head if inline offloads is requested due to possible early
2128 * freeing of the inlined mbufs (can not store pkts array in elts as a batch).
2131 * Pointer to TX queue structure.
2133 * Packets to transmit.
2135 * Number of packets in array.
2137 * Pointer to burst routine local context.
2139 * Configured Tx offloads mask. It is fully defined at
2140 * compile time and may be used for optimization.
2143 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2144 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2145 * MLX5_TXCMP_CODE_SINGLE - single-segment packet encountered.
2146 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2147 * Local context variables updated.
2149 static __rte_always_inline enum mlx5_txcmp_code
2150 mlx5_tx_burst_tso(struct mlx5_txq_data *__rte_restrict txq,
2151 struct rte_mbuf **__rte_restrict pkts,
2152 unsigned int pkts_n,
2153 struct mlx5_txq_local *__rte_restrict loc,
2156 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2157 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2158 pkts += loc->pkts_sent + 1;
2159 pkts_n -= loc->pkts_sent;
2161 struct mlx5_wqe_dseg *__rte_restrict dseg;
2162 struct mlx5_wqe *__rte_restrict wqe;
2163 unsigned int ds, dlen, hlen, ntcp, vlan = 0;
2166 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2167 if (MLX5_TXOFF_CONFIG(TXPP)) {
2168 enum mlx5_txcmp_code wret;
2170 /* Generate WAIT for scheduling if requested. */
2171 wret = mlx5_tx_schedule_send(txq, loc, olx);
2172 if (wret == MLX5_TXCMP_CODE_EXIT)
2173 return MLX5_TXCMP_CODE_EXIT;
2174 if (wret == MLX5_TXCMP_CODE_ERROR)
2175 return MLX5_TXCMP_CODE_ERROR;
2177 dlen = rte_pktmbuf_data_len(loc->mbuf);
2178 if (MLX5_TXOFF_CONFIG(VLAN) &&
2179 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2180 vlan = sizeof(struct rte_vlan_hdr);
2183 * First calculate the WQE size to check
2184 * whether we have enough space in ring buffer.
2186 hlen = loc->mbuf->l2_len + vlan +
2187 loc->mbuf->l3_len + loc->mbuf->l4_len;
2188 if (unlikely((!hlen || !loc->mbuf->tso_segsz)))
2189 return MLX5_TXCMP_CODE_ERROR;
2190 if (loc->mbuf->ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)
2191 hlen += loc->mbuf->outer_l2_len +
2192 loc->mbuf->outer_l3_len;
2193 /* Segment must contain all TSO headers. */
2194 if (unlikely(hlen > MLX5_MAX_TSO_HEADER ||
2195 hlen <= MLX5_ESEG_MIN_INLINE_SIZE ||
2196 hlen > (dlen + vlan)))
2197 return MLX5_TXCMP_CODE_ERROR;
2199 * Check whether there are enough free WQEBBs:
2201 * - Ethernet Segment
2202 * - First Segment of inlined Ethernet data
2203 * - ... data continued ...
2204 * - Finishing Data Segment of pointer type
2206 ds = 4 + (hlen - MLX5_ESEG_MIN_INLINE_SIZE +
2207 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
2208 if (loc->wqe_free < ((ds + 3) / 4))
2209 return MLX5_TXCMP_CODE_EXIT;
2210 #ifdef MLX5_PMD_SOFT_COUNTERS
2211 /* Update sent data bytes/packets counters. */
2212 ntcp = (dlen + vlan - hlen +
2213 loc->mbuf->tso_segsz - 1) /
2214 loc->mbuf->tso_segsz;
2216 * One will be added for mbuf itself at the end
2217 * of the mlx5_tx_burst from loc->pkts_sent field.
2220 txq->stats.opackets += ntcp;
2221 txq->stats.obytes += dlen + vlan + ntcp * hlen;
2224 * Build the TSO WQE:
2226 * - Ethernet Segment with hlen bytes inlined
2227 * - Data Segment of pointer type
2229 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2230 loc->wqe_last = wqe;
2231 mlx5_tx_cseg_init(txq, loc, wqe, ds,
2232 MLX5_OPCODE_TSO, olx);
2233 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan, hlen, 1, olx);
2234 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) + hlen - vlan;
2235 dlen -= hlen - vlan;
2236 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2238 * WQE is built, update the loop parameters
2239 * and go to the next packet.
2241 txq->wqe_ci += (ds + 3) / 4;
2242 loc->wqe_free -= (ds + 3) / 4;
2243 if (MLX5_TXOFF_CONFIG(INLINE))
2244 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2248 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2249 return MLX5_TXCMP_CODE_EXIT;
2250 loc->mbuf = *pkts++;
2252 rte_prefetch0(*pkts);
2253 if (MLX5_TXOFF_CONFIG(MULTI) &&
2254 unlikely(NB_SEGS(loc->mbuf) > 1))
2255 return MLX5_TXCMP_CODE_MULTI;
2256 if (likely(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)))
2257 return MLX5_TXCMP_CODE_SINGLE;
2258 /* Continue with the next TSO packet. */
2264 * Analyze the packet and select the best method to send.
2267 * Pointer to TX queue structure.
2269 * Pointer to burst routine local context.
2271 * Configured Tx offloads mask. It is fully defined at
2272 * compile time and may be used for optimization.
2274 * The predefined flag whether do complete check for
2275 * multi-segment packets and TSO.
2278 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2279 * MLX5_TXCMP_CODE_TSO - TSO required, use TSO/LSO.
2280 * MLX5_TXCMP_CODE_SINGLE - single-segment packet, use SEND.
2281 * MLX5_TXCMP_CODE_EMPW - single-segment packet, use MPW.
2283 static __rte_always_inline enum mlx5_txcmp_code
2284 mlx5_tx_able_to_empw(struct mlx5_txq_data *__rte_restrict txq,
2285 struct mlx5_txq_local *__rte_restrict loc,
2289 /* Check for multi-segment packet. */
2291 MLX5_TXOFF_CONFIG(MULTI) &&
2292 unlikely(NB_SEGS(loc->mbuf) > 1))
2293 return MLX5_TXCMP_CODE_MULTI;
2294 /* Check for TSO packet. */
2296 MLX5_TXOFF_CONFIG(TSO) &&
2297 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG))
2298 return MLX5_TXCMP_CODE_TSO;
2299 /* Check if eMPW is enabled at all. */
2300 if (!MLX5_TXOFF_CONFIG(EMPW))
2301 return MLX5_TXCMP_CODE_SINGLE;
2302 /* Check if eMPW can be engaged. */
2303 if (MLX5_TXOFF_CONFIG(VLAN) &&
2304 unlikely(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) &&
2305 (!MLX5_TXOFF_CONFIG(INLINE) ||
2306 unlikely((rte_pktmbuf_data_len(loc->mbuf) +
2307 sizeof(struct rte_vlan_hdr)) > txq->inlen_empw))) {
2309 * eMPW does not support VLAN insertion offload, we have to
2310 * inline the entire packet but packet is too long for inlining.
2312 return MLX5_TXCMP_CODE_SINGLE;
2314 return MLX5_TXCMP_CODE_EMPW;
2318 * Check the next packet attributes to match with the eMPW batch ones.
2319 * In addition, for legacy MPW the packet length is checked either.
2322 * Pointer to TX queue structure.
2324 * Pointer to Ethernet Segment of eMPW batch.
2326 * Pointer to burst routine local context.
2328 * Length of previous packet in MPW descriptor.
2330 * Configured Tx offloads mask. It is fully defined at
2331 * compile time and may be used for optimization.
2334 * true - packet match with eMPW batch attributes.
2335 * false - no match, eMPW should be restarted.
2337 static __rte_always_inline bool
2338 mlx5_tx_match_empw(struct mlx5_txq_data *__rte_restrict txq,
2339 struct mlx5_wqe_eseg *__rte_restrict es,
2340 struct mlx5_txq_local *__rte_restrict loc,
2344 uint8_t swp_flags = 0;
2346 /* Compare the checksum flags, if any. */
2347 if (MLX5_TXOFF_CONFIG(CSUM) &&
2348 txq_ol_cksum_to_cs(loc->mbuf) != es->cs_flags)
2350 /* Compare the Software Parser offsets and flags. */
2351 if (MLX5_TXOFF_CONFIG(SWP) &&
2352 (es->swp_offs != txq_mbuf_to_swp(loc, &swp_flags, olx) ||
2353 es->swp_flags != swp_flags))
2355 /* Fill metadata field if needed. */
2356 if (MLX5_TXOFF_CONFIG(METADATA) &&
2357 es->metadata != (loc->mbuf->ol_flags & RTE_MBUF_DYNFLAG_TX_METADATA ?
2358 rte_cpu_to_be_32(*RTE_FLOW_DYNF_METADATA(loc->mbuf)) : 0))
2360 /* Legacy MPW can send packets with the same length only. */
2361 if (MLX5_TXOFF_CONFIG(MPW) &&
2362 dlen != rte_pktmbuf_data_len(loc->mbuf))
2364 /* There must be no VLAN packets in eMPW loop. */
2365 if (MLX5_TXOFF_CONFIG(VLAN))
2366 MLX5_ASSERT(!(loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN));
2367 /* Check if the scheduling is requested. */
2368 if (MLX5_TXOFF_CONFIG(TXPP) &&
2369 loc->mbuf->ol_flags & txq->ts_mask)
2375 * Update send loop variables and WQE for eMPW loop without data inlining.
2376 * Number of Data Segments is equal to the number of sent packets.
2379 * Pointer to TX queue structure.
2381 * Pointer to burst routine local context.
2383 * Number of packets/Data Segments/Packets.
2385 * Accumulated statistics, bytes sent.
2387 * Configured Tx offloads mask. It is fully defined at
2388 * compile time and may be used for optimization.
2391 * true - packet match with eMPW batch attributes.
2392 * false - no match, eMPW should be restarted.
2394 static __rte_always_inline void
2395 mlx5_tx_sdone_empw(struct mlx5_txq_data *__rte_restrict txq,
2396 struct mlx5_txq_local *__rte_restrict loc,
2399 unsigned int olx __rte_unused)
2401 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2402 #ifdef MLX5_PMD_SOFT_COUNTERS
2403 /* Update sent data bytes counter. */
2404 txq->stats.obytes += slen;
2408 loc->elts_free -= ds;
2409 loc->pkts_sent += ds;
2411 loc->wqe_last->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | ds);
2412 txq->wqe_ci += (ds + 3) / 4;
2413 loc->wqe_free -= (ds + 3) / 4;
2417 * Update send loop variables and WQE for eMPW loop with data inlining.
2418 * Gets the size of pushed descriptors and data to the WQE.
2421 * Pointer to TX queue structure.
2423 * Pointer to burst routine local context.
2425 * Total size of descriptor/data in bytes.
2427 * Accumulated statistics, data bytes sent.
2429 * The base WQE for the eMPW/MPW descriptor.
2431 * Configured Tx offloads mask. It is fully defined at
2432 * compile time and may be used for optimization.
2435 * true - packet match with eMPW batch attributes.
2436 * false - no match, eMPW should be restarted.
2438 static __rte_always_inline void
2439 mlx5_tx_idone_empw(struct mlx5_txq_data *__rte_restrict txq,
2440 struct mlx5_txq_local *__rte_restrict loc,
2443 struct mlx5_wqe *__rte_restrict wqem,
2444 unsigned int olx __rte_unused)
2446 struct mlx5_wqe_dseg *dseg = &wqem->dseg[0];
2448 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2449 #ifdef MLX5_PMD_SOFT_COUNTERS
2450 /* Update sent data bytes counter. */
2451 txq->stats.obytes += slen;
2455 if (MLX5_TXOFF_CONFIG(MPW) && dseg->bcount == RTE_BE32(0)) {
2457 * If the legacy MPW session contains the inline packets
2458 * we should set the only inline data segment length
2459 * and align the total length to the segment size.
2461 MLX5_ASSERT(len > sizeof(dseg->bcount));
2462 dseg->bcount = rte_cpu_to_be_32((len - sizeof(dseg->bcount)) |
2463 MLX5_ETH_WQE_DATA_INLINE);
2464 len = (len + MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE + 2;
2467 * The session is not legacy MPW or contains the
2468 * data buffer pointer segments.
2470 MLX5_ASSERT((len % MLX5_WSEG_SIZE) == 0);
2471 len = len / MLX5_WSEG_SIZE + 2;
2473 wqem->cseg.sq_ds = rte_cpu_to_be_32(txq->qp_num_8s | len);
2474 txq->wqe_ci += (len + 3) / 4;
2475 loc->wqe_free -= (len + 3) / 4;
2476 loc->wqe_last = wqem;
2480 * The set of Tx burst functions for single-segment packets without TSO
2481 * and with Multi-Packet Writing feature support.
2482 * Supports all types of Tx offloads, except multi-packets and TSO.
2484 * Uses MLX5_OPCODE_EMPW to build WQEs if possible and sends as many packet
2485 * per WQE as it can. If eMPW is not configured or packet can not be sent with
2486 * eMPW (VLAN insertion) the ordinary SEND opcode is used and only one packet
2489 * Functions stop sending if it encounters the multi-segment packet or packet
2490 * with TSO requested.
2492 * The routines are responsible for storing processed mbuf into elts ring buffer
2493 * and update elts_head if inlining offload is requested. Otherwise the copying
2494 * mbufs to elts can be postponed and completed at the end of burst routine.
2497 * Pointer to TX queue structure.
2499 * Packets to transmit.
2501 * Number of packets in array.
2503 * Pointer to burst routine local context.
2505 * Configured Tx offloads mask. It is fully defined at
2506 * compile time and may be used for optimization.
2509 * MLX5_TXCMP_CODE_EXIT - sending is done or impossible.
2510 * MLX5_TXCMP_CODE_ERROR - some unrecoverable error occurred.
2511 * MLX5_TXCMP_CODE_MULTI - multi-segment packet encountered.
2512 * MLX5_TXCMP_CODE_TSO - TSO packet encountered.
2513 * MLX5_TXCMP_CODE_SINGLE - used inside functions set.
2514 * MLX5_TXCMP_CODE_EMPW - used inside functions set.
2516 * Local context variables updated.
2519 * The routine sends packets with MLX5_OPCODE_EMPW
2520 * without inlining, this is dedicated optimized branch.
2521 * No VLAN insertion is supported.
2523 static __rte_always_inline enum mlx5_txcmp_code
2524 mlx5_tx_burst_empw_simple(struct mlx5_txq_data *__rte_restrict txq,
2525 struct rte_mbuf **__rte_restrict pkts,
2526 unsigned int pkts_n,
2527 struct mlx5_txq_local *__rte_restrict loc,
2531 * Subroutine is the part of mlx5_tx_burst_single() and sends
2532 * single-segment packet with eMPW opcode without data inlining.
2534 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
2535 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2536 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2537 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2538 pkts += loc->pkts_sent + 1;
2539 pkts_n -= loc->pkts_sent;
2541 struct mlx5_wqe_dseg *__rte_restrict dseg;
2542 struct mlx5_wqe_eseg *__rte_restrict eseg;
2543 enum mlx5_txcmp_code ret;
2544 unsigned int part, loop;
2545 unsigned int slen = 0;
2548 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2549 if (MLX5_TXOFF_CONFIG(TXPP)) {
2550 enum mlx5_txcmp_code wret;
2552 /* Generate WAIT for scheduling if requested. */
2553 wret = mlx5_tx_schedule_send(txq, loc, olx);
2554 if (wret == MLX5_TXCMP_CODE_EXIT)
2555 return MLX5_TXCMP_CODE_EXIT;
2556 if (wret == MLX5_TXCMP_CODE_ERROR)
2557 return MLX5_TXCMP_CODE_ERROR;
2559 part = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2560 MLX5_MPW_MAX_PACKETS :
2561 MLX5_EMPW_MAX_PACKETS);
2562 if (unlikely(loc->elts_free < part)) {
2563 /* We have no enough elts to save all mbufs. */
2564 if (unlikely(loc->elts_free < MLX5_EMPW_MIN_PACKETS))
2565 return MLX5_TXCMP_CODE_EXIT;
2566 /* But we still able to send at least minimal eMPW. */
2567 part = loc->elts_free;
2569 /* Check whether we have enough WQEs */
2570 if (unlikely(loc->wqe_free < ((2 + part + 3) / 4))) {
2571 if (unlikely(loc->wqe_free <
2572 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2573 return MLX5_TXCMP_CODE_EXIT;
2574 part = (loc->wqe_free * 4) - 2;
2576 if (likely(part > 1))
2577 rte_prefetch0(*pkts);
2578 loc->wqe_last = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2580 * Build eMPW title WQEBB:
2581 * - Control Segment, eMPW opcode
2582 * - Ethernet Segment, no inline
2584 mlx5_tx_cseg_init(txq, loc, loc->wqe_last, part + 2,
2585 MLX5_OPCODE_ENHANCED_MPSW, olx);
2586 mlx5_tx_eseg_none(txq, loc, loc->wqe_last,
2587 olx & ~MLX5_TXOFF_CONFIG_VLAN);
2588 eseg = &loc->wqe_last->eseg;
2589 dseg = &loc->wqe_last->dseg[0];
2591 /* Store the packet length for legacy MPW. */
2592 if (MLX5_TXOFF_CONFIG(MPW))
2593 eseg->mss = rte_cpu_to_be_16
2594 (rte_pktmbuf_data_len(loc->mbuf));
2596 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2597 #ifdef MLX5_PMD_SOFT_COUNTERS
2598 /* Update sent data bytes counter. */
2603 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
2605 if (unlikely(--loop == 0))
2607 loc->mbuf = *pkts++;
2608 if (likely(loop > 1))
2609 rte_prefetch0(*pkts);
2610 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2612 * Unroll the completion code to avoid
2613 * returning variable value - it results in
2614 * unoptimized sequent checking in caller.
2616 if (ret == MLX5_TXCMP_CODE_MULTI) {
2618 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2619 if (unlikely(!loc->elts_free ||
2621 return MLX5_TXCMP_CODE_EXIT;
2622 return MLX5_TXCMP_CODE_MULTI;
2624 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2625 if (ret == MLX5_TXCMP_CODE_TSO) {
2627 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2628 if (unlikely(!loc->elts_free ||
2630 return MLX5_TXCMP_CODE_EXIT;
2631 return MLX5_TXCMP_CODE_TSO;
2633 if (ret == MLX5_TXCMP_CODE_SINGLE) {
2635 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2636 if (unlikely(!loc->elts_free ||
2638 return MLX5_TXCMP_CODE_EXIT;
2639 return MLX5_TXCMP_CODE_SINGLE;
2641 if (ret != MLX5_TXCMP_CODE_EMPW) {
2644 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2645 return MLX5_TXCMP_CODE_ERROR;
2648 * Check whether packet parameters coincide
2649 * within assumed eMPW batch:
2650 * - check sum settings
2652 * - software parser settings
2653 * - packets length (legacy MPW only)
2654 * - scheduling is not required
2656 if (!mlx5_tx_match_empw(txq, eseg, loc, dlen, olx)) {
2659 mlx5_tx_sdone_empw(txq, loc, part, slen, olx);
2660 if (unlikely(!loc->elts_free ||
2662 return MLX5_TXCMP_CODE_EXIT;
2666 /* Packet attributes match, continue the same eMPW. */
2668 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2669 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2671 /* eMPW is built successfully, update loop parameters. */
2673 MLX5_ASSERT(pkts_n >= part);
2674 #ifdef MLX5_PMD_SOFT_COUNTERS
2675 /* Update sent data bytes counter. */
2676 txq->stats.obytes += slen;
2678 loc->elts_free -= part;
2679 loc->pkts_sent += part;
2680 txq->wqe_ci += (2 + part + 3) / 4;
2681 loc->wqe_free -= (2 + part + 3) / 4;
2683 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
2684 return MLX5_TXCMP_CODE_EXIT;
2685 loc->mbuf = *pkts++;
2686 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2687 if (unlikely(ret != MLX5_TXCMP_CODE_EMPW))
2689 /* Continue sending eMPW batches. */
2695 * The routine sends packets with MLX5_OPCODE_EMPW
2696 * with inlining, optionally supports VLAN insertion.
2698 static __rte_always_inline enum mlx5_txcmp_code
2699 mlx5_tx_burst_empw_inline(struct mlx5_txq_data *__rte_restrict txq,
2700 struct rte_mbuf **__rte_restrict pkts,
2701 unsigned int pkts_n,
2702 struct mlx5_txq_local *__rte_restrict loc,
2706 * Subroutine is the part of mlx5_tx_burst_single() and sends
2707 * single-segment packet with eMPW opcode with data inlining.
2709 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
2710 MLX5_ASSERT(MLX5_TXOFF_CONFIG(EMPW));
2711 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
2712 MLX5_ASSERT(pkts_n > loc->pkts_sent);
2713 pkts += loc->pkts_sent + 1;
2714 pkts_n -= loc->pkts_sent;
2716 struct mlx5_wqe_dseg *__rte_restrict dseg;
2717 struct mlx5_wqe *__rte_restrict wqem;
2718 enum mlx5_txcmp_code ret;
2719 unsigned int room, part, nlim;
2720 unsigned int slen = 0;
2722 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2723 if (MLX5_TXOFF_CONFIG(TXPP)) {
2724 enum mlx5_txcmp_code wret;
2726 /* Generate WAIT for scheduling if requested. */
2727 wret = mlx5_tx_schedule_send(txq, loc, olx);
2728 if (wret == MLX5_TXCMP_CODE_EXIT)
2729 return MLX5_TXCMP_CODE_EXIT;
2730 if (wret == MLX5_TXCMP_CODE_ERROR)
2731 return MLX5_TXCMP_CODE_ERROR;
2734 * Limits the amount of packets in one WQE
2735 * to improve CQE latency generation.
2737 nlim = RTE_MIN(pkts_n, MLX5_TXOFF_CONFIG(MPW) ?
2738 MLX5_MPW_INLINE_MAX_PACKETS :
2739 MLX5_EMPW_MAX_PACKETS);
2740 /* Check whether we have minimal amount WQEs */
2741 if (unlikely(loc->wqe_free <
2742 ((2 + MLX5_EMPW_MIN_PACKETS + 3) / 4)))
2743 return MLX5_TXCMP_CODE_EXIT;
2744 if (likely(pkts_n > 1))
2745 rte_prefetch0(*pkts);
2746 wqem = txq->wqes + (txq->wqe_ci & txq->wqe_m);
2748 * Build eMPW title WQEBB:
2749 * - Control Segment, eMPW opcode, zero DS
2750 * - Ethernet Segment, no inline
2752 mlx5_tx_cseg_init(txq, loc, wqem, 0,
2753 MLX5_OPCODE_ENHANCED_MPSW, olx);
2754 mlx5_tx_eseg_none(txq, loc, wqem,
2755 olx & ~MLX5_TXOFF_CONFIG_VLAN);
2756 dseg = &wqem->dseg[0];
2757 /* Store the packet length for legacy MPW. */
2758 if (MLX5_TXOFF_CONFIG(MPW))
2759 wqem->eseg.mss = rte_cpu_to_be_16
2760 (rte_pktmbuf_data_len(loc->mbuf));
2761 room = RTE_MIN(MLX5_WQE_SIZE_MAX / MLX5_WQE_SIZE,
2762 loc->wqe_free) * MLX5_WQE_SIZE -
2763 MLX5_WQE_CSEG_SIZE -
2765 /* Limit the room for legacy MPW sessions for performance. */
2766 if (MLX5_TXOFF_CONFIG(MPW))
2767 room = RTE_MIN(room,
2768 RTE_MAX(txq->inlen_empw +
2769 sizeof(dseg->bcount) +
2770 (MLX5_TXOFF_CONFIG(VLAN) ?
2771 sizeof(struct rte_vlan_hdr) : 0),
2772 MLX5_MPW_INLINE_MAX_PACKETS *
2773 MLX5_WQE_DSEG_SIZE));
2774 /* Build WQE till we have space, packets and resources. */
2777 uint32_t dlen = rte_pktmbuf_data_len(loc->mbuf);
2778 uint8_t *dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *);
2781 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2782 MLX5_ASSERT((room % MLX5_WQE_DSEG_SIZE) == 0);
2783 MLX5_ASSERT((uintptr_t)dseg < (uintptr_t)txq->wqes_end);
2785 * Some Tx offloads may cause an error if packet is not
2786 * long enough, check against assumed minimal length.
2788 if (unlikely(dlen <= MLX5_ESEG_MIN_INLINE_SIZE)) {
2790 if (unlikely(!part))
2791 return MLX5_TXCMP_CODE_ERROR;
2793 * We have some successfully built
2794 * packet Data Segments to send.
2796 mlx5_tx_idone_empw(txq, loc, part,
2798 return MLX5_TXCMP_CODE_ERROR;
2800 /* Inline or not inline - that's the Question. */
2801 if (dlen > txq->inlen_empw ||
2802 loc->mbuf->ol_flags & RTE_MBUF_F_TX_DYNF_NOINLINE)
2804 if (MLX5_TXOFF_CONFIG(MPW)) {
2805 if (dlen > txq->inlen_send)
2809 /* Open new inline MPW session. */
2810 tlen += sizeof(dseg->bcount);
2811 dseg->bcount = RTE_BE32(0);
2813 (dseg, sizeof(dseg->bcount));
2816 * No pointer and inline descriptor
2817 * intermix for legacy MPW sessions.
2819 if (wqem->dseg[0].bcount)
2823 tlen = sizeof(dseg->bcount) + dlen;
2825 /* Inline entire packet, optional VLAN insertion. */
2826 if (MLX5_TXOFF_CONFIG(VLAN) &&
2827 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
2829 * The packet length must be checked in
2830 * mlx5_tx_able_to_empw() and packet
2831 * fits into inline length guaranteed.
2834 sizeof(struct rte_vlan_hdr)) <=
2836 tlen += sizeof(struct rte_vlan_hdr);
2839 dseg = mlx5_tx_dseg_vlan(txq, loc, dseg,
2841 #ifdef MLX5_PMD_SOFT_COUNTERS
2842 /* Update sent data bytes counter. */
2843 slen += sizeof(struct rte_vlan_hdr);
2848 dseg = mlx5_tx_dseg_empw(txq, loc, dseg,
2851 if (!MLX5_TXOFF_CONFIG(MPW))
2852 tlen = RTE_ALIGN(tlen, MLX5_WSEG_SIZE);
2853 MLX5_ASSERT(room >= tlen);
2856 * Packet data are completely inline,
2857 * we can try to free the packet.
2859 if (likely(loc->pkts_sent == loc->mbuf_free)) {
2861 * All the packets from the burst beginning
2862 * are inline, we can free mbufs directly
2863 * from the origin array on tx_burst exit().
2869 * In order no to call rte_pktmbuf_free_seg() here,
2870 * in the most inner loop (that might be very
2871 * expensive) we just save the mbuf in elts.
2873 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2878 * No pointer and inline descriptor
2879 * intermix for legacy MPW sessions.
2881 if (MLX5_TXOFF_CONFIG(MPW) &&
2883 wqem->dseg[0].bcount == RTE_BE32(0))
2886 * Not inlinable VLAN packets are
2887 * proceeded outside of this routine.
2889 MLX5_ASSERT(room >= MLX5_WQE_DSEG_SIZE);
2890 if (MLX5_TXOFF_CONFIG(VLAN))
2891 MLX5_ASSERT(!(loc->mbuf->ol_flags &
2892 RTE_MBUF_F_TX_VLAN));
2893 mlx5_tx_dseg_ptr(txq, loc, dseg, dptr, dlen, olx);
2894 /* We have to store mbuf in elts.*/
2895 txq->elts[txq->elts_head++ & txq->elts_m] = loc->mbuf;
2897 room -= MLX5_WQE_DSEG_SIZE;
2898 /* Ring buffer wraparound is checked at the loop end.*/
2901 #ifdef MLX5_PMD_SOFT_COUNTERS
2902 /* Update sent data bytes counter. */
2907 if (unlikely(!pkts_n || !loc->elts_free)) {
2909 * We have no resources/packets to
2910 * continue build descriptors.
2913 mlx5_tx_idone_empw(txq, loc, part,
2915 return MLX5_TXCMP_CODE_EXIT;
2917 loc->mbuf = *pkts++;
2918 if (likely(pkts_n > 1))
2919 rte_prefetch0(*pkts);
2920 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
2922 * Unroll the completion code to avoid
2923 * returning variable value - it results in
2924 * unoptimized sequent checking in caller.
2926 if (ret == MLX5_TXCMP_CODE_MULTI) {
2928 mlx5_tx_idone_empw(txq, loc, part,
2930 if (unlikely(!loc->elts_free ||
2932 return MLX5_TXCMP_CODE_EXIT;
2933 return MLX5_TXCMP_CODE_MULTI;
2935 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
2936 if (ret == MLX5_TXCMP_CODE_TSO) {
2938 mlx5_tx_idone_empw(txq, loc, part,
2940 if (unlikely(!loc->elts_free ||
2942 return MLX5_TXCMP_CODE_EXIT;
2943 return MLX5_TXCMP_CODE_TSO;
2945 if (ret == MLX5_TXCMP_CODE_SINGLE) {
2947 mlx5_tx_idone_empw(txq, loc, part,
2949 if (unlikely(!loc->elts_free ||
2951 return MLX5_TXCMP_CODE_EXIT;
2952 return MLX5_TXCMP_CODE_SINGLE;
2954 if (ret != MLX5_TXCMP_CODE_EMPW) {
2957 mlx5_tx_idone_empw(txq, loc, part,
2959 return MLX5_TXCMP_CODE_ERROR;
2961 /* Check if we have minimal room left. */
2963 if (unlikely(!nlim || room < MLX5_WQE_DSEG_SIZE))
2966 * Check whether packet parameters coincide
2967 * within assumed eMPW batch:
2968 * - check sum settings
2970 * - software parser settings
2971 * - packets length (legacy MPW only)
2972 * - scheduling is not required
2974 if (!mlx5_tx_match_empw(txq, &wqem->eseg,
2977 /* Packet attributes match, continue the same eMPW. */
2978 if ((uintptr_t)dseg >= (uintptr_t)txq->wqes_end)
2979 dseg = (struct mlx5_wqe_dseg *)txq->wqes;
2982 * We get here to close an existing eMPW
2983 * session and start the new one.
2985 MLX5_ASSERT(pkts_n);
2987 if (unlikely(!part))
2988 return MLX5_TXCMP_CODE_EXIT;
2989 mlx5_tx_idone_empw(txq, loc, part, slen, wqem, olx);
2990 if (unlikely(!loc->elts_free ||
2992 return MLX5_TXCMP_CODE_EXIT;
2993 /* Continue the loop with new eMPW session. */
2999 * The routine sends packets with ordinary MLX5_OPCODE_SEND.
3000 * Data inlining and VLAN insertion are supported.
3002 static __rte_always_inline enum mlx5_txcmp_code
3003 mlx5_tx_burst_single_send(struct mlx5_txq_data *__rte_restrict txq,
3004 struct rte_mbuf **__rte_restrict pkts,
3005 unsigned int pkts_n,
3006 struct mlx5_txq_local *__rte_restrict loc,
3010 * Subroutine is the part of mlx5_tx_burst_single()
3011 * and sends single-segment packet with SEND opcode.
3013 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3014 MLX5_ASSERT(pkts_n > loc->pkts_sent);
3015 pkts += loc->pkts_sent + 1;
3016 pkts_n -= loc->pkts_sent;
3018 struct mlx5_wqe *__rte_restrict wqe;
3019 enum mlx5_txcmp_code ret;
3021 MLX5_ASSERT(NB_SEGS(loc->mbuf) == 1);
3022 if (MLX5_TXOFF_CONFIG(TXPP)) {
3023 enum mlx5_txcmp_code wret;
3025 /* Generate WAIT for scheduling if requested. */
3026 wret = mlx5_tx_schedule_send(txq, loc, olx);
3027 if (wret == MLX5_TXCMP_CODE_EXIT)
3028 return MLX5_TXCMP_CODE_EXIT;
3029 if (wret == MLX5_TXCMP_CODE_ERROR)
3030 return MLX5_TXCMP_CODE_ERROR;
3032 if (MLX5_TXOFF_CONFIG(INLINE)) {
3033 unsigned int inlen, vlan = 0;
3035 inlen = rte_pktmbuf_data_len(loc->mbuf);
3036 if (MLX5_TXOFF_CONFIG(VLAN) &&
3037 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN) {
3038 vlan = sizeof(struct rte_vlan_hdr);
3042 * If inlining is enabled at configuration time
3043 * the limit must be not less than minimal size.
3044 * Otherwise we would do extra check for data
3045 * size to avoid crashes due to length overflow.
3047 MLX5_ASSERT(txq->inlen_send >=
3048 MLX5_ESEG_MIN_INLINE_SIZE);
3049 if (inlen <= txq->inlen_send) {
3050 unsigned int seg_n, wqe_n;
3052 rte_prefetch0(rte_pktmbuf_mtod
3053 (loc->mbuf, uint8_t *));
3054 /* Check against minimal length. */
3055 if (inlen <= MLX5_ESEG_MIN_INLINE_SIZE)
3056 return MLX5_TXCMP_CODE_ERROR;
3057 if (loc->mbuf->ol_flags &
3058 RTE_MBUF_F_TX_DYNF_NOINLINE) {
3060 * The hint flag not to inline packet
3061 * data is set. Check whether we can
3064 if ((!MLX5_TXOFF_CONFIG(EMPW) &&
3066 (MLX5_TXOFF_CONFIG(MPW) &&
3068 if (inlen <= txq->inlen_send)
3071 * The hardware requires the
3072 * minimal inline data header.
3074 goto single_min_inline;
3076 if (MLX5_TXOFF_CONFIG(VLAN) &&
3077 vlan && !txq->vlan_en) {
3079 * We must insert VLAN tag
3080 * by software means.
3082 goto single_part_inline;
3084 goto single_no_inline;
3088 * Completely inlined packet data WQE:
3089 * - Control Segment, SEND opcode
3090 * - Ethernet Segment, no VLAN insertion
3091 * - Data inlined, VLAN optionally inserted
3092 * - Alignment to MLX5_WSEG_SIZE
3093 * Have to estimate amount of WQEBBs
3095 seg_n = (inlen + 3 * MLX5_WSEG_SIZE -
3096 MLX5_ESEG_MIN_INLINE_SIZE +
3097 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3098 /* Check if there are enough WQEBBs. */
3099 wqe_n = (seg_n + 3) / 4;
3100 if (wqe_n > loc->wqe_free)
3101 return MLX5_TXCMP_CODE_EXIT;
3102 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3103 loc->wqe_last = wqe;
3104 mlx5_tx_cseg_init(txq, loc, wqe, seg_n,
3105 MLX5_OPCODE_SEND, olx);
3106 mlx5_tx_eseg_data(txq, loc, wqe,
3107 vlan, inlen, 0, olx);
3108 txq->wqe_ci += wqe_n;
3109 loc->wqe_free -= wqe_n;
3111 * Packet data are completely inlined,
3112 * free the packet immediately.
3114 rte_pktmbuf_free_seg(loc->mbuf);
3115 } else if ((!MLX5_TXOFF_CONFIG(EMPW) ||
3116 MLX5_TXOFF_CONFIG(MPW)) &&
3119 * If minimal inlining is requested the eMPW
3120 * feature should be disabled due to data is
3121 * inlined into Ethernet Segment, which can
3122 * not contain inlined data for eMPW due to
3123 * segment shared for all packets.
3125 struct mlx5_wqe_dseg *__rte_restrict dseg;
3130 * The inline-mode settings require
3131 * to inline the specified amount of
3132 * data bytes to the Ethernet Segment.
3133 * We should check the free space in
3134 * WQE ring buffer to inline partially.
3137 MLX5_ASSERT(txq->inlen_send >= txq->inlen_mode);
3138 MLX5_ASSERT(inlen > txq->inlen_mode);
3139 MLX5_ASSERT(txq->inlen_mode >=
3140 MLX5_ESEG_MIN_INLINE_SIZE);
3142 * Check whether there are enough free WQEBBs:
3144 * - Ethernet Segment
3145 * - First Segment of inlined Ethernet data
3146 * - ... data continued ...
3147 * - Finishing Data Segment of pointer type
3149 ds = (MLX5_WQE_CSEG_SIZE +
3150 MLX5_WQE_ESEG_SIZE +
3151 MLX5_WQE_DSEG_SIZE +
3153 MLX5_ESEG_MIN_INLINE_SIZE +
3154 MLX5_WQE_DSEG_SIZE +
3155 MLX5_WSEG_SIZE - 1) / MLX5_WSEG_SIZE;
3156 if (loc->wqe_free < ((ds + 3) / 4))
3157 return MLX5_TXCMP_CODE_EXIT;
3159 * Build the ordinary SEND WQE:
3161 * - Ethernet Segment, inline inlen_mode bytes
3162 * - Data Segment of pointer type
3164 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3165 loc->wqe_last = wqe;
3166 mlx5_tx_cseg_init(txq, loc, wqe, ds,
3167 MLX5_OPCODE_SEND, olx);
3168 dseg = mlx5_tx_eseg_data(txq, loc, wqe, vlan,
3171 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3172 txq->inlen_mode - vlan;
3173 inlen -= txq->inlen_mode;
3174 mlx5_tx_dseg_ptr(txq, loc, dseg,
3177 * WQE is built, update the loop parameters
3178 * and got to the next packet.
3180 txq->wqe_ci += (ds + 3) / 4;
3181 loc->wqe_free -= (ds + 3) / 4;
3182 /* We have to store mbuf in elts.*/
3183 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3184 txq->elts[txq->elts_head++ & txq->elts_m] =
3192 * Partially inlined packet data WQE, we have
3193 * some space in title WQEBB, we can fill it
3194 * with some packet data. It takes one WQEBB,
3195 * it is available, no extra space check:
3196 * - Control Segment, SEND opcode
3197 * - Ethernet Segment, no VLAN insertion
3198 * - MLX5_ESEG_MIN_INLINE_SIZE bytes of Data
3199 * - Data Segment, pointer type
3201 * We also get here if VLAN insertion is not
3202 * supported by HW, the inline is enabled.
3205 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3206 loc->wqe_last = wqe;
3207 mlx5_tx_cseg_init(txq, loc, wqe, 4,
3208 MLX5_OPCODE_SEND, olx);
3209 mlx5_tx_eseg_dmin(txq, loc, wqe, vlan, olx);
3210 dptr = rte_pktmbuf_mtod(loc->mbuf, uint8_t *) +
3211 MLX5_ESEG_MIN_INLINE_SIZE - vlan;
3213 * The length check is performed above, by
3214 * comparing with txq->inlen_send. We should
3215 * not get overflow here.
3217 MLX5_ASSERT(inlen > MLX5_ESEG_MIN_INLINE_SIZE);
3218 dlen = inlen - MLX5_ESEG_MIN_INLINE_SIZE;
3219 mlx5_tx_dseg_ptr(txq, loc, &wqe->dseg[1],
3223 /* We have to store mbuf in elts.*/
3224 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE));
3225 txq->elts[txq->elts_head++ & txq->elts_m] =
3229 #ifdef MLX5_PMD_SOFT_COUNTERS
3230 /* Update sent data bytes counter. */
3231 txq->stats.obytes += vlan +
3232 rte_pktmbuf_data_len(loc->mbuf);
3236 * No inline at all, it means the CPU cycles saving
3237 * is prioritized at configuration, we should not
3238 * copy any packet data to WQE.
3240 * SEND WQE, one WQEBB:
3241 * - Control Segment, SEND opcode
3242 * - Ethernet Segment, optional VLAN, no inline
3243 * - Data Segment, pointer type
3246 wqe = txq->wqes + (txq->wqe_ci & txq->wqe_m);
3247 loc->wqe_last = wqe;
3248 mlx5_tx_cseg_init(txq, loc, wqe, 3,
3249 MLX5_OPCODE_SEND, olx);
3250 mlx5_tx_eseg_none(txq, loc, wqe, olx);
3252 (txq, loc, &wqe->dseg[0],
3253 rte_pktmbuf_mtod(loc->mbuf, uint8_t *),
3254 rte_pktmbuf_data_len(loc->mbuf), olx);
3258 * We should not store mbuf pointer in elts
3259 * if no inlining is configured, this is done
3260 * by calling routine in a batch copy.
3262 MLX5_ASSERT(!MLX5_TXOFF_CONFIG(INLINE));
3264 #ifdef MLX5_PMD_SOFT_COUNTERS
3265 /* Update sent data bytes counter. */
3266 txq->stats.obytes += rte_pktmbuf_data_len(loc->mbuf);
3267 if (MLX5_TXOFF_CONFIG(VLAN) &&
3268 loc->mbuf->ol_flags & RTE_MBUF_F_TX_VLAN)
3269 txq->stats.obytes +=
3270 sizeof(struct rte_vlan_hdr);
3275 if (unlikely(!pkts_n || !loc->elts_free || !loc->wqe_free))
3276 return MLX5_TXCMP_CODE_EXIT;
3277 loc->mbuf = *pkts++;
3279 rte_prefetch0(*pkts);
3280 ret = mlx5_tx_able_to_empw(txq, loc, olx, true);
3281 if (unlikely(ret != MLX5_TXCMP_CODE_SINGLE))
3287 static __rte_always_inline enum mlx5_txcmp_code
3288 mlx5_tx_burst_single(struct mlx5_txq_data *__rte_restrict txq,
3289 struct rte_mbuf **__rte_restrict pkts,
3290 unsigned int pkts_n,
3291 struct mlx5_txq_local *__rte_restrict loc,
3294 enum mlx5_txcmp_code ret;
3296 ret = mlx5_tx_able_to_empw(txq, loc, olx, false);
3297 if (ret == MLX5_TXCMP_CODE_SINGLE)
3299 MLX5_ASSERT(ret == MLX5_TXCMP_CODE_EMPW);
3301 /* Optimize for inline/no inline eMPW send. */
3302 ret = (MLX5_TXOFF_CONFIG(INLINE)) ?
3303 mlx5_tx_burst_empw_inline
3304 (txq, pkts, pkts_n, loc, olx) :
3305 mlx5_tx_burst_empw_simple
3306 (txq, pkts, pkts_n, loc, olx);
3307 if (ret != MLX5_TXCMP_CODE_SINGLE)
3309 /* The resources to send one packet should remain. */
3310 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3312 ret = mlx5_tx_burst_single_send(txq, pkts, pkts_n, loc, olx);
3313 MLX5_ASSERT(ret != MLX5_TXCMP_CODE_SINGLE);
3314 if (ret != MLX5_TXCMP_CODE_EMPW)
3316 /* The resources to send one packet should remain. */
3317 MLX5_ASSERT(loc->elts_free && loc->wqe_free);
3322 * DPDK Tx callback template. This is configured template used to generate
3323 * routines optimized for specified offload setup.
3324 * One of this generated functions is chosen at SQ configuration time.
3327 * Generic pointer to TX queue structure.
3329 * Packets to transmit.
3331 * Number of packets in array.
3333 * Configured offloads mask, presents the bits of MLX5_TXOFF_CONFIG_xxx
3334 * values. Should be static to take compile time static configuration
3338 * Number of packets successfully transmitted (<= pkts_n).
3340 static __rte_always_inline uint16_t
3341 mlx5_tx_burst_tmpl(struct mlx5_txq_data *__rte_restrict txq,
3342 struct rte_mbuf **__rte_restrict pkts,
3346 struct mlx5_txq_local loc;
3347 enum mlx5_txcmp_code ret;
3350 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3351 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3352 if (unlikely(!pkts_n))
3354 if (MLX5_TXOFF_CONFIG(INLINE))
3358 loc.wqe_last = NULL;
3361 loc.pkts_loop = loc.pkts_sent;
3363 * Check if there are some CQEs, if any:
3364 * - process an encountered errors
3365 * - process the completed WQEs
3366 * - free related mbufs
3367 * - doorbell the NIC about processed CQEs
3369 rte_prefetch0(*(pkts + loc.pkts_sent));
3370 mlx5_tx_handle_completion(txq, olx);
3372 * Calculate the number of available resources - elts and WQEs.
3373 * There are two possible different scenarios:
3374 * - no data inlining into WQEs, one WQEBB may contains up to
3375 * four packets, in this case elts become scarce resource
3376 * - data inlining into WQEs, one packet may require multiple
3377 * WQEBBs, the WQEs become the limiting factor.
3379 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3380 loc.elts_free = txq->elts_s -
3381 (uint16_t)(txq->elts_head - txq->elts_tail);
3382 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3383 loc.wqe_free = txq->wqe_s -
3384 (uint16_t)(txq->wqe_ci - txq->wqe_pi);
3385 if (unlikely(!loc.elts_free || !loc.wqe_free))
3389 * Fetch the packet from array. Usually this is the first
3390 * packet in series of multi/single segment packets.
3392 loc.mbuf = *(pkts + loc.pkts_sent);
3393 /* Dedicated branch for multi-segment packets. */
3394 if (MLX5_TXOFF_CONFIG(MULTI) &&
3395 unlikely(NB_SEGS(loc.mbuf) > 1)) {
3397 * Multi-segment packet encountered.
3398 * Hardware is able to process it only
3399 * with SEND/TSO opcodes, one packet
3400 * per WQE, do it in dedicated routine.
3403 MLX5_ASSERT(loc.pkts_sent >= loc.pkts_copy);
3404 part = loc.pkts_sent - loc.pkts_copy;
3405 if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3407 * There are some single-segment mbufs not
3408 * stored in elts. The mbufs must be in the
3409 * same order as WQEs, so we must copy the
3410 * mbufs to elts here, before the coming
3411 * multi-segment packet mbufs is appended.
3413 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy,
3415 loc.pkts_copy = loc.pkts_sent;
3417 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3418 ret = mlx5_tx_burst_mseg(txq, pkts, pkts_n, &loc, olx);
3419 if (!MLX5_TXOFF_CONFIG(INLINE))
3420 loc.pkts_copy = loc.pkts_sent;
3422 * These returned code checks are supposed
3423 * to be optimized out due to routine inlining.
3425 if (ret == MLX5_TXCMP_CODE_EXIT) {
3427 * The routine returns this code when
3428 * all packets are sent or there is no
3429 * enough resources to complete request.
3433 if (ret == MLX5_TXCMP_CODE_ERROR) {
3435 * The routine returns this code when some error
3436 * in the incoming packets format occurred.
3438 txq->stats.oerrors++;
3441 if (ret == MLX5_TXCMP_CODE_SINGLE) {
3443 * The single-segment packet was encountered
3444 * in the array, try to send it with the
3445 * best optimized way, possible engaging eMPW.
3447 goto enter_send_single;
3449 if (MLX5_TXOFF_CONFIG(TSO) &&
3450 ret == MLX5_TXCMP_CODE_TSO) {
3452 * The single-segment TSO packet was
3453 * encountered in the array.
3455 goto enter_send_tso;
3457 /* We must not get here. Something is going wrong. */
3459 txq->stats.oerrors++;
3462 /* Dedicated branch for single-segment TSO packets. */
3463 if (MLX5_TXOFF_CONFIG(TSO) &&
3464 unlikely(loc.mbuf->ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
3466 * TSO might require special way for inlining
3467 * (dedicated parameters) and is sent with
3468 * MLX5_OPCODE_TSO opcode only, provide this
3469 * in dedicated branch.
3472 MLX5_ASSERT(NB_SEGS(loc.mbuf) == 1);
3473 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3474 ret = mlx5_tx_burst_tso(txq, pkts, pkts_n, &loc, olx);
3476 * These returned code checks are supposed
3477 * to be optimized out due to routine inlining.
3479 if (ret == MLX5_TXCMP_CODE_EXIT)
3481 if (ret == MLX5_TXCMP_CODE_ERROR) {
3482 txq->stats.oerrors++;
3485 if (ret == MLX5_TXCMP_CODE_SINGLE)
3486 goto enter_send_single;
3487 if (MLX5_TXOFF_CONFIG(MULTI) &&
3488 ret == MLX5_TXCMP_CODE_MULTI) {
3490 * The multi-segment packet was
3491 * encountered in the array.
3493 goto enter_send_multi;
3495 /* We must not get here. Something is going wrong. */
3497 txq->stats.oerrors++;
3501 * The dedicated branch for the single-segment packets
3502 * without TSO. Often these ones can be sent using
3503 * MLX5_OPCODE_EMPW with multiple packets in one WQE.
3504 * The routine builds the WQEs till it encounters
3505 * the TSO or multi-segment packet (in case if these
3506 * offloads are requested at SQ configuration time).
3509 MLX5_ASSERT(pkts_n > loc.pkts_sent);
3510 ret = mlx5_tx_burst_single(txq, pkts, pkts_n, &loc, olx);
3512 * These returned code checks are supposed
3513 * to be optimized out due to routine inlining.
3515 if (ret == MLX5_TXCMP_CODE_EXIT)
3517 if (ret == MLX5_TXCMP_CODE_ERROR) {
3518 txq->stats.oerrors++;
3521 if (MLX5_TXOFF_CONFIG(MULTI) &&
3522 ret == MLX5_TXCMP_CODE_MULTI) {
3524 * The multi-segment packet was
3525 * encountered in the array.
3527 goto enter_send_multi;
3529 if (MLX5_TXOFF_CONFIG(TSO) &&
3530 ret == MLX5_TXCMP_CODE_TSO) {
3532 * The single-segment TSO packet was
3533 * encountered in the array.
3535 goto enter_send_tso;
3537 /* We must not get here. Something is going wrong. */
3539 txq->stats.oerrors++;
3543 * Main Tx loop is completed, do the rest:
3544 * - set completion request if thresholds are reached
3545 * - doorbell the hardware
3546 * - copy the rest of mbufs to elts (if any)
3548 MLX5_ASSERT(MLX5_TXOFF_CONFIG(INLINE) ||
3549 loc.pkts_sent >= loc.pkts_copy);
3550 /* Take a shortcut if nothing is sent. */
3551 if (unlikely(loc.pkts_sent == loc.pkts_loop))
3553 /* Request CQE generation if limits are reached. */
3554 mlx5_tx_request_completion(txq, &loc, olx);
3556 * Ring QP doorbell immediately after WQE building completion
3557 * to improve latencies. The pure software related data treatment
3558 * can be completed after doorbell. Tx CQEs for this SQ are
3559 * processed in this thread only by the polling.
3561 * The rdma core library can map doorbell register in two ways,
3562 * depending on the environment variable "MLX5_SHUT_UP_BF":
3564 * - as regular cached memory, the variable is either missing or
3565 * set to zero. This type of mapping may cause the significant
3566 * doorbell register writing latency and requires explicit memory
3567 * write barrier to mitigate this issue and prevent write combining.
3569 * - as non-cached memory, the variable is present and set to not "0"
3570 * value. This type of mapping may cause performance impact under
3571 * heavy loading conditions but the explicit write memory barrier is
3572 * not required and it may improve core performance.
3574 * - the legacy behaviour (prior 19.08 release) was to use some
3575 * heuristics to decide whether write memory barrier should
3576 * be performed. This behavior is supported with specifying
3577 * tx_db_nc=2, write barrier is skipped if application provides
3578 * the full recommended burst of packets, it supposes the next
3579 * packets are coming and the write barrier will be issued on
3580 * the next burst (after descriptor writing, at least).
3582 mlx5_doorbell_ring(mlx5_tx_bfreg(txq),
3583 *(volatile uint64_t *)loc.wqe_last, txq->wqe_ci,
3584 txq->qp_db, !txq->db_nc &&
3585 (!txq->db_heu || pkts_n % MLX5_TX_DEFAULT_BURST));
3586 /* Not all of the mbufs may be stored into elts yet. */
3587 part = MLX5_TXOFF_CONFIG(INLINE) ? 0 : loc.pkts_sent - loc.pkts_copy;
3588 if (!MLX5_TXOFF_CONFIG(INLINE) && part) {
3590 * There are some single-segment mbufs not stored in elts.
3591 * It can be only if the last packet was single-segment.
3592 * The copying is gathered into one place due to it is
3593 * a good opportunity to optimize that with SIMD.
3594 * Unfortunately if inlining is enabled the gaps in pointer
3595 * array may happen due to early freeing of the inlined mbufs.
3597 mlx5_tx_copy_elts(txq, pkts + loc.pkts_copy, part, olx);
3598 loc.pkts_copy = loc.pkts_sent;
3600 MLX5_ASSERT(txq->elts_s >= (uint16_t)(txq->elts_head - txq->elts_tail));
3601 MLX5_ASSERT(txq->wqe_s >= (uint16_t)(txq->wqe_ci - txq->wqe_pi));
3602 if (pkts_n > loc.pkts_sent) {
3604 * If burst size is large there might be no enough CQE
3605 * fetched from completion queue and no enough resources
3606 * freed to send all the packets.
3611 #ifdef MLX5_PMD_SOFT_COUNTERS
3612 /* Increment sent packets counter. */
3613 txq->stats.opackets += loc.pkts_sent;
3615 if (MLX5_TXOFF_CONFIG(INLINE) && loc.mbuf_free)
3616 __mlx5_tx_free_mbuf(txq, pkts, loc.mbuf_free, olx);
3617 return loc.pkts_sent;
3620 #endif /* RTE_PMD_MLX5_TX_H_ */