1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(C) 2021 Marvell.
9 #define NIX_TX_OFFLOAD_NONE (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F BIT(5)
17 /* Flags to control xmit_prepare function.
18 * Defining it from backwards to denote its been
19 * not used as offload flags to pick function
21 #define NIX_TX_MULTI_SEG_F BIT(15)
23 #define NIX_TX_NEED_SEND_HDR_W1 \
24 (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F | \
25 NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
27 #define NIX_TX_NEED_EXT_HDR \
28 (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F | \
31 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) \
33 /* Cached value is low, Update the fc_cache_pkts */ \
34 if (unlikely((txq)->fc_cache_pkts < (pkts))) { \
35 /* Multiply with sqe_per_sqb to express in pkts */ \
36 (txq)->fc_cache_pkts = \
37 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) \
38 << (txq)->sqes_per_sqb_log2; \
39 /* Check it again for the room */ \
40 if (unlikely((txq)->fc_cache_pkts < (pkts))) \
45 /* Encoded number of segments to number of dwords macro, each value of nb_segs
46 * is encoded as 4bits.
48 #define NIX_SEGDW_MAGIC 0x76654432210ULL
50 #define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
52 #define LMT_OFF(lmt_addr, lmt_num, offset) \
53 (void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
55 /* Function to determine no of tx subdesc required in case ext
56 * sub desc is enabled.
58 static __rte_always_inline int
59 cn10k_nix_tx_ext_subs(const uint16_t flags)
61 return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
64 (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
69 static __rte_always_inline uint8_t
70 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
72 return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
73 << ROC_LMT_LINES_PER_CORE_LOG2;
76 static __rte_always_inline uint8_t
77 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
79 return (flags & NIX_TX_NEED_EXT_HDR) ?
80 ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
84 static __rte_always_inline uint64_t
85 cn10k_nix_tx_steor_data(const uint16_t flags)
87 const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
90 /* This will be moved to addr area */
92 /* 15 vector sizes for single seg */
112 static __rte_always_inline uint8_t
113 cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
115 return ((flags & NIX_TX_NEED_EXT_HDR) ?
116 (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
120 static __rte_always_inline uint64_t
121 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
123 const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
126 /* This will be moved to addr area */
128 /* 15 vector sizes for single seg */
148 static __rte_always_inline void
149 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
150 const uint16_t flags)
153 cmd[0] = txq->send_hdr_w0;
157 /* Send ext if present */
158 if (flags & NIX_TX_NEED_EXT_HDR) {
159 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
168 static __rte_always_inline void
169 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
171 uint64_t mask, ol_flags = m->ol_flags;
173 if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
174 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
175 uint16_t *iplen, *oiplen, *oudplen;
176 uint16_t lso_sb, paylen;
178 mask = -!!(ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6));
179 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
180 m->l2_len + m->l3_len + m->l4_len;
182 /* Reduce payload len from base headers */
183 paylen = m->pkt_len - lso_sb;
185 /* Get iplen position assuming no tunnel hdr */
186 iplen = (uint16_t *)(mdata + m->l2_len +
187 (2 << !!(ol_flags & PKT_TX_IPV6)));
188 /* Handle tunnel tso */
189 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
190 (ol_flags & PKT_TX_TUNNEL_MASK)) {
191 const uint8_t is_udp_tun =
192 (CNXK_NIX_UDP_TUN_BITMASK >>
193 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
196 oiplen = (uint16_t *)(mdata + m->outer_l2_len +
198 PKT_TX_OUTER_IPV6)));
199 *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
202 /* Update format for UDP tunneled packet */
204 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
205 m->outer_l3_len + 4);
206 *oudplen = rte_cpu_to_be_16(
207 rte_be_to_cpu_16(*oudplen) - paylen);
210 /* Update iplen position to inner ip hdr */
211 iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
213 (2 << !!(ol_flags & PKT_TX_IPV6)));
216 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
220 static __rte_always_inline void
221 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
222 const uint16_t flags, const uint64_t lso_tun_fmt)
224 struct nix_send_ext_s *send_hdr_ext;
225 struct nix_send_hdr_s *send_hdr;
226 uint64_t ol_flags = 0, mask;
227 union nix_send_hdr_w1_u w1;
228 union nix_send_sg_s *sg;
230 send_hdr = (struct nix_send_hdr_s *)cmd;
231 if (flags & NIX_TX_NEED_EXT_HDR) {
232 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
233 sg = (union nix_send_sg_s *)(cmd + 4);
234 /* Clear previous markings */
235 send_hdr_ext->w0.lso = 0;
236 send_hdr_ext->w1.u = 0;
238 sg = (union nix_send_sg_s *)(cmd + 2);
241 if (flags & NIX_TX_NEED_SEND_HDR_W1) {
242 ol_flags = m->ol_flags;
246 if (!(flags & NIX_TX_MULTI_SEG_F)) {
247 send_hdr->w0.total = m->data_len;
249 roc_npa_aura_handle_to_aura(m->pool->pool_id);
254 * 3 => IPV4 with csum
256 * L3type and L3ptr needs to be set for either
257 * L3 csum or L4 csum or LSO
261 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
262 (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
263 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
264 const uint8_t ol3type =
265 ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
266 ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
267 !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
270 w1.ol3type = ol3type;
271 mask = 0xffffull << ((!!ol3type) << 4);
272 w1.ol3ptr = ~mask & m->outer_l2_len;
273 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
276 w1.ol4type = csum + (csum << 1);
279 w1.il3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
280 ((!!(ol_flags & PKT_TX_IPV6)) << 2);
281 w1.il3ptr = w1.ol4ptr + m->l2_len;
282 w1.il4ptr = w1.il3ptr + m->l3_len;
283 /* Increment it by 1 if it is IPV4 as 3 is with csum */
284 w1.il3type = w1.il3type + !!(ol_flags & PKT_TX_IP_CKSUM);
287 w1.il4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
289 /* In case of no tunnel header use only
290 * shift IL3/IL4 fields a bit to use
291 * OL3/OL4 for header checksum
294 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
295 ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
297 } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
298 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
299 const uint8_t outer_l2_len = m->outer_l2_len;
302 w1.ol3ptr = outer_l2_len;
303 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
304 /* Increment it by 1 if it is IPV4 as 3 is with csum */
305 w1.ol3type = ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
306 ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
307 !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
310 w1.ol4type = csum + (csum << 1);
312 } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
313 const uint8_t l2_len = m->l2_len;
315 /* Always use OLXPTR and OLXTYPE when only
316 * when one header is present
321 w1.ol4ptr = l2_len + m->l3_len;
322 /* Increment it by 1 if it is IPV4 as 3 is with csum */
323 w1.ol3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
324 ((!!(ol_flags & PKT_TX_IPV6)) << 2) +
325 !!(ol_flags & PKT_TX_IP_CKSUM);
328 w1.ol4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
331 if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
332 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & PKT_TX_VLAN);
333 /* HW will update ptr after vlan0 update */
334 send_hdr_ext->w1.vlan1_ins_ptr = 12;
335 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
337 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & PKT_TX_QINQ);
338 /* 2B before end of l2 header */
339 send_hdr_ext->w1.vlan0_ins_ptr = 12;
340 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
343 if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
347 mask = -(!w1.il3type);
348 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
350 send_hdr_ext->w0.lso_sb = lso_sb;
351 send_hdr_ext->w0.lso = 1;
352 send_hdr_ext->w0.lso_mps = m->tso_segsz;
353 send_hdr_ext->w0.lso_format =
354 NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
355 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
357 /* Handle tunnel tso */
358 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
359 (ol_flags & PKT_TX_TUNNEL_MASK)) {
360 const uint8_t is_udp_tun =
361 (CNXK_NIX_UDP_TUN_BITMASK >>
362 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
364 uint8_t shift = is_udp_tun ? 32 : 0;
366 shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
367 shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
369 w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
370 w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
371 /* Update format for UDP tunneled packet */
372 send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
376 if (flags & NIX_TX_NEED_SEND_HDR_W1)
377 send_hdr->w1.u = w1.u;
379 if (!(flags & NIX_TX_MULTI_SEG_F)) {
380 sg->seg1_size = m->data_len;
381 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
383 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
384 /* DF bit = 1 if refcount of current mbuf or parent mbuf
386 * DF bit = 0 otherwise
388 send_hdr->w0.df = cnxk_nix_prefree_seg(m);
390 /* Mark mempool object as "put" since it is freed by NIX */
391 if (!send_hdr->w0.df)
392 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
395 /* With minimal offloads, 'cmd' being local could be optimized out to
396 * registers. In other cases, 'cmd' will be in stack. Intent is
397 * 'cmd' stores content from txq->cmd which is copied only once.
399 *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
401 if (flags & NIX_TX_NEED_EXT_HDR) {
402 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
405 /* In case of multi-seg, sg template is stored here */
406 *((union nix_send_sg_s *)lmt_addr) = *sg;
407 *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
410 static __rte_always_inline void
411 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
412 const uint64_t ol_flags, const uint16_t no_segdw,
413 const uint16_t flags)
415 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
416 const uint8_t is_ol_tstamp = !(ol_flags & PKT_TX_IEEE1588_TMST);
417 struct nix_send_ext_s *send_hdr_ext =
418 (struct nix_send_ext_s *)lmt_addr + 16;
419 uint64_t *lmt = (uint64_t *)lmt_addr;
420 uint16_t off = (no_segdw - 1) << 1;
421 struct nix_send_mem_s *send_mem;
423 send_mem = (struct nix_send_mem_s *)(lmt + off);
424 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
425 send_hdr_ext->w0.tstmp = 1;
426 if (flags & NIX_TX_MULTI_SEG_F) {
427 /* Retrieving the default desc values */
430 /* Using compiler barier to avoid voilation of C
433 rte_compiler_barrier();
436 /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
437 * should not be recorded, hence changing the alg type to
438 * NIX_SENDMEMALG_SET and also changing send mem addr field to
439 * next 8 bytes as it corrpt the actual tx tstamp registered
442 send_mem->w0.subdc = NIX_SUBDC_MEM;
443 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
445 (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
449 static __rte_always_inline uint16_t
450 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
452 struct nix_send_hdr_s *send_hdr;
453 union nix_send_sg_s *sg;
454 struct rte_mbuf *m_next;
455 uint64_t *slist, sg_u;
460 send_hdr = (struct nix_send_hdr_s *)cmd;
461 send_hdr->w0.total = m->pkt_len;
462 send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
464 if (flags & NIX_TX_NEED_EXT_HDR)
469 sg = (union nix_send_sg_s *)&cmd[2 + off];
470 /* Clear sg->u header before use */
471 sg->u &= 0xFC00000000000000;
473 slist = &cmd[3 + off];
476 nb_segs = m->nb_segs;
478 /* Fill mbuf segments */
481 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
482 *slist = rte_mbuf_data_iova(m);
483 /* Set invert df if buffer is not to be freed by H/W */
484 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
485 sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
486 /* Mark mempool object as "put" since it is freed by NIX
488 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
489 if (!(sg_u & (1ULL << (i + 55))))
490 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
495 if (i > 2 && nb_segs) {
497 /* Next SG subdesc */
498 *(uint64_t *)slist = sg_u & 0xFC00000000000000;
501 sg = (union nix_send_sg_s *)slist;
510 segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
511 /* Roundup extra dwords to multiple of 2 */
512 segdw = (segdw >> 1) + (segdw & 0x1);
514 segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
515 send_hdr->w0.sizem1 = segdw - 1;
520 static __rte_always_inline uint16_t
521 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
522 uint64_t *cmd, const uint16_t flags)
524 struct cn10k_eth_txq *txq = tx_queue;
525 const rte_iova_t io_addr = txq->io_addr;
526 uintptr_t pa, lmt_addr = txq->lmt_base;
527 uint16_t lmt_id, burst, left, i;
528 uint64_t lso_tun_fmt;
531 NIX_XMIT_FC_OR_RETURN(txq, pkts);
533 /* Get cmd skeleton */
534 cn10k_nix_tx_skeleton(txq, cmd, flags);
536 /* Reduce the cached count */
537 txq->fc_cache_pkts -= pkts;
539 if (flags & NIX_TX_OFFLOAD_TSO_F)
540 lso_tun_fmt = txq->lso_tun_fmt;
542 /* Get LMT base address and LMT ID as lcore id */
543 ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
546 burst = left > 32 ? 32 : left;
547 for (i = 0; i < burst; i++) {
548 /* Perform header writes for TSO, barrier at
549 * lmt steorl will suffice.
551 if (flags & NIX_TX_OFFLOAD_TSO_F)
552 cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
554 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
556 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
557 tx_pkts[i]->ol_flags, 4, flags);
558 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
563 data = cn10k_nix_tx_steor_data(flags);
564 pa = io_addr | (data & 0x7) << 4;
566 data |= (15ULL << 12);
567 data |= (uint64_t)lmt_id;
570 roc_lmt_submit_steorl(data, pa);
572 data = cn10k_nix_tx_steor_data(flags);
573 pa = io_addr | (data & 0x7) << 4;
575 data |= ((uint64_t)(burst - 17)) << 12;
576 data |= (uint64_t)(lmt_id + 16);
579 roc_lmt_submit_steorl(data, pa);
581 data = cn10k_nix_tx_steor_data(flags);
582 pa = io_addr | (data & 0x7) << 4;
584 data |= ((uint64_t)(burst - 1)) << 12;
588 roc_lmt_submit_steorl(data, pa);
594 /* Start processing another burst */
596 /* Reset lmt base addr */
597 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
598 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
605 static __rte_always_inline uint16_t
606 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
607 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
609 struct cn10k_eth_txq *txq = tx_queue;
610 uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
611 const rte_iova_t io_addr = txq->io_addr;
612 uint16_t segdw, lmt_id, burst, left, i;
613 uint64_t data0, data1;
614 uint64_t lso_tun_fmt;
618 NIX_XMIT_FC_OR_RETURN(txq, pkts);
620 cn10k_nix_tx_skeleton(txq, cmd, flags);
622 /* Reduce the cached count */
623 txq->fc_cache_pkts -= pkts;
625 if (flags & NIX_TX_OFFLOAD_TSO_F)
626 lso_tun_fmt = txq->lso_tun_fmt;
628 /* Get LMT base address and LMT ID as lcore id */
629 ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
632 burst = left > 32 ? 32 : left;
635 for (i = 0; i < burst; i++) {
636 /* Perform header writes for TSO, barrier at
637 * lmt steorl will suffice.
639 if (flags & NIX_TX_OFFLOAD_TSO_F)
640 cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
642 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
644 /* Store sg list directly on lmt line */
645 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
647 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
648 tx_pkts[i]->ol_flags, segdw,
650 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
651 data128 |= (((__uint128_t)(segdw - 1)) << shft);
655 data0 = (uint64_t)data128;
656 data1 = (uint64_t)(data128 >> 64);
657 /* Make data0 similar to data1 */
661 pa0 = io_addr | (data0 & 0x7) << 4;
663 /* Move lmtst1..15 sz to bits 63:19 */
665 data0 |= (15ULL << 12);
666 data0 |= (uint64_t)lmt_id;
669 roc_lmt_submit_steorl(data0, pa0);
671 pa1 = io_addr | (data1 & 0x7) << 4;
674 data1 |= ((uint64_t)(burst - 17)) << 12;
675 data1 |= (uint64_t)(lmt_id + 16);
678 roc_lmt_submit_steorl(data1, pa1);
680 pa0 = io_addr | (data0 & 0x7) << 4;
682 /* Move lmtst1..15 sz to bits 63:19 */
684 data0 |= ((burst - 1) << 12);
685 data0 |= (uint64_t)lmt_id;
688 roc_lmt_submit_steorl(data0, pa0);
694 /* Start processing another burst */
696 /* Reset lmt base addr */
697 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
698 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
705 #if defined(RTE_ARCH_ARM64)
707 static __rte_always_inline void
708 cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
709 union nix_send_ext_w0_u *w0, uint64_t ol_flags,
710 const uint64_t flags, const uint64_t lso_tun_fmt)
715 if (!(ol_flags & PKT_TX_TCP_SEG))
718 mask = -(!w1->il3type);
719 lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
723 w0->lso_mps = m->tso_segsz;
724 w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
725 w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
727 /* Handle tunnel tso */
728 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
729 (ol_flags & PKT_TX_TUNNEL_MASK)) {
730 const uint8_t is_udp_tun =
731 (CNXK_NIX_UDP_TUN_BITMASK >>
732 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
734 uint8_t shift = is_udp_tun ? 32 : 0;
736 shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
737 shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
739 w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
740 w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
741 /* Update format for UDP tunneled packet */
743 w0->lso_format = (lso_tun_fmt >> shift);
747 static __rte_always_inline void
748 cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
749 union nix_send_hdr_w0_u *sh,
750 union nix_send_sg_s *sg, const uint32_t flags)
752 struct rte_mbuf *m_next;
753 uint64_t *slist, sg_u;
757 sh->total = m->pkt_len;
758 /* Clear sg->u header before use */
759 sg->u &= 0xFC00000000000000;
763 sg_u = sg_u | ((uint64_t)m->data_len);
765 nb_segs = m->nb_segs - 1;
768 /* Set invert df if buffer is not to be freed by H/W */
769 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
770 sg_u |= (cnxk_nix_prefree_seg(m) << 55);
771 /* Mark mempool object as "put" since it is freed by NIX */
772 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
773 if (!(sg_u & (1ULL << 55)))
774 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
779 /* Fill mbuf segments */
782 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
783 *slist = rte_mbuf_data_iova(m);
784 /* Set invert df if buffer is not to be freed by H/W */
785 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
786 sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
787 /* Mark mempool object as "put" since it is freed by NIX
789 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
790 if (!(sg_u & (1ULL << (i + 55))))
791 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
797 if (i > 2 && nb_segs) {
799 /* Next SG subdesc */
800 *(uint64_t *)slist = sg_u & 0xFC00000000000000;
803 sg = (union nix_send_sg_s *)slist;
814 static __rte_always_inline void
815 cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
816 uint64x2_t *cmd1, const uint8_t segdw,
817 const uint32_t flags)
819 union nix_send_hdr_w0_u sh;
820 union nix_send_sg_s sg;
822 if (m->nb_segs == 1) {
823 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
824 sg.u = vgetq_lane_u64(cmd1[0], 0);
825 sg.u |= (cnxk_nix_prefree_seg(m) << 55);
826 cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
829 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
830 sg.u = vgetq_lane_u64(cmd1[0], 0);
831 if (!(sg.u & (1ULL << 55)))
832 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
838 sh.u = vgetq_lane_u64(cmd0[0], 0);
839 sg.u = vgetq_lane_u64(cmd1[0], 0);
841 cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
843 sh.sizem1 = segdw - 1;
844 cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
845 cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
848 #define NIX_DESCS_PER_LOOP 4
850 static __rte_always_inline uint8_t
851 cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
852 uint64x2_t *cmd1, uint64x2_t *cmd2,
853 uint64x2_t *cmd3, uint8_t *segdw,
854 uint64_t *lmt_addr, __uint128_t *data128,
855 uint8_t *shift, const uint16_t flags)
857 uint8_t j, off, lmt_used;
859 if (!(flags & NIX_TX_NEED_EXT_HDR) &&
860 !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
861 /* No segments in 4 consecutive packets. */
862 if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
863 for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
864 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
867 vst1q_u64(lmt_addr, cmd0[0]);
868 vst1q_u64(lmt_addr + 2, cmd1[0]);
869 vst1q_u64(lmt_addr + 4, cmd0[1]);
870 vst1q_u64(lmt_addr + 6, cmd1[1]);
871 vst1q_u64(lmt_addr + 8, cmd0[2]);
872 vst1q_u64(lmt_addr + 10, cmd1[2]);
873 vst1q_u64(lmt_addr + 12, cmd0[3]);
874 vst1q_u64(lmt_addr + 14, cmd1[3]);
876 *data128 |= ((__uint128_t)7) << *shift;
884 for (j = 0; j < NIX_DESCS_PER_LOOP;) {
885 /* Fit consecutive packets in same LMTLINE. */
886 if ((segdw[j] + segdw[j + 1]) <= 8) {
887 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
888 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
891 cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
894 segdw[j + 1], flags);
895 /* TSTAMP takes 4 each, no segs. */
896 vst1q_u64(lmt_addr, cmd0[j]);
897 vst1q_u64(lmt_addr + 2, cmd2[j]);
898 vst1q_u64(lmt_addr + 4, cmd1[j]);
899 vst1q_u64(lmt_addr + 6, cmd3[j]);
901 vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
902 vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
903 vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
904 vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
905 } else if (flags & NIX_TX_NEED_EXT_HDR) {
906 /* EXT header take 3 each, space for 2 segs.*/
907 cn10k_nix_prepare_mseg_vec(mbufs[j],
911 vst1q_u64(lmt_addr, cmd0[j]);
912 vst1q_u64(lmt_addr + 2, cmd2[j]);
913 vst1q_u64(lmt_addr + 4, cmd1[j]);
916 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
920 segdw[j + 1], flags);
921 vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
922 vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
923 vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
925 cn10k_nix_prepare_mseg_vec(mbufs[j],
929 vst1q_u64(lmt_addr, cmd0[j]);
930 vst1q_u64(lmt_addr + 2, cmd1[j]);
933 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
937 segdw[j + 1], flags);
938 vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
939 vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
941 *data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
946 if ((flags & NIX_TX_NEED_EXT_HDR) &&
947 (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
948 cn10k_nix_prepare_mseg_vec(mbufs[j],
952 vst1q_u64(lmt_addr, cmd0[j]);
953 vst1q_u64(lmt_addr + 2, cmd2[j]);
954 vst1q_u64(lmt_addr + 4, cmd1[j]);
957 vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
958 } else if (flags & NIX_TX_NEED_EXT_HDR) {
959 cn10k_nix_prepare_mseg_vec(mbufs[j],
963 vst1q_u64(lmt_addr, cmd0[j]);
964 vst1q_u64(lmt_addr + 2, cmd2[j]);
965 vst1q_u64(lmt_addr + 4, cmd1[j]);
967 cn10k_nix_prepare_mseg_vec(mbufs[j],
971 vst1q_u64(lmt_addr, cmd0[j]);
972 vst1q_u64(lmt_addr + 2, cmd1[j]);
974 *data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
985 static __rte_always_inline uint16_t
986 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
987 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
989 uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
990 uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
991 uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
992 cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
993 uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
994 uint64x2_t senddesc01_w0, senddesc23_w0;
995 uint64x2_t senddesc01_w1, senddesc23_w1;
996 uint16_t left, scalar, burst, i, lmt_id;
997 uint64x2_t sendext01_w0, sendext23_w0;
998 uint64x2_t sendext01_w1, sendext23_w1;
999 uint64x2_t sendmem01_w0, sendmem23_w0;
1000 uint64x2_t sendmem01_w1, sendmem23_w1;
1001 uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
1002 uint64x2_t sgdesc01_w0, sgdesc23_w0;
1003 uint64x2_t sgdesc01_w1, sgdesc23_w1;
1004 struct cn10k_eth_txq *txq = tx_queue;
1005 uintptr_t laddr = txq->lmt_base;
1006 rte_iova_t io_addr = txq->io_addr;
1007 uint64x2_t ltypes01, ltypes23;
1008 uint64x2_t xtmp128, ytmp128;
1009 uint64x2_t xmask01, xmask23;
1010 uint8_t lnum, shift;
1012 __uint128_t data128;
1016 NIX_XMIT_FC_OR_RETURN(txq, pkts);
1018 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1019 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1021 /* Reduce the cached count */
1022 txq->fc_cache_pkts -= pkts;
1023 /* Perform header writes before barrier for TSO */
1024 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1025 for (i = 0; i < pkts; i++)
1026 cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
1029 senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
1030 senddesc23_w0 = senddesc01_w0;
1031 senddesc01_w1 = vdupq_n_u64(0);
1032 senddesc23_w1 = senddesc01_w1;
1033 sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
1034 sgdesc23_w0 = sgdesc01_w0;
1036 /* Load command defaults into vector variables. */
1037 if (flags & NIX_TX_NEED_EXT_HDR) {
1038 sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
1039 sendext23_w0 = sendext01_w0;
1040 sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
1041 sendext23_w1 = sendext01_w1;
1042 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1043 sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
1044 sendmem23_w0 = sendmem01_w0;
1045 sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
1046 sendmem23_w1 = sendmem01_w1;
1050 /* Get LMT base address and LMT ID as lcore id */
1051 ROC_LMT_BASE_ID_GET(laddr, lmt_id);
1054 /* Number of packets to prepare depends on offloads enabled. */
1055 burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
1056 cn10k_nix_pkts_per_vec_brst(flags) :
1058 if (flags & NIX_TX_MULTI_SEG_F) {
1064 for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
1065 if (flags & NIX_TX_MULTI_SEG_F) {
1068 for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1069 struct rte_mbuf *m = tx_pkts[j];
1071 /* Get dwords based on nb_segs. */
1072 segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
1073 /* Add dwords based on offloads. */
1074 segdw[j] += 1 + /* SEND HDR */
1075 !!(flags & NIX_TX_NEED_EXT_HDR) +
1076 !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
1079 /* Check if there are enough LMTLINES for this loop */
1080 if (lnum + 4 > 32) {
1081 uint8_t ldwords_con = 0, lneeded = 0;
1082 for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1083 ldwords_con += segdw[j];
1084 if (ldwords_con > 8) {
1086 ldwords_con = segdw[j];
1090 if (lnum + lneeded > 32) {
1096 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
1098 vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1099 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1101 senddesc23_w0 = senddesc01_w0;
1102 sgdesc23_w0 = sgdesc01_w0;
1104 /* Clear vlan enables. */
1105 if (flags & NIX_TX_NEED_EXT_HDR) {
1106 sendext01_w1 = vbicq_u64(sendext01_w1,
1107 vdupq_n_u64(0x3FFFF00FFFF00));
1108 sendext23_w1 = sendext01_w1;
1111 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1112 /* Reset send mem alg to SETTSTMP from SUB*/
1113 sendmem01_w0 = vbicq_u64(sendmem01_w0,
1114 vdupq_n_u64(BIT_ULL(59)));
1115 /* Reset send mem address to default. */
1117 vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
1118 sendmem23_w0 = sendmem01_w0;
1119 sendmem23_w1 = sendmem01_w1;
1122 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1123 /* Clear the LSO enable bit. */
1124 sendext01_w0 = vbicq_u64(sendext01_w0,
1125 vdupq_n_u64(BIT_ULL(14)));
1126 sendext23_w0 = sendext01_w0;
1129 /* Move mbufs to iova */
1130 mbuf0 = (uint64_t *)tx_pkts[0];
1131 mbuf1 = (uint64_t *)tx_pkts[1];
1132 mbuf2 = (uint64_t *)tx_pkts[2];
1133 mbuf3 = (uint64_t *)tx_pkts[3];
1135 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1136 offsetof(struct rte_mbuf, buf_iova));
1137 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1138 offsetof(struct rte_mbuf, buf_iova));
1139 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1140 offsetof(struct rte_mbuf, buf_iova));
1141 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1142 offsetof(struct rte_mbuf, buf_iova));
1144 * Get mbuf's, olflags, iova, pktlen, dataoff
1145 * dataoff_iovaX.D[0] = iova,
1146 * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
1147 * len_olflagsX.D[0] = ol_flags,
1148 * len_olflagsX.D[1](63:32) = mbuf->pkt_len
1150 dataoff_iova0 = vld1q_u64(mbuf0);
1151 len_olflags0 = vld1q_u64(mbuf0 + 2);
1152 dataoff_iova1 = vld1q_u64(mbuf1);
1153 len_olflags1 = vld1q_u64(mbuf1 + 2);
1154 dataoff_iova2 = vld1q_u64(mbuf2);
1155 len_olflags2 = vld1q_u64(mbuf2 + 2);
1156 dataoff_iova3 = vld1q_u64(mbuf3);
1157 len_olflags3 = vld1q_u64(mbuf3 + 2);
1159 /* Move mbufs to point pool */
1160 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1161 offsetof(struct rte_mbuf, pool) -
1162 offsetof(struct rte_mbuf, buf_iova));
1163 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1164 offsetof(struct rte_mbuf, pool) -
1165 offsetof(struct rte_mbuf, buf_iova));
1166 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1167 offsetof(struct rte_mbuf, pool) -
1168 offsetof(struct rte_mbuf, buf_iova));
1169 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1170 offsetof(struct rte_mbuf, pool) -
1171 offsetof(struct rte_mbuf, buf_iova));
1173 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
1174 NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
1175 /* Get tx_offload for ol2, ol3, l2, l3 lengths */
1177 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1178 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1181 asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
1182 : [a] "+w"(senddesc01_w1)
1183 : [in] "r"(mbuf0 + 2)
1186 asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
1187 : [a] "+w"(senddesc01_w1)
1188 : [in] "r"(mbuf1 + 2)
1191 asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
1192 : [b] "+w"(senddesc23_w1)
1193 : [in] "r"(mbuf2 + 2)
1196 asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
1197 : [b] "+w"(senddesc23_w1)
1198 : [in] "r"(mbuf3 + 2)
1201 /* Get pool pointer alone */
1202 mbuf0 = (uint64_t *)*mbuf0;
1203 mbuf1 = (uint64_t *)*mbuf1;
1204 mbuf2 = (uint64_t *)*mbuf2;
1205 mbuf3 = (uint64_t *)*mbuf3;
1207 /* Get pool pointer alone */
1208 mbuf0 = (uint64_t *)*mbuf0;
1209 mbuf1 = (uint64_t *)*mbuf1;
1210 mbuf2 = (uint64_t *)*mbuf2;
1211 mbuf3 = (uint64_t *)*mbuf3;
1214 const uint8x16_t shuf_mask2 = {
1215 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1216 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1218 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
1219 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
1221 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
1222 const uint64x2_t and_mask0 = {
1227 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
1228 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
1229 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
1230 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
1233 * Pick only 16 bits of pktlen preset at bits 63:32
1234 * and place them at bits 15:0.
1236 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
1237 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
1239 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
1240 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
1241 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
1243 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
1244 * pktlen at 15:0 position.
1246 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
1247 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
1248 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
1249 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
1251 /* Move mbuf to point to pool_id. */
1252 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1253 offsetof(struct rte_mempool, pool_id));
1254 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1255 offsetof(struct rte_mempool, pool_id));
1256 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1257 offsetof(struct rte_mempool, pool_id));
1258 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1259 offsetof(struct rte_mempool, pool_id));
1261 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1262 !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1264 * Lookup table to translate ol_flags to
1265 * il3/il4 types. But we still use ol3/ol4 types in
1266 * senddesc_w1 as only one header processing is enabled.
1268 const uint8x16_t tbl = {
1269 /* [0-15] = il4type:il3type */
1270 0x04, /* none (IPv6 assumed) */
1271 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
1272 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
1273 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
1274 0x03, /* PKT_TX_IP_CKSUM */
1275 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
1276 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
1277 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
1278 0x02, /* PKT_TX_IPV4 */
1279 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
1280 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
1281 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
1282 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
1283 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1286 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1289 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1294 /* Extract olflags to translate to iltypes */
1295 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1296 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1299 * E(47):L3_LEN(9):L2_LEN(7+z)
1300 * E(47):L3_LEN(9):L2_LEN(7+z)
1302 senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
1303 senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
1305 /* Move OLFLAGS bits 55:52 to 51:48
1306 * with zeros preprended on the byte and rest
1309 xtmp128 = vshrq_n_u8(xtmp128, 4);
1310 ytmp128 = vshrq_n_u8(ytmp128, 4);
1312 * E(48):L3_LEN(8):L2_LEN(z+7)
1313 * E(48):L3_LEN(8):L2_LEN(z+7)
1315 const int8x16_t tshft3 = {
1316 -1, 0, 8, 8, 8, 8, 8, 8,
1317 -1, 0, 8, 8, 8, 8, 8, 8,
1320 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1321 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1324 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1325 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1327 /* Pick only relevant fields i.e Bit 48:55 of iltype
1328 * and place it in ol3/ol4type of senddesc_w1
1330 const uint8x16_t shuf_mask0 = {
1331 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
1332 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
1335 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1336 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1338 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1339 * a [E(32):E(16):OL3(8):OL2(8)]
1341 * a [E(32):E(16):(OL3+OL2):OL2]
1342 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1344 senddesc01_w1 = vaddq_u8(senddesc01_w1,
1345 vshlq_n_u16(senddesc01_w1, 8));
1346 senddesc23_w1 = vaddq_u8(senddesc23_w1,
1347 vshlq_n_u16(senddesc23_w1, 8));
1349 /* Move ltypes to senddesc*_w1 */
1350 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1351 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1352 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1353 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1355 * Lookup table to translate ol_flags to
1359 const uint8x16_t tbl = {
1360 /* [0-15] = ol4type:ol3type */
1362 0x03, /* OUTER_IP_CKSUM */
1363 0x02, /* OUTER_IPV4 */
1364 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1365 0x04, /* OUTER_IPV6 */
1366 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1367 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1368 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1371 0x00, /* OUTER_UDP_CKSUM */
1372 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
1373 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
1374 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
1377 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
1378 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1381 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1384 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1385 * OUTER_IPV4 | OUTER_IP_CKSUM
1389 /* Extract olflags to translate to iltypes */
1390 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1391 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1394 * E(47):OL3_LEN(9):OL2_LEN(7+z)
1395 * E(47):OL3_LEN(9):OL2_LEN(7+z)
1397 const uint8x16_t shuf_mask5 = {
1398 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1399 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1401 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1402 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1404 /* Extract outer ol flags only */
1405 const uint64x2_t o_cksum_mask = {
1410 xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1411 ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1413 /* Extract OUTER_UDP_CKSUM bit 41 and
1417 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1418 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1420 /* Shift oltype by 2 to start nibble from BIT(56)
1421 * instead of BIT(58)
1423 xtmp128 = vshrq_n_u8(xtmp128, 2);
1424 ytmp128 = vshrq_n_u8(ytmp128, 2);
1426 * E(48):L3_LEN(8):L2_LEN(z+7)
1427 * E(48):L3_LEN(8):L2_LEN(z+7)
1429 const int8x16_t tshft3 = {
1430 -1, 0, 8, 8, 8, 8, 8, 8,
1431 -1, 0, 8, 8, 8, 8, 8, 8,
1434 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1435 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1438 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1439 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1441 /* Pick only relevant fields i.e Bit 56:63 of oltype
1442 * and place it in ol3/ol4type of senddesc_w1
1444 const uint8x16_t shuf_mask0 = {
1445 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1446 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1449 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1450 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1452 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1453 * a [E(32):E(16):OL3(8):OL2(8)]
1455 * a [E(32):E(16):(OL3+OL2):OL2]
1456 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1458 senddesc01_w1 = vaddq_u8(senddesc01_w1,
1459 vshlq_n_u16(senddesc01_w1, 8));
1460 senddesc23_w1 = vaddq_u8(senddesc23_w1,
1461 vshlq_n_u16(senddesc23_w1, 8));
1463 /* Move ltypes to senddesc*_w1 */
1464 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1465 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1466 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1467 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1468 /* Lookup table to translate ol_flags to
1469 * ol4type, ol3type, il4type, il3type of senddesc_w1
1471 const uint8x16x2_t tbl = {{
1473 /* [0-15] = il4type:il3type */
1474 0x04, /* none (IPv6) */
1475 0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
1476 0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
1477 0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
1478 0x03, /* PKT_TX_IP_CKSUM */
1479 0x13, /* PKT_TX_IP_CKSUM |
1482 0x23, /* PKT_TX_IP_CKSUM |
1485 0x33, /* PKT_TX_IP_CKSUM |
1488 0x02, /* PKT_TX_IPV4 */
1489 0x12, /* PKT_TX_IPV4 |
1492 0x22, /* PKT_TX_IPV4 |
1495 0x32, /* PKT_TX_IPV4 |
1498 0x03, /* PKT_TX_IPV4 |
1501 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1504 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1507 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1513 /* [16-31] = ol4type:ol3type */
1515 0x03, /* OUTER_IP_CKSUM */
1516 0x02, /* OUTER_IPV4 */
1517 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1518 0x04, /* OUTER_IPV6 */
1519 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1520 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1521 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1524 0x00, /* OUTER_UDP_CKSUM */
1525 0x33, /* OUTER_UDP_CKSUM |
1528 0x32, /* OUTER_UDP_CKSUM |
1531 0x33, /* OUTER_UDP_CKSUM |
1532 * OUTER_IPV4 | OUTER_IP_CKSUM
1534 0x34, /* OUTER_UDP_CKSUM |
1537 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1540 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1543 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1544 * OUTER_IPV4 | OUTER_IP_CKSUM
1549 /* Extract olflags to translate to oltype & iltype */
1550 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1551 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1554 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1555 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1557 const uint32x4_t tshft_4 = {
1563 senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1564 senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1567 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1568 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1570 const uint8x16_t shuf_mask5 = {
1571 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1572 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1574 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1575 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1577 /* Extract outer and inner header ol_flags */
1578 const uint64x2_t oi_cksum_mask = {
1583 xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1584 ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1586 /* Extract OUTER_UDP_CKSUM bit 41 and
1590 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1591 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1593 /* Shift right oltype by 2 and iltype by 4
1594 * to start oltype nibble from BIT(58)
1595 * instead of BIT(56) and iltype nibble from BIT(48)
1596 * instead of BIT(52).
1598 const int8x16_t tshft5 = {
1599 8, 8, 8, 8, 8, 8, -4, -2,
1600 8, 8, 8, 8, 8, 8, -4, -2,
1603 xtmp128 = vshlq_u8(xtmp128, tshft5);
1604 ytmp128 = vshlq_u8(ytmp128, tshft5);
1606 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1607 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1609 const int8x16_t tshft3 = {
1610 -1, 0, -1, 0, 0, 0, 0, 0,
1611 -1, 0, -1, 0, 0, 0, 0, 0,
1614 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1615 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1617 /* Mark Bit(4) of oltype */
1618 const uint64x2_t oi_cksum_mask2 = {
1623 xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1624 ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1627 ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1628 ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1630 /* Pick only relevant fields i.e Bit 48:55 of iltype and
1631 * Bit 56:63 of oltype and place it in corresponding
1632 * place in senddesc_w1.
1634 const uint8x16_t shuf_mask0 = {
1635 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1636 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1639 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1640 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1642 /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1643 * l3len, l2len, ol3len, ol2len.
1644 * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1646 * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1648 * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1649 * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1651 senddesc01_w1 = vaddq_u8(senddesc01_w1,
1652 vshlq_n_u32(senddesc01_w1, 8));
1653 senddesc23_w1 = vaddq_u8(senddesc23_w1,
1654 vshlq_n_u32(senddesc23_w1, 8));
1656 /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1657 senddesc01_w1 = vaddq_u8(
1658 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1659 senddesc23_w1 = vaddq_u8(
1660 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1662 /* Move ltypes to senddesc*_w1 */
1663 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1664 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1667 xmask01 = vdupq_n_u64(0);
1669 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1674 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1679 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1684 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1688 xmask01 = vshlq_n_u64(xmask01, 20);
1689 xmask23 = vshlq_n_u64(xmask23, 20);
1691 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1692 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1694 if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
1695 /* Tx ol_flag for vlan. */
1696 const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
1697 /* Bit enable for VLAN1 */
1698 const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
1699 /* Tx ol_flag for QnQ. */
1700 const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
1701 /* Bit enable for VLAN0 */
1702 const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
1703 /* Load vlan values from packet. outer is VLAN 0 */
1704 uint64x2_t ext01 = {
1705 ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
1706 ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
1707 ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
1708 ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
1710 uint64x2_t ext23 = {
1711 ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
1712 ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
1713 ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
1714 ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
1717 /* Get ol_flags of the packets. */
1718 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1719 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1721 /* ORR vlan outer/inner values into cmd. */
1722 sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
1723 sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
1725 /* Test for offload enable bits and generate masks. */
1726 xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
1728 vandq_u64(vtstq_u64(xtmp128, olq),
1730 ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
1732 vandq_u64(vtstq_u64(ytmp128, olq),
1735 /* Set vlan enable bits into cmd based on mask. */
1736 sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
1737 sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
1740 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1741 /* Tx ol_flag for timestam. */
1742 const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
1743 PKT_TX_IEEE1588_TMST};
1744 /* Set send mem alg to SUB. */
1745 const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
1746 /* Increment send mem address by 8. */
1747 const uint64x2_t addr = {0x8, 0x8};
1749 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1750 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1752 /* Check if timestamp is requested and generate inverted
1753 * mask as we need not make any changes to default cmd
1756 xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
1757 ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
1759 /* Change send mem address to an 8 byte offset when
1760 * TSTMP is disabled.
1762 sendmem01_w1 = vaddq_u64(sendmem01_w1,
1763 vandq_u64(xtmp128, addr));
1764 sendmem23_w1 = vaddq_u64(sendmem23_w1,
1765 vandq_u64(ytmp128, addr));
1766 /* Change send mem alg to SUB when TSTMP is disabled. */
1767 sendmem01_w0 = vorrq_u64(sendmem01_w0,
1768 vandq_u64(xtmp128, alg));
1769 sendmem23_w0 = vorrq_u64(sendmem23_w0,
1770 vandq_u64(ytmp128, alg));
1772 cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
1773 cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
1774 cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
1775 cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
1778 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1779 const uint64_t lso_fmt = txq->lso_tun_fmt;
1780 uint64_t sx_w0[NIX_DESCS_PER_LOOP];
1781 uint64_t sd_w1[NIX_DESCS_PER_LOOP];
1783 /* Extract SD W1 as we need to set L4 types. */
1784 vst1q_u64(sd_w1, senddesc01_w1);
1785 vst1q_u64(sd_w1 + 2, senddesc23_w1);
1787 /* Extract SX W0 as we need to set LSO fields. */
1788 vst1q_u64(sx_w0, sendext01_w0);
1789 vst1q_u64(sx_w0 + 2, sendext23_w0);
1791 /* Extract ol_flags. */
1792 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1793 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1795 /* Prepare individual mbufs. */
1796 cn10k_nix_prepare_tso(tx_pkts[0],
1797 (union nix_send_hdr_w1_u *)&sd_w1[0],
1798 (union nix_send_ext_w0_u *)&sx_w0[0],
1799 vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
1801 cn10k_nix_prepare_tso(tx_pkts[1],
1802 (union nix_send_hdr_w1_u *)&sd_w1[1],
1803 (union nix_send_ext_w0_u *)&sx_w0[1],
1804 vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
1806 cn10k_nix_prepare_tso(tx_pkts[2],
1807 (union nix_send_hdr_w1_u *)&sd_w1[2],
1808 (union nix_send_ext_w0_u *)&sx_w0[2],
1809 vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
1811 cn10k_nix_prepare_tso(tx_pkts[3],
1812 (union nix_send_hdr_w1_u *)&sd_w1[3],
1813 (union nix_send_ext_w0_u *)&sx_w0[3],
1814 vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
1816 senddesc01_w1 = vld1q_u64(sd_w1);
1817 senddesc23_w1 = vld1q_u64(sd_w1 + 2);
1819 sendext01_w0 = vld1q_u64(sx_w0);
1820 sendext23_w0 = vld1q_u64(sx_w0 + 2);
1823 if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
1824 !(flags & NIX_TX_MULTI_SEG_F)) {
1825 /* Set don't free bit if reference count > 1 */
1826 xmask01 = vdupq_n_u64(0);
1829 /* Move mbufs to iova */
1830 mbuf0 = (uint64_t *)tx_pkts[0];
1831 mbuf1 = (uint64_t *)tx_pkts[1];
1832 mbuf2 = (uint64_t *)tx_pkts[2];
1833 mbuf3 = (uint64_t *)tx_pkts[3];
1835 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1836 vsetq_lane_u64(0x80000, xmask01, 0);
1838 __mempool_check_cookies(
1839 ((struct rte_mbuf *)mbuf0)->pool,
1840 (void **)&mbuf0, 1, 0);
1842 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1843 vsetq_lane_u64(0x80000, xmask01, 1);
1845 __mempool_check_cookies(
1846 ((struct rte_mbuf *)mbuf1)->pool,
1847 (void **)&mbuf1, 1, 0);
1849 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1850 vsetq_lane_u64(0x80000, xmask23, 0);
1852 __mempool_check_cookies(
1853 ((struct rte_mbuf *)mbuf2)->pool,
1854 (void **)&mbuf2, 1, 0);
1856 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1857 vsetq_lane_u64(0x80000, xmask23, 1);
1859 __mempool_check_cookies(
1860 ((struct rte_mbuf *)mbuf3)->pool,
1861 (void **)&mbuf3, 1, 0);
1862 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1863 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1864 } else if (!(flags & NIX_TX_MULTI_SEG_F)) {
1865 /* Move mbufs to iova */
1866 mbuf0 = (uint64_t *)tx_pkts[0];
1867 mbuf1 = (uint64_t *)tx_pkts[1];
1868 mbuf2 = (uint64_t *)tx_pkts[2];
1869 mbuf3 = (uint64_t *)tx_pkts[3];
1871 /* Mark mempool object as "put" since
1872 * it is freed by NIX
1874 __mempool_check_cookies(
1875 ((struct rte_mbuf *)mbuf0)->pool,
1876 (void **)&mbuf0, 1, 0);
1878 __mempool_check_cookies(
1879 ((struct rte_mbuf *)mbuf1)->pool,
1880 (void **)&mbuf1, 1, 0);
1882 __mempool_check_cookies(
1883 ((struct rte_mbuf *)mbuf2)->pool,
1884 (void **)&mbuf2, 1, 0);
1886 __mempool_check_cookies(
1887 ((struct rte_mbuf *)mbuf3)->pool,
1888 (void **)&mbuf3, 1, 0);
1891 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1892 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1893 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1894 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1895 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1897 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1898 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1899 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1900 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1902 if (flags & NIX_TX_NEED_EXT_HDR) {
1903 cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
1904 cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
1905 cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
1906 cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
1909 if (flags & NIX_TX_MULTI_SEG_F) {
1913 j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
1916 LMT_OFF(laddr, lnum,
1918 &wd.data128, &shift,
1921 } else if (flags & NIX_TX_NEED_EXT_HDR) {
1922 /* Store the prepared send desc to LMT lines */
1923 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1924 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1925 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1926 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1927 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
1928 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
1929 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
1930 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
1931 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
1933 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1934 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1935 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1936 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
1937 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
1938 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
1939 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
1940 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
1942 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1943 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1944 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1945 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
1946 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
1947 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
1949 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1950 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1951 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1952 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
1953 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
1954 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
1958 /* Store the prepared send desc to LMT lines */
1959 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1960 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1961 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1962 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1963 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1964 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1965 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1966 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1970 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1973 if (flags & NIX_TX_MULTI_SEG_F)
1978 if (!(flags & NIX_TX_MULTI_SEG_F))
1979 wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
1981 pa = io_addr | (wd.data[0] & 0x7) << 4;
1982 wd.data[0] &= ~0x7ULL;
1984 if (flags & NIX_TX_MULTI_SEG_F)
1987 wd.data[0] |= (15ULL << 12);
1988 wd.data[0] |= (uint64_t)lmt_id;
1991 roc_lmt_submit_steorl(wd.data[0], pa);
1993 if (!(flags & NIX_TX_MULTI_SEG_F))
1994 wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
1996 pa = io_addr | (wd.data[1] & 0x7) << 4;
1997 wd.data[1] &= ~0x7ULL;
1999 if (flags & NIX_TX_MULTI_SEG_F)
2002 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
2003 wd.data[1] |= (uint64_t)(lmt_id + 16);
2006 roc_lmt_submit_steorl(wd.data[1], pa);
2008 if (!(flags & NIX_TX_MULTI_SEG_F))
2009 wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
2011 pa = io_addr | (wd.data[0] & 0x7) << 4;
2012 wd.data[0] &= ~0x7ULL;
2014 if (flags & NIX_TX_MULTI_SEG_F)
2017 wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
2018 wd.data[0] |= lmt_id;
2021 roc_lmt_submit_steorl(wd.data[0], pa);
2029 if (unlikely(scalar)) {
2030 if (flags & NIX_TX_MULTI_SEG_F)
2031 pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
2032 scalar, cmd, flags);
2034 pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
2042 static __rte_always_inline uint16_t
2043 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
2044 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
2046 RTE_SET_USED(tx_queue);
2047 RTE_SET_USED(tx_pkts);
2050 RTE_SET_USED(flags);
2055 #define L3L4CSUM_F NIX_TX_OFFLOAD_L3_L4_CSUM_F
2056 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
2057 #define VLAN_F NIX_TX_OFFLOAD_VLAN_QINQ_F
2058 #define NOFF_F NIX_TX_OFFLOAD_MBUF_NOFF_F
2059 #define TSO_F NIX_TX_OFFLOAD_TSO_F
2060 #define TSP_F NIX_TX_OFFLOAD_TSTAMP_F
2062 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
2063 #define NIX_TX_FASTPATH_MODES \
2064 T(no_offload, 0, 0, 0, 0, 0, 0, 4, \
2065 NIX_TX_OFFLOAD_NONE) \
2066 T(l3l4csum, 0, 0, 0, 0, 0, 1, 4, \
2068 T(ol3ol4csum, 0, 0, 0, 0, 1, 0, 4, \
2070 T(ol3ol4csum_l3l4csum, 0, 0, 0, 0, 1, 1, 4, \
2071 OL3OL4CSUM_F | L3L4CSUM_F) \
2072 T(vlan, 0, 0, 0, 1, 0, 0, 6, \
2074 T(vlan_l3l4csum, 0, 0, 0, 1, 0, 1, 6, \
2075 VLAN_F | L3L4CSUM_F) \
2076 T(vlan_ol3ol4csum, 0, 0, 0, 1, 1, 0, 6, \
2077 VLAN_F | OL3OL4CSUM_F) \
2078 T(vlan_ol3ol4csum_l3l4csum, 0, 0, 0, 1, 1, 1, 6, \
2079 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2080 T(noff, 0, 0, 1, 0, 0, 0, 4, \
2082 T(noff_l3l4csum, 0, 0, 1, 0, 0, 1, 4, \
2083 NOFF_F | L3L4CSUM_F) \
2084 T(noff_ol3ol4csum, 0, 0, 1, 0, 1, 0, 4, \
2085 NOFF_F | OL3OL4CSUM_F) \
2086 T(noff_ol3ol4csum_l3l4csum, 0, 0, 1, 0, 1, 1, 4, \
2087 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2088 T(noff_vlan, 0, 0, 1, 1, 0, 0, 6, \
2090 T(noff_vlan_l3l4csum, 0, 0, 1, 1, 0, 1, 6, \
2091 NOFF_F | VLAN_F | L3L4CSUM_F) \
2092 T(noff_vlan_ol3ol4csum, 0, 0, 1, 1, 1, 0, 6, \
2093 NOFF_F | VLAN_F | OL3OL4CSUM_F) \
2094 T(noff_vlan_ol3ol4csum_l3l4csum, 0, 0, 1, 1, 1, 1, 6, \
2095 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2096 T(tso, 0, 1, 0, 0, 0, 0, 6, \
2098 T(tso_l3l4csum, 0, 1, 0, 0, 0, 1, 6, \
2099 TSO_F | L3L4CSUM_F) \
2100 T(tso_ol3ol4csum, 0, 1, 0, 0, 1, 0, 6, \
2101 TSO_F | OL3OL4CSUM_F) \
2102 T(tso_ol3ol4csum_l3l4csum, 0, 1, 0, 0, 1, 1, 6, \
2103 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2104 T(tso_vlan, 0, 1, 0, 1, 0, 0, 6, \
2106 T(tso_vlan_l3l4csum, 0, 1, 0, 1, 0, 1, 6, \
2107 TSO_F | VLAN_F | L3L4CSUM_F) \
2108 T(tso_vlan_ol3ol4csum, 0, 1, 0, 1, 1, 0, 6, \
2109 TSO_F | VLAN_F | OL3OL4CSUM_F) \
2110 T(tso_vlan_ol3ol4csum_l3l4csum, 0, 1, 0, 1, 1, 1, 6, \
2111 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2112 T(tso_noff, 0, 1, 1, 0, 0, 0, 6, \
2114 T(tso_noff_l3l4csum, 0, 1, 1, 0, 0, 1, 6, \
2115 TSO_F | NOFF_F | L3L4CSUM_F) \
2116 T(tso_noff_ol3ol4csum, 0, 1, 1, 0, 1, 0, 6, \
2117 TSO_F | NOFF_F | OL3OL4CSUM_F) \
2118 T(tso_noff_ol3ol4csum_l3l4csum, 0, 1, 1, 0, 1, 1, 6, \
2119 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2120 T(tso_noff_vlan, 0, 1, 1, 1, 0, 0, 6, \
2121 TSO_F | NOFF_F | VLAN_F) \
2122 T(tso_noff_vlan_l3l4csum, 0, 1, 1, 1, 0, 1, 6, \
2123 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
2124 T(tso_noff_vlan_ol3ol4csum, 0, 1, 1, 1, 1, 0, 6, \
2125 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
2126 T(tso_noff_vlan_ol3ol4csum_l3l4csum, 0, 1, 1, 1, 1, 1, 6, \
2127 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2128 T(ts, 1, 0, 0, 0, 0, 0, 8, \
2130 T(ts_l3l4csum, 1, 0, 0, 0, 0, 1, 8, \
2131 TSP_F | L3L4CSUM_F) \
2132 T(ts_ol3ol4csum, 1, 0, 0, 0, 1, 0, 8, \
2133 TSP_F | OL3OL4CSUM_F) \
2134 T(ts_ol3ol4csum_l3l4csum, 1, 0, 0, 0, 1, 1, 8, \
2135 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2136 T(ts_vlan, 1, 0, 0, 1, 0, 0, 8, \
2138 T(ts_vlan_l3l4csum, 1, 0, 0, 1, 0, 1, 8, \
2139 TSP_F | VLAN_F | L3L4CSUM_F) \
2140 T(ts_vlan_ol3ol4csum, 1, 0, 0, 1, 1, 0, 8, \
2141 TSP_F | VLAN_F | OL3OL4CSUM_F) \
2142 T(ts_vlan_ol3ol4csum_l3l4csum, 1, 0, 0, 1, 1, 1, 8, \
2143 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2144 T(ts_noff, 1, 0, 1, 0, 0, 0, 8, \
2146 T(ts_noff_l3l4csum, 1, 0, 1, 0, 0, 1, 8, \
2147 TSP_F | NOFF_F | L3L4CSUM_F) \
2148 T(ts_noff_ol3ol4csum, 1, 0, 1, 0, 1, 0, 8, \
2149 TSP_F | NOFF_F | OL3OL4CSUM_F) \
2150 T(ts_noff_ol3ol4csum_l3l4csum, 1, 0, 1, 0, 1, 1, 8, \
2151 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2152 T(ts_noff_vlan, 1, 0, 1, 1, 0, 0, 8, \
2153 TSP_F | NOFF_F | VLAN_F) \
2154 T(ts_noff_vlan_l3l4csum, 1, 0, 1, 1, 0, 1, 8, \
2155 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
2156 T(ts_noff_vlan_ol3ol4csum, 1, 0, 1, 1, 1, 0, 8, \
2157 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
2158 T(ts_noff_vlan_ol3ol4csum_l3l4csum, 1, 0, 1, 1, 1, 1, 8, \
2159 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2160 T(ts_tso, 1, 1, 0, 0, 0, 0, 8, \
2162 T(ts_tso_l3l4csum, 1, 1, 0, 0, 0, 1, 8, \
2163 TSP_F | TSO_F | L3L4CSUM_F) \
2164 T(ts_tso_ol3ol4csum, 1, 1, 0, 0, 1, 0, 8, \
2165 TSP_F | TSO_F | OL3OL4CSUM_F) \
2166 T(ts_tso_ol3ol4csum_l3l4csum, 1, 1, 0, 0, 1, 1, 8, \
2167 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2168 T(ts_tso_vlan, 1, 1, 0, 1, 0, 0, 8, \
2169 TSP_F | TSO_F | VLAN_F) \
2170 T(ts_tso_vlan_l3l4csum, 1, 1, 0, 1, 0, 1, 8, \
2171 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F) \
2172 T(ts_tso_vlan_ol3ol4csum, 1, 1, 0, 1, 1, 0, 8, \
2173 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F) \
2174 T(ts_tso_vlan_ol3ol4csum_l3l4csum, 1, 1, 0, 1, 1, 1, 8, \
2175 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2176 T(ts_tso_noff, 1, 1, 1, 0, 0, 0, 8, \
2177 TSP_F | TSO_F | NOFF_F) \
2178 T(ts_tso_noff_l3l4csum, 1, 1, 1, 0, 0, 1, 8, \
2179 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F) \
2180 T(ts_tso_noff_ol3ol4csum, 1, 1, 1, 0, 1, 0, 8, \
2181 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F) \
2182 T(ts_tso_noff_ol3ol4csum_l3l4csum, 1, 1, 1, 0, 1, 1, 8, \
2183 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
2184 T(ts_tso_noff_vlan, 1, 1, 1, 1, 0, 0, 8, \
2185 TSP_F | TSO_F | NOFF_F | VLAN_F) \
2186 T(ts_tso_noff_vlan_l3l4csum, 1, 1, 1, 1, 0, 1, 8, \
2187 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
2188 T(ts_tso_noff_vlan_ol3ol4csum, 1, 1, 1, 1, 1, 0, 8, \
2189 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
2190 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1, 8, \
2191 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
2193 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags) \
2194 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name( \
2195 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
2197 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name( \
2198 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
2200 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name( \
2201 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
2203 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
2204 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
2206 NIX_TX_FASTPATH_MODES
2209 #endif /* __CN10K_TX_H__ */