1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(C) 2021 Marvell.
9 #define NIX_TX_OFFLOAD_NONE (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F BIT(5)
17 /* Flags to control xmit_prepare function.
18 * Defining it from backwards to denote its been
19 * not used as offload flags to pick function
21 #define NIX_TX_MULTI_SEG_F BIT(15)
23 #define NIX_TX_NEED_SEND_HDR_W1 \
24 (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F | \
25 NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
27 #define NIX_TX_NEED_EXT_HDR \
28 (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F | \
31 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) \
33 /* Cached value is low, Update the fc_cache_pkts */ \
34 if (unlikely((txq)->fc_cache_pkts < (pkts))) { \
35 /* Multiply with sqe_per_sqb to express in pkts */ \
36 (txq)->fc_cache_pkts = \
37 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) \
38 << (txq)->sqes_per_sqb_log2; \
39 /* Check it again for the room */ \
40 if (unlikely((txq)->fc_cache_pkts < (pkts))) \
45 #define LMT_OFF(lmt_addr, lmt_num, offset) \
46 (void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
48 /* Function to determine no of tx subdesc required in case ext
49 * sub desc is enabled.
51 static __rte_always_inline int
52 cn10k_nix_tx_ext_subs(const uint16_t flags)
54 return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
57 (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
62 static __rte_always_inline uint8_t
63 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
66 /* We can pack up to 4 packets per LMTLINE if there are no offloads. */
67 return 4 << ROC_LMT_LINES_PER_CORE_LOG2;
70 static __rte_always_inline uint64_t
71 cn10k_nix_tx_steor_data(const uint16_t flags)
73 const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
76 /* This will be moved to addr area */
78 /* 15 vector sizes for single seg */
98 static __rte_always_inline uint64_t
99 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
101 const uint64_t dw_m1 = 0x7;
105 /* This will be moved to addr area */
107 /* 15 vector sizes for single seg */
127 static __rte_always_inline void
128 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
129 const uint16_t flags)
132 cmd[0] = txq->send_hdr_w0;
136 /* Send ext if present */
137 if (flags & NIX_TX_NEED_EXT_HDR) {
138 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
147 static __rte_always_inline void
148 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
150 uint64_t mask, ol_flags = m->ol_flags;
152 if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
153 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
154 uint16_t *iplen, *oiplen, *oudplen;
155 uint16_t lso_sb, paylen;
157 mask = -!!(ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6));
158 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
159 m->l2_len + m->l3_len + m->l4_len;
161 /* Reduce payload len from base headers */
162 paylen = m->pkt_len - lso_sb;
164 /* Get iplen position assuming no tunnel hdr */
165 iplen = (uint16_t *)(mdata + m->l2_len +
166 (2 << !!(ol_flags & PKT_TX_IPV6)));
167 /* Handle tunnel tso */
168 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
169 (ol_flags & PKT_TX_TUNNEL_MASK)) {
170 const uint8_t is_udp_tun =
171 (CNXK_NIX_UDP_TUN_BITMASK >>
172 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
175 oiplen = (uint16_t *)(mdata + m->outer_l2_len +
177 PKT_TX_OUTER_IPV6)));
178 *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
181 /* Update format for UDP tunneled packet */
183 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
184 m->outer_l3_len + 4);
185 *oudplen = rte_cpu_to_be_16(
186 rte_be_to_cpu_16(*oudplen) - paylen);
189 /* Update iplen position to inner ip hdr */
190 iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
192 (2 << !!(ol_flags & PKT_TX_IPV6)));
195 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
199 static __rte_always_inline void
200 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
201 const uint16_t flags, const uint64_t lso_tun_fmt)
203 struct nix_send_ext_s *send_hdr_ext;
204 struct nix_send_hdr_s *send_hdr;
205 uint64_t ol_flags = 0, mask;
206 union nix_send_hdr_w1_u w1;
207 union nix_send_sg_s *sg;
209 send_hdr = (struct nix_send_hdr_s *)cmd;
210 if (flags & NIX_TX_NEED_EXT_HDR) {
211 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
212 sg = (union nix_send_sg_s *)(cmd + 4);
213 /* Clear previous markings */
214 send_hdr_ext->w0.lso = 0;
215 send_hdr_ext->w1.u = 0;
217 sg = (union nix_send_sg_s *)(cmd + 2);
220 if (flags & NIX_TX_NEED_SEND_HDR_W1) {
221 ol_flags = m->ol_flags;
225 if (!(flags & NIX_TX_MULTI_SEG_F)) {
226 send_hdr->w0.total = m->data_len;
228 roc_npa_aura_handle_to_aura(m->pool->pool_id);
233 * 3 => IPV4 with csum
235 * L3type and L3ptr needs to be set for either
236 * L3 csum or L4 csum or LSO
240 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
241 (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
242 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
243 const uint8_t ol3type =
244 ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
245 ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
246 !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
249 w1.ol3type = ol3type;
250 mask = 0xffffull << ((!!ol3type) << 4);
251 w1.ol3ptr = ~mask & m->outer_l2_len;
252 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
255 w1.ol4type = csum + (csum << 1);
258 w1.il3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
259 ((!!(ol_flags & PKT_TX_IPV6)) << 2);
260 w1.il3ptr = w1.ol4ptr + m->l2_len;
261 w1.il4ptr = w1.il3ptr + m->l3_len;
262 /* Increment it by 1 if it is IPV4 as 3 is with csum */
263 w1.il3type = w1.il3type + !!(ol_flags & PKT_TX_IP_CKSUM);
266 w1.il4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
268 /* In case of no tunnel header use only
269 * shift IL3/IL4 fields a bit to use
270 * OL3/OL4 for header checksum
273 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
274 ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
276 } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
277 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
278 const uint8_t outer_l2_len = m->outer_l2_len;
281 w1.ol3ptr = outer_l2_len;
282 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
283 /* Increment it by 1 if it is IPV4 as 3 is with csum */
284 w1.ol3type = ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
285 ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
286 !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
289 w1.ol4type = csum + (csum << 1);
291 } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
292 const uint8_t l2_len = m->l2_len;
294 /* Always use OLXPTR and OLXTYPE when only
295 * when one header is present
300 w1.ol4ptr = l2_len + m->l3_len;
301 /* Increment it by 1 if it is IPV4 as 3 is with csum */
302 w1.ol3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
303 ((!!(ol_flags & PKT_TX_IPV6)) << 2) +
304 !!(ol_flags & PKT_TX_IP_CKSUM);
307 w1.ol4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
310 if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
311 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & PKT_TX_VLAN);
312 /* HW will update ptr after vlan0 update */
313 send_hdr_ext->w1.vlan1_ins_ptr = 12;
314 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
316 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & PKT_TX_QINQ);
317 /* 2B before end of l2 header */
318 send_hdr_ext->w1.vlan0_ins_ptr = 12;
319 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
322 if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
326 mask = -(!w1.il3type);
327 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
329 send_hdr_ext->w0.lso_sb = lso_sb;
330 send_hdr_ext->w0.lso = 1;
331 send_hdr_ext->w0.lso_mps = m->tso_segsz;
332 send_hdr_ext->w0.lso_format =
333 NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
334 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
336 /* Handle tunnel tso */
337 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
338 (ol_flags & PKT_TX_TUNNEL_MASK)) {
339 const uint8_t is_udp_tun =
340 (CNXK_NIX_UDP_TUN_BITMASK >>
341 ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
343 uint8_t shift = is_udp_tun ? 32 : 0;
345 shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
346 shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
348 w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
349 w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
350 /* Update format for UDP tunneled packet */
351 send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
355 if (flags & NIX_TX_NEED_SEND_HDR_W1)
356 send_hdr->w1.u = w1.u;
358 if (!(flags & NIX_TX_MULTI_SEG_F)) {
359 sg->seg1_size = m->data_len;
360 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
362 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
363 /* DF bit = 1 if refcount of current mbuf or parent mbuf
365 * DF bit = 0 otherwise
367 send_hdr->w0.df = cnxk_nix_prefree_seg(m);
369 /* Mark mempool object as "put" since it is freed by NIX */
370 if (!send_hdr->w0.df)
371 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
374 /* With minimal offloads, 'cmd' being local could be optimized out to
375 * registers. In other cases, 'cmd' will be in stack. Intent is
376 * 'cmd' stores content from txq->cmd which is copied only once.
378 *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
380 if (flags & NIX_TX_NEED_EXT_HDR) {
381 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
384 /* In case of multi-seg, sg template is stored here */
385 *((union nix_send_sg_s *)lmt_addr) = *sg;
386 *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
389 static __rte_always_inline void
390 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
391 const uint64_t ol_flags, const uint16_t no_segdw,
392 const uint16_t flags)
394 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
395 const uint8_t is_ol_tstamp = !(ol_flags & PKT_TX_IEEE1588_TMST);
396 struct nix_send_ext_s *send_hdr_ext =
397 (struct nix_send_ext_s *)lmt_addr + 16;
398 uint64_t *lmt = (uint64_t *)lmt_addr;
399 uint16_t off = (no_segdw - 1) << 1;
400 struct nix_send_mem_s *send_mem;
402 send_mem = (struct nix_send_mem_s *)(lmt + off);
403 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
404 send_hdr_ext->w0.tstmp = 1;
405 if (flags & NIX_TX_MULTI_SEG_F) {
406 /* Retrieving the default desc values */
409 /* Using compiler barier to avoid voilation of C
412 rte_compiler_barrier();
415 /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
416 * should not be recorded, hence changing the alg type to
417 * NIX_SENDMEMALG_SET and also changing send mem addr field to
418 * next 8 bytes as it corrpt the actual tx tstamp registered
421 send_mem->w0.subdc = NIX_SUBDC_MEM;
422 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
424 (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
428 static __rte_always_inline uint16_t
429 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
431 struct nix_send_hdr_s *send_hdr;
432 union nix_send_sg_s *sg;
433 struct rte_mbuf *m_next;
434 uint64_t *slist, sg_u;
439 send_hdr = (struct nix_send_hdr_s *)cmd;
440 send_hdr->w0.total = m->pkt_len;
441 send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
443 if (flags & NIX_TX_NEED_EXT_HDR)
448 sg = (union nix_send_sg_s *)&cmd[2 + off];
449 /* Clear sg->u header before use */
450 sg->u &= 0xFC00000000000000;
452 slist = &cmd[3 + off];
455 nb_segs = m->nb_segs;
457 /* Fill mbuf segments */
460 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
461 *slist = rte_mbuf_data_iova(m);
462 /* Set invert df if buffer is not to be freed by H/W */
463 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
464 sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
465 /* Mark mempool object as "put" since it is freed by NIX
467 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
468 if (!(sg_u & (1ULL << (i + 55))))
469 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
474 if (i > 2 && nb_segs) {
476 /* Next SG subdesc */
477 *(uint64_t *)slist = sg_u & 0xFC00000000000000;
480 sg = (union nix_send_sg_s *)slist;
489 segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
490 /* Roundup extra dwords to multiple of 2 */
491 segdw = (segdw >> 1) + (segdw & 0x1);
493 segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
494 send_hdr->w0.sizem1 = segdw - 1;
499 static __rte_always_inline uint16_t
500 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
501 uint64_t *cmd, const uint16_t flags)
503 struct cn10k_eth_txq *txq = tx_queue;
504 const rte_iova_t io_addr = txq->io_addr;
505 uintptr_t pa, lmt_addr = txq->lmt_base;
506 uint16_t lmt_id, burst, left, i;
507 uint64_t lso_tun_fmt;
510 NIX_XMIT_FC_OR_RETURN(txq, pkts);
512 /* Get cmd skeleton */
513 cn10k_nix_tx_skeleton(txq, cmd, flags);
515 /* Reduce the cached count */
516 txq->fc_cache_pkts -= pkts;
518 if (flags & NIX_TX_OFFLOAD_TSO_F)
519 lso_tun_fmt = txq->lso_tun_fmt;
521 /* Get LMT base address and LMT ID as lcore id */
522 ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
525 burst = left > 32 ? 32 : left;
526 for (i = 0; i < burst; i++) {
527 /* Perform header writes for TSO, barrier at
528 * lmt steorl will suffice.
530 if (flags & NIX_TX_OFFLOAD_TSO_F)
531 cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
533 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
535 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
536 tx_pkts[i]->ol_flags, 4, flags);
537 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
542 data = cn10k_nix_tx_steor_data(flags);
543 pa = io_addr | (data & 0x7) << 4;
545 data |= (15ULL << 12);
546 data |= (uint64_t)lmt_id;
549 roc_lmt_submit_steorl(data, pa);
551 data = cn10k_nix_tx_steor_data(flags);
552 pa = io_addr | (data & 0x7) << 4;
554 data |= ((uint64_t)(burst - 17)) << 12;
555 data |= (uint64_t)(lmt_id + 16);
558 roc_lmt_submit_steorl(data, pa);
560 data = cn10k_nix_tx_steor_data(flags);
561 pa = io_addr | (data & 0x7) << 4;
563 data |= ((uint64_t)(burst - 1)) << 12;
567 roc_lmt_submit_steorl(data, pa);
573 /* Start processing another burst */
575 /* Reset lmt base addr */
576 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
577 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
584 static __rte_always_inline uint16_t
585 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
586 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
588 struct cn10k_eth_txq *txq = tx_queue;
589 uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
590 const rte_iova_t io_addr = txq->io_addr;
591 uint16_t segdw, lmt_id, burst, left, i;
592 uint64_t data0, data1;
593 uint64_t lso_tun_fmt;
597 NIX_XMIT_FC_OR_RETURN(txq, pkts);
599 cn10k_nix_tx_skeleton(txq, cmd, flags);
601 /* Reduce the cached count */
602 txq->fc_cache_pkts -= pkts;
604 if (flags & NIX_TX_OFFLOAD_TSO_F)
605 lso_tun_fmt = txq->lso_tun_fmt;
607 /* Get LMT base address and LMT ID as lcore id */
608 ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
611 burst = left > 32 ? 32 : left;
614 for (i = 0; i < burst; i++) {
615 /* Perform header writes for TSO, barrier at
616 * lmt steorl will suffice.
618 if (flags & NIX_TX_OFFLOAD_TSO_F)
619 cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
621 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
623 /* Store sg list directly on lmt line */
624 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
626 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
627 tx_pkts[i]->ol_flags, segdw,
629 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
630 data128 |= (((__uint128_t)(segdw - 1)) << shft);
634 data0 = (uint64_t)data128;
635 data1 = (uint64_t)(data128 >> 64);
636 /* Make data0 similar to data1 */
640 pa0 = io_addr | (data0 & 0x7) << 4;
642 /* Move lmtst1..15 sz to bits 63:19 */
644 data0 |= (15ULL << 12);
645 data0 |= (uint64_t)lmt_id;
648 roc_lmt_submit_steorl(data0, pa0);
650 pa1 = io_addr | (data1 & 0x7) << 4;
653 data1 |= ((uint64_t)(burst - 17)) << 12;
654 data1 |= (uint64_t)(lmt_id + 16);
657 roc_lmt_submit_steorl(data1, pa1);
659 pa0 = io_addr | (data0 & 0x7) << 4;
661 /* Move lmtst1..15 sz to bits 63:19 */
663 data0 |= ((burst - 1) << 12);
664 data0 |= (uint64_t)lmt_id;
667 roc_lmt_submit_steorl(data0, pa0);
673 /* Start processing another burst */
675 /* Reset lmt base addr */
676 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
677 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
684 #if defined(RTE_ARCH_ARM64)
686 #define NIX_DESCS_PER_LOOP 4
687 static __rte_always_inline uint16_t
688 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
689 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
691 uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
692 uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
693 uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP];
694 uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
695 uint64x2_t senddesc01_w0, senddesc23_w0;
696 uint64x2_t senddesc01_w1, senddesc23_w1;
697 uint16_t left, scalar, burst, i, lmt_id;
698 uint64x2_t sgdesc01_w0, sgdesc23_w0;
699 uint64x2_t sgdesc01_w1, sgdesc23_w1;
700 struct cn10k_eth_txq *txq = tx_queue;
701 uintptr_t laddr = txq->lmt_base;
702 rte_iova_t io_addr = txq->io_addr;
703 uint64x2_t ltypes01, ltypes23;
704 uint64x2_t xtmp128, ytmp128;
705 uint64x2_t xmask01, xmask23;
708 NIX_XMIT_FC_OR_RETURN(txq, pkts);
710 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
711 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
713 /* Reduce the cached count */
714 txq->fc_cache_pkts -= pkts;
716 senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
717 senddesc23_w0 = senddesc01_w0;
718 senddesc01_w1 = vdupq_n_u64(0);
719 senddesc23_w1 = senddesc01_w1;
720 sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
721 sgdesc23_w0 = sgdesc01_w0;
723 /* Get LMT base address and LMT ID as lcore id */
724 ROC_LMT_BASE_ID_GET(laddr, lmt_id);
727 /* Number of packets to prepare depends on offloads enabled. */
728 burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
729 cn10k_nix_pkts_per_vec_brst(flags) :
732 for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
733 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
735 vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
736 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
738 senddesc23_w0 = senddesc01_w0;
739 sgdesc23_w0 = sgdesc01_w0;
741 /* Move mbufs to iova */
742 mbuf0 = (uint64_t *)tx_pkts[0];
743 mbuf1 = (uint64_t *)tx_pkts[1];
744 mbuf2 = (uint64_t *)tx_pkts[2];
745 mbuf3 = (uint64_t *)tx_pkts[3];
747 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
748 offsetof(struct rte_mbuf, buf_iova));
749 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
750 offsetof(struct rte_mbuf, buf_iova));
751 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
752 offsetof(struct rte_mbuf, buf_iova));
753 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
754 offsetof(struct rte_mbuf, buf_iova));
756 * Get mbuf's, olflags, iova, pktlen, dataoff
757 * dataoff_iovaX.D[0] = iova,
758 * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
759 * len_olflagsX.D[0] = ol_flags,
760 * len_olflagsX.D[1](63:32) = mbuf->pkt_len
762 dataoff_iova0 = vld1q_u64(mbuf0);
763 len_olflags0 = vld1q_u64(mbuf0 + 2);
764 dataoff_iova1 = vld1q_u64(mbuf1);
765 len_olflags1 = vld1q_u64(mbuf1 + 2);
766 dataoff_iova2 = vld1q_u64(mbuf2);
767 len_olflags2 = vld1q_u64(mbuf2 + 2);
768 dataoff_iova3 = vld1q_u64(mbuf3);
769 len_olflags3 = vld1q_u64(mbuf3 + 2);
771 /* Move mbufs to point pool */
772 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
773 offsetof(struct rte_mbuf, pool) -
774 offsetof(struct rte_mbuf, buf_iova));
775 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
776 offsetof(struct rte_mbuf, pool) -
777 offsetof(struct rte_mbuf, buf_iova));
778 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
779 offsetof(struct rte_mbuf, pool) -
780 offsetof(struct rte_mbuf, buf_iova));
781 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
782 offsetof(struct rte_mbuf, pool) -
783 offsetof(struct rte_mbuf, buf_iova));
785 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
786 NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
787 /* Get tx_offload for ol2, ol3, l2, l3 lengths */
789 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
790 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
793 asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
794 : [a] "+w"(senddesc01_w1)
795 : [in] "r"(mbuf0 + 2)
798 asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
799 : [a] "+w"(senddesc01_w1)
800 : [in] "r"(mbuf1 + 2)
803 asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
804 : [b] "+w"(senddesc23_w1)
805 : [in] "r"(mbuf2 + 2)
808 asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
809 : [b] "+w"(senddesc23_w1)
810 : [in] "r"(mbuf3 + 2)
813 /* Get pool pointer alone */
814 mbuf0 = (uint64_t *)*mbuf0;
815 mbuf1 = (uint64_t *)*mbuf1;
816 mbuf2 = (uint64_t *)*mbuf2;
817 mbuf3 = (uint64_t *)*mbuf3;
819 /* Get pool pointer alone */
820 mbuf0 = (uint64_t *)*mbuf0;
821 mbuf1 = (uint64_t *)*mbuf1;
822 mbuf2 = (uint64_t *)*mbuf2;
823 mbuf3 = (uint64_t *)*mbuf3;
826 const uint8x16_t shuf_mask2 = {
827 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
828 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
830 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
831 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
833 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
834 const uint64x2_t and_mask0 = {
839 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
840 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
841 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
842 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
845 * Pick only 16 bits of pktlen preset at bits 63:32
846 * and place them at bits 15:0.
848 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
849 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
851 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
852 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
853 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
855 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
856 * pktlen at 15:0 position.
858 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
859 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
860 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
861 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
863 /* Move mbuf to point to pool_id. */
864 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
865 offsetof(struct rte_mempool, pool_id));
866 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
867 offsetof(struct rte_mempool, pool_id));
868 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
869 offsetof(struct rte_mempool, pool_id));
870 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
871 offsetof(struct rte_mempool, pool_id));
873 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
874 !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
876 * Lookup table to translate ol_flags to
877 * il3/il4 types. But we still use ol3/ol4 types in
878 * senddesc_w1 as only one header processing is enabled.
880 const uint8x16_t tbl = {
881 /* [0-15] = il4type:il3type */
882 0x04, /* none (IPv6 assumed) */
883 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
884 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
885 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
886 0x03, /* PKT_TX_IP_CKSUM */
887 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
888 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
889 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
890 0x02, /* PKT_TX_IPV4 */
891 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
892 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
893 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
894 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
895 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
898 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
901 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
906 /* Extract olflags to translate to iltypes */
907 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
908 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
911 * E(47):L3_LEN(9):L2_LEN(7+z)
912 * E(47):L3_LEN(9):L2_LEN(7+z)
914 senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
915 senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
917 /* Move OLFLAGS bits 55:52 to 51:48
918 * with zeros preprended on the byte and rest
921 xtmp128 = vshrq_n_u8(xtmp128, 4);
922 ytmp128 = vshrq_n_u8(ytmp128, 4);
924 * E(48):L3_LEN(8):L2_LEN(z+7)
925 * E(48):L3_LEN(8):L2_LEN(z+7)
927 const int8x16_t tshft3 = {
928 -1, 0, 8, 8, 8, 8, 8, 8,
929 -1, 0, 8, 8, 8, 8, 8, 8,
932 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
933 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
936 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
937 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
939 /* Pick only relevant fields i.e Bit 48:55 of iltype
940 * and place it in ol3/ol4type of senddesc_w1
942 const uint8x16_t shuf_mask0 = {
943 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
944 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
947 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
948 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
950 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
951 * a [E(32):E(16):OL3(8):OL2(8)]
953 * a [E(32):E(16):(OL3+OL2):OL2]
954 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
956 senddesc01_w1 = vaddq_u8(senddesc01_w1,
957 vshlq_n_u16(senddesc01_w1, 8));
958 senddesc23_w1 = vaddq_u8(senddesc23_w1,
959 vshlq_n_u16(senddesc23_w1, 8));
961 /* Move ltypes to senddesc*_w1 */
962 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
963 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
964 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
965 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
967 * Lookup table to translate ol_flags to
971 const uint8x16_t tbl = {
972 /* [0-15] = ol4type:ol3type */
974 0x03, /* OUTER_IP_CKSUM */
975 0x02, /* OUTER_IPV4 */
976 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
977 0x04, /* OUTER_IPV6 */
978 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
979 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
980 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
983 0x00, /* OUTER_UDP_CKSUM */
984 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
985 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
986 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
989 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
990 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
993 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
996 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
997 * OUTER_IPV4 | OUTER_IP_CKSUM
1001 /* Extract olflags to translate to iltypes */
1002 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1003 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1006 * E(47):OL3_LEN(9):OL2_LEN(7+z)
1007 * E(47):OL3_LEN(9):OL2_LEN(7+z)
1009 const uint8x16_t shuf_mask5 = {
1010 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1011 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1013 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1014 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1016 /* Extract outer ol flags only */
1017 const uint64x2_t o_cksum_mask = {
1022 xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1023 ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1025 /* Extract OUTER_UDP_CKSUM bit 41 and
1029 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1030 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1032 /* Shift oltype by 2 to start nibble from BIT(56)
1033 * instead of BIT(58)
1035 xtmp128 = vshrq_n_u8(xtmp128, 2);
1036 ytmp128 = vshrq_n_u8(ytmp128, 2);
1038 * E(48):L3_LEN(8):L2_LEN(z+7)
1039 * E(48):L3_LEN(8):L2_LEN(z+7)
1041 const int8x16_t tshft3 = {
1042 -1, 0, 8, 8, 8, 8, 8, 8,
1043 -1, 0, 8, 8, 8, 8, 8, 8,
1046 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1047 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1050 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1051 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1053 /* Pick only relevant fields i.e Bit 56:63 of oltype
1054 * and place it in ol3/ol4type of senddesc_w1
1056 const uint8x16_t shuf_mask0 = {
1057 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1058 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1061 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1062 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1064 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1065 * a [E(32):E(16):OL3(8):OL2(8)]
1067 * a [E(32):E(16):(OL3+OL2):OL2]
1068 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1070 senddesc01_w1 = vaddq_u8(senddesc01_w1,
1071 vshlq_n_u16(senddesc01_w1, 8));
1072 senddesc23_w1 = vaddq_u8(senddesc23_w1,
1073 vshlq_n_u16(senddesc23_w1, 8));
1075 /* Move ltypes to senddesc*_w1 */
1076 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1077 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1078 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1079 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1080 /* Lookup table to translate ol_flags to
1081 * ol4type, ol3type, il4type, il3type of senddesc_w1
1083 const uint8x16x2_t tbl = {{
1085 /* [0-15] = il4type:il3type */
1086 0x04, /* none (IPv6) */
1087 0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
1088 0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
1089 0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
1090 0x03, /* PKT_TX_IP_CKSUM */
1091 0x13, /* PKT_TX_IP_CKSUM |
1094 0x23, /* PKT_TX_IP_CKSUM |
1097 0x33, /* PKT_TX_IP_CKSUM |
1100 0x02, /* PKT_TX_IPV4 */
1101 0x12, /* PKT_TX_IPV4 |
1104 0x22, /* PKT_TX_IPV4 |
1107 0x32, /* PKT_TX_IPV4 |
1110 0x03, /* PKT_TX_IPV4 |
1113 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1116 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1119 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1125 /* [16-31] = ol4type:ol3type */
1127 0x03, /* OUTER_IP_CKSUM */
1128 0x02, /* OUTER_IPV4 */
1129 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1130 0x04, /* OUTER_IPV6 */
1131 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1132 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1133 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1136 0x00, /* OUTER_UDP_CKSUM */
1137 0x33, /* OUTER_UDP_CKSUM |
1140 0x32, /* OUTER_UDP_CKSUM |
1143 0x33, /* OUTER_UDP_CKSUM |
1144 * OUTER_IPV4 | OUTER_IP_CKSUM
1146 0x34, /* OUTER_UDP_CKSUM |
1149 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1152 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1155 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1156 * OUTER_IPV4 | OUTER_IP_CKSUM
1161 /* Extract olflags to translate to oltype & iltype */
1162 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1163 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1166 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1167 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1169 const uint32x4_t tshft_4 = {
1175 senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1176 senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1179 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1180 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1182 const uint8x16_t shuf_mask5 = {
1183 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1184 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1186 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1187 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1189 /* Extract outer and inner header ol_flags */
1190 const uint64x2_t oi_cksum_mask = {
1195 xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1196 ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1198 /* Extract OUTER_UDP_CKSUM bit 41 and
1202 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1203 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1205 /* Shift right oltype by 2 and iltype by 4
1206 * to start oltype nibble from BIT(58)
1207 * instead of BIT(56) and iltype nibble from BIT(48)
1208 * instead of BIT(52).
1210 const int8x16_t tshft5 = {
1211 8, 8, 8, 8, 8, 8, -4, -2,
1212 8, 8, 8, 8, 8, 8, -4, -2,
1215 xtmp128 = vshlq_u8(xtmp128, tshft5);
1216 ytmp128 = vshlq_u8(ytmp128, tshft5);
1218 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1219 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1221 const int8x16_t tshft3 = {
1222 -1, 0, -1, 0, 0, 0, 0, 0,
1223 -1, 0, -1, 0, 0, 0, 0, 0,
1226 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1227 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1229 /* Mark Bit(4) of oltype */
1230 const uint64x2_t oi_cksum_mask2 = {
1235 xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1236 ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1239 ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1240 ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1242 /* Pick only relevant fields i.e Bit 48:55 of iltype and
1243 * Bit 56:63 of oltype and place it in corresponding
1244 * place in senddesc_w1.
1246 const uint8x16_t shuf_mask0 = {
1247 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1248 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1251 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1252 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1254 /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1255 * l3len, l2len, ol3len, ol2len.
1256 * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1258 * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1260 * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1261 * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1263 senddesc01_w1 = vaddq_u8(senddesc01_w1,
1264 vshlq_n_u32(senddesc01_w1, 8));
1265 senddesc23_w1 = vaddq_u8(senddesc23_w1,
1266 vshlq_n_u32(senddesc23_w1, 8));
1268 /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1269 senddesc01_w1 = vaddq_u8(
1270 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1271 senddesc23_w1 = vaddq_u8(
1272 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1274 /* Move ltypes to senddesc*_w1 */
1275 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1276 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1279 xmask01 = vdupq_n_u64(0);
1281 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1286 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1291 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1296 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1300 xmask01 = vshlq_n_u64(xmask01, 20);
1301 xmask23 = vshlq_n_u64(xmask23, 20);
1303 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1304 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1306 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
1307 /* Set don't free bit if reference count > 1 */
1308 xmask01 = vdupq_n_u64(0);
1311 /* Move mbufs to iova */
1312 mbuf0 = (uint64_t *)tx_pkts[0];
1313 mbuf1 = (uint64_t *)tx_pkts[1];
1314 mbuf2 = (uint64_t *)tx_pkts[2];
1315 mbuf3 = (uint64_t *)tx_pkts[3];
1317 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1318 vsetq_lane_u64(0x80000, xmask01, 0);
1320 __mempool_check_cookies(
1321 ((struct rte_mbuf *)mbuf0)->pool,
1322 (void **)&mbuf0, 1, 0);
1324 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1325 vsetq_lane_u64(0x80000, xmask01, 1);
1327 __mempool_check_cookies(
1328 ((struct rte_mbuf *)mbuf1)->pool,
1329 (void **)&mbuf1, 1, 0);
1331 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1332 vsetq_lane_u64(0x80000, xmask23, 0);
1334 __mempool_check_cookies(
1335 ((struct rte_mbuf *)mbuf2)->pool,
1336 (void **)&mbuf2, 1, 0);
1338 if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1339 vsetq_lane_u64(0x80000, xmask23, 1);
1341 __mempool_check_cookies(
1342 ((struct rte_mbuf *)mbuf3)->pool,
1343 (void **)&mbuf3, 1, 0);
1344 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1345 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1347 /* Move mbufs to iova */
1348 mbuf0 = (uint64_t *)tx_pkts[0];
1349 mbuf1 = (uint64_t *)tx_pkts[1];
1350 mbuf2 = (uint64_t *)tx_pkts[2];
1351 mbuf3 = (uint64_t *)tx_pkts[3];
1353 /* Mark mempool object as "put" since
1354 * it is freed by NIX
1356 __mempool_check_cookies(
1357 ((struct rte_mbuf *)mbuf0)->pool,
1358 (void **)&mbuf0, 1, 0);
1360 __mempool_check_cookies(
1361 ((struct rte_mbuf *)mbuf1)->pool,
1362 (void **)&mbuf1, 1, 0);
1364 __mempool_check_cookies(
1365 ((struct rte_mbuf *)mbuf2)->pool,
1366 (void **)&mbuf2, 1, 0);
1368 __mempool_check_cookies(
1369 ((struct rte_mbuf *)mbuf3)->pool,
1370 (void **)&mbuf3, 1, 0);
1373 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1374 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1375 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1376 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1377 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1379 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1380 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1381 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1382 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1384 /* Store the prepared send desc to LMT lines */
1385 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1386 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1387 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1388 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1389 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1390 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1391 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1392 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1395 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1400 data = cn10k_nix_tx_steor_vec_data(flags);
1401 pa = io_addr | (data & 0x7) << 4;
1403 data |= (15ULL << 12);
1404 data |= (uint64_t)lmt_id;
1407 roc_lmt_submit_steorl(data, pa);
1409 data = cn10k_nix_tx_steor_vec_data(flags);
1410 pa = io_addr | (data & 0x7) << 4;
1412 data |= ((uint64_t)(lnum - 17)) << 12;
1413 data |= (uint64_t)(lmt_id + 16);
1416 roc_lmt_submit_steorl(data, pa);
1418 data = cn10k_nix_tx_steor_vec_data(flags);
1419 pa = io_addr | (data & 0x7) << 4;
1421 data |= ((uint64_t)(lnum - 1)) << 12;
1425 roc_lmt_submit_steorl(data, pa);
1433 if (unlikely(scalar))
1434 pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar, cmd,
1441 static __rte_always_inline uint16_t
1442 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
1443 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
1445 RTE_SET_USED(tx_queue);
1446 RTE_SET_USED(tx_pkts);
1449 RTE_SET_USED(flags);
1454 #define L3L4CSUM_F NIX_TX_OFFLOAD_L3_L4_CSUM_F
1455 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
1456 #define VLAN_F NIX_TX_OFFLOAD_VLAN_QINQ_F
1457 #define NOFF_F NIX_TX_OFFLOAD_MBUF_NOFF_F
1458 #define TSO_F NIX_TX_OFFLOAD_TSO_F
1459 #define TSP_F NIX_TX_OFFLOAD_TSTAMP_F
1461 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
1462 #define NIX_TX_FASTPATH_MODES \
1463 T(no_offload, 0, 0, 0, 0, 0, 0, 4, \
1464 NIX_TX_OFFLOAD_NONE) \
1465 T(l3l4csum, 0, 0, 0, 0, 0, 1, 4, \
1467 T(ol3ol4csum, 0, 0, 0, 0, 1, 0, 4, \
1469 T(ol3ol4csum_l3l4csum, 0, 0, 0, 0, 1, 1, 4, \
1470 OL3OL4CSUM_F | L3L4CSUM_F) \
1471 T(vlan, 0, 0, 0, 1, 0, 0, 6, \
1473 T(vlan_l3l4csum, 0, 0, 0, 1, 0, 1, 6, \
1474 VLAN_F | L3L4CSUM_F) \
1475 T(vlan_ol3ol4csum, 0, 0, 0, 1, 1, 0, 6, \
1476 VLAN_F | OL3OL4CSUM_F) \
1477 T(vlan_ol3ol4csum_l3l4csum, 0, 0, 0, 1, 1, 1, 6, \
1478 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1479 T(noff, 0, 0, 1, 0, 0, 0, 4, \
1481 T(noff_l3l4csum, 0, 0, 1, 0, 0, 1, 4, \
1482 NOFF_F | L3L4CSUM_F) \
1483 T(noff_ol3ol4csum, 0, 0, 1, 0, 1, 0, 4, \
1484 NOFF_F | OL3OL4CSUM_F) \
1485 T(noff_ol3ol4csum_l3l4csum, 0, 0, 1, 0, 1, 1, 4, \
1486 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1487 T(noff_vlan, 0, 0, 1, 1, 0, 0, 6, \
1489 T(noff_vlan_l3l4csum, 0, 0, 1, 1, 0, 1, 6, \
1490 NOFF_F | VLAN_F | L3L4CSUM_F) \
1491 T(noff_vlan_ol3ol4csum, 0, 0, 1, 1, 1, 0, 6, \
1492 NOFF_F | VLAN_F | OL3OL4CSUM_F) \
1493 T(noff_vlan_ol3ol4csum_l3l4csum, 0, 0, 1, 1, 1, 1, 6, \
1494 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1495 T(tso, 0, 1, 0, 0, 0, 0, 6, \
1497 T(tso_l3l4csum, 0, 1, 0, 0, 0, 1, 6, \
1498 TSO_F | L3L4CSUM_F) \
1499 T(tso_ol3ol4csum, 0, 1, 0, 0, 1, 0, 6, \
1500 TSO_F | OL3OL4CSUM_F) \
1501 T(tso_ol3ol4csum_l3l4csum, 0, 1, 0, 0, 1, 1, 6, \
1502 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1503 T(tso_vlan, 0, 1, 0, 1, 0, 0, 6, \
1505 T(tso_vlan_l3l4csum, 0, 1, 0, 1, 0, 1, 6, \
1506 TSO_F | VLAN_F | L3L4CSUM_F) \
1507 T(tso_vlan_ol3ol4csum, 0, 1, 0, 1, 1, 0, 6, \
1508 TSO_F | VLAN_F | OL3OL4CSUM_F) \
1509 T(tso_vlan_ol3ol4csum_l3l4csum, 0, 1, 0, 1, 1, 1, 6, \
1510 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1511 T(tso_noff, 0, 1, 1, 0, 0, 0, 6, \
1513 T(tso_noff_l3l4csum, 0, 1, 1, 0, 0, 1, 6, \
1514 TSO_F | NOFF_F | L3L4CSUM_F) \
1515 T(tso_noff_ol3ol4csum, 0, 1, 1, 0, 1, 0, 6, \
1516 TSO_F | NOFF_F | OL3OL4CSUM_F) \
1517 T(tso_noff_ol3ol4csum_l3l4csum, 0, 1, 1, 0, 1, 1, 6, \
1518 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1519 T(tso_noff_vlan, 0, 1, 1, 1, 0, 0, 6, \
1520 TSO_F | NOFF_F | VLAN_F) \
1521 T(tso_noff_vlan_l3l4csum, 0, 1, 1, 1, 0, 1, 6, \
1522 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
1523 T(tso_noff_vlan_ol3ol4csum, 0, 1, 1, 1, 1, 0, 6, \
1524 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
1525 T(tso_noff_vlan_ol3ol4csum_l3l4csum, 0, 1, 1, 1, 1, 1, 6, \
1526 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1527 T(ts, 1, 0, 0, 0, 0, 0, 8, \
1529 T(ts_l3l4csum, 1, 0, 0, 0, 0, 1, 8, \
1530 TSP_F | L3L4CSUM_F) \
1531 T(ts_ol3ol4csum, 1, 0, 0, 0, 1, 0, 8, \
1532 TSP_F | OL3OL4CSUM_F) \
1533 T(ts_ol3ol4csum_l3l4csum, 1, 0, 0, 0, 1, 1, 8, \
1534 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1535 T(ts_vlan, 1, 0, 0, 1, 0, 0, 8, \
1537 T(ts_vlan_l3l4csum, 1, 0, 0, 1, 0, 1, 8, \
1538 TSP_F | VLAN_F | L3L4CSUM_F) \
1539 T(ts_vlan_ol3ol4csum, 1, 0, 0, 1, 1, 0, 8, \
1540 TSP_F | VLAN_F | OL3OL4CSUM_F) \
1541 T(ts_vlan_ol3ol4csum_l3l4csum, 1, 0, 0, 1, 1, 1, 8, \
1542 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1543 T(ts_noff, 1, 0, 1, 0, 0, 0, 8, \
1545 T(ts_noff_l3l4csum, 1, 0, 1, 0, 0, 1, 8, \
1546 TSP_F | NOFF_F | L3L4CSUM_F) \
1547 T(ts_noff_ol3ol4csum, 1, 0, 1, 0, 1, 0, 8, \
1548 TSP_F | NOFF_F | OL3OL4CSUM_F) \
1549 T(ts_noff_ol3ol4csum_l3l4csum, 1, 0, 1, 0, 1, 1, 8, \
1550 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1551 T(ts_noff_vlan, 1, 0, 1, 1, 0, 0, 8, \
1552 TSP_F | NOFF_F | VLAN_F) \
1553 T(ts_noff_vlan_l3l4csum, 1, 0, 1, 1, 0, 1, 8, \
1554 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
1555 T(ts_noff_vlan_ol3ol4csum, 1, 0, 1, 1, 1, 0, 8, \
1556 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
1557 T(ts_noff_vlan_ol3ol4csum_l3l4csum, 1, 0, 1, 1, 1, 1, 8, \
1558 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1559 T(ts_tso, 1, 1, 0, 0, 0, 0, 8, \
1561 T(ts_tso_l3l4csum, 1, 1, 0, 0, 0, 1, 8, \
1562 TSP_F | TSO_F | L3L4CSUM_F) \
1563 T(ts_tso_ol3ol4csum, 1, 1, 0, 0, 1, 0, 8, \
1564 TSP_F | TSO_F | OL3OL4CSUM_F) \
1565 T(ts_tso_ol3ol4csum_l3l4csum, 1, 1, 0, 0, 1, 1, 8, \
1566 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1567 T(ts_tso_vlan, 1, 1, 0, 1, 0, 0, 8, \
1568 TSP_F | TSO_F | VLAN_F) \
1569 T(ts_tso_vlan_l3l4csum, 1, 1, 0, 1, 0, 1, 8, \
1570 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F) \
1571 T(ts_tso_vlan_ol3ol4csum, 1, 1, 0, 1, 1, 0, 8, \
1572 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F) \
1573 T(ts_tso_vlan_ol3ol4csum_l3l4csum, 1, 1, 0, 1, 1, 1, 8, \
1574 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1575 T(ts_tso_noff, 1, 1, 1, 0, 0, 0, 8, \
1576 TSP_F | TSO_F | NOFF_F) \
1577 T(ts_tso_noff_l3l4csum, 1, 1, 1, 0, 0, 1, 8, \
1578 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F) \
1579 T(ts_tso_noff_ol3ol4csum, 1, 1, 1, 0, 1, 0, 8, \
1580 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F) \
1581 T(ts_tso_noff_ol3ol4csum_l3l4csum, 1, 1, 1, 0, 1, 1, 8, \
1582 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F) \
1583 T(ts_tso_noff_vlan, 1, 1, 1, 1, 0, 0, 8, \
1584 TSP_F | TSO_F | NOFF_F | VLAN_F) \
1585 T(ts_tso_noff_vlan_l3l4csum, 1, 1, 1, 1, 0, 1, 8, \
1586 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F) \
1587 T(ts_tso_noff_vlan_ol3ol4csum, 1, 1, 1, 1, 1, 0, 8, \
1588 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F) \
1589 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1, 8, \
1590 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
1592 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags) \
1593 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name( \
1594 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
1596 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name( \
1597 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts); \
1599 uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name( \
1600 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
1602 NIX_TX_FASTPATH_MODES
1605 #endif /* __CN10K_TX_H__ */