net/cnxk: support Rx security offload on cn10k
[dpdk.git] / drivers / net / cnxk / cn10k_tx.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2021 Marvell.
3  */
4 #ifndef __CN10K_TX_H__
5 #define __CN10K_TX_H__
6
7 #include <rte_vect.h>
8
9 #define NIX_TX_OFFLOAD_NONE           (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F   BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F    BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F    BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F          BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F       BIT(5)
16 #define NIX_TX_OFFLOAD_SECURITY_F     BIT(6)
17
18 /* Flags to control xmit_prepare function.
19  * Defining it from backwards to denote its been
20  * not used as offload flags to pick function
21  */
22 #define NIX_TX_VWQE_F      BIT(14)
23 #define NIX_TX_MULTI_SEG_F BIT(15)
24
25 #define NIX_TX_NEED_SEND_HDR_W1                                                \
26         (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |         \
27          NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
28
29 #define NIX_TX_NEED_EXT_HDR                                                    \
30         (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |                \
31          NIX_TX_OFFLOAD_TSO_F)
32
33 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
34         do {                                                                   \
35                 /* Cached value is low, Update the fc_cache_pkts */            \
36                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
37                         /* Multiply with sqe_per_sqb to express in pkts */     \
38                         (txq)->fc_cache_pkts =                                 \
39                                 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
40                                 << (txq)->sqes_per_sqb_log2;                   \
41                         /* Check it again for the room */                      \
42                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
43                                 return 0;                                      \
44                 }                                                              \
45         } while (0)
46
47 /* Encoded number of segments to number of dwords macro, each value of nb_segs
48  * is encoded as 4bits.
49  */
50 #define NIX_SEGDW_MAGIC 0x76654432210ULL
51
52 #define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
53
54 /* Function to determine no of tx subdesc required in case ext
55  * sub desc is enabled.
56  */
57 static __rte_always_inline int
58 cn10k_nix_tx_ext_subs(const uint16_t flags)
59 {
60         return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
61                        ? 2
62                        : ((flags &
63                            (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
64                                   ? 1
65                                   : 0);
66 }
67
68 static __rte_always_inline uint8_t
69 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
70 {
71         return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
72                << ROC_LMT_LINES_PER_CORE_LOG2;
73 }
74
75 static __rte_always_inline uint8_t
76 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
77 {
78         return (flags & NIX_TX_NEED_EXT_HDR) ?
79                              ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
80                              8;
81 }
82
83 static __rte_always_inline uint64_t
84 cn10k_nix_tx_steor_data(const uint16_t flags)
85 {
86         const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
87         uint64_t data;
88
89         /* This will be moved to addr area */
90         data = dw_m1;
91         /* 15 vector sizes for single seg */
92         data |= dw_m1 << 19;
93         data |= dw_m1 << 22;
94         data |= dw_m1 << 25;
95         data |= dw_m1 << 28;
96         data |= dw_m1 << 31;
97         data |= dw_m1 << 34;
98         data |= dw_m1 << 37;
99         data |= dw_m1 << 40;
100         data |= dw_m1 << 43;
101         data |= dw_m1 << 46;
102         data |= dw_m1 << 49;
103         data |= dw_m1 << 52;
104         data |= dw_m1 << 55;
105         data |= dw_m1 << 58;
106         data |= dw_m1 << 61;
107
108         return data;
109 }
110
111 static __rte_always_inline uint8_t
112 cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
113 {
114         return ((flags & NIX_TX_NEED_EXT_HDR) ?
115                               (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
116                               4);
117 }
118
119 static __rte_always_inline uint64_t
120 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
121 {
122         const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
123         uint64_t data;
124
125         /* This will be moved to addr area */
126         data = dw_m1;
127         /* 15 vector sizes for single seg */
128         data |= dw_m1 << 19;
129         data |= dw_m1 << 22;
130         data |= dw_m1 << 25;
131         data |= dw_m1 << 28;
132         data |= dw_m1 << 31;
133         data |= dw_m1 << 34;
134         data |= dw_m1 << 37;
135         data |= dw_m1 << 40;
136         data |= dw_m1 << 43;
137         data |= dw_m1 << 46;
138         data |= dw_m1 << 49;
139         data |= dw_m1 << 52;
140         data |= dw_m1 << 55;
141         data |= dw_m1 << 58;
142         data |= dw_m1 << 61;
143
144         return data;
145 }
146
147 static __rte_always_inline void
148 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
149                       const uint16_t flags)
150 {
151         /* Send hdr */
152         cmd[0] = txq->send_hdr_w0;
153         cmd[1] = 0;
154         cmd += 2;
155
156         /* Send ext if present */
157         if (flags & NIX_TX_NEED_EXT_HDR) {
158                 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
159                 cmd += 2;
160         }
161
162         /* Send sg */
163         cmd[0] = txq->sg_w0;
164         cmd[1] = 0;
165 }
166
167 static __rte_always_inline void
168 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
169 {
170         uint64_t mask, ol_flags = m->ol_flags;
171
172         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
173                 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
174                 uint16_t *iplen, *oiplen, *oudplen;
175                 uint16_t lso_sb, paylen;
176
177                 mask = -!!(ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6));
178                 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
179                          m->l2_len + m->l3_len + m->l4_len;
180
181                 /* Reduce payload len from base headers */
182                 paylen = m->pkt_len - lso_sb;
183
184                 /* Get iplen position assuming no tunnel hdr */
185                 iplen = (uint16_t *)(mdata + m->l2_len +
186                                      (2 << !!(ol_flags & PKT_TX_IPV6)));
187                 /* Handle tunnel tso */
188                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
189                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
190                         const uint8_t is_udp_tun =
191                                 (CNXK_NIX_UDP_TUN_BITMASK >>
192                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
193                                 0x1;
194
195                         oiplen = (uint16_t *)(mdata + m->outer_l2_len +
196                                               (2 << !!(ol_flags &
197                                                        PKT_TX_OUTER_IPV6)));
198                         *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
199                                                    paylen);
200
201                         /* Update format for UDP tunneled packet */
202                         if (is_udp_tun) {
203                                 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
204                                                        m->outer_l3_len + 4);
205                                 *oudplen = rte_cpu_to_be_16(
206                                         rte_be_to_cpu_16(*oudplen) - paylen);
207                         }
208
209                         /* Update iplen position to inner ip hdr */
210                         iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
211                                              m->l4_len +
212                                              (2 << !!(ol_flags & PKT_TX_IPV6)));
213                 }
214
215                 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
216         }
217 }
218
219 static __rte_always_inline void
220 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
221                        const uint16_t flags, const uint64_t lso_tun_fmt)
222 {
223         struct nix_send_ext_s *send_hdr_ext;
224         struct nix_send_hdr_s *send_hdr;
225         uint64_t ol_flags = 0, mask;
226         union nix_send_hdr_w1_u w1;
227         union nix_send_sg_s *sg;
228
229         send_hdr = (struct nix_send_hdr_s *)cmd;
230         if (flags & NIX_TX_NEED_EXT_HDR) {
231                 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
232                 sg = (union nix_send_sg_s *)(cmd + 4);
233                 /* Clear previous markings */
234                 send_hdr_ext->w0.lso = 0;
235                 send_hdr_ext->w1.u = 0;
236         } else {
237                 sg = (union nix_send_sg_s *)(cmd + 2);
238         }
239
240         if (flags & NIX_TX_NEED_SEND_HDR_W1) {
241                 ol_flags = m->ol_flags;
242                 w1.u = 0;
243         }
244
245         if (!(flags & NIX_TX_MULTI_SEG_F)) {
246                 send_hdr->w0.total = m->data_len;
247                 send_hdr->w0.aura =
248                         roc_npa_aura_handle_to_aura(m->pool->pool_id);
249         }
250
251         /*
252          * L3type:  2 => IPV4
253          *          3 => IPV4 with csum
254          *          4 => IPV6
255          * L3type and L3ptr needs to be set for either
256          * L3 csum or L4 csum or LSO
257          *
258          */
259
260         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
261             (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
262                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
263                 const uint8_t ol3type =
264                         ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
265                         ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
266                         !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
267
268                 /* Outer L3 */
269                 w1.ol3type = ol3type;
270                 mask = 0xffffull << ((!!ol3type) << 4);
271                 w1.ol3ptr = ~mask & m->outer_l2_len;
272                 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
273
274                 /* Outer L4 */
275                 w1.ol4type = csum + (csum << 1);
276
277                 /* Inner L3 */
278                 w1.il3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
279                              ((!!(ol_flags & PKT_TX_IPV6)) << 2);
280                 w1.il3ptr = w1.ol4ptr + m->l2_len;
281                 w1.il4ptr = w1.il3ptr + m->l3_len;
282                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
283                 w1.il3type = w1.il3type + !!(ol_flags & PKT_TX_IP_CKSUM);
284
285                 /* Inner L4 */
286                 w1.il4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
287
288                 /* In case of no tunnel header use only
289                  * shift IL3/IL4 fields a bit to use
290                  * OL3/OL4 for header checksum
291                  */
292                 mask = !ol3type;
293                 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
294                        ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
295
296         } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
297                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
298                 const uint8_t outer_l2_len = m->outer_l2_len;
299
300                 /* Outer L3 */
301                 w1.ol3ptr = outer_l2_len;
302                 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
303                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
304                 w1.ol3type = ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
305                              ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
306                              !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
307
308                 /* Outer L4 */
309                 w1.ol4type = csum + (csum << 1);
310
311         } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
312                 const uint8_t l2_len = m->l2_len;
313
314                 /* Always use OLXPTR and OLXTYPE when only
315                  * when one header is present
316                  */
317
318                 /* Inner L3 */
319                 w1.ol3ptr = l2_len;
320                 w1.ol4ptr = l2_len + m->l3_len;
321                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
322                 w1.ol3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
323                              ((!!(ol_flags & PKT_TX_IPV6)) << 2) +
324                              !!(ol_flags & PKT_TX_IP_CKSUM);
325
326                 /* Inner L4 */
327                 w1.ol4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
328         }
329
330         if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
331                 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & PKT_TX_VLAN);
332                 /* HW will update ptr after vlan0 update */
333                 send_hdr_ext->w1.vlan1_ins_ptr = 12;
334                 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
335
336                 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & PKT_TX_QINQ);
337                 /* 2B before end of l2 header */
338                 send_hdr_ext->w1.vlan0_ins_ptr = 12;
339                 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
340         }
341
342         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
343                 uint16_t lso_sb;
344                 uint64_t mask;
345
346                 mask = -(!w1.il3type);
347                 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
348
349                 send_hdr_ext->w0.lso_sb = lso_sb;
350                 send_hdr_ext->w0.lso = 1;
351                 send_hdr_ext->w0.lso_mps = m->tso_segsz;
352                 send_hdr_ext->w0.lso_format =
353                         NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
354                 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
355
356                 /* Handle tunnel tso */
357                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
358                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
359                         const uint8_t is_udp_tun =
360                                 (CNXK_NIX_UDP_TUN_BITMASK >>
361                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
362                                 0x1;
363                         uint8_t shift = is_udp_tun ? 32 : 0;
364
365                         shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
366                         shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
367
368                         w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
369                         w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
370                         /* Update format for UDP tunneled packet */
371                         send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
372                 }
373         }
374
375         if (flags & NIX_TX_NEED_SEND_HDR_W1)
376                 send_hdr->w1.u = w1.u;
377
378         if (!(flags & NIX_TX_MULTI_SEG_F)) {
379                 sg->seg1_size = m->data_len;
380                 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
381
382                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
383                         /* DF bit = 1 if refcount of current mbuf or parent mbuf
384                          *              is greater than 1
385                          * DF bit = 0 otherwise
386                          */
387                         send_hdr->w0.df = cnxk_nix_prefree_seg(m);
388                 }
389                 /* Mark mempool object as "put" since it is freed by NIX */
390                 if (!send_hdr->w0.df)
391                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
392         }
393
394         /* With minimal offloads, 'cmd' being local could be optimized out to
395          * registers. In other cases, 'cmd' will be in stack. Intent is
396          * 'cmd' stores content from txq->cmd which is copied only once.
397          */
398         *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
399         lmt_addr += 16;
400         if (flags & NIX_TX_NEED_EXT_HDR) {
401                 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
402                 lmt_addr += 16;
403         }
404         /* In case of multi-seg, sg template is stored here */
405         *((union nix_send_sg_s *)lmt_addr) = *sg;
406         *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
407 }
408
409 static __rte_always_inline void
410 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
411                               const uint64_t ol_flags, const uint16_t no_segdw,
412                               const uint16_t flags)
413 {
414         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
415                 const uint8_t is_ol_tstamp = !(ol_flags & PKT_TX_IEEE1588_TMST);
416                 struct nix_send_ext_s *send_hdr_ext =
417                                         (struct nix_send_ext_s *)lmt_addr + 16;
418                 uint64_t *lmt = (uint64_t *)lmt_addr;
419                 uint16_t off = (no_segdw - 1) << 1;
420                 struct nix_send_mem_s *send_mem;
421
422                 send_mem = (struct nix_send_mem_s *)(lmt + off);
423                 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
424                 send_hdr_ext->w0.tstmp = 1;
425                 if (flags & NIX_TX_MULTI_SEG_F) {
426                         /* Retrieving the default desc values */
427                         lmt[off] = cmd[2];
428
429                         /* Using compiler barier to avoid voilation of C
430                          * aliasing rules.
431                          */
432                         rte_compiler_barrier();
433                 }
434
435                 /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
436                  * should not be recorded, hence changing the alg type to
437                  * NIX_SENDMEMALG_SET and also changing send mem addr field to
438                  * next 8 bytes as it corrpt the actual tx tstamp registered
439                  * address.
440                  */
441                 send_mem->w0.subdc = NIX_SUBDC_MEM;
442                 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
443                 send_mem->addr =
444                         (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
445         }
446 }
447
448 static __rte_always_inline uint16_t
449 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
450 {
451         struct nix_send_hdr_s *send_hdr;
452         union nix_send_sg_s *sg;
453         struct rte_mbuf *m_next;
454         uint64_t *slist, sg_u;
455         uint64_t nb_segs;
456         uint64_t segdw;
457         uint8_t off, i;
458
459         send_hdr = (struct nix_send_hdr_s *)cmd;
460         send_hdr->w0.total = m->pkt_len;
461         send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
462
463         if (flags & NIX_TX_NEED_EXT_HDR)
464                 off = 2;
465         else
466                 off = 0;
467
468         sg = (union nix_send_sg_s *)&cmd[2 + off];
469         /* Clear sg->u header before use */
470         sg->u &= 0xFC00000000000000;
471         sg_u = sg->u;
472         slist = &cmd[3 + off];
473
474         i = 0;
475         nb_segs = m->nb_segs;
476
477         /* Fill mbuf segments */
478         do {
479                 m_next = m->next;
480                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
481                 *slist = rte_mbuf_data_iova(m);
482                 /* Set invert df if buffer is not to be freed by H/W */
483                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
484                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
485                         /* Mark mempool object as "put" since it is freed by NIX
486                          */
487 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
488                 if (!(sg_u & (1ULL << (i + 55))))
489                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
490 #endif
491                 slist++;
492                 i++;
493                 nb_segs--;
494                 if (i > 2 && nb_segs) {
495                         i = 0;
496                         /* Next SG subdesc */
497                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
498                         sg->u = sg_u;
499                         sg->segs = 3;
500                         sg = (union nix_send_sg_s *)slist;
501                         sg_u = sg->u;
502                         slist++;
503                 }
504                 m = m_next;
505         } while (nb_segs);
506
507         sg->u = sg_u;
508         sg->segs = i;
509         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
510         /* Roundup extra dwords to multiple of 2 */
511         segdw = (segdw >> 1) + (segdw & 0x1);
512         /* Default dwords */
513         segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
514         send_hdr->w0.sizem1 = segdw - 1;
515
516         return segdw;
517 }
518
519 static __rte_always_inline uint16_t
520 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
521                     uint64_t *cmd, uintptr_t base, const uint16_t flags)
522 {
523         struct cn10k_eth_txq *txq = tx_queue;
524         const rte_iova_t io_addr = txq->io_addr;
525         uintptr_t pa, lmt_addr = txq->lmt_base;
526         uint16_t lmt_id, burst, left, i;
527         uint64_t lso_tun_fmt;
528         uint64_t data;
529
530         if (!(flags & NIX_TX_VWQE_F)) {
531                 NIX_XMIT_FC_OR_RETURN(txq, pkts);
532                 /* Reduce the cached count */
533                 txq->fc_cache_pkts -= pkts;
534         }
535
536         /* Get cmd skeleton */
537         cn10k_nix_tx_skeleton(txq, cmd, flags);
538
539         if (flags & NIX_TX_OFFLOAD_TSO_F)
540                 lso_tun_fmt = txq->lso_tun_fmt;
541
542         /* Get LMT base address and LMT ID as lcore id */
543         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
544         left = pkts;
545 again:
546         burst = left > 32 ? 32 : left;
547         for (i = 0; i < burst; i++) {
548                 /* Perform header writes for TSO, barrier at
549                  * lmt steorl will suffice.
550                  */
551                 if (flags & NIX_TX_OFFLOAD_TSO_F)
552                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
553
554                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
555                                        lso_tun_fmt);
556                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
557                                               tx_pkts[i]->ol_flags, 4, flags);
558                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
559         }
560
561         if (flags & NIX_TX_VWQE_F)
562                 roc_sso_hws_head_wait(base);
563
564         /* Trigger LMTST */
565         if (burst > 16) {
566                 data = cn10k_nix_tx_steor_data(flags);
567                 pa = io_addr | (data & 0x7) << 4;
568                 data &= ~0x7ULL;
569                 data |= (15ULL << 12);
570                 data |= (uint64_t)lmt_id;
571
572                 /* STEOR0 */
573                 roc_lmt_submit_steorl(data, pa);
574
575                 data = cn10k_nix_tx_steor_data(flags);
576                 pa = io_addr | (data & 0x7) << 4;
577                 data &= ~0x7ULL;
578                 data |= ((uint64_t)(burst - 17)) << 12;
579                 data |= (uint64_t)(lmt_id + 16);
580
581                 /* STEOR1 */
582                 roc_lmt_submit_steorl(data, pa);
583         } else if (burst) {
584                 data = cn10k_nix_tx_steor_data(flags);
585                 pa = io_addr | (data & 0x7) << 4;
586                 data &= ~0x7ULL;
587                 data |= ((uint64_t)(burst - 1)) << 12;
588                 data |= lmt_id;
589
590                 /* STEOR0 */
591                 roc_lmt_submit_steorl(data, pa);
592         }
593
594         left -= burst;
595         rte_io_wmb();
596         if (left) {
597                 /* Start processing another burst */
598                 tx_pkts += burst;
599                 /* Reset lmt base addr */
600                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
601                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
602                 goto again;
603         }
604
605         return pkts;
606 }
607
608 static __rte_always_inline uint16_t
609 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
610                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
611                          const uint16_t flags)
612 {
613         struct cn10k_eth_txq *txq = tx_queue;
614         uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
615         const rte_iova_t io_addr = txq->io_addr;
616         uint16_t segdw, lmt_id, burst, left, i;
617         uint64_t data0, data1;
618         uint64_t lso_tun_fmt;
619         __uint128_t data128;
620         uint16_t shft;
621
622         NIX_XMIT_FC_OR_RETURN(txq, pkts);
623
624         cn10k_nix_tx_skeleton(txq, cmd, flags);
625
626         /* Reduce the cached count */
627         txq->fc_cache_pkts -= pkts;
628
629         if (flags & NIX_TX_OFFLOAD_TSO_F)
630                 lso_tun_fmt = txq->lso_tun_fmt;
631
632         /* Get LMT base address and LMT ID as lcore id */
633         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
634         left = pkts;
635 again:
636         burst = left > 32 ? 32 : left;
637         shft = 16;
638         data128 = 0;
639         for (i = 0; i < burst; i++) {
640                 /* Perform header writes for TSO, barrier at
641                  * lmt steorl will suffice.
642                  */
643                 if (flags & NIX_TX_OFFLOAD_TSO_F)
644                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
645
646                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
647                                        lso_tun_fmt);
648                 /* Store sg list directly on lmt line */
649                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
650                                                flags);
651                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
652                                               tx_pkts[i]->ol_flags, segdw,
653                                               flags);
654                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
655                 data128 |= (((__uint128_t)(segdw - 1)) << shft);
656                 shft += 3;
657         }
658
659         if (flags & NIX_TX_VWQE_F)
660                 roc_sso_hws_head_wait(base);
661
662         data0 = (uint64_t)data128;
663         data1 = (uint64_t)(data128 >> 64);
664         /* Make data0 similar to data1 */
665         data0 >>= 16;
666         /* Trigger LMTST */
667         if (burst > 16) {
668                 pa0 = io_addr | (data0 & 0x7) << 4;
669                 data0 &= ~0x7ULL;
670                 /* Move lmtst1..15 sz to bits 63:19 */
671                 data0 <<= 16;
672                 data0 |= (15ULL << 12);
673                 data0 |= (uint64_t)lmt_id;
674
675                 /* STEOR0 */
676                 roc_lmt_submit_steorl(data0, pa0);
677
678                 pa1 = io_addr | (data1 & 0x7) << 4;
679                 data1 &= ~0x7ULL;
680                 data1 <<= 16;
681                 data1 |= ((uint64_t)(burst - 17)) << 12;
682                 data1 |= (uint64_t)(lmt_id + 16);
683
684                 /* STEOR1 */
685                 roc_lmt_submit_steorl(data1, pa1);
686         } else if (burst) {
687                 pa0 = io_addr | (data0 & 0x7) << 4;
688                 data0 &= ~0x7ULL;
689                 /* Move lmtst1..15 sz to bits 63:19 */
690                 data0 <<= 16;
691                 data0 |= ((burst - 1) << 12);
692                 data0 |= (uint64_t)lmt_id;
693
694                 /* STEOR0 */
695                 roc_lmt_submit_steorl(data0, pa0);
696         }
697
698         left -= burst;
699         rte_io_wmb();
700         if (left) {
701                 /* Start processing another burst */
702                 tx_pkts += burst;
703                 /* Reset lmt base addr */
704                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
705                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
706                 goto again;
707         }
708
709         return pkts;
710 }
711
712 #if defined(RTE_ARCH_ARM64)
713
714 static __rte_always_inline void
715 cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
716                       union nix_send_ext_w0_u *w0, uint64_t ol_flags,
717                       const uint64_t flags, const uint64_t lso_tun_fmt)
718 {
719         uint16_t lso_sb;
720         uint64_t mask;
721
722         if (!(ol_flags & PKT_TX_TCP_SEG))
723                 return;
724
725         mask = -(!w1->il3type);
726         lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
727
728         w0->u |= BIT(14);
729         w0->lso_sb = lso_sb;
730         w0->lso_mps = m->tso_segsz;
731         w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
732         w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
733
734         /* Handle tunnel tso */
735         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
736             (ol_flags & PKT_TX_TUNNEL_MASK)) {
737                 const uint8_t is_udp_tun =
738                         (CNXK_NIX_UDP_TUN_BITMASK >>
739                          ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
740                         0x1;
741                 uint8_t shift = is_udp_tun ? 32 : 0;
742
743                 shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
744                 shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
745
746                 w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
747                 w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
748                 /* Update format for UDP tunneled packet */
749
750                 w0->lso_format = (lso_tun_fmt >> shift);
751         }
752 }
753
754 static __rte_always_inline void
755 cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
756                                 union nix_send_hdr_w0_u *sh,
757                                 union nix_send_sg_s *sg, const uint32_t flags)
758 {
759         struct rte_mbuf *m_next;
760         uint64_t *slist, sg_u;
761         uint16_t nb_segs;
762         int i = 1;
763
764         sh->total = m->pkt_len;
765         /* Clear sg->u header before use */
766         sg->u &= 0xFC00000000000000;
767         sg_u = sg->u;
768         slist = &cmd[0];
769
770         sg_u = sg_u | ((uint64_t)m->data_len);
771
772         nb_segs = m->nb_segs - 1;
773         m_next = m->next;
774
775         /* Set invert df if buffer is not to be freed by H/W */
776         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
777                 sg_u |= (cnxk_nix_prefree_seg(m) << 55);
778                 /* Mark mempool object as "put" since it is freed by NIX */
779 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
780         if (!(sg_u & (1ULL << 55)))
781                 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
782         rte_io_wmb();
783 #endif
784
785         m = m_next;
786         /* Fill mbuf segments */
787         do {
788                 m_next = m->next;
789                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
790                 *slist = rte_mbuf_data_iova(m);
791                 /* Set invert df if buffer is not to be freed by H/W */
792                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
793                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
794                         /* Mark mempool object as "put" since it is freed by NIX
795                          */
796 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
797                 if (!(sg_u & (1ULL << (i + 55))))
798                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
799                 rte_io_wmb();
800 #endif
801                 slist++;
802                 i++;
803                 nb_segs--;
804                 if (i > 2 && nb_segs) {
805                         i = 0;
806                         /* Next SG subdesc */
807                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
808                         sg->u = sg_u;
809                         sg->segs = 3;
810                         sg = (union nix_send_sg_s *)slist;
811                         sg_u = sg->u;
812                         slist++;
813                 }
814                 m = m_next;
815         } while (nb_segs);
816
817         sg->u = sg_u;
818         sg->segs = i;
819 }
820
821 static __rte_always_inline void
822 cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
823                            uint64x2_t *cmd1, const uint8_t segdw,
824                            const uint32_t flags)
825 {
826         union nix_send_hdr_w0_u sh;
827         union nix_send_sg_s sg;
828
829         if (m->nb_segs == 1) {
830                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
831                         sg.u = vgetq_lane_u64(cmd1[0], 0);
832                         sg.u |= (cnxk_nix_prefree_seg(m) << 55);
833                         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
834                 }
835
836 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
837                 sg.u = vgetq_lane_u64(cmd1[0], 0);
838                 if (!(sg.u & (1ULL << 55)))
839                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
840                 rte_io_wmb();
841 #endif
842                 return;
843         }
844
845         sh.u = vgetq_lane_u64(cmd0[0], 0);
846         sg.u = vgetq_lane_u64(cmd1[0], 0);
847
848         cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
849
850         sh.sizem1 = segdw - 1;
851         cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
852         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
853 }
854
855 #define NIX_DESCS_PER_LOOP 4
856
857 static __rte_always_inline uint8_t
858 cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
859                                uint64x2_t *cmd1, uint64x2_t *cmd2,
860                                uint64x2_t *cmd3, uint8_t *segdw,
861                                uint64_t *lmt_addr, __uint128_t *data128,
862                                uint8_t *shift, const uint16_t flags)
863 {
864         uint8_t j, off, lmt_used;
865
866         if (!(flags & NIX_TX_NEED_EXT_HDR) &&
867             !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
868                 /* No segments in 4 consecutive packets. */
869                 if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
870                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
871                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
872                                                            &cmd0[j], &cmd1[j],
873                                                            segdw[j], flags);
874                         vst1q_u64(lmt_addr, cmd0[0]);
875                         vst1q_u64(lmt_addr + 2, cmd1[0]);
876                         vst1q_u64(lmt_addr + 4, cmd0[1]);
877                         vst1q_u64(lmt_addr + 6, cmd1[1]);
878                         vst1q_u64(lmt_addr + 8, cmd0[2]);
879                         vst1q_u64(lmt_addr + 10, cmd1[2]);
880                         vst1q_u64(lmt_addr + 12, cmd0[3]);
881                         vst1q_u64(lmt_addr + 14, cmd1[3]);
882
883                         *data128 |= ((__uint128_t)7) << *shift;
884                         shift += 3;
885
886                         return 1;
887                 }
888         }
889
890         lmt_used = 0;
891         for (j = 0; j < NIX_DESCS_PER_LOOP;) {
892                 /* Fit consecutive packets in same LMTLINE. */
893                 if ((segdw[j] + segdw[j + 1]) <= 8) {
894                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
895                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
896                                                            &cmd0[j], &cmd1[j],
897                                                            segdw[j], flags);
898                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
899                                                            &cmd0[j + 1],
900                                                            &cmd1[j + 1],
901                                                            segdw[j + 1], flags);
902                                 /* TSTAMP takes 4 each, no segs. */
903                                 vst1q_u64(lmt_addr, cmd0[j]);
904                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
905                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
906                                 vst1q_u64(lmt_addr + 6, cmd3[j]);
907
908                                 vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
909                                 vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
910                                 vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
911                                 vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
912                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
913                                 /* EXT header take 3 each, space for 2 segs.*/
914                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
915                                                            lmt_addr + 6,
916                                                            &cmd0[j], &cmd1[j],
917                                                            segdw[j], flags);
918                                 vst1q_u64(lmt_addr, cmd0[j]);
919                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
920                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
921                                 off = segdw[j] - 3;
922                                 off <<= 1;
923                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
924                                                            lmt_addr + 12 + off,
925                                                            &cmd0[j + 1],
926                                                            &cmd1[j + 1],
927                                                            segdw[j + 1], flags);
928                                 vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
929                                 vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
930                                 vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
931                         } else {
932                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
933                                                            lmt_addr + 4,
934                                                            &cmd0[j], &cmd1[j],
935                                                            segdw[j], flags);
936                                 vst1q_u64(lmt_addr, cmd0[j]);
937                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
938                                 off = segdw[j] - 2;
939                                 off <<= 1;
940                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
941                                                            lmt_addr + 8 + off,
942                                                            &cmd0[j + 1],
943                                                            &cmd1[j + 1],
944                                                            segdw[j + 1], flags);
945                                 vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
946                                 vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
947                         }
948                         *data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
949                                     << *shift;
950                         *shift += 3;
951                         j += 2;
952                 } else {
953                         if ((flags & NIX_TX_NEED_EXT_HDR) &&
954                             (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
955                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
956                                                            lmt_addr + 6,
957                                                            &cmd0[j], &cmd1[j],
958                                                            segdw[j], flags);
959                                 vst1q_u64(lmt_addr, cmd0[j]);
960                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
961                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
962                                 off = segdw[j] - 4;
963                                 off <<= 1;
964                                 vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
965                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
966                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
967                                                            lmt_addr + 6,
968                                                            &cmd0[j], &cmd1[j],
969                                                            segdw[j], flags);
970                                 vst1q_u64(lmt_addr, cmd0[j]);
971                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
972                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
973                         } else {
974                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
975                                                            lmt_addr + 4,
976                                                            &cmd0[j], &cmd1[j],
977                                                            segdw[j], flags);
978                                 vst1q_u64(lmt_addr, cmd0[j]);
979                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
980                         }
981                         *data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
982                         *shift += 3;
983                         j++;
984                 }
985                 lmt_used++;
986                 lmt_addr += 16;
987         }
988
989         return lmt_used;
990 }
991
992 static __rte_always_inline uint16_t
993 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
994                            uint16_t pkts, uint64_t *cmd, uintptr_t base,
995                            const uint16_t flags)
996 {
997         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
998         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
999         uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
1000                 cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
1001         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
1002         uint64x2_t senddesc01_w0, senddesc23_w0;
1003         uint64x2_t senddesc01_w1, senddesc23_w1;
1004         uint16_t left, scalar, burst, i, lmt_id;
1005         uint64x2_t sendext01_w0, sendext23_w0;
1006         uint64x2_t sendext01_w1, sendext23_w1;
1007         uint64x2_t sendmem01_w0, sendmem23_w0;
1008         uint64x2_t sendmem01_w1, sendmem23_w1;
1009         uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
1010         uint64x2_t sgdesc01_w0, sgdesc23_w0;
1011         uint64x2_t sgdesc01_w1, sgdesc23_w1;
1012         struct cn10k_eth_txq *txq = tx_queue;
1013         uintptr_t laddr = txq->lmt_base;
1014         rte_iova_t io_addr = txq->io_addr;
1015         uint64x2_t ltypes01, ltypes23;
1016         uint64x2_t xtmp128, ytmp128;
1017         uint64x2_t xmask01, xmask23;
1018         uint8_t lnum, shift;
1019         union wdata {
1020                 __uint128_t data128;
1021                 uint64_t data[2];
1022         } wd;
1023
1024         if (!(flags & NIX_TX_VWQE_F)) {
1025                 NIX_XMIT_FC_OR_RETURN(txq, pkts);
1026                 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1027                 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1028                 /* Reduce the cached count */
1029                 txq->fc_cache_pkts -= pkts;
1030         } else {
1031                 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1032                 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1033         }
1034
1035         /* Perform header writes before barrier for TSO */
1036         if (flags & NIX_TX_OFFLOAD_TSO_F) {
1037                 for (i = 0; i < pkts; i++)
1038                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
1039         }
1040
1041         senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
1042         senddesc23_w0 = senddesc01_w0;
1043         senddesc01_w1 = vdupq_n_u64(0);
1044         senddesc23_w1 = senddesc01_w1;
1045         sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
1046         sgdesc23_w0 = sgdesc01_w0;
1047
1048         /* Load command defaults into vector variables. */
1049         if (flags & NIX_TX_NEED_EXT_HDR) {
1050                 sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
1051                 sendext23_w0 = sendext01_w0;
1052                 sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
1053                 sendext23_w1 = sendext01_w1;
1054                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1055                         sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
1056                         sendmem23_w0 = sendmem01_w0;
1057                         sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
1058                         sendmem23_w1 = sendmem01_w1;
1059                 }
1060         }
1061
1062         /* Get LMT base address and LMT ID as lcore id */
1063         ROC_LMT_BASE_ID_GET(laddr, lmt_id);
1064         left = pkts;
1065 again:
1066         /* Number of packets to prepare depends on offloads enabled. */
1067         burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
1068                               cn10k_nix_pkts_per_vec_brst(flags) :
1069                               left;
1070         if (flags & NIX_TX_MULTI_SEG_F) {
1071                 wd.data128 = 0;
1072                 shift = 16;
1073         }
1074         lnum = 0;
1075
1076         for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
1077                 if (flags & NIX_TX_MULTI_SEG_F) {
1078                         uint8_t j;
1079
1080                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1081                                 struct rte_mbuf *m = tx_pkts[j];
1082
1083                                 /* Get dwords based on nb_segs. */
1084                                 segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
1085                                 /* Add dwords based on offloads. */
1086                                 segdw[j] += 1 + /* SEND HDR */
1087                                             !!(flags & NIX_TX_NEED_EXT_HDR) +
1088                                             !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
1089                         }
1090
1091                         /* Check if there are enough LMTLINES for this loop */
1092                         if (lnum + 4 > 32) {
1093                                 uint8_t ldwords_con = 0, lneeded = 0;
1094                                 for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1095                                         ldwords_con += segdw[j];
1096                                         if (ldwords_con > 8) {
1097                                                 lneeded += 1;
1098                                                 ldwords_con = segdw[j];
1099                                         }
1100                                 }
1101                                 lneeded += 1;
1102                                 if (lnum + lneeded > 32) {
1103                                         burst = i;
1104                                         break;
1105                                 }
1106                         }
1107                 }
1108                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
1109                 senddesc01_w0 =
1110                         vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1111                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1112
1113                 senddesc23_w0 = senddesc01_w0;
1114                 sgdesc23_w0 = sgdesc01_w0;
1115
1116                 /* Clear vlan enables. */
1117                 if (flags & NIX_TX_NEED_EXT_HDR) {
1118                         sendext01_w1 = vbicq_u64(sendext01_w1,
1119                                                  vdupq_n_u64(0x3FFFF00FFFF00));
1120                         sendext23_w1 = sendext01_w1;
1121                 }
1122
1123                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1124                         /* Reset send mem alg to SETTSTMP from SUB*/
1125                         sendmem01_w0 = vbicq_u64(sendmem01_w0,
1126                                                  vdupq_n_u64(BIT_ULL(59)));
1127                         /* Reset send mem address to default. */
1128                         sendmem01_w1 =
1129                                 vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
1130                         sendmem23_w0 = sendmem01_w0;
1131                         sendmem23_w1 = sendmem01_w1;
1132                 }
1133
1134                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1135                         /* Clear the LSO enable bit. */
1136                         sendext01_w0 = vbicq_u64(sendext01_w0,
1137                                                  vdupq_n_u64(BIT_ULL(14)));
1138                         sendext23_w0 = sendext01_w0;
1139                 }
1140
1141                 /* Move mbufs to iova */
1142                 mbuf0 = (uint64_t *)tx_pkts[0];
1143                 mbuf1 = (uint64_t *)tx_pkts[1];
1144                 mbuf2 = (uint64_t *)tx_pkts[2];
1145                 mbuf3 = (uint64_t *)tx_pkts[3];
1146
1147                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1148                                      offsetof(struct rte_mbuf, buf_iova));
1149                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1150                                      offsetof(struct rte_mbuf, buf_iova));
1151                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1152                                      offsetof(struct rte_mbuf, buf_iova));
1153                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1154                                      offsetof(struct rte_mbuf, buf_iova));
1155                 /*
1156                  * Get mbuf's, olflags, iova, pktlen, dataoff
1157                  * dataoff_iovaX.D[0] = iova,
1158                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
1159                  * len_olflagsX.D[0] = ol_flags,
1160                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
1161                  */
1162                 dataoff_iova0 = vld1q_u64(mbuf0);
1163                 len_olflags0 = vld1q_u64(mbuf0 + 2);
1164                 dataoff_iova1 = vld1q_u64(mbuf1);
1165                 len_olflags1 = vld1q_u64(mbuf1 + 2);
1166                 dataoff_iova2 = vld1q_u64(mbuf2);
1167                 len_olflags2 = vld1q_u64(mbuf2 + 2);
1168                 dataoff_iova3 = vld1q_u64(mbuf3);
1169                 len_olflags3 = vld1q_u64(mbuf3 + 2);
1170
1171                 /* Move mbufs to point pool */
1172                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1173                                      offsetof(struct rte_mbuf, pool) -
1174                                      offsetof(struct rte_mbuf, buf_iova));
1175                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1176                                      offsetof(struct rte_mbuf, pool) -
1177                                      offsetof(struct rte_mbuf, buf_iova));
1178                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1179                                      offsetof(struct rte_mbuf, pool) -
1180                                      offsetof(struct rte_mbuf, buf_iova));
1181                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1182                                      offsetof(struct rte_mbuf, pool) -
1183                                      offsetof(struct rte_mbuf, buf_iova));
1184
1185                 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
1186                              NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
1187                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
1188                         /*
1189                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1190                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1191                          */
1192
1193                         asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
1194                                      : [a] "+w"(senddesc01_w1)
1195                                      : [in] "r"(mbuf0 + 2)
1196                                      : "memory");
1197
1198                         asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
1199                                      : [a] "+w"(senddesc01_w1)
1200                                      : [in] "r"(mbuf1 + 2)
1201                                      : "memory");
1202
1203                         asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
1204                                      : [b] "+w"(senddesc23_w1)
1205                                      : [in] "r"(mbuf2 + 2)
1206                                      : "memory");
1207
1208                         asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
1209                                      : [b] "+w"(senddesc23_w1)
1210                                      : [in] "r"(mbuf3 + 2)
1211                                      : "memory");
1212
1213                         /* Get pool pointer alone */
1214                         mbuf0 = (uint64_t *)*mbuf0;
1215                         mbuf1 = (uint64_t *)*mbuf1;
1216                         mbuf2 = (uint64_t *)*mbuf2;
1217                         mbuf3 = (uint64_t *)*mbuf3;
1218                 } else {
1219                         /* Get pool pointer alone */
1220                         mbuf0 = (uint64_t *)*mbuf0;
1221                         mbuf1 = (uint64_t *)*mbuf1;
1222                         mbuf2 = (uint64_t *)*mbuf2;
1223                         mbuf3 = (uint64_t *)*mbuf3;
1224                 }
1225
1226                 const uint8x16_t shuf_mask2 = {
1227                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1228                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1229                 };
1230                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
1231                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
1232
1233                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
1234                 const uint64x2_t and_mask0 = {
1235                         0xFFFFFFFFFFFFFFFF,
1236                         0x000000000000FFFF,
1237                 };
1238
1239                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
1240                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
1241                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
1242                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
1243
1244                 /*
1245                  * Pick only 16 bits of pktlen preset at bits 63:32
1246                  * and place them at bits 15:0.
1247                  */
1248                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
1249                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
1250
1251                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
1252                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
1253                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
1254
1255                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
1256                  * pktlen at 15:0 position.
1257                  */
1258                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
1259                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
1260                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
1261                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
1262
1263                 /* Move mbuf to point to pool_id. */
1264                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1265                                      offsetof(struct rte_mempool, pool_id));
1266                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1267                                      offsetof(struct rte_mempool, pool_id));
1268                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1269                                      offsetof(struct rte_mempool, pool_id));
1270                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1271                                      offsetof(struct rte_mempool, pool_id));
1272
1273                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1274                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1275                         /*
1276                          * Lookup table to translate ol_flags to
1277                          * il3/il4 types. But we still use ol3/ol4 types in
1278                          * senddesc_w1 as only one header processing is enabled.
1279                          */
1280                         const uint8x16_t tbl = {
1281                                 /* [0-15] = il4type:il3type */
1282                                 0x04, /* none (IPv6 assumed) */
1283                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
1284                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
1285                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
1286                                 0x03, /* PKT_TX_IP_CKSUM */
1287                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
1288                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
1289                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
1290                                 0x02, /* PKT_TX_IPV4  */
1291                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
1292                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
1293                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
1294                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
1295                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1296                                        * PKT_TX_TCP_CKSUM
1297                                        */
1298                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1299                                        * PKT_TX_SCTP_CKSUM
1300                                        */
1301                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1302                                        * PKT_TX_UDP_CKSUM
1303                                        */
1304                         };
1305
1306                         /* Extract olflags to translate to iltypes */
1307                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1308                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1309
1310                         /*
1311                          * E(47):L3_LEN(9):L2_LEN(7+z)
1312                          * E(47):L3_LEN(9):L2_LEN(7+z)
1313                          */
1314                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
1315                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
1316
1317                         /* Move OLFLAGS bits 55:52 to 51:48
1318                          * with zeros preprended on the byte and rest
1319                          * don't care
1320                          */
1321                         xtmp128 = vshrq_n_u8(xtmp128, 4);
1322                         ytmp128 = vshrq_n_u8(ytmp128, 4);
1323                         /*
1324                          * E(48):L3_LEN(8):L2_LEN(z+7)
1325                          * E(48):L3_LEN(8):L2_LEN(z+7)
1326                          */
1327                         const int8x16_t tshft3 = {
1328                                 -1, 0, 8, 8, 8, 8, 8, 8,
1329                                 -1, 0, 8, 8, 8, 8, 8, 8,
1330                         };
1331
1332                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1333                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1334
1335                         /* Do the lookup */
1336                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1337                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1338
1339                         /* Pick only relevant fields i.e Bit 48:55 of iltype
1340                          * and place it in ol3/ol4type of senddesc_w1
1341                          */
1342                         const uint8x16_t shuf_mask0 = {
1343                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
1344                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
1345                         };
1346
1347                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1348                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1349
1350                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1351                          * a [E(32):E(16):OL3(8):OL2(8)]
1352                          * a = a + (a << 8)
1353                          * a [E(32):E(16):(OL3+OL2):OL2]
1354                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1355                          */
1356                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1357                                                  vshlq_n_u16(senddesc01_w1, 8));
1358                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1359                                                  vshlq_n_u16(senddesc23_w1, 8));
1360
1361                         /* Move ltypes to senddesc*_w1 */
1362                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1363                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1364                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1365                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1366                         /*
1367                          * Lookup table to translate ol_flags to
1368                          * ol3/ol4 types.
1369                          */
1370
1371                         const uint8x16_t tbl = {
1372                                 /* [0-15] = ol4type:ol3type */
1373                                 0x00, /* none */
1374                                 0x03, /* OUTER_IP_CKSUM */
1375                                 0x02, /* OUTER_IPV4 */
1376                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1377                                 0x04, /* OUTER_IPV6 */
1378                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1379                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1380                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1381                                        * OUTER_IP_CKSUM
1382                                        */
1383                                 0x00, /* OUTER_UDP_CKSUM */
1384                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
1385                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
1386                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
1387                                        * OUTER_IP_CKSUM
1388                                        */
1389                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
1390                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1391                                        * OUTER_IP_CKSUM
1392                                        */
1393                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1394                                        * OUTER_IPV4
1395                                        */
1396                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1397                                        * OUTER_IPV4 | OUTER_IP_CKSUM
1398                                        */
1399                         };
1400
1401                         /* Extract olflags to translate to iltypes */
1402                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1403                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1404
1405                         /*
1406                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1407                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1408                          */
1409                         const uint8x16_t shuf_mask5 = {
1410                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1411                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1412                         };
1413                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1414                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1415
1416                         /* Extract outer ol flags only */
1417                         const uint64x2_t o_cksum_mask = {
1418                                 0x1C00020000000000,
1419                                 0x1C00020000000000,
1420                         };
1421
1422                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1423                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1424
1425                         /* Extract OUTER_UDP_CKSUM bit 41 and
1426                          * move it to bit 61
1427                          */
1428
1429                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1430                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1431
1432                         /* Shift oltype by 2 to start nibble from BIT(56)
1433                          * instead of BIT(58)
1434                          */
1435                         xtmp128 = vshrq_n_u8(xtmp128, 2);
1436                         ytmp128 = vshrq_n_u8(ytmp128, 2);
1437                         /*
1438                          * E(48):L3_LEN(8):L2_LEN(z+7)
1439                          * E(48):L3_LEN(8):L2_LEN(z+7)
1440                          */
1441                         const int8x16_t tshft3 = {
1442                                 -1, 0, 8, 8, 8, 8, 8, 8,
1443                                 -1, 0, 8, 8, 8, 8, 8, 8,
1444                         };
1445
1446                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1447                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1448
1449                         /* Do the lookup */
1450                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1451                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1452
1453                         /* Pick only relevant fields i.e Bit 56:63 of oltype
1454                          * and place it in ol3/ol4type of senddesc_w1
1455                          */
1456                         const uint8x16_t shuf_mask0 = {
1457                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1458                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1459                         };
1460
1461                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1462                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1463
1464                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1465                          * a [E(32):E(16):OL3(8):OL2(8)]
1466                          * a = a + (a << 8)
1467                          * a [E(32):E(16):(OL3+OL2):OL2]
1468                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1469                          */
1470                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1471                                                  vshlq_n_u16(senddesc01_w1, 8));
1472                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1473                                                  vshlq_n_u16(senddesc23_w1, 8));
1474
1475                         /* Move ltypes to senddesc*_w1 */
1476                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1477                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1478                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1479                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1480                         /* Lookup table to translate ol_flags to
1481                          * ol4type, ol3type, il4type, il3type of senddesc_w1
1482                          */
1483                         const uint8x16x2_t tbl = {{
1484                                 {
1485                                         /* [0-15] = il4type:il3type */
1486                                         0x04, /* none (IPv6) */
1487                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
1488                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
1489                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
1490                                         0x03, /* PKT_TX_IP_CKSUM */
1491                                         0x13, /* PKT_TX_IP_CKSUM |
1492                                                * PKT_TX_TCP_CKSUM
1493                                                */
1494                                         0x23, /* PKT_TX_IP_CKSUM |
1495                                                * PKT_TX_SCTP_CKSUM
1496                                                */
1497                                         0x33, /* PKT_TX_IP_CKSUM |
1498                                                * PKT_TX_UDP_CKSUM
1499                                                */
1500                                         0x02, /* PKT_TX_IPV4 */
1501                                         0x12, /* PKT_TX_IPV4 |
1502                                                * PKT_TX_TCP_CKSUM
1503                                                */
1504                                         0x22, /* PKT_TX_IPV4 |
1505                                                * PKT_TX_SCTP_CKSUM
1506                                                */
1507                                         0x32, /* PKT_TX_IPV4 |
1508                                                * PKT_TX_UDP_CKSUM
1509                                                */
1510                                         0x03, /* PKT_TX_IPV4 |
1511                                                * PKT_TX_IP_CKSUM
1512                                                */
1513                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1514                                                * PKT_TX_TCP_CKSUM
1515                                                */
1516                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1517                                                * PKT_TX_SCTP_CKSUM
1518                                                */
1519                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1520                                                * PKT_TX_UDP_CKSUM
1521                                                */
1522                                 },
1523
1524                                 {
1525                                         /* [16-31] = ol4type:ol3type */
1526                                         0x00, /* none */
1527                                         0x03, /* OUTER_IP_CKSUM */
1528                                         0x02, /* OUTER_IPV4 */
1529                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1530                                         0x04, /* OUTER_IPV6 */
1531                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1532                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1533                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1534                                                * OUTER_IP_CKSUM
1535                                                */
1536                                         0x00, /* OUTER_UDP_CKSUM */
1537                                         0x33, /* OUTER_UDP_CKSUM |
1538                                                * OUTER_IP_CKSUM
1539                                                */
1540                                         0x32, /* OUTER_UDP_CKSUM |
1541                                                * OUTER_IPV4
1542                                                */
1543                                         0x33, /* OUTER_UDP_CKSUM |
1544                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1545                                                */
1546                                         0x34, /* OUTER_UDP_CKSUM |
1547                                                * OUTER_IPV6
1548                                                */
1549                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1550                                                * OUTER_IP_CKSUM
1551                                                */
1552                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1553                                                * OUTER_IPV4
1554                                                */
1555                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1556                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1557                                                */
1558                                 },
1559                         }};
1560
1561                         /* Extract olflags to translate to oltype & iltype */
1562                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1563                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1564
1565                         /*
1566                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1567                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1568                          */
1569                         const uint32x4_t tshft_4 = {
1570                                 1,
1571                                 0,
1572                                 1,
1573                                 0,
1574                         };
1575                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1576                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1577
1578                         /*
1579                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1580                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1581                          */
1582                         const uint8x16_t shuf_mask5 = {
1583                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1584                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1585                         };
1586                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1587                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1588
1589                         /* Extract outer and inner header ol_flags */
1590                         const uint64x2_t oi_cksum_mask = {
1591                                 0x1CF0020000000000,
1592                                 0x1CF0020000000000,
1593                         };
1594
1595                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1596                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1597
1598                         /* Extract OUTER_UDP_CKSUM bit 41 and
1599                          * move it to bit 61
1600                          */
1601
1602                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1603                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1604
1605                         /* Shift right oltype by 2 and iltype by 4
1606                          * to start oltype nibble from BIT(58)
1607                          * instead of BIT(56) and iltype nibble from BIT(48)
1608                          * instead of BIT(52).
1609                          */
1610                         const int8x16_t tshft5 = {
1611                                 8, 8, 8, 8, 8, 8, -4, -2,
1612                                 8, 8, 8, 8, 8, 8, -4, -2,
1613                         };
1614
1615                         xtmp128 = vshlq_u8(xtmp128, tshft5);
1616                         ytmp128 = vshlq_u8(ytmp128, tshft5);
1617                         /*
1618                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1619                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1620                          */
1621                         const int8x16_t tshft3 = {
1622                                 -1, 0, -1, 0, 0, 0, 0, 0,
1623                                 -1, 0, -1, 0, 0, 0, 0, 0,
1624                         };
1625
1626                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1627                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1628
1629                         /* Mark Bit(4) of oltype */
1630                         const uint64x2_t oi_cksum_mask2 = {
1631                                 0x1000000000000000,
1632                                 0x1000000000000000,
1633                         };
1634
1635                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1636                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1637
1638                         /* Do the lookup */
1639                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1640                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1641
1642                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
1643                          * Bit 56:63 of oltype and place it in corresponding
1644                          * place in senddesc_w1.
1645                          */
1646                         const uint8x16_t shuf_mask0 = {
1647                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1648                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1649                         };
1650
1651                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1652                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1653
1654                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1655                          * l3len, l2len, ol3len, ol2len.
1656                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1657                          * a = a + (a << 8)
1658                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1659                          * a = a + (a << 16)
1660                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1661                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1662                          */
1663                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1664                                                  vshlq_n_u32(senddesc01_w1, 8));
1665                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1666                                                  vshlq_n_u32(senddesc23_w1, 8));
1667
1668                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1669                         senddesc01_w1 = vaddq_u8(
1670                                 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1671                         senddesc23_w1 = vaddq_u8(
1672                                 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1673
1674                         /* Move ltypes to senddesc*_w1 */
1675                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1676                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1677                 }
1678
1679                 xmask01 = vdupq_n_u64(0);
1680                 xmask23 = xmask01;
1681                 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1682                              : [a] "+w"(xmask01)
1683                              : [in] "r"(mbuf0)
1684                              : "memory");
1685
1686                 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1687                              : [a] "+w"(xmask01)
1688                              : [in] "r"(mbuf1)
1689                              : "memory");
1690
1691                 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1692                              : [b] "+w"(xmask23)
1693                              : [in] "r"(mbuf2)
1694                              : "memory");
1695
1696                 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1697                              : [b] "+w"(xmask23)
1698                              : [in] "r"(mbuf3)
1699                              : "memory");
1700                 xmask01 = vshlq_n_u64(xmask01, 20);
1701                 xmask23 = vshlq_n_u64(xmask23, 20);
1702
1703                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1704                 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1705
1706                 if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
1707                         /* Tx ol_flag for vlan. */
1708                         const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
1709                         /* Bit enable for VLAN1 */
1710                         const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
1711                         /* Tx ol_flag for QnQ. */
1712                         const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
1713                         /* Bit enable for VLAN0 */
1714                         const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
1715                         /* Load vlan values from packet. outer is VLAN 0 */
1716                         uint64x2_t ext01 = {
1717                                 ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
1718                                         ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
1719                                 ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
1720                                         ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
1721                         };
1722                         uint64x2_t ext23 = {
1723                                 ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
1724                                         ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
1725                                 ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
1726                                         ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
1727                         };
1728
1729                         /* Get ol_flags of the packets. */
1730                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1731                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1732
1733                         /* ORR vlan outer/inner values into cmd. */
1734                         sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
1735                         sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
1736
1737                         /* Test for offload enable bits and generate masks. */
1738                         xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
1739                                                       mlv),
1740                                             vandq_u64(vtstq_u64(xtmp128, olq),
1741                                                       mlq));
1742                         ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
1743                                                       mlv),
1744                                             vandq_u64(vtstq_u64(ytmp128, olq),
1745                                                       mlq));
1746
1747                         /* Set vlan enable bits into cmd based on mask. */
1748                         sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
1749                         sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
1750                 }
1751
1752                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1753                         /* Tx ol_flag for timestam. */
1754                         const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
1755                                                 PKT_TX_IEEE1588_TMST};
1756                         /* Set send mem alg to SUB. */
1757                         const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
1758                         /* Increment send mem address by 8. */
1759                         const uint64x2_t addr = {0x8, 0x8};
1760
1761                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1762                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1763
1764                         /* Check if timestamp is requested and generate inverted
1765                          * mask as we need not make any changes to default cmd
1766                          * value.
1767                          */
1768                         xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
1769                         ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
1770
1771                         /* Change send mem address to an 8 byte offset when
1772                          * TSTMP is disabled.
1773                          */
1774                         sendmem01_w1 = vaddq_u64(sendmem01_w1,
1775                                                  vandq_u64(xtmp128, addr));
1776                         sendmem23_w1 = vaddq_u64(sendmem23_w1,
1777                                                  vandq_u64(ytmp128, addr));
1778                         /* Change send mem alg to SUB when TSTMP is disabled. */
1779                         sendmem01_w0 = vorrq_u64(sendmem01_w0,
1780                                                  vandq_u64(xtmp128, alg));
1781                         sendmem23_w0 = vorrq_u64(sendmem23_w0,
1782                                                  vandq_u64(ytmp128, alg));
1783
1784                         cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
1785                         cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
1786                         cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
1787                         cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
1788                 }
1789
1790                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1791                         const uint64_t lso_fmt = txq->lso_tun_fmt;
1792                         uint64_t sx_w0[NIX_DESCS_PER_LOOP];
1793                         uint64_t sd_w1[NIX_DESCS_PER_LOOP];
1794
1795                         /* Extract SD W1 as we need to set L4 types. */
1796                         vst1q_u64(sd_w1, senddesc01_w1);
1797                         vst1q_u64(sd_w1 + 2, senddesc23_w1);
1798
1799                         /* Extract SX W0 as we need to set LSO fields. */
1800                         vst1q_u64(sx_w0, sendext01_w0);
1801                         vst1q_u64(sx_w0 + 2, sendext23_w0);
1802
1803                         /* Extract ol_flags. */
1804                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1805                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1806
1807                         /* Prepare individual mbufs. */
1808                         cn10k_nix_prepare_tso(tx_pkts[0],
1809                                 (union nix_send_hdr_w1_u *)&sd_w1[0],
1810                                 (union nix_send_ext_w0_u *)&sx_w0[0],
1811                                 vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
1812
1813                         cn10k_nix_prepare_tso(tx_pkts[1],
1814                                 (union nix_send_hdr_w1_u *)&sd_w1[1],
1815                                 (union nix_send_ext_w0_u *)&sx_w0[1],
1816                                 vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
1817
1818                         cn10k_nix_prepare_tso(tx_pkts[2],
1819                                 (union nix_send_hdr_w1_u *)&sd_w1[2],
1820                                 (union nix_send_ext_w0_u *)&sx_w0[2],
1821                                 vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
1822
1823                         cn10k_nix_prepare_tso(tx_pkts[3],
1824                                 (union nix_send_hdr_w1_u *)&sd_w1[3],
1825                                 (union nix_send_ext_w0_u *)&sx_w0[3],
1826                                 vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
1827
1828                         senddesc01_w1 = vld1q_u64(sd_w1);
1829                         senddesc23_w1 = vld1q_u64(sd_w1 + 2);
1830
1831                         sendext01_w0 = vld1q_u64(sx_w0);
1832                         sendext23_w0 = vld1q_u64(sx_w0 + 2);
1833                 }
1834
1835                 if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
1836                     !(flags & NIX_TX_MULTI_SEG_F)) {
1837                         /* Set don't free bit if reference count > 1 */
1838                         xmask01 = vdupq_n_u64(0);
1839                         xmask23 = xmask01;
1840
1841                         /* Move mbufs to iova */
1842                         mbuf0 = (uint64_t *)tx_pkts[0];
1843                         mbuf1 = (uint64_t *)tx_pkts[1];
1844                         mbuf2 = (uint64_t *)tx_pkts[2];
1845                         mbuf3 = (uint64_t *)tx_pkts[3];
1846
1847                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1848                                 vsetq_lane_u64(0x80000, xmask01, 0);
1849                         else
1850                                 __mempool_check_cookies(
1851                                         ((struct rte_mbuf *)mbuf0)->pool,
1852                                         (void **)&mbuf0, 1, 0);
1853
1854                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1855                                 vsetq_lane_u64(0x80000, xmask01, 1);
1856                         else
1857                                 __mempool_check_cookies(
1858                                         ((struct rte_mbuf *)mbuf1)->pool,
1859                                         (void **)&mbuf1, 1, 0);
1860
1861                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1862                                 vsetq_lane_u64(0x80000, xmask23, 0);
1863                         else
1864                                 __mempool_check_cookies(
1865                                         ((struct rte_mbuf *)mbuf2)->pool,
1866                                         (void **)&mbuf2, 1, 0);
1867
1868                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1869                                 vsetq_lane_u64(0x80000, xmask23, 1);
1870                         else
1871                                 __mempool_check_cookies(
1872                                         ((struct rte_mbuf *)mbuf3)->pool,
1873                                         (void **)&mbuf3, 1, 0);
1874                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1875                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1876                 } else if (!(flags & NIX_TX_MULTI_SEG_F)) {
1877                         /* Move mbufs to iova */
1878                         mbuf0 = (uint64_t *)tx_pkts[0];
1879                         mbuf1 = (uint64_t *)tx_pkts[1];
1880                         mbuf2 = (uint64_t *)tx_pkts[2];
1881                         mbuf3 = (uint64_t *)tx_pkts[3];
1882
1883                         /* Mark mempool object as "put" since
1884                          * it is freed by NIX
1885                          */
1886                         __mempool_check_cookies(
1887                                 ((struct rte_mbuf *)mbuf0)->pool,
1888                                 (void **)&mbuf0, 1, 0);
1889
1890                         __mempool_check_cookies(
1891                                 ((struct rte_mbuf *)mbuf1)->pool,
1892                                 (void **)&mbuf1, 1, 0);
1893
1894                         __mempool_check_cookies(
1895                                 ((struct rte_mbuf *)mbuf2)->pool,
1896                                 (void **)&mbuf2, 1, 0);
1897
1898                         __mempool_check_cookies(
1899                                 ((struct rte_mbuf *)mbuf3)->pool,
1900                                 (void **)&mbuf3, 1, 0);
1901                 }
1902
1903                 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1904                 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1905                 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1906                 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1907                 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1908
1909                 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1910                 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1911                 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1912                 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1913
1914                 if (flags & NIX_TX_NEED_EXT_HDR) {
1915                         cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
1916                         cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
1917                         cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
1918                         cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
1919                 }
1920
1921                 if (flags & NIX_TX_MULTI_SEG_F) {
1922                         uint8_t j;
1923
1924                         segdw[4] = 8;
1925                         j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
1926                                                           cmd2, cmd3, segdw,
1927                                                           (uint64_t *)
1928                                                           LMT_OFF(laddr, lnum,
1929                                                                   0),
1930                                                           &wd.data128, &shift,
1931                                                           flags);
1932                         lnum += j;
1933                 } else if (flags & NIX_TX_NEED_EXT_HDR) {
1934                         /* Store the prepared send desc to LMT lines */
1935                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1936                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1937                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1938                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1939                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
1940                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
1941                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
1942                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
1943                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
1944                                 lnum += 1;
1945                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1946                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1947                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1948                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
1949                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
1950                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
1951                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
1952                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
1953                         } else {
1954                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1955                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1956                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1957                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
1958                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
1959                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
1960                                 lnum += 1;
1961                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1962                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1963                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1964                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
1965                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
1966                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
1967                         }
1968                         lnum += 1;
1969                 } else {
1970                         /* Store the prepared send desc to LMT lines */
1971                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1972                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1973                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1974                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1975                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1976                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1977                         vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1978                         vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1979                         lnum += 1;
1980                 }
1981
1982                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1983         }
1984
1985         if (flags & NIX_TX_MULTI_SEG_F)
1986                 wd.data[0] >>= 16;
1987
1988         if (flags & NIX_TX_VWQE_F)
1989                 roc_sso_hws_head_wait(base);
1990
1991         /* Trigger LMTST */
1992         if (lnum > 16) {
1993                 if (!(flags & NIX_TX_MULTI_SEG_F))
1994                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
1995
1996                 pa = io_addr | (wd.data[0] & 0x7) << 4;
1997                 wd.data[0] &= ~0x7ULL;
1998
1999                 if (flags & NIX_TX_MULTI_SEG_F)
2000                         wd.data[0] <<= 16;
2001
2002                 wd.data[0] |= (15ULL << 12);
2003                 wd.data[0] |= (uint64_t)lmt_id;
2004
2005                 /* STEOR0 */
2006                 roc_lmt_submit_steorl(wd.data[0], pa);
2007
2008                 if (!(flags & NIX_TX_MULTI_SEG_F))
2009                         wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
2010
2011                 pa = io_addr | (wd.data[1] & 0x7) << 4;
2012                 wd.data[1] &= ~0x7ULL;
2013
2014                 if (flags & NIX_TX_MULTI_SEG_F)
2015                         wd.data[1] <<= 16;
2016
2017                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
2018                 wd.data[1] |= (uint64_t)(lmt_id + 16);
2019
2020                 /* STEOR1 */
2021                 roc_lmt_submit_steorl(wd.data[1], pa);
2022         } else if (lnum) {
2023                 if (!(flags & NIX_TX_MULTI_SEG_F))
2024                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
2025
2026                 pa = io_addr | (wd.data[0] & 0x7) << 4;
2027                 wd.data[0] &= ~0x7ULL;
2028
2029                 if (flags & NIX_TX_MULTI_SEG_F)
2030                         wd.data[0] <<= 16;
2031
2032                 wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
2033                 wd.data[0] |= lmt_id;
2034
2035                 /* STEOR0 */
2036                 roc_lmt_submit_steorl(wd.data[0], pa);
2037         }
2038
2039         left -= burst;
2040         rte_io_wmb();
2041         if (left)
2042                 goto again;
2043
2044         if (unlikely(scalar)) {
2045                 if (flags & NIX_TX_MULTI_SEG_F)
2046                         pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
2047                                                          scalar, cmd, base,
2048                                                          flags);
2049                 else
2050                         pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
2051                                                     cmd, base, flags);
2052         }
2053
2054         return pkts;
2055 }
2056
2057 #else
2058 static __rte_always_inline uint16_t
2059 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
2060                            uint16_t pkts, uint64_t *cmd, uintptr_t base,
2061                            const uint16_t flags)
2062 {
2063         RTE_SET_USED(tx_queue);
2064         RTE_SET_USED(tx_pkts);
2065         RTE_SET_USED(pkts);
2066         RTE_SET_USED(cmd);
2067         RTE_SET_USED(flags);
2068         RTE_SET_USED(base);
2069         return 0;
2070 }
2071 #endif
2072
2073 #define L3L4CSUM_F   NIX_TX_OFFLOAD_L3_L4_CSUM_F
2074 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
2075 #define VLAN_F       NIX_TX_OFFLOAD_VLAN_QINQ_F
2076 #define NOFF_F       NIX_TX_OFFLOAD_MBUF_NOFF_F
2077 #define TSO_F        NIX_TX_OFFLOAD_TSO_F
2078 #define TSP_F        NIX_TX_OFFLOAD_TSTAMP_F
2079
2080 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
2081 #define NIX_TX_FASTPATH_MODES                                           \
2082 T(no_offload,                           0, 0, 0, 0, 0, 0,       4,      \
2083                 NIX_TX_OFFLOAD_NONE)                                    \
2084 T(l3l4csum,                             0, 0, 0, 0, 0, 1,       4,      \
2085                 L3L4CSUM_F)                                             \
2086 T(ol3ol4csum,                           0, 0, 0, 0, 1, 0,       4,      \
2087                 OL3OL4CSUM_F)                                           \
2088 T(ol3ol4csum_l3l4csum,                  0, 0, 0, 0, 1, 1,       4,      \
2089                 OL3OL4CSUM_F | L3L4CSUM_F)                              \
2090 T(vlan,                                 0, 0, 0, 1, 0, 0,       6,      \
2091                 VLAN_F)                                                 \
2092 T(vlan_l3l4csum,                        0, 0, 0, 1, 0, 1,       6,      \
2093                 VLAN_F | L3L4CSUM_F)                                    \
2094 T(vlan_ol3ol4csum,                      0, 0, 0, 1, 1, 0,       6,      \
2095                 VLAN_F | OL3OL4CSUM_F)                                  \
2096 T(vlan_ol3ol4csum_l3l4csum,             0, 0, 0, 1, 1, 1,       6,      \
2097                 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2098 T(noff,                                 0, 0, 1, 0, 0, 0,       4,      \
2099                 NOFF_F)                                                 \
2100 T(noff_l3l4csum,                        0, 0, 1, 0, 0, 1,       4,      \
2101                 NOFF_F | L3L4CSUM_F)                                    \
2102 T(noff_ol3ol4csum,                      0, 0, 1, 0, 1, 0,       4,      \
2103                 NOFF_F | OL3OL4CSUM_F)                                  \
2104 T(noff_ol3ol4csum_l3l4csum,             0, 0, 1, 0, 1, 1,       4,      \
2105                 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2106 T(noff_vlan,                            0, 0, 1, 1, 0, 0,       6,      \
2107                 NOFF_F | VLAN_F)                                        \
2108 T(noff_vlan_l3l4csum,                   0, 0, 1, 1, 0, 1,       6,      \
2109                 NOFF_F | VLAN_F | L3L4CSUM_F)                           \
2110 T(noff_vlan_ol3ol4csum,                 0, 0, 1, 1, 1, 0,       6,      \
2111                 NOFF_F | VLAN_F | OL3OL4CSUM_F)                         \
2112 T(noff_vlan_ol3ol4csum_l3l4csum,        0, 0, 1, 1, 1, 1,       6,      \
2113                 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)            \
2114 T(tso,                                  0, 1, 0, 0, 0, 0,       6,      \
2115                 TSO_F)                                                  \
2116 T(tso_l3l4csum,                         0, 1, 0, 0, 0, 1,       6,      \
2117                 TSO_F | L3L4CSUM_F)                                     \
2118 T(tso_ol3ol4csum,                       0, 1, 0, 0, 1, 0,       6,      \
2119                 TSO_F | OL3OL4CSUM_F)                                   \
2120 T(tso_ol3ol4csum_l3l4csum,              0, 1, 0, 0, 1, 1,       6,      \
2121                 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2122 T(tso_vlan,                             0, 1, 0, 1, 0, 0,       6,      \
2123                 TSO_F | VLAN_F)                                         \
2124 T(tso_vlan_l3l4csum,                    0, 1, 0, 1, 0, 1,       6,      \
2125                 TSO_F | VLAN_F | L3L4CSUM_F)                            \
2126 T(tso_vlan_ol3ol4csum,                  0, 1, 0, 1, 1, 0,       6,      \
2127                 TSO_F | VLAN_F | OL3OL4CSUM_F)                          \
2128 T(tso_vlan_ol3ol4csum_l3l4csum,         0, 1, 0, 1, 1, 1,       6,      \
2129                 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2130 T(tso_noff,                             0, 1, 1, 0, 0, 0,       6,      \
2131                 TSO_F | NOFF_F)                                         \
2132 T(tso_noff_l3l4csum,                    0, 1, 1, 0, 0, 1,       6,      \
2133                 TSO_F | NOFF_F | L3L4CSUM_F)                            \
2134 T(tso_noff_ol3ol4csum,                  0, 1, 1, 0, 1, 0,       6,      \
2135                 TSO_F | NOFF_F | OL3OL4CSUM_F)                          \
2136 T(tso_noff_ol3ol4csum_l3l4csum,         0, 1, 1, 0, 1, 1,       6,      \
2137                 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2138 T(tso_noff_vlan,                        0, 1, 1, 1, 0, 0,       6,      \
2139                 TSO_F | NOFF_F | VLAN_F)                                \
2140 T(tso_noff_vlan_l3l4csum,               0, 1, 1, 1, 0, 1,       6,      \
2141                 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2142 T(tso_noff_vlan_ol3ol4csum,             0, 1, 1, 1, 1, 0,       6,      \
2143                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2144 T(tso_noff_vlan_ol3ol4csum_l3l4csum,    0, 1, 1, 1, 1, 1,       6,      \
2145                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2146 T(ts,                                   1, 0, 0, 0, 0, 0,       8,      \
2147                 TSP_F)                                                  \
2148 T(ts_l3l4csum,                          1, 0, 0, 0, 0, 1,       8,      \
2149                 TSP_F | L3L4CSUM_F)                                     \
2150 T(ts_ol3ol4csum,                        1, 0, 0, 0, 1, 0,       8,      \
2151                 TSP_F | OL3OL4CSUM_F)                                   \
2152 T(ts_ol3ol4csum_l3l4csum,               1, 0, 0, 0, 1, 1,       8,      \
2153                 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2154 T(ts_vlan,                              1, 0, 0, 1, 0, 0,       8,      \
2155                 TSP_F | VLAN_F)                                         \
2156 T(ts_vlan_l3l4csum,                     1, 0, 0, 1, 0, 1,       8,      \
2157                 TSP_F | VLAN_F | L3L4CSUM_F)                            \
2158 T(ts_vlan_ol3ol4csum,                   1, 0, 0, 1, 1, 0,       8,      \
2159                 TSP_F | VLAN_F | OL3OL4CSUM_F)                          \
2160 T(ts_vlan_ol3ol4csum_l3l4csum,          1, 0, 0, 1, 1, 1,       8,      \
2161                 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2162 T(ts_noff,                              1, 0, 1, 0, 0, 0,       8,      \
2163                 TSP_F | NOFF_F)                                         \
2164 T(ts_noff_l3l4csum,                     1, 0, 1, 0, 0, 1,       8,      \
2165                 TSP_F | NOFF_F | L3L4CSUM_F)                            \
2166 T(ts_noff_ol3ol4csum,                   1, 0, 1, 0, 1, 0,       8,      \
2167                 TSP_F | NOFF_F | OL3OL4CSUM_F)                          \
2168 T(ts_noff_ol3ol4csum_l3l4csum,          1, 0, 1, 0, 1, 1,       8,      \
2169                 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2170 T(ts_noff_vlan,                         1, 0, 1, 1, 0, 0,       8,      \
2171                 TSP_F | NOFF_F | VLAN_F)                                \
2172 T(ts_noff_vlan_l3l4csum,                1, 0, 1, 1, 0, 1,       8,      \
2173                 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2174 T(ts_noff_vlan_ol3ol4csum,              1, 0, 1, 1, 1, 0,       8,      \
2175                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2176 T(ts_noff_vlan_ol3ol4csum_l3l4csum,     1, 0, 1, 1, 1, 1,       8,      \
2177                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2178 T(ts_tso,                               1, 1, 0, 0, 0, 0,       8,      \
2179                 TSP_F | TSO_F)                                          \
2180 T(ts_tso_l3l4csum,                      1, 1, 0, 0, 0, 1,       8,      \
2181                 TSP_F | TSO_F | L3L4CSUM_F)                             \
2182 T(ts_tso_ol3ol4csum,                    1, 1, 0, 0, 1, 0,       8,      \
2183                 TSP_F | TSO_F | OL3OL4CSUM_F)                           \
2184 T(ts_tso_ol3ol4csum_l3l4csum,           1, 1, 0, 0, 1, 1,       8,      \
2185                 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)              \
2186 T(ts_tso_vlan,                          1, 1, 0, 1, 0, 0,       8,      \
2187                 TSP_F | TSO_F | VLAN_F)                                 \
2188 T(ts_tso_vlan_l3l4csum,                 1, 1, 0, 1, 0, 1,       8,      \
2189                 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F)                    \
2190 T(ts_tso_vlan_ol3ol4csum,               1, 1, 0, 1, 1, 0,       8,      \
2191                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F)                  \
2192 T(ts_tso_vlan_ol3ol4csum_l3l4csum,      1, 1, 0, 1, 1, 1,       8,      \
2193                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2194 T(ts_tso_noff,                          1, 1, 1, 0, 0, 0,       8,      \
2195                 TSP_F | TSO_F | NOFF_F)                                 \
2196 T(ts_tso_noff_l3l4csum,                 1, 1, 1, 0, 0, 1,       8,      \
2197                 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F)                    \
2198 T(ts_tso_noff_ol3ol4csum,               1, 1, 1, 0, 1, 0,       8,      \
2199                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F)                  \
2200 T(ts_tso_noff_ol3ol4csum_l3l4csum,      1, 1, 1, 0, 1, 1,       8,      \
2201                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2202 T(ts_tso_noff_vlan,                     1, 1, 1, 1, 0, 0,       8,      \
2203                 TSP_F | TSO_F | NOFF_F | VLAN_F)                        \
2204 T(ts_tso_noff_vlan_l3l4csum,            1, 1, 1, 1, 0, 1,       8,      \
2205                 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)           \
2206 T(ts_tso_noff_vlan_ol3ol4csum,          1, 1, 1, 1, 1, 0,       8,      \
2207                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)         \
2208 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1,       8,      \
2209                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
2210
2211 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
2212         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name(          \
2213                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2214                                                                                \
2215         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name(     \
2216                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2217                                                                                \
2218         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
2219                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2220                                                                                \
2221         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
2222                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2223
2224 NIX_TX_FASTPATH_MODES
2225 #undef T
2226
2227 #endif /* __CN10K_TX_H__ */