net/bnxt: support Thor WC TCAM
[dpdk.git] / drivers / net / cnxk / cn10k_tx.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2021 Marvell.
3  */
4 #ifndef __CN10K_TX_H__
5 #define __CN10K_TX_H__
6
7 #include <rte_vect.h>
8
9 #define NIX_TX_OFFLOAD_NONE           (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F   BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F    BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F    BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F          BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F       BIT(5)
16
17 /* Flags to control xmit_prepare function.
18  * Defining it from backwards to denote its been
19  * not used as offload flags to pick function
20  */
21 #define NIX_TX_MULTI_SEG_F BIT(15)
22
23 #define NIX_TX_NEED_SEND_HDR_W1                                                \
24         (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |         \
25          NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
26
27 #define NIX_TX_NEED_EXT_HDR                                                    \
28         (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |                \
29          NIX_TX_OFFLOAD_TSO_F)
30
31 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
32         do {                                                                   \
33                 /* Cached value is low, Update the fc_cache_pkts */            \
34                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
35                         /* Multiply with sqe_per_sqb to express in pkts */     \
36                         (txq)->fc_cache_pkts =                                 \
37                                 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
38                                 << (txq)->sqes_per_sqb_log2;                   \
39                         /* Check it again for the room */                      \
40                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
41                                 return 0;                                      \
42                 }                                                              \
43         } while (0)
44
45 /* Encoded number of segments to number of dwords macro, each value of nb_segs
46  * is encoded as 4bits.
47  */
48 #define NIX_SEGDW_MAGIC 0x76654432210ULL
49
50 #define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
51
52 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
53         (void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
54
55 /* Function to determine no of tx subdesc required in case ext
56  * sub desc is enabled.
57  */
58 static __rte_always_inline int
59 cn10k_nix_tx_ext_subs(const uint16_t flags)
60 {
61         return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
62                        ? 2
63                        : ((flags &
64                            (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
65                                   ? 1
66                                   : 0);
67 }
68
69 static __rte_always_inline uint8_t
70 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
71 {
72         return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
73                << ROC_LMT_LINES_PER_CORE_LOG2;
74 }
75
76 static __rte_always_inline uint8_t
77 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
78 {
79         return (flags & NIX_TX_NEED_EXT_HDR) ?
80                              ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
81                              8;
82 }
83
84 static __rte_always_inline uint64_t
85 cn10k_nix_tx_steor_data(const uint16_t flags)
86 {
87         const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
88         uint64_t data;
89
90         /* This will be moved to addr area */
91         data = dw_m1;
92         /* 15 vector sizes for single seg */
93         data |= dw_m1 << 19;
94         data |= dw_m1 << 22;
95         data |= dw_m1 << 25;
96         data |= dw_m1 << 28;
97         data |= dw_m1 << 31;
98         data |= dw_m1 << 34;
99         data |= dw_m1 << 37;
100         data |= dw_m1 << 40;
101         data |= dw_m1 << 43;
102         data |= dw_m1 << 46;
103         data |= dw_m1 << 49;
104         data |= dw_m1 << 52;
105         data |= dw_m1 << 55;
106         data |= dw_m1 << 58;
107         data |= dw_m1 << 61;
108
109         return data;
110 }
111
112 static __rte_always_inline uint8_t
113 cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
114 {
115         return ((flags & NIX_TX_NEED_EXT_HDR) ?
116                               (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
117                               4);
118 }
119
120 static __rte_always_inline uint64_t
121 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
122 {
123         const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
124         uint64_t data;
125
126         /* This will be moved to addr area */
127         data = dw_m1;
128         /* 15 vector sizes for single seg */
129         data |= dw_m1 << 19;
130         data |= dw_m1 << 22;
131         data |= dw_m1 << 25;
132         data |= dw_m1 << 28;
133         data |= dw_m1 << 31;
134         data |= dw_m1 << 34;
135         data |= dw_m1 << 37;
136         data |= dw_m1 << 40;
137         data |= dw_m1 << 43;
138         data |= dw_m1 << 46;
139         data |= dw_m1 << 49;
140         data |= dw_m1 << 52;
141         data |= dw_m1 << 55;
142         data |= dw_m1 << 58;
143         data |= dw_m1 << 61;
144
145         return data;
146 }
147
148 static __rte_always_inline void
149 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
150                       const uint16_t flags)
151 {
152         /* Send hdr */
153         cmd[0] = txq->send_hdr_w0;
154         cmd[1] = 0;
155         cmd += 2;
156
157         /* Send ext if present */
158         if (flags & NIX_TX_NEED_EXT_HDR) {
159                 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
160                 cmd += 2;
161         }
162
163         /* Send sg */
164         cmd[0] = txq->sg_w0;
165         cmd[1] = 0;
166 }
167
168 static __rte_always_inline void
169 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
170 {
171         uint64_t mask, ol_flags = m->ol_flags;
172
173         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
174                 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
175                 uint16_t *iplen, *oiplen, *oudplen;
176                 uint16_t lso_sb, paylen;
177
178                 mask = -!!(ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6));
179                 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
180                          m->l2_len + m->l3_len + m->l4_len;
181
182                 /* Reduce payload len from base headers */
183                 paylen = m->pkt_len - lso_sb;
184
185                 /* Get iplen position assuming no tunnel hdr */
186                 iplen = (uint16_t *)(mdata + m->l2_len +
187                                      (2 << !!(ol_flags & PKT_TX_IPV6)));
188                 /* Handle tunnel tso */
189                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
190                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
191                         const uint8_t is_udp_tun =
192                                 (CNXK_NIX_UDP_TUN_BITMASK >>
193                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
194                                 0x1;
195
196                         oiplen = (uint16_t *)(mdata + m->outer_l2_len +
197                                               (2 << !!(ol_flags &
198                                                        PKT_TX_OUTER_IPV6)));
199                         *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
200                                                    paylen);
201
202                         /* Update format for UDP tunneled packet */
203                         if (is_udp_tun) {
204                                 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
205                                                        m->outer_l3_len + 4);
206                                 *oudplen = rte_cpu_to_be_16(
207                                         rte_be_to_cpu_16(*oudplen) - paylen);
208                         }
209
210                         /* Update iplen position to inner ip hdr */
211                         iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
212                                              m->l4_len +
213                                              (2 << !!(ol_flags & PKT_TX_IPV6)));
214                 }
215
216                 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
217         }
218 }
219
220 static __rte_always_inline void
221 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
222                        const uint16_t flags, const uint64_t lso_tun_fmt)
223 {
224         struct nix_send_ext_s *send_hdr_ext;
225         struct nix_send_hdr_s *send_hdr;
226         uint64_t ol_flags = 0, mask;
227         union nix_send_hdr_w1_u w1;
228         union nix_send_sg_s *sg;
229
230         send_hdr = (struct nix_send_hdr_s *)cmd;
231         if (flags & NIX_TX_NEED_EXT_HDR) {
232                 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
233                 sg = (union nix_send_sg_s *)(cmd + 4);
234                 /* Clear previous markings */
235                 send_hdr_ext->w0.lso = 0;
236                 send_hdr_ext->w1.u = 0;
237         } else {
238                 sg = (union nix_send_sg_s *)(cmd + 2);
239         }
240
241         if (flags & NIX_TX_NEED_SEND_HDR_W1) {
242                 ol_flags = m->ol_flags;
243                 w1.u = 0;
244         }
245
246         if (!(flags & NIX_TX_MULTI_SEG_F)) {
247                 send_hdr->w0.total = m->data_len;
248                 send_hdr->w0.aura =
249                         roc_npa_aura_handle_to_aura(m->pool->pool_id);
250         }
251
252         /*
253          * L3type:  2 => IPV4
254          *          3 => IPV4 with csum
255          *          4 => IPV6
256          * L3type and L3ptr needs to be set for either
257          * L3 csum or L4 csum or LSO
258          *
259          */
260
261         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
262             (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
263                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
264                 const uint8_t ol3type =
265                         ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
266                         ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
267                         !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
268
269                 /* Outer L3 */
270                 w1.ol3type = ol3type;
271                 mask = 0xffffull << ((!!ol3type) << 4);
272                 w1.ol3ptr = ~mask & m->outer_l2_len;
273                 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
274
275                 /* Outer L4 */
276                 w1.ol4type = csum + (csum << 1);
277
278                 /* Inner L3 */
279                 w1.il3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
280                              ((!!(ol_flags & PKT_TX_IPV6)) << 2);
281                 w1.il3ptr = w1.ol4ptr + m->l2_len;
282                 w1.il4ptr = w1.il3ptr + m->l3_len;
283                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
284                 w1.il3type = w1.il3type + !!(ol_flags & PKT_TX_IP_CKSUM);
285
286                 /* Inner L4 */
287                 w1.il4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
288
289                 /* In case of no tunnel header use only
290                  * shift IL3/IL4 fields a bit to use
291                  * OL3/OL4 for header checksum
292                  */
293                 mask = !ol3type;
294                 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
295                        ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
296
297         } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
298                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
299                 const uint8_t outer_l2_len = m->outer_l2_len;
300
301                 /* Outer L3 */
302                 w1.ol3ptr = outer_l2_len;
303                 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
304                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
305                 w1.ol3type = ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
306                              ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
307                              !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
308
309                 /* Outer L4 */
310                 w1.ol4type = csum + (csum << 1);
311
312         } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
313                 const uint8_t l2_len = m->l2_len;
314
315                 /* Always use OLXPTR and OLXTYPE when only
316                  * when one header is present
317                  */
318
319                 /* Inner L3 */
320                 w1.ol3ptr = l2_len;
321                 w1.ol4ptr = l2_len + m->l3_len;
322                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
323                 w1.ol3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
324                              ((!!(ol_flags & PKT_TX_IPV6)) << 2) +
325                              !!(ol_flags & PKT_TX_IP_CKSUM);
326
327                 /* Inner L4 */
328                 w1.ol4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
329         }
330
331         if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
332                 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & PKT_TX_VLAN);
333                 /* HW will update ptr after vlan0 update */
334                 send_hdr_ext->w1.vlan1_ins_ptr = 12;
335                 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
336
337                 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & PKT_TX_QINQ);
338                 /* 2B before end of l2 header */
339                 send_hdr_ext->w1.vlan0_ins_ptr = 12;
340                 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
341         }
342
343         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
344                 uint16_t lso_sb;
345                 uint64_t mask;
346
347                 mask = -(!w1.il3type);
348                 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
349
350                 send_hdr_ext->w0.lso_sb = lso_sb;
351                 send_hdr_ext->w0.lso = 1;
352                 send_hdr_ext->w0.lso_mps = m->tso_segsz;
353                 send_hdr_ext->w0.lso_format =
354                         NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
355                 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
356
357                 /* Handle tunnel tso */
358                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
359                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
360                         const uint8_t is_udp_tun =
361                                 (CNXK_NIX_UDP_TUN_BITMASK >>
362                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
363                                 0x1;
364                         uint8_t shift = is_udp_tun ? 32 : 0;
365
366                         shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
367                         shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
368
369                         w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
370                         w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
371                         /* Update format for UDP tunneled packet */
372                         send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
373                 }
374         }
375
376         if (flags & NIX_TX_NEED_SEND_HDR_W1)
377                 send_hdr->w1.u = w1.u;
378
379         if (!(flags & NIX_TX_MULTI_SEG_F)) {
380                 sg->seg1_size = m->data_len;
381                 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
382
383                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
384                         /* DF bit = 1 if refcount of current mbuf or parent mbuf
385                          *              is greater than 1
386                          * DF bit = 0 otherwise
387                          */
388                         send_hdr->w0.df = cnxk_nix_prefree_seg(m);
389                 }
390                 /* Mark mempool object as "put" since it is freed by NIX */
391                 if (!send_hdr->w0.df)
392                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
393         }
394
395         /* With minimal offloads, 'cmd' being local could be optimized out to
396          * registers. In other cases, 'cmd' will be in stack. Intent is
397          * 'cmd' stores content from txq->cmd which is copied only once.
398          */
399         *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
400         lmt_addr += 16;
401         if (flags & NIX_TX_NEED_EXT_HDR) {
402                 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
403                 lmt_addr += 16;
404         }
405         /* In case of multi-seg, sg template is stored here */
406         *((union nix_send_sg_s *)lmt_addr) = *sg;
407         *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
408 }
409
410 static __rte_always_inline void
411 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
412                               const uint64_t ol_flags, const uint16_t no_segdw,
413                               const uint16_t flags)
414 {
415         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
416                 const uint8_t is_ol_tstamp = !(ol_flags & PKT_TX_IEEE1588_TMST);
417                 struct nix_send_ext_s *send_hdr_ext =
418                                         (struct nix_send_ext_s *)lmt_addr + 16;
419                 uint64_t *lmt = (uint64_t *)lmt_addr;
420                 uint16_t off = (no_segdw - 1) << 1;
421                 struct nix_send_mem_s *send_mem;
422
423                 send_mem = (struct nix_send_mem_s *)(lmt + off);
424                 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
425                 send_hdr_ext->w0.tstmp = 1;
426                 if (flags & NIX_TX_MULTI_SEG_F) {
427                         /* Retrieving the default desc values */
428                         lmt[off] = cmd[2];
429
430                         /* Using compiler barier to avoid voilation of C
431                          * aliasing rules.
432                          */
433                         rte_compiler_barrier();
434                 }
435
436                 /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
437                  * should not be recorded, hence changing the alg type to
438                  * NIX_SENDMEMALG_SET and also changing send mem addr field to
439                  * next 8 bytes as it corrpt the actual tx tstamp registered
440                  * address.
441                  */
442                 send_mem->w0.subdc = NIX_SUBDC_MEM;
443                 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
444                 send_mem->addr =
445                         (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
446         }
447 }
448
449 static __rte_always_inline uint16_t
450 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
451 {
452         struct nix_send_hdr_s *send_hdr;
453         union nix_send_sg_s *sg;
454         struct rte_mbuf *m_next;
455         uint64_t *slist, sg_u;
456         uint64_t nb_segs;
457         uint64_t segdw;
458         uint8_t off, i;
459
460         send_hdr = (struct nix_send_hdr_s *)cmd;
461         send_hdr->w0.total = m->pkt_len;
462         send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
463
464         if (flags & NIX_TX_NEED_EXT_HDR)
465                 off = 2;
466         else
467                 off = 0;
468
469         sg = (union nix_send_sg_s *)&cmd[2 + off];
470         /* Clear sg->u header before use */
471         sg->u &= 0xFC00000000000000;
472         sg_u = sg->u;
473         slist = &cmd[3 + off];
474
475         i = 0;
476         nb_segs = m->nb_segs;
477
478         /* Fill mbuf segments */
479         do {
480                 m_next = m->next;
481                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
482                 *slist = rte_mbuf_data_iova(m);
483                 /* Set invert df if buffer is not to be freed by H/W */
484                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
485                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
486                         /* Mark mempool object as "put" since it is freed by NIX
487                          */
488 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
489                 if (!(sg_u & (1ULL << (i + 55))))
490                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
491 #endif
492                 slist++;
493                 i++;
494                 nb_segs--;
495                 if (i > 2 && nb_segs) {
496                         i = 0;
497                         /* Next SG subdesc */
498                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
499                         sg->u = sg_u;
500                         sg->segs = 3;
501                         sg = (union nix_send_sg_s *)slist;
502                         sg_u = sg->u;
503                         slist++;
504                 }
505                 m = m_next;
506         } while (nb_segs);
507
508         sg->u = sg_u;
509         sg->segs = i;
510         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
511         /* Roundup extra dwords to multiple of 2 */
512         segdw = (segdw >> 1) + (segdw & 0x1);
513         /* Default dwords */
514         segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
515         send_hdr->w0.sizem1 = segdw - 1;
516
517         return segdw;
518 }
519
520 static __rte_always_inline uint16_t
521 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
522                     uint64_t *cmd, const uint16_t flags)
523 {
524         struct cn10k_eth_txq *txq = tx_queue;
525         const rte_iova_t io_addr = txq->io_addr;
526         uintptr_t pa, lmt_addr = txq->lmt_base;
527         uint16_t lmt_id, burst, left, i;
528         uint64_t lso_tun_fmt;
529         uint64_t data;
530
531         NIX_XMIT_FC_OR_RETURN(txq, pkts);
532
533         /* Get cmd skeleton */
534         cn10k_nix_tx_skeleton(txq, cmd, flags);
535
536         /* Reduce the cached count */
537         txq->fc_cache_pkts -= pkts;
538
539         if (flags & NIX_TX_OFFLOAD_TSO_F)
540                 lso_tun_fmt = txq->lso_tun_fmt;
541
542         /* Get LMT base address and LMT ID as lcore id */
543         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
544         left = pkts;
545 again:
546         burst = left > 32 ? 32 : left;
547         for (i = 0; i < burst; i++) {
548                 /* Perform header writes for TSO, barrier at
549                  * lmt steorl will suffice.
550                  */
551                 if (flags & NIX_TX_OFFLOAD_TSO_F)
552                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
553
554                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
555                                        lso_tun_fmt);
556                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
557                                               tx_pkts[i]->ol_flags, 4, flags);
558                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
559         }
560
561         /* Trigger LMTST */
562         if (burst > 16) {
563                 data = cn10k_nix_tx_steor_data(flags);
564                 pa = io_addr | (data & 0x7) << 4;
565                 data &= ~0x7ULL;
566                 data |= (15ULL << 12);
567                 data |= (uint64_t)lmt_id;
568
569                 /* STEOR0 */
570                 roc_lmt_submit_steorl(data, pa);
571
572                 data = cn10k_nix_tx_steor_data(flags);
573                 pa = io_addr | (data & 0x7) << 4;
574                 data &= ~0x7ULL;
575                 data |= ((uint64_t)(burst - 17)) << 12;
576                 data |= (uint64_t)(lmt_id + 16);
577
578                 /* STEOR1 */
579                 roc_lmt_submit_steorl(data, pa);
580         } else if (burst) {
581                 data = cn10k_nix_tx_steor_data(flags);
582                 pa = io_addr | (data & 0x7) << 4;
583                 data &= ~0x7ULL;
584                 data |= ((uint64_t)(burst - 1)) << 12;
585                 data |= lmt_id;
586
587                 /* STEOR0 */
588                 roc_lmt_submit_steorl(data, pa);
589         }
590
591         left -= burst;
592         rte_io_wmb();
593         if (left) {
594                 /* Start processing another burst */
595                 tx_pkts += burst;
596                 /* Reset lmt base addr */
597                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
598                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
599                 goto again;
600         }
601
602         return pkts;
603 }
604
605 static __rte_always_inline uint16_t
606 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
607                          uint16_t pkts, uint64_t *cmd, const uint16_t flags)
608 {
609         struct cn10k_eth_txq *txq = tx_queue;
610         uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
611         const rte_iova_t io_addr = txq->io_addr;
612         uint16_t segdw, lmt_id, burst, left, i;
613         uint64_t data0, data1;
614         uint64_t lso_tun_fmt;
615         __uint128_t data128;
616         uint16_t shft;
617
618         NIX_XMIT_FC_OR_RETURN(txq, pkts);
619
620         cn10k_nix_tx_skeleton(txq, cmd, flags);
621
622         /* Reduce the cached count */
623         txq->fc_cache_pkts -= pkts;
624
625         if (flags & NIX_TX_OFFLOAD_TSO_F)
626                 lso_tun_fmt = txq->lso_tun_fmt;
627
628         /* Get LMT base address and LMT ID as lcore id */
629         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
630         left = pkts;
631 again:
632         burst = left > 32 ? 32 : left;
633         shft = 16;
634         data128 = 0;
635         for (i = 0; i < burst; i++) {
636                 /* Perform header writes for TSO, barrier at
637                  * lmt steorl will suffice.
638                  */
639                 if (flags & NIX_TX_OFFLOAD_TSO_F)
640                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
641
642                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
643                                        lso_tun_fmt);
644                 /* Store sg list directly on lmt line */
645                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
646                                                flags);
647                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
648                                               tx_pkts[i]->ol_flags, segdw,
649                                               flags);
650                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
651                 data128 |= (((__uint128_t)(segdw - 1)) << shft);
652                 shft += 3;
653         }
654
655         data0 = (uint64_t)data128;
656         data1 = (uint64_t)(data128 >> 64);
657         /* Make data0 similar to data1 */
658         data0 >>= 16;
659         /* Trigger LMTST */
660         if (burst > 16) {
661                 pa0 = io_addr | (data0 & 0x7) << 4;
662                 data0 &= ~0x7ULL;
663                 /* Move lmtst1..15 sz to bits 63:19 */
664                 data0 <<= 16;
665                 data0 |= (15ULL << 12);
666                 data0 |= (uint64_t)lmt_id;
667
668                 /* STEOR0 */
669                 roc_lmt_submit_steorl(data0, pa0);
670
671                 pa1 = io_addr | (data1 & 0x7) << 4;
672                 data1 &= ~0x7ULL;
673                 data1 <<= 16;
674                 data1 |= ((uint64_t)(burst - 17)) << 12;
675                 data1 |= (uint64_t)(lmt_id + 16);
676
677                 /* STEOR1 */
678                 roc_lmt_submit_steorl(data1, pa1);
679         } else if (burst) {
680                 pa0 = io_addr | (data0 & 0x7) << 4;
681                 data0 &= ~0x7ULL;
682                 /* Move lmtst1..15 sz to bits 63:19 */
683                 data0 <<= 16;
684                 data0 |= ((burst - 1) << 12);
685                 data0 |= (uint64_t)lmt_id;
686
687                 /* STEOR0 */
688                 roc_lmt_submit_steorl(data0, pa0);
689         }
690
691         left -= burst;
692         rte_io_wmb();
693         if (left) {
694                 /* Start processing another burst */
695                 tx_pkts += burst;
696                 /* Reset lmt base addr */
697                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
698                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
699                 goto again;
700         }
701
702         return pkts;
703 }
704
705 #if defined(RTE_ARCH_ARM64)
706
707 static __rte_always_inline void
708 cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
709                       union nix_send_ext_w0_u *w0, uint64_t ol_flags,
710                       const uint64_t flags, const uint64_t lso_tun_fmt)
711 {
712         uint16_t lso_sb;
713         uint64_t mask;
714
715         if (!(ol_flags & PKT_TX_TCP_SEG))
716                 return;
717
718         mask = -(!w1->il3type);
719         lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
720
721         w0->u |= BIT(14);
722         w0->lso_sb = lso_sb;
723         w0->lso_mps = m->tso_segsz;
724         w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
725         w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
726
727         /* Handle tunnel tso */
728         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
729             (ol_flags & PKT_TX_TUNNEL_MASK)) {
730                 const uint8_t is_udp_tun =
731                         (CNXK_NIX_UDP_TUN_BITMASK >>
732                          ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
733                         0x1;
734                 uint8_t shift = is_udp_tun ? 32 : 0;
735
736                 shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
737                 shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
738
739                 w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
740                 w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
741                 /* Update format for UDP tunneled packet */
742
743                 w0->lso_format = (lso_tun_fmt >> shift);
744         }
745 }
746
747 static __rte_always_inline void
748 cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
749                                 union nix_send_hdr_w0_u *sh,
750                                 union nix_send_sg_s *sg, const uint32_t flags)
751 {
752         struct rte_mbuf *m_next;
753         uint64_t *slist, sg_u;
754         uint16_t nb_segs;
755         int i = 1;
756
757         sh->total = m->pkt_len;
758         /* Clear sg->u header before use */
759         sg->u &= 0xFC00000000000000;
760         sg_u = sg->u;
761         slist = &cmd[0];
762
763         sg_u = sg_u | ((uint64_t)m->data_len);
764
765         nb_segs = m->nb_segs - 1;
766         m_next = m->next;
767
768         /* Set invert df if buffer is not to be freed by H/W */
769         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
770                 sg_u |= (cnxk_nix_prefree_seg(m) << 55);
771                 /* Mark mempool object as "put" since it is freed by NIX */
772 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
773         if (!(sg_u & (1ULL << 55)))
774                 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
775         rte_io_wmb();
776 #endif
777
778         m = m_next;
779         /* Fill mbuf segments */
780         do {
781                 m_next = m->next;
782                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
783                 *slist = rte_mbuf_data_iova(m);
784                 /* Set invert df if buffer is not to be freed by H/W */
785                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
786                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
787                         /* Mark mempool object as "put" since it is freed by NIX
788                          */
789 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
790                 if (!(sg_u & (1ULL << (i + 55))))
791                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
792                 rte_io_wmb();
793 #endif
794                 slist++;
795                 i++;
796                 nb_segs--;
797                 if (i > 2 && nb_segs) {
798                         i = 0;
799                         /* Next SG subdesc */
800                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
801                         sg->u = sg_u;
802                         sg->segs = 3;
803                         sg = (union nix_send_sg_s *)slist;
804                         sg_u = sg->u;
805                         slist++;
806                 }
807                 m = m_next;
808         } while (nb_segs);
809
810         sg->u = sg_u;
811         sg->segs = i;
812 }
813
814 static __rte_always_inline void
815 cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
816                            uint64x2_t *cmd1, const uint8_t segdw,
817                            const uint32_t flags)
818 {
819         union nix_send_hdr_w0_u sh;
820         union nix_send_sg_s sg;
821
822         if (m->nb_segs == 1) {
823                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
824                         sg.u = vgetq_lane_u64(cmd1[0], 0);
825                         sg.u |= (cnxk_nix_prefree_seg(m) << 55);
826                         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
827                 }
828
829 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
830                 sg.u = vgetq_lane_u64(cmd1[0], 0);
831                 if (!(sg.u & (1ULL << 55)))
832                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
833                 rte_io_wmb();
834 #endif
835                 return;
836         }
837
838         sh.u = vgetq_lane_u64(cmd0[0], 0);
839         sg.u = vgetq_lane_u64(cmd1[0], 0);
840
841         cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
842
843         sh.sizem1 = segdw - 1;
844         cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
845         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
846 }
847
848 #define NIX_DESCS_PER_LOOP 4
849
850 static __rte_always_inline uint8_t
851 cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
852                                uint64x2_t *cmd1, uint64x2_t *cmd2,
853                                uint64x2_t *cmd3, uint8_t *segdw,
854                                uint64_t *lmt_addr, __uint128_t *data128,
855                                uint8_t *shift, const uint16_t flags)
856 {
857         uint8_t j, off, lmt_used;
858
859         if (!(flags & NIX_TX_NEED_EXT_HDR) &&
860             !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
861                 /* No segments in 4 consecutive packets. */
862                 if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
863                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
864                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
865                                                            &cmd0[j], &cmd1[j],
866                                                            segdw[j], flags);
867                         vst1q_u64(lmt_addr, cmd0[0]);
868                         vst1q_u64(lmt_addr + 2, cmd1[0]);
869                         vst1q_u64(lmt_addr + 4, cmd0[1]);
870                         vst1q_u64(lmt_addr + 6, cmd1[1]);
871                         vst1q_u64(lmt_addr + 8, cmd0[2]);
872                         vst1q_u64(lmt_addr + 10, cmd1[2]);
873                         vst1q_u64(lmt_addr + 12, cmd0[3]);
874                         vst1q_u64(lmt_addr + 14, cmd1[3]);
875
876                         *data128 |= ((__uint128_t)7) << *shift;
877                         shift += 3;
878
879                         return 1;
880                 }
881         }
882
883         lmt_used = 0;
884         for (j = 0; j < NIX_DESCS_PER_LOOP;) {
885                 /* Fit consecutive packets in same LMTLINE. */
886                 if ((segdw[j] + segdw[j + 1]) <= 8) {
887                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
888                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
889                                                            &cmd0[j], &cmd1[j],
890                                                            segdw[j], flags);
891                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
892                                                            &cmd0[j + 1],
893                                                            &cmd1[j + 1],
894                                                            segdw[j + 1], flags);
895                                 /* TSTAMP takes 4 each, no segs. */
896                                 vst1q_u64(lmt_addr, cmd0[j]);
897                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
898                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
899                                 vst1q_u64(lmt_addr + 6, cmd3[j]);
900
901                                 vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
902                                 vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
903                                 vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
904                                 vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
905                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
906                                 /* EXT header take 3 each, space for 2 segs.*/
907                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
908                                                            lmt_addr + 6,
909                                                            &cmd0[j], &cmd1[j],
910                                                            segdw[j], flags);
911                                 vst1q_u64(lmt_addr, cmd0[j]);
912                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
913                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
914                                 off = segdw[j] - 3;
915                                 off <<= 1;
916                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
917                                                            lmt_addr + 12 + off,
918                                                            &cmd0[j + 1],
919                                                            &cmd1[j + 1],
920                                                            segdw[j + 1], flags);
921                                 vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
922                                 vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
923                                 vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
924                         } else {
925                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
926                                                            lmt_addr + 4,
927                                                            &cmd0[j], &cmd1[j],
928                                                            segdw[j], flags);
929                                 vst1q_u64(lmt_addr, cmd0[j]);
930                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
931                                 off = segdw[j] - 2;
932                                 off <<= 1;
933                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
934                                                            lmt_addr + 8 + off,
935                                                            &cmd0[j + 1],
936                                                            &cmd1[j + 1],
937                                                            segdw[j + 1], flags);
938                                 vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
939                                 vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
940                         }
941                         *data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
942                                     << *shift;
943                         *shift += 3;
944                         j += 2;
945                 } else {
946                         if ((flags & NIX_TX_NEED_EXT_HDR) &&
947                             (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
948                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
949                                                            lmt_addr + 6,
950                                                            &cmd0[j], &cmd1[j],
951                                                            segdw[j], flags);
952                                 vst1q_u64(lmt_addr, cmd0[j]);
953                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
954                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
955                                 off = segdw[j] - 4;
956                                 off <<= 1;
957                                 vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
958                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
959                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
960                                                            lmt_addr + 6,
961                                                            &cmd0[j], &cmd1[j],
962                                                            segdw[j], flags);
963                                 vst1q_u64(lmt_addr, cmd0[j]);
964                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
965                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
966                         } else {
967                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
968                                                            lmt_addr + 4,
969                                                            &cmd0[j], &cmd1[j],
970                                                            segdw[j], flags);
971                                 vst1q_u64(lmt_addr, cmd0[j]);
972                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
973                         }
974                         *data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
975                         *shift += 3;
976                         j++;
977                 }
978                 lmt_used++;
979                 lmt_addr += 16;
980         }
981
982         return lmt_used;
983 }
984
985 static __rte_always_inline uint16_t
986 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
987                            uint16_t pkts, uint64_t *cmd, const uint16_t flags)
988 {
989         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
990         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
991         uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
992                 cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
993         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
994         uint64x2_t senddesc01_w0, senddesc23_w0;
995         uint64x2_t senddesc01_w1, senddesc23_w1;
996         uint16_t left, scalar, burst, i, lmt_id;
997         uint64x2_t sendext01_w0, sendext23_w0;
998         uint64x2_t sendext01_w1, sendext23_w1;
999         uint64x2_t sendmem01_w0, sendmem23_w0;
1000         uint64x2_t sendmem01_w1, sendmem23_w1;
1001         uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
1002         uint64x2_t sgdesc01_w0, sgdesc23_w0;
1003         uint64x2_t sgdesc01_w1, sgdesc23_w1;
1004         struct cn10k_eth_txq *txq = tx_queue;
1005         uintptr_t laddr = txq->lmt_base;
1006         rte_iova_t io_addr = txq->io_addr;
1007         uint64x2_t ltypes01, ltypes23;
1008         uint64x2_t xtmp128, ytmp128;
1009         uint64x2_t xmask01, xmask23;
1010         uint8_t lnum, shift;
1011         union wdata {
1012                 __uint128_t data128;
1013                 uint64_t data[2];
1014         } wd;
1015
1016         NIX_XMIT_FC_OR_RETURN(txq, pkts);
1017
1018         scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1019         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1020
1021         /* Reduce the cached count */
1022         txq->fc_cache_pkts -= pkts;
1023         /* Perform header writes before barrier for TSO */
1024         if (flags & NIX_TX_OFFLOAD_TSO_F) {
1025                 for (i = 0; i < pkts; i++)
1026                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
1027         }
1028
1029         senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
1030         senddesc23_w0 = senddesc01_w0;
1031         senddesc01_w1 = vdupq_n_u64(0);
1032         senddesc23_w1 = senddesc01_w1;
1033         sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
1034         sgdesc23_w0 = sgdesc01_w0;
1035
1036         /* Load command defaults into vector variables. */
1037         if (flags & NIX_TX_NEED_EXT_HDR) {
1038                 sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
1039                 sendext23_w0 = sendext01_w0;
1040                 sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
1041                 sendext23_w1 = sendext01_w1;
1042                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1043                         sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
1044                         sendmem23_w0 = sendmem01_w0;
1045                         sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
1046                         sendmem23_w1 = sendmem01_w1;
1047                 }
1048         }
1049
1050         /* Get LMT base address and LMT ID as lcore id */
1051         ROC_LMT_BASE_ID_GET(laddr, lmt_id);
1052         left = pkts;
1053 again:
1054         /* Number of packets to prepare depends on offloads enabled. */
1055         burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
1056                               cn10k_nix_pkts_per_vec_brst(flags) :
1057                               left;
1058         if (flags & NIX_TX_MULTI_SEG_F) {
1059                 wd.data128 = 0;
1060                 shift = 16;
1061         }
1062         lnum = 0;
1063
1064         for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
1065                 if (flags & NIX_TX_MULTI_SEG_F) {
1066                         uint8_t j;
1067
1068                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1069                                 struct rte_mbuf *m = tx_pkts[j];
1070
1071                                 /* Get dwords based on nb_segs. */
1072                                 segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
1073                                 /* Add dwords based on offloads. */
1074                                 segdw[j] += 1 + /* SEND HDR */
1075                                             !!(flags & NIX_TX_NEED_EXT_HDR) +
1076                                             !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
1077                         }
1078
1079                         /* Check if there are enough LMTLINES for this loop */
1080                         if (lnum + 4 > 32) {
1081                                 uint8_t ldwords_con = 0, lneeded = 0;
1082                                 for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1083                                         ldwords_con += segdw[j];
1084                                         if (ldwords_con > 8) {
1085                                                 lneeded += 1;
1086                                                 ldwords_con = segdw[j];
1087                                         }
1088                                 }
1089                                 lneeded += 1;
1090                                 if (lnum + lneeded > 32) {
1091                                         burst = i;
1092                                         break;
1093                                 }
1094                         }
1095                 }
1096                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
1097                 senddesc01_w0 =
1098                         vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1099                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1100
1101                 senddesc23_w0 = senddesc01_w0;
1102                 sgdesc23_w0 = sgdesc01_w0;
1103
1104                 /* Clear vlan enables. */
1105                 if (flags & NIX_TX_NEED_EXT_HDR) {
1106                         sendext01_w1 = vbicq_u64(sendext01_w1,
1107                                                  vdupq_n_u64(0x3FFFF00FFFF00));
1108                         sendext23_w1 = sendext01_w1;
1109                 }
1110
1111                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1112                         /* Reset send mem alg to SETTSTMP from SUB*/
1113                         sendmem01_w0 = vbicq_u64(sendmem01_w0,
1114                                                  vdupq_n_u64(BIT_ULL(59)));
1115                         /* Reset send mem address to default. */
1116                         sendmem01_w1 =
1117                                 vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
1118                         sendmem23_w0 = sendmem01_w0;
1119                         sendmem23_w1 = sendmem01_w1;
1120                 }
1121
1122                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1123                         /* Clear the LSO enable bit. */
1124                         sendext01_w0 = vbicq_u64(sendext01_w0,
1125                                                  vdupq_n_u64(BIT_ULL(14)));
1126                         sendext23_w0 = sendext01_w0;
1127                 }
1128
1129                 /* Move mbufs to iova */
1130                 mbuf0 = (uint64_t *)tx_pkts[0];
1131                 mbuf1 = (uint64_t *)tx_pkts[1];
1132                 mbuf2 = (uint64_t *)tx_pkts[2];
1133                 mbuf3 = (uint64_t *)tx_pkts[3];
1134
1135                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1136                                      offsetof(struct rte_mbuf, buf_iova));
1137                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1138                                      offsetof(struct rte_mbuf, buf_iova));
1139                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1140                                      offsetof(struct rte_mbuf, buf_iova));
1141                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1142                                      offsetof(struct rte_mbuf, buf_iova));
1143                 /*
1144                  * Get mbuf's, olflags, iova, pktlen, dataoff
1145                  * dataoff_iovaX.D[0] = iova,
1146                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
1147                  * len_olflagsX.D[0] = ol_flags,
1148                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
1149                  */
1150                 dataoff_iova0 = vld1q_u64(mbuf0);
1151                 len_olflags0 = vld1q_u64(mbuf0 + 2);
1152                 dataoff_iova1 = vld1q_u64(mbuf1);
1153                 len_olflags1 = vld1q_u64(mbuf1 + 2);
1154                 dataoff_iova2 = vld1q_u64(mbuf2);
1155                 len_olflags2 = vld1q_u64(mbuf2 + 2);
1156                 dataoff_iova3 = vld1q_u64(mbuf3);
1157                 len_olflags3 = vld1q_u64(mbuf3 + 2);
1158
1159                 /* Move mbufs to point pool */
1160                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1161                                      offsetof(struct rte_mbuf, pool) -
1162                                      offsetof(struct rte_mbuf, buf_iova));
1163                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1164                                      offsetof(struct rte_mbuf, pool) -
1165                                      offsetof(struct rte_mbuf, buf_iova));
1166                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1167                                      offsetof(struct rte_mbuf, pool) -
1168                                      offsetof(struct rte_mbuf, buf_iova));
1169                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1170                                      offsetof(struct rte_mbuf, pool) -
1171                                      offsetof(struct rte_mbuf, buf_iova));
1172
1173                 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
1174                              NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
1175                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
1176                         /*
1177                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1178                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1179                          */
1180
1181                         asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
1182                                      : [a] "+w"(senddesc01_w1)
1183                                      : [in] "r"(mbuf0 + 2)
1184                                      : "memory");
1185
1186                         asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
1187                                      : [a] "+w"(senddesc01_w1)
1188                                      : [in] "r"(mbuf1 + 2)
1189                                      : "memory");
1190
1191                         asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
1192                                      : [b] "+w"(senddesc23_w1)
1193                                      : [in] "r"(mbuf2 + 2)
1194                                      : "memory");
1195
1196                         asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
1197                                      : [b] "+w"(senddesc23_w1)
1198                                      : [in] "r"(mbuf3 + 2)
1199                                      : "memory");
1200
1201                         /* Get pool pointer alone */
1202                         mbuf0 = (uint64_t *)*mbuf0;
1203                         mbuf1 = (uint64_t *)*mbuf1;
1204                         mbuf2 = (uint64_t *)*mbuf2;
1205                         mbuf3 = (uint64_t *)*mbuf3;
1206                 } else {
1207                         /* Get pool pointer alone */
1208                         mbuf0 = (uint64_t *)*mbuf0;
1209                         mbuf1 = (uint64_t *)*mbuf1;
1210                         mbuf2 = (uint64_t *)*mbuf2;
1211                         mbuf3 = (uint64_t *)*mbuf3;
1212                 }
1213
1214                 const uint8x16_t shuf_mask2 = {
1215                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1216                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1217                 };
1218                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
1219                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
1220
1221                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
1222                 const uint64x2_t and_mask0 = {
1223                         0xFFFFFFFFFFFFFFFF,
1224                         0x000000000000FFFF,
1225                 };
1226
1227                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
1228                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
1229                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
1230                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
1231
1232                 /*
1233                  * Pick only 16 bits of pktlen preset at bits 63:32
1234                  * and place them at bits 15:0.
1235                  */
1236                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
1237                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
1238
1239                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
1240                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
1241                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
1242
1243                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
1244                  * pktlen at 15:0 position.
1245                  */
1246                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
1247                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
1248                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
1249                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
1250
1251                 /* Move mbuf to point to pool_id. */
1252                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1253                                      offsetof(struct rte_mempool, pool_id));
1254                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1255                                      offsetof(struct rte_mempool, pool_id));
1256                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1257                                      offsetof(struct rte_mempool, pool_id));
1258                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1259                                      offsetof(struct rte_mempool, pool_id));
1260
1261                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1262                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1263                         /*
1264                          * Lookup table to translate ol_flags to
1265                          * il3/il4 types. But we still use ol3/ol4 types in
1266                          * senddesc_w1 as only one header processing is enabled.
1267                          */
1268                         const uint8x16_t tbl = {
1269                                 /* [0-15] = il4type:il3type */
1270                                 0x04, /* none (IPv6 assumed) */
1271                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
1272                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
1273                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
1274                                 0x03, /* PKT_TX_IP_CKSUM */
1275                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
1276                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
1277                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
1278                                 0x02, /* PKT_TX_IPV4  */
1279                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
1280                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
1281                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
1282                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
1283                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1284                                        * PKT_TX_TCP_CKSUM
1285                                        */
1286                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1287                                        * PKT_TX_SCTP_CKSUM
1288                                        */
1289                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1290                                        * PKT_TX_UDP_CKSUM
1291                                        */
1292                         };
1293
1294                         /* Extract olflags to translate to iltypes */
1295                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1296                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1297
1298                         /*
1299                          * E(47):L3_LEN(9):L2_LEN(7+z)
1300                          * E(47):L3_LEN(9):L2_LEN(7+z)
1301                          */
1302                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
1303                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
1304
1305                         /* Move OLFLAGS bits 55:52 to 51:48
1306                          * with zeros preprended on the byte and rest
1307                          * don't care
1308                          */
1309                         xtmp128 = vshrq_n_u8(xtmp128, 4);
1310                         ytmp128 = vshrq_n_u8(ytmp128, 4);
1311                         /*
1312                          * E(48):L3_LEN(8):L2_LEN(z+7)
1313                          * E(48):L3_LEN(8):L2_LEN(z+7)
1314                          */
1315                         const int8x16_t tshft3 = {
1316                                 -1, 0, 8, 8, 8, 8, 8, 8,
1317                                 -1, 0, 8, 8, 8, 8, 8, 8,
1318                         };
1319
1320                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1321                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1322
1323                         /* Do the lookup */
1324                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1325                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1326
1327                         /* Pick only relevant fields i.e Bit 48:55 of iltype
1328                          * and place it in ol3/ol4type of senddesc_w1
1329                          */
1330                         const uint8x16_t shuf_mask0 = {
1331                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
1332                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
1333                         };
1334
1335                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1336                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1337
1338                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1339                          * a [E(32):E(16):OL3(8):OL2(8)]
1340                          * a = a + (a << 8)
1341                          * a [E(32):E(16):(OL3+OL2):OL2]
1342                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1343                          */
1344                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1345                                                  vshlq_n_u16(senddesc01_w1, 8));
1346                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1347                                                  vshlq_n_u16(senddesc23_w1, 8));
1348
1349                         /* Move ltypes to senddesc*_w1 */
1350                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1351                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1352                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1353                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1354                         /*
1355                          * Lookup table to translate ol_flags to
1356                          * ol3/ol4 types.
1357                          */
1358
1359                         const uint8x16_t tbl = {
1360                                 /* [0-15] = ol4type:ol3type */
1361                                 0x00, /* none */
1362                                 0x03, /* OUTER_IP_CKSUM */
1363                                 0x02, /* OUTER_IPV4 */
1364                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1365                                 0x04, /* OUTER_IPV6 */
1366                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1367                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1368                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1369                                        * OUTER_IP_CKSUM
1370                                        */
1371                                 0x00, /* OUTER_UDP_CKSUM */
1372                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
1373                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
1374                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
1375                                        * OUTER_IP_CKSUM
1376                                        */
1377                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
1378                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1379                                        * OUTER_IP_CKSUM
1380                                        */
1381                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1382                                        * OUTER_IPV4
1383                                        */
1384                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1385                                        * OUTER_IPV4 | OUTER_IP_CKSUM
1386                                        */
1387                         };
1388
1389                         /* Extract olflags to translate to iltypes */
1390                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1391                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1392
1393                         /*
1394                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1395                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1396                          */
1397                         const uint8x16_t shuf_mask5 = {
1398                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1399                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1400                         };
1401                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1402                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1403
1404                         /* Extract outer ol flags only */
1405                         const uint64x2_t o_cksum_mask = {
1406                                 0x1C00020000000000,
1407                                 0x1C00020000000000,
1408                         };
1409
1410                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1411                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1412
1413                         /* Extract OUTER_UDP_CKSUM bit 41 and
1414                          * move it to bit 61
1415                          */
1416
1417                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1418                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1419
1420                         /* Shift oltype by 2 to start nibble from BIT(56)
1421                          * instead of BIT(58)
1422                          */
1423                         xtmp128 = vshrq_n_u8(xtmp128, 2);
1424                         ytmp128 = vshrq_n_u8(ytmp128, 2);
1425                         /*
1426                          * E(48):L3_LEN(8):L2_LEN(z+7)
1427                          * E(48):L3_LEN(8):L2_LEN(z+7)
1428                          */
1429                         const int8x16_t tshft3 = {
1430                                 -1, 0, 8, 8, 8, 8, 8, 8,
1431                                 -1, 0, 8, 8, 8, 8, 8, 8,
1432                         };
1433
1434                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1435                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1436
1437                         /* Do the lookup */
1438                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1439                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1440
1441                         /* Pick only relevant fields i.e Bit 56:63 of oltype
1442                          * and place it in ol3/ol4type of senddesc_w1
1443                          */
1444                         const uint8x16_t shuf_mask0 = {
1445                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1446                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1447                         };
1448
1449                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1450                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1451
1452                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1453                          * a [E(32):E(16):OL3(8):OL2(8)]
1454                          * a = a + (a << 8)
1455                          * a [E(32):E(16):(OL3+OL2):OL2]
1456                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1457                          */
1458                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1459                                                  vshlq_n_u16(senddesc01_w1, 8));
1460                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1461                                                  vshlq_n_u16(senddesc23_w1, 8));
1462
1463                         /* Move ltypes to senddesc*_w1 */
1464                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1465                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1466                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1467                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1468                         /* Lookup table to translate ol_flags to
1469                          * ol4type, ol3type, il4type, il3type of senddesc_w1
1470                          */
1471                         const uint8x16x2_t tbl = {{
1472                                 {
1473                                         /* [0-15] = il4type:il3type */
1474                                         0x04, /* none (IPv6) */
1475                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
1476                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
1477                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
1478                                         0x03, /* PKT_TX_IP_CKSUM */
1479                                         0x13, /* PKT_TX_IP_CKSUM |
1480                                                * PKT_TX_TCP_CKSUM
1481                                                */
1482                                         0x23, /* PKT_TX_IP_CKSUM |
1483                                                * PKT_TX_SCTP_CKSUM
1484                                                */
1485                                         0x33, /* PKT_TX_IP_CKSUM |
1486                                                * PKT_TX_UDP_CKSUM
1487                                                */
1488                                         0x02, /* PKT_TX_IPV4 */
1489                                         0x12, /* PKT_TX_IPV4 |
1490                                                * PKT_TX_TCP_CKSUM
1491                                                */
1492                                         0x22, /* PKT_TX_IPV4 |
1493                                                * PKT_TX_SCTP_CKSUM
1494                                                */
1495                                         0x32, /* PKT_TX_IPV4 |
1496                                                * PKT_TX_UDP_CKSUM
1497                                                */
1498                                         0x03, /* PKT_TX_IPV4 |
1499                                                * PKT_TX_IP_CKSUM
1500                                                */
1501                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1502                                                * PKT_TX_TCP_CKSUM
1503                                                */
1504                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1505                                                * PKT_TX_SCTP_CKSUM
1506                                                */
1507                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1508                                                * PKT_TX_UDP_CKSUM
1509                                                */
1510                                 },
1511
1512                                 {
1513                                         /* [16-31] = ol4type:ol3type */
1514                                         0x00, /* none */
1515                                         0x03, /* OUTER_IP_CKSUM */
1516                                         0x02, /* OUTER_IPV4 */
1517                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1518                                         0x04, /* OUTER_IPV6 */
1519                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1520                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1521                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1522                                                * OUTER_IP_CKSUM
1523                                                */
1524                                         0x00, /* OUTER_UDP_CKSUM */
1525                                         0x33, /* OUTER_UDP_CKSUM |
1526                                                * OUTER_IP_CKSUM
1527                                                */
1528                                         0x32, /* OUTER_UDP_CKSUM |
1529                                                * OUTER_IPV4
1530                                                */
1531                                         0x33, /* OUTER_UDP_CKSUM |
1532                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1533                                                */
1534                                         0x34, /* OUTER_UDP_CKSUM |
1535                                                * OUTER_IPV6
1536                                                */
1537                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1538                                                * OUTER_IP_CKSUM
1539                                                */
1540                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1541                                                * OUTER_IPV4
1542                                                */
1543                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1544                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1545                                                */
1546                                 },
1547                         }};
1548
1549                         /* Extract olflags to translate to oltype & iltype */
1550                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1551                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1552
1553                         /*
1554                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1555                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1556                          */
1557                         const uint32x4_t tshft_4 = {
1558                                 1,
1559                                 0,
1560                                 1,
1561                                 0,
1562                         };
1563                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1564                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1565
1566                         /*
1567                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1568                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1569                          */
1570                         const uint8x16_t shuf_mask5 = {
1571                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1572                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1573                         };
1574                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1575                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1576
1577                         /* Extract outer and inner header ol_flags */
1578                         const uint64x2_t oi_cksum_mask = {
1579                                 0x1CF0020000000000,
1580                                 0x1CF0020000000000,
1581                         };
1582
1583                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1584                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1585
1586                         /* Extract OUTER_UDP_CKSUM bit 41 and
1587                          * move it to bit 61
1588                          */
1589
1590                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1591                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1592
1593                         /* Shift right oltype by 2 and iltype by 4
1594                          * to start oltype nibble from BIT(58)
1595                          * instead of BIT(56) and iltype nibble from BIT(48)
1596                          * instead of BIT(52).
1597                          */
1598                         const int8x16_t tshft5 = {
1599                                 8, 8, 8, 8, 8, 8, -4, -2,
1600                                 8, 8, 8, 8, 8, 8, -4, -2,
1601                         };
1602
1603                         xtmp128 = vshlq_u8(xtmp128, tshft5);
1604                         ytmp128 = vshlq_u8(ytmp128, tshft5);
1605                         /*
1606                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1607                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1608                          */
1609                         const int8x16_t tshft3 = {
1610                                 -1, 0, -1, 0, 0, 0, 0, 0,
1611                                 -1, 0, -1, 0, 0, 0, 0, 0,
1612                         };
1613
1614                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1615                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1616
1617                         /* Mark Bit(4) of oltype */
1618                         const uint64x2_t oi_cksum_mask2 = {
1619                                 0x1000000000000000,
1620                                 0x1000000000000000,
1621                         };
1622
1623                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1624                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1625
1626                         /* Do the lookup */
1627                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1628                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1629
1630                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
1631                          * Bit 56:63 of oltype and place it in corresponding
1632                          * place in senddesc_w1.
1633                          */
1634                         const uint8x16_t shuf_mask0 = {
1635                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1636                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1637                         };
1638
1639                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1640                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1641
1642                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1643                          * l3len, l2len, ol3len, ol2len.
1644                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1645                          * a = a + (a << 8)
1646                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1647                          * a = a + (a << 16)
1648                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1649                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1650                          */
1651                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1652                                                  vshlq_n_u32(senddesc01_w1, 8));
1653                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1654                                                  vshlq_n_u32(senddesc23_w1, 8));
1655
1656                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1657                         senddesc01_w1 = vaddq_u8(
1658                                 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1659                         senddesc23_w1 = vaddq_u8(
1660                                 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1661
1662                         /* Move ltypes to senddesc*_w1 */
1663                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1664                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1665                 }
1666
1667                 xmask01 = vdupq_n_u64(0);
1668                 xmask23 = xmask01;
1669                 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1670                              : [a] "+w"(xmask01)
1671                              : [in] "r"(mbuf0)
1672                              : "memory");
1673
1674                 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1675                              : [a] "+w"(xmask01)
1676                              : [in] "r"(mbuf1)
1677                              : "memory");
1678
1679                 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1680                              : [b] "+w"(xmask23)
1681                              : [in] "r"(mbuf2)
1682                              : "memory");
1683
1684                 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1685                              : [b] "+w"(xmask23)
1686                              : [in] "r"(mbuf3)
1687                              : "memory");
1688                 xmask01 = vshlq_n_u64(xmask01, 20);
1689                 xmask23 = vshlq_n_u64(xmask23, 20);
1690
1691                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1692                 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1693
1694                 if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
1695                         /* Tx ol_flag for vlan. */
1696                         const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
1697                         /* Bit enable for VLAN1 */
1698                         const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
1699                         /* Tx ol_flag for QnQ. */
1700                         const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
1701                         /* Bit enable for VLAN0 */
1702                         const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
1703                         /* Load vlan values from packet. outer is VLAN 0 */
1704                         uint64x2_t ext01 = {
1705                                 ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
1706                                         ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
1707                                 ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
1708                                         ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
1709                         };
1710                         uint64x2_t ext23 = {
1711                                 ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
1712                                         ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
1713                                 ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
1714                                         ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
1715                         };
1716
1717                         /* Get ol_flags of the packets. */
1718                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1719                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1720
1721                         /* ORR vlan outer/inner values into cmd. */
1722                         sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
1723                         sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
1724
1725                         /* Test for offload enable bits and generate masks. */
1726                         xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
1727                                                       mlv),
1728                                             vandq_u64(vtstq_u64(xtmp128, olq),
1729                                                       mlq));
1730                         ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
1731                                                       mlv),
1732                                             vandq_u64(vtstq_u64(ytmp128, olq),
1733                                                       mlq));
1734
1735                         /* Set vlan enable bits into cmd based on mask. */
1736                         sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
1737                         sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
1738                 }
1739
1740                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1741                         /* Tx ol_flag for timestam. */
1742                         const uint64x2_t olf = {PKT_TX_IEEE1588_TMST,
1743                                                 PKT_TX_IEEE1588_TMST};
1744                         /* Set send mem alg to SUB. */
1745                         const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
1746                         /* Increment send mem address by 8. */
1747                         const uint64x2_t addr = {0x8, 0x8};
1748
1749                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1750                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1751
1752                         /* Check if timestamp is requested and generate inverted
1753                          * mask as we need not make any changes to default cmd
1754                          * value.
1755                          */
1756                         xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
1757                         ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
1758
1759                         /* Change send mem address to an 8 byte offset when
1760                          * TSTMP is disabled.
1761                          */
1762                         sendmem01_w1 = vaddq_u64(sendmem01_w1,
1763                                                  vandq_u64(xtmp128, addr));
1764                         sendmem23_w1 = vaddq_u64(sendmem23_w1,
1765                                                  vandq_u64(ytmp128, addr));
1766                         /* Change send mem alg to SUB when TSTMP is disabled. */
1767                         sendmem01_w0 = vorrq_u64(sendmem01_w0,
1768                                                  vandq_u64(xtmp128, alg));
1769                         sendmem23_w0 = vorrq_u64(sendmem23_w0,
1770                                                  vandq_u64(ytmp128, alg));
1771
1772                         cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
1773                         cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
1774                         cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
1775                         cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
1776                 }
1777
1778                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1779                         const uint64_t lso_fmt = txq->lso_tun_fmt;
1780                         uint64_t sx_w0[NIX_DESCS_PER_LOOP];
1781                         uint64_t sd_w1[NIX_DESCS_PER_LOOP];
1782
1783                         /* Extract SD W1 as we need to set L4 types. */
1784                         vst1q_u64(sd_w1, senddesc01_w1);
1785                         vst1q_u64(sd_w1 + 2, senddesc23_w1);
1786
1787                         /* Extract SX W0 as we need to set LSO fields. */
1788                         vst1q_u64(sx_w0, sendext01_w0);
1789                         vst1q_u64(sx_w0 + 2, sendext23_w0);
1790
1791                         /* Extract ol_flags. */
1792                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1793                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1794
1795                         /* Prepare individual mbufs. */
1796                         cn10k_nix_prepare_tso(tx_pkts[0],
1797                                 (union nix_send_hdr_w1_u *)&sd_w1[0],
1798                                 (union nix_send_ext_w0_u *)&sx_w0[0],
1799                                 vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
1800
1801                         cn10k_nix_prepare_tso(tx_pkts[1],
1802                                 (union nix_send_hdr_w1_u *)&sd_w1[1],
1803                                 (union nix_send_ext_w0_u *)&sx_w0[1],
1804                                 vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
1805
1806                         cn10k_nix_prepare_tso(tx_pkts[2],
1807                                 (union nix_send_hdr_w1_u *)&sd_w1[2],
1808                                 (union nix_send_ext_w0_u *)&sx_w0[2],
1809                                 vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
1810
1811                         cn10k_nix_prepare_tso(tx_pkts[3],
1812                                 (union nix_send_hdr_w1_u *)&sd_w1[3],
1813                                 (union nix_send_ext_w0_u *)&sx_w0[3],
1814                                 vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
1815
1816                         senddesc01_w1 = vld1q_u64(sd_w1);
1817                         senddesc23_w1 = vld1q_u64(sd_w1 + 2);
1818
1819                         sendext01_w0 = vld1q_u64(sx_w0);
1820                         sendext23_w0 = vld1q_u64(sx_w0 + 2);
1821                 }
1822
1823                 if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
1824                     !(flags & NIX_TX_MULTI_SEG_F)) {
1825                         /* Set don't free bit if reference count > 1 */
1826                         xmask01 = vdupq_n_u64(0);
1827                         xmask23 = xmask01;
1828
1829                         /* Move mbufs to iova */
1830                         mbuf0 = (uint64_t *)tx_pkts[0];
1831                         mbuf1 = (uint64_t *)tx_pkts[1];
1832                         mbuf2 = (uint64_t *)tx_pkts[2];
1833                         mbuf3 = (uint64_t *)tx_pkts[3];
1834
1835                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1836                                 vsetq_lane_u64(0x80000, xmask01, 0);
1837                         else
1838                                 __mempool_check_cookies(
1839                                         ((struct rte_mbuf *)mbuf0)->pool,
1840                                         (void **)&mbuf0, 1, 0);
1841
1842                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1843                                 vsetq_lane_u64(0x80000, xmask01, 1);
1844                         else
1845                                 __mempool_check_cookies(
1846                                         ((struct rte_mbuf *)mbuf1)->pool,
1847                                         (void **)&mbuf1, 1, 0);
1848
1849                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1850                                 vsetq_lane_u64(0x80000, xmask23, 0);
1851                         else
1852                                 __mempool_check_cookies(
1853                                         ((struct rte_mbuf *)mbuf2)->pool,
1854                                         (void **)&mbuf2, 1, 0);
1855
1856                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1857                                 vsetq_lane_u64(0x80000, xmask23, 1);
1858                         else
1859                                 __mempool_check_cookies(
1860                                         ((struct rte_mbuf *)mbuf3)->pool,
1861                                         (void **)&mbuf3, 1, 0);
1862                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1863                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1864                 } else if (!(flags & NIX_TX_MULTI_SEG_F)) {
1865                         /* Move mbufs to iova */
1866                         mbuf0 = (uint64_t *)tx_pkts[0];
1867                         mbuf1 = (uint64_t *)tx_pkts[1];
1868                         mbuf2 = (uint64_t *)tx_pkts[2];
1869                         mbuf3 = (uint64_t *)tx_pkts[3];
1870
1871                         /* Mark mempool object as "put" since
1872                          * it is freed by NIX
1873                          */
1874                         __mempool_check_cookies(
1875                                 ((struct rte_mbuf *)mbuf0)->pool,
1876                                 (void **)&mbuf0, 1, 0);
1877
1878                         __mempool_check_cookies(
1879                                 ((struct rte_mbuf *)mbuf1)->pool,
1880                                 (void **)&mbuf1, 1, 0);
1881
1882                         __mempool_check_cookies(
1883                                 ((struct rte_mbuf *)mbuf2)->pool,
1884                                 (void **)&mbuf2, 1, 0);
1885
1886                         __mempool_check_cookies(
1887                                 ((struct rte_mbuf *)mbuf3)->pool,
1888                                 (void **)&mbuf3, 1, 0);
1889                 }
1890
1891                 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1892                 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1893                 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1894                 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1895                 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1896
1897                 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1898                 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1899                 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1900                 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1901
1902                 if (flags & NIX_TX_NEED_EXT_HDR) {
1903                         cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
1904                         cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
1905                         cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
1906                         cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
1907                 }
1908
1909                 if (flags & NIX_TX_MULTI_SEG_F) {
1910                         uint8_t j;
1911
1912                         segdw[4] = 8;
1913                         j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
1914                                                           cmd2, cmd3, segdw,
1915                                                           (uint64_t *)
1916                                                           LMT_OFF(laddr, lnum,
1917                                                                   0),
1918                                                           &wd.data128, &shift,
1919                                                           flags);
1920                         lnum += j;
1921                 } else if (flags & NIX_TX_NEED_EXT_HDR) {
1922                         /* Store the prepared send desc to LMT lines */
1923                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1924                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1925                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1926                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1927                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
1928                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
1929                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
1930                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
1931                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
1932                                 lnum += 1;
1933                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1934                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1935                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1936                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
1937                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
1938                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
1939                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
1940                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
1941                         } else {
1942                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1943                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1944                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1945                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
1946                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
1947                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
1948                                 lnum += 1;
1949                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1950                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1951                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1952                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
1953                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
1954                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
1955                         }
1956                         lnum += 1;
1957                 } else {
1958                         /* Store the prepared send desc to LMT lines */
1959                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1960                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1961                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1962                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1963                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1964                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1965                         vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1966                         vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1967                         lnum += 1;
1968                 }
1969
1970                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1971         }
1972
1973         if (flags & NIX_TX_MULTI_SEG_F)
1974                 wd.data[0] >>= 16;
1975
1976         /* Trigger LMTST */
1977         if (lnum > 16) {
1978                 if (!(flags & NIX_TX_MULTI_SEG_F))
1979                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
1980
1981                 pa = io_addr | (wd.data[0] & 0x7) << 4;
1982                 wd.data[0] &= ~0x7ULL;
1983
1984                 if (flags & NIX_TX_MULTI_SEG_F)
1985                         wd.data[0] <<= 16;
1986
1987                 wd.data[0] |= (15ULL << 12);
1988                 wd.data[0] |= (uint64_t)lmt_id;
1989
1990                 /* STEOR0 */
1991                 roc_lmt_submit_steorl(wd.data[0], pa);
1992
1993                 if (!(flags & NIX_TX_MULTI_SEG_F))
1994                         wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
1995
1996                 pa = io_addr | (wd.data[1] & 0x7) << 4;
1997                 wd.data[1] &= ~0x7ULL;
1998
1999                 if (flags & NIX_TX_MULTI_SEG_F)
2000                         wd.data[1] <<= 16;
2001
2002                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
2003                 wd.data[1] |= (uint64_t)(lmt_id + 16);
2004
2005                 /* STEOR1 */
2006                 roc_lmt_submit_steorl(wd.data[1], pa);
2007         } else if (lnum) {
2008                 if (!(flags & NIX_TX_MULTI_SEG_F))
2009                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
2010
2011                 pa = io_addr | (wd.data[0] & 0x7) << 4;
2012                 wd.data[0] &= ~0x7ULL;
2013
2014                 if (flags & NIX_TX_MULTI_SEG_F)
2015                         wd.data[0] <<= 16;
2016
2017                 wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
2018                 wd.data[0] |= lmt_id;
2019
2020                 /* STEOR0 */
2021                 roc_lmt_submit_steorl(wd.data[0], pa);
2022         }
2023
2024         left -= burst;
2025         rte_io_wmb();
2026         if (left)
2027                 goto again;
2028
2029         if (unlikely(scalar)) {
2030                 if (flags & NIX_TX_MULTI_SEG_F)
2031                         pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
2032                                                          scalar, cmd, flags);
2033                 else
2034                         pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
2035                                                     cmd, flags);
2036         }
2037
2038         return pkts;
2039 }
2040
2041 #else
2042 static __rte_always_inline uint16_t
2043 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
2044                            uint16_t pkts, uint64_t *cmd, const uint16_t flags)
2045 {
2046         RTE_SET_USED(tx_queue);
2047         RTE_SET_USED(tx_pkts);
2048         RTE_SET_USED(pkts);
2049         RTE_SET_USED(cmd);
2050         RTE_SET_USED(flags);
2051         return 0;
2052 }
2053 #endif
2054
2055 #define L3L4CSUM_F   NIX_TX_OFFLOAD_L3_L4_CSUM_F
2056 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
2057 #define VLAN_F       NIX_TX_OFFLOAD_VLAN_QINQ_F
2058 #define NOFF_F       NIX_TX_OFFLOAD_MBUF_NOFF_F
2059 #define TSO_F        NIX_TX_OFFLOAD_TSO_F
2060 #define TSP_F        NIX_TX_OFFLOAD_TSTAMP_F
2061
2062 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
2063 #define NIX_TX_FASTPATH_MODES                                           \
2064 T(no_offload,                           0, 0, 0, 0, 0, 0,       4,      \
2065                 NIX_TX_OFFLOAD_NONE)                                    \
2066 T(l3l4csum,                             0, 0, 0, 0, 0, 1,       4,      \
2067                 L3L4CSUM_F)                                             \
2068 T(ol3ol4csum,                           0, 0, 0, 0, 1, 0,       4,      \
2069                 OL3OL4CSUM_F)                                           \
2070 T(ol3ol4csum_l3l4csum,                  0, 0, 0, 0, 1, 1,       4,      \
2071                 OL3OL4CSUM_F | L3L4CSUM_F)                              \
2072 T(vlan,                                 0, 0, 0, 1, 0, 0,       6,      \
2073                 VLAN_F)                                                 \
2074 T(vlan_l3l4csum,                        0, 0, 0, 1, 0, 1,       6,      \
2075                 VLAN_F | L3L4CSUM_F)                                    \
2076 T(vlan_ol3ol4csum,                      0, 0, 0, 1, 1, 0,       6,      \
2077                 VLAN_F | OL3OL4CSUM_F)                                  \
2078 T(vlan_ol3ol4csum_l3l4csum,             0, 0, 0, 1, 1, 1,       6,      \
2079                 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2080 T(noff,                                 0, 0, 1, 0, 0, 0,       4,      \
2081                 NOFF_F)                                                 \
2082 T(noff_l3l4csum,                        0, 0, 1, 0, 0, 1,       4,      \
2083                 NOFF_F | L3L4CSUM_F)                                    \
2084 T(noff_ol3ol4csum,                      0, 0, 1, 0, 1, 0,       4,      \
2085                 NOFF_F | OL3OL4CSUM_F)                                  \
2086 T(noff_ol3ol4csum_l3l4csum,             0, 0, 1, 0, 1, 1,       4,      \
2087                 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2088 T(noff_vlan,                            0, 0, 1, 1, 0, 0,       6,      \
2089                 NOFF_F | VLAN_F)                                        \
2090 T(noff_vlan_l3l4csum,                   0, 0, 1, 1, 0, 1,       6,      \
2091                 NOFF_F | VLAN_F | L3L4CSUM_F)                           \
2092 T(noff_vlan_ol3ol4csum,                 0, 0, 1, 1, 1, 0,       6,      \
2093                 NOFF_F | VLAN_F | OL3OL4CSUM_F)                         \
2094 T(noff_vlan_ol3ol4csum_l3l4csum,        0, 0, 1, 1, 1, 1,       6,      \
2095                 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)            \
2096 T(tso,                                  0, 1, 0, 0, 0, 0,       6,      \
2097                 TSO_F)                                                  \
2098 T(tso_l3l4csum,                         0, 1, 0, 0, 0, 1,       6,      \
2099                 TSO_F | L3L4CSUM_F)                                     \
2100 T(tso_ol3ol4csum,                       0, 1, 0, 0, 1, 0,       6,      \
2101                 TSO_F | OL3OL4CSUM_F)                                   \
2102 T(tso_ol3ol4csum_l3l4csum,              0, 1, 0, 0, 1, 1,       6,      \
2103                 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2104 T(tso_vlan,                             0, 1, 0, 1, 0, 0,       6,      \
2105                 TSO_F | VLAN_F)                                         \
2106 T(tso_vlan_l3l4csum,                    0, 1, 0, 1, 0, 1,       6,      \
2107                 TSO_F | VLAN_F | L3L4CSUM_F)                            \
2108 T(tso_vlan_ol3ol4csum,                  0, 1, 0, 1, 1, 0,       6,      \
2109                 TSO_F | VLAN_F | OL3OL4CSUM_F)                          \
2110 T(tso_vlan_ol3ol4csum_l3l4csum,         0, 1, 0, 1, 1, 1,       6,      \
2111                 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2112 T(tso_noff,                             0, 1, 1, 0, 0, 0,       6,      \
2113                 TSO_F | NOFF_F)                                         \
2114 T(tso_noff_l3l4csum,                    0, 1, 1, 0, 0, 1,       6,      \
2115                 TSO_F | NOFF_F | L3L4CSUM_F)                            \
2116 T(tso_noff_ol3ol4csum,                  0, 1, 1, 0, 1, 0,       6,      \
2117                 TSO_F | NOFF_F | OL3OL4CSUM_F)                          \
2118 T(tso_noff_ol3ol4csum_l3l4csum,         0, 1, 1, 0, 1, 1,       6,      \
2119                 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2120 T(tso_noff_vlan,                        0, 1, 1, 1, 0, 0,       6,      \
2121                 TSO_F | NOFF_F | VLAN_F)                                \
2122 T(tso_noff_vlan_l3l4csum,               0, 1, 1, 1, 0, 1,       6,      \
2123                 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2124 T(tso_noff_vlan_ol3ol4csum,             0, 1, 1, 1, 1, 0,       6,      \
2125                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2126 T(tso_noff_vlan_ol3ol4csum_l3l4csum,    0, 1, 1, 1, 1, 1,       6,      \
2127                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2128 T(ts,                                   1, 0, 0, 0, 0, 0,       8,      \
2129                 TSP_F)                                                  \
2130 T(ts_l3l4csum,                          1, 0, 0, 0, 0, 1,       8,      \
2131                 TSP_F | L3L4CSUM_F)                                     \
2132 T(ts_ol3ol4csum,                        1, 0, 0, 0, 1, 0,       8,      \
2133                 TSP_F | OL3OL4CSUM_F)                                   \
2134 T(ts_ol3ol4csum_l3l4csum,               1, 0, 0, 0, 1, 1,       8,      \
2135                 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2136 T(ts_vlan,                              1, 0, 0, 1, 0, 0,       8,      \
2137                 TSP_F | VLAN_F)                                         \
2138 T(ts_vlan_l3l4csum,                     1, 0, 0, 1, 0, 1,       8,      \
2139                 TSP_F | VLAN_F | L3L4CSUM_F)                            \
2140 T(ts_vlan_ol3ol4csum,                   1, 0, 0, 1, 1, 0,       8,      \
2141                 TSP_F | VLAN_F | OL3OL4CSUM_F)                          \
2142 T(ts_vlan_ol3ol4csum_l3l4csum,          1, 0, 0, 1, 1, 1,       8,      \
2143                 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2144 T(ts_noff,                              1, 0, 1, 0, 0, 0,       8,      \
2145                 TSP_F | NOFF_F)                                         \
2146 T(ts_noff_l3l4csum,                     1, 0, 1, 0, 0, 1,       8,      \
2147                 TSP_F | NOFF_F | L3L4CSUM_F)                            \
2148 T(ts_noff_ol3ol4csum,                   1, 0, 1, 0, 1, 0,       8,      \
2149                 TSP_F | NOFF_F | OL3OL4CSUM_F)                          \
2150 T(ts_noff_ol3ol4csum_l3l4csum,          1, 0, 1, 0, 1, 1,       8,      \
2151                 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2152 T(ts_noff_vlan,                         1, 0, 1, 1, 0, 0,       8,      \
2153                 TSP_F | NOFF_F | VLAN_F)                                \
2154 T(ts_noff_vlan_l3l4csum,                1, 0, 1, 1, 0, 1,       8,      \
2155                 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2156 T(ts_noff_vlan_ol3ol4csum,              1, 0, 1, 1, 1, 0,       8,      \
2157                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2158 T(ts_noff_vlan_ol3ol4csum_l3l4csum,     1, 0, 1, 1, 1, 1,       8,      \
2159                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2160 T(ts_tso,                               1, 1, 0, 0, 0, 0,       8,      \
2161                 TSP_F | TSO_F)                                          \
2162 T(ts_tso_l3l4csum,                      1, 1, 0, 0, 0, 1,       8,      \
2163                 TSP_F | TSO_F | L3L4CSUM_F)                             \
2164 T(ts_tso_ol3ol4csum,                    1, 1, 0, 0, 1, 0,       8,      \
2165                 TSP_F | TSO_F | OL3OL4CSUM_F)                           \
2166 T(ts_tso_ol3ol4csum_l3l4csum,           1, 1, 0, 0, 1, 1,       8,      \
2167                 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)              \
2168 T(ts_tso_vlan,                          1, 1, 0, 1, 0, 0,       8,      \
2169                 TSP_F | TSO_F | VLAN_F)                                 \
2170 T(ts_tso_vlan_l3l4csum,                 1, 1, 0, 1, 0, 1,       8,      \
2171                 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F)                    \
2172 T(ts_tso_vlan_ol3ol4csum,               1, 1, 0, 1, 1, 0,       8,      \
2173                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F)                  \
2174 T(ts_tso_vlan_ol3ol4csum_l3l4csum,      1, 1, 0, 1, 1, 1,       8,      \
2175                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2176 T(ts_tso_noff,                          1, 1, 1, 0, 0, 0,       8,      \
2177                 TSP_F | TSO_F | NOFF_F)                                 \
2178 T(ts_tso_noff_l3l4csum,                 1, 1, 1, 0, 0, 1,       8,      \
2179                 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F)                    \
2180 T(ts_tso_noff_ol3ol4csum,               1, 1, 1, 0, 1, 0,       8,      \
2181                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F)                  \
2182 T(ts_tso_noff_ol3ol4csum_l3l4csum,      1, 1, 1, 0, 1, 1,       8,      \
2183                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2184 T(ts_tso_noff_vlan,                     1, 1, 1, 1, 0, 0,       8,      \
2185                 TSP_F | TSO_F | NOFF_F | VLAN_F)                        \
2186 T(ts_tso_noff_vlan_l3l4csum,            1, 1, 1, 1, 0, 1,       8,      \
2187                 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)           \
2188 T(ts_tso_noff_vlan_ol3ol4csum,          1, 1, 1, 1, 1, 0,       8,      \
2189                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)         \
2190 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1,       8,      \
2191                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
2192
2193 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
2194         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name(          \
2195                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2196                                                                                \
2197         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name(     \
2198                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2199                                                                                \
2200         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
2201                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2202                                                                                \
2203         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
2204                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2205
2206 NIX_TX_FASTPATH_MODES
2207 #undef T
2208
2209 #endif /* __CN10K_TX_H__ */