net/cnxk: enable VLAN processing in vector Tx
[dpdk.git] / drivers / net / cnxk / cn10k_tx.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2021 Marvell.
3  */
4 #ifndef __CN10K_TX_H__
5 #define __CN10K_TX_H__
6
7 #include <rte_vect.h>
8
9 #define NIX_TX_OFFLOAD_NONE           (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F   BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F    BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F    BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F          BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F       BIT(5)
16
17 /* Flags to control xmit_prepare function.
18  * Defining it from backwards to denote its been
19  * not used as offload flags to pick function
20  */
21 #define NIX_TX_MULTI_SEG_F BIT(15)
22
23 #define NIX_TX_NEED_SEND_HDR_W1                                                \
24         (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |         \
25          NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
26
27 #define NIX_TX_NEED_EXT_HDR                                                    \
28         (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |                \
29          NIX_TX_OFFLOAD_TSO_F)
30
31 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
32         do {                                                                   \
33                 /* Cached value is low, Update the fc_cache_pkts */            \
34                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
35                         /* Multiply with sqe_per_sqb to express in pkts */     \
36                         (txq)->fc_cache_pkts =                                 \
37                                 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
38                                 << (txq)->sqes_per_sqb_log2;                   \
39                         /* Check it again for the room */                      \
40                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
41                                 return 0;                                      \
42                 }                                                              \
43         } while (0)
44
45 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
46         (void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
47
48 /* Function to determine no of tx subdesc required in case ext
49  * sub desc is enabled.
50  */
51 static __rte_always_inline int
52 cn10k_nix_tx_ext_subs(const uint16_t flags)
53 {
54         return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
55                        ? 2
56                        : ((flags &
57                            (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
58                                   ? 1
59                                   : 0);
60 }
61
62 static __rte_always_inline uint8_t
63 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
64 {
65         return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
66                << ROC_LMT_LINES_PER_CORE_LOG2;
67 }
68
69 static __rte_always_inline uint8_t
70 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
71 {
72         return (flags & NIX_TX_NEED_EXT_HDR) ? 6 : 8;
73 }
74
75 static __rte_always_inline uint64_t
76 cn10k_nix_tx_steor_data(const uint16_t flags)
77 {
78         const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
79         uint64_t data;
80
81         /* This will be moved to addr area */
82         data = dw_m1;
83         /* 15 vector sizes for single seg */
84         data |= dw_m1 << 19;
85         data |= dw_m1 << 22;
86         data |= dw_m1 << 25;
87         data |= dw_m1 << 28;
88         data |= dw_m1 << 31;
89         data |= dw_m1 << 34;
90         data |= dw_m1 << 37;
91         data |= dw_m1 << 40;
92         data |= dw_m1 << 43;
93         data |= dw_m1 << 46;
94         data |= dw_m1 << 49;
95         data |= dw_m1 << 52;
96         data |= dw_m1 << 55;
97         data |= dw_m1 << 58;
98         data |= dw_m1 << 61;
99
100         return data;
101 }
102
103 static __rte_always_inline uint64_t
104 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
105 {
106         const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
107         uint64_t data;
108
109         /* This will be moved to addr area */
110         data = dw_m1;
111         /* 15 vector sizes for single seg */
112         data |= dw_m1 << 19;
113         data |= dw_m1 << 22;
114         data |= dw_m1 << 25;
115         data |= dw_m1 << 28;
116         data |= dw_m1 << 31;
117         data |= dw_m1 << 34;
118         data |= dw_m1 << 37;
119         data |= dw_m1 << 40;
120         data |= dw_m1 << 43;
121         data |= dw_m1 << 46;
122         data |= dw_m1 << 49;
123         data |= dw_m1 << 52;
124         data |= dw_m1 << 55;
125         data |= dw_m1 << 58;
126         data |= dw_m1 << 61;
127
128         return data;
129 }
130
131 static __rte_always_inline void
132 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
133                       const uint16_t flags)
134 {
135         /* Send hdr */
136         cmd[0] = txq->send_hdr_w0;
137         cmd[1] = 0;
138         cmd += 2;
139
140         /* Send ext if present */
141         if (flags & NIX_TX_NEED_EXT_HDR) {
142                 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
143                 cmd += 2;
144         }
145
146         /* Send sg */
147         cmd[0] = txq->sg_w0;
148         cmd[1] = 0;
149 }
150
151 static __rte_always_inline void
152 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
153 {
154         uint64_t mask, ol_flags = m->ol_flags;
155
156         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
157                 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
158                 uint16_t *iplen, *oiplen, *oudplen;
159                 uint16_t lso_sb, paylen;
160
161                 mask = -!!(ol_flags & (PKT_TX_OUTER_IPV4 | PKT_TX_OUTER_IPV6));
162                 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
163                          m->l2_len + m->l3_len + m->l4_len;
164
165                 /* Reduce payload len from base headers */
166                 paylen = m->pkt_len - lso_sb;
167
168                 /* Get iplen position assuming no tunnel hdr */
169                 iplen = (uint16_t *)(mdata + m->l2_len +
170                                      (2 << !!(ol_flags & PKT_TX_IPV6)));
171                 /* Handle tunnel tso */
172                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
173                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
174                         const uint8_t is_udp_tun =
175                                 (CNXK_NIX_UDP_TUN_BITMASK >>
176                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
177                                 0x1;
178
179                         oiplen = (uint16_t *)(mdata + m->outer_l2_len +
180                                               (2 << !!(ol_flags &
181                                                        PKT_TX_OUTER_IPV6)));
182                         *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
183                                                    paylen);
184
185                         /* Update format for UDP tunneled packet */
186                         if (is_udp_tun) {
187                                 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
188                                                        m->outer_l3_len + 4);
189                                 *oudplen = rte_cpu_to_be_16(
190                                         rte_be_to_cpu_16(*oudplen) - paylen);
191                         }
192
193                         /* Update iplen position to inner ip hdr */
194                         iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
195                                              m->l4_len +
196                                              (2 << !!(ol_flags & PKT_TX_IPV6)));
197                 }
198
199                 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
200         }
201 }
202
203 static __rte_always_inline void
204 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
205                        const uint16_t flags, const uint64_t lso_tun_fmt)
206 {
207         struct nix_send_ext_s *send_hdr_ext;
208         struct nix_send_hdr_s *send_hdr;
209         uint64_t ol_flags = 0, mask;
210         union nix_send_hdr_w1_u w1;
211         union nix_send_sg_s *sg;
212
213         send_hdr = (struct nix_send_hdr_s *)cmd;
214         if (flags & NIX_TX_NEED_EXT_HDR) {
215                 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
216                 sg = (union nix_send_sg_s *)(cmd + 4);
217                 /* Clear previous markings */
218                 send_hdr_ext->w0.lso = 0;
219                 send_hdr_ext->w1.u = 0;
220         } else {
221                 sg = (union nix_send_sg_s *)(cmd + 2);
222         }
223
224         if (flags & NIX_TX_NEED_SEND_HDR_W1) {
225                 ol_flags = m->ol_flags;
226                 w1.u = 0;
227         }
228
229         if (!(flags & NIX_TX_MULTI_SEG_F)) {
230                 send_hdr->w0.total = m->data_len;
231                 send_hdr->w0.aura =
232                         roc_npa_aura_handle_to_aura(m->pool->pool_id);
233         }
234
235         /*
236          * L3type:  2 => IPV4
237          *          3 => IPV4 with csum
238          *          4 => IPV6
239          * L3type and L3ptr needs to be set for either
240          * L3 csum or L4 csum or LSO
241          *
242          */
243
244         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
245             (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
246                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
247                 const uint8_t ol3type =
248                         ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
249                         ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
250                         !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
251
252                 /* Outer L3 */
253                 w1.ol3type = ol3type;
254                 mask = 0xffffull << ((!!ol3type) << 4);
255                 w1.ol3ptr = ~mask & m->outer_l2_len;
256                 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
257
258                 /* Outer L4 */
259                 w1.ol4type = csum + (csum << 1);
260
261                 /* Inner L3 */
262                 w1.il3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
263                              ((!!(ol_flags & PKT_TX_IPV6)) << 2);
264                 w1.il3ptr = w1.ol4ptr + m->l2_len;
265                 w1.il4ptr = w1.il3ptr + m->l3_len;
266                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
267                 w1.il3type = w1.il3type + !!(ol_flags & PKT_TX_IP_CKSUM);
268
269                 /* Inner L4 */
270                 w1.il4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
271
272                 /* In case of no tunnel header use only
273                  * shift IL3/IL4 fields a bit to use
274                  * OL3/OL4 for header checksum
275                  */
276                 mask = !ol3type;
277                 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
278                        ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
279
280         } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
281                 const uint8_t csum = !!(ol_flags & PKT_TX_OUTER_UDP_CKSUM);
282                 const uint8_t outer_l2_len = m->outer_l2_len;
283
284                 /* Outer L3 */
285                 w1.ol3ptr = outer_l2_len;
286                 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
287                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
288                 w1.ol3type = ((!!(ol_flags & PKT_TX_OUTER_IPV4)) << 1) +
289                              ((!!(ol_flags & PKT_TX_OUTER_IPV6)) << 2) +
290                              !!(ol_flags & PKT_TX_OUTER_IP_CKSUM);
291
292                 /* Outer L4 */
293                 w1.ol4type = csum + (csum << 1);
294
295         } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
296                 const uint8_t l2_len = m->l2_len;
297
298                 /* Always use OLXPTR and OLXTYPE when only
299                  * when one header is present
300                  */
301
302                 /* Inner L3 */
303                 w1.ol3ptr = l2_len;
304                 w1.ol4ptr = l2_len + m->l3_len;
305                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
306                 w1.ol3type = ((!!(ol_flags & PKT_TX_IPV4)) << 1) +
307                              ((!!(ol_flags & PKT_TX_IPV6)) << 2) +
308                              !!(ol_flags & PKT_TX_IP_CKSUM);
309
310                 /* Inner L4 */
311                 w1.ol4type = (ol_flags & PKT_TX_L4_MASK) >> 52;
312         }
313
314         if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
315                 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & PKT_TX_VLAN);
316                 /* HW will update ptr after vlan0 update */
317                 send_hdr_ext->w1.vlan1_ins_ptr = 12;
318                 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
319
320                 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & PKT_TX_QINQ);
321                 /* 2B before end of l2 header */
322                 send_hdr_ext->w1.vlan0_ins_ptr = 12;
323                 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
324         }
325
326         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & PKT_TX_TCP_SEG)) {
327                 uint16_t lso_sb;
328                 uint64_t mask;
329
330                 mask = -(!w1.il3type);
331                 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
332
333                 send_hdr_ext->w0.lso_sb = lso_sb;
334                 send_hdr_ext->w0.lso = 1;
335                 send_hdr_ext->w0.lso_mps = m->tso_segsz;
336                 send_hdr_ext->w0.lso_format =
337                         NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & PKT_TX_IPV6);
338                 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
339
340                 /* Handle tunnel tso */
341                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
342                     (ol_flags & PKT_TX_TUNNEL_MASK)) {
343                         const uint8_t is_udp_tun =
344                                 (CNXK_NIX_UDP_TUN_BITMASK >>
345                                  ((ol_flags & PKT_TX_TUNNEL_MASK) >> 45)) &
346                                 0x1;
347                         uint8_t shift = is_udp_tun ? 32 : 0;
348
349                         shift += (!!(ol_flags & PKT_TX_OUTER_IPV6) << 4);
350                         shift += (!!(ol_flags & PKT_TX_IPV6) << 3);
351
352                         w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
353                         w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
354                         /* Update format for UDP tunneled packet */
355                         send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
356                 }
357         }
358
359         if (flags & NIX_TX_NEED_SEND_HDR_W1)
360                 send_hdr->w1.u = w1.u;
361
362         if (!(flags & NIX_TX_MULTI_SEG_F)) {
363                 sg->seg1_size = m->data_len;
364                 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
365
366                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
367                         /* DF bit = 1 if refcount of current mbuf or parent mbuf
368                          *              is greater than 1
369                          * DF bit = 0 otherwise
370                          */
371                         send_hdr->w0.df = cnxk_nix_prefree_seg(m);
372                 }
373                 /* Mark mempool object as "put" since it is freed by NIX */
374                 if (!send_hdr->w0.df)
375                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
376         }
377
378         /* With minimal offloads, 'cmd' being local could be optimized out to
379          * registers. In other cases, 'cmd' will be in stack. Intent is
380          * 'cmd' stores content from txq->cmd which is copied only once.
381          */
382         *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
383         lmt_addr += 16;
384         if (flags & NIX_TX_NEED_EXT_HDR) {
385                 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
386                 lmt_addr += 16;
387         }
388         /* In case of multi-seg, sg template is stored here */
389         *((union nix_send_sg_s *)lmt_addr) = *sg;
390         *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
391 }
392
393 static __rte_always_inline void
394 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
395                               const uint64_t ol_flags, const uint16_t no_segdw,
396                               const uint16_t flags)
397 {
398         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
399                 const uint8_t is_ol_tstamp = !(ol_flags & PKT_TX_IEEE1588_TMST);
400                 struct nix_send_ext_s *send_hdr_ext =
401                                         (struct nix_send_ext_s *)lmt_addr + 16;
402                 uint64_t *lmt = (uint64_t *)lmt_addr;
403                 uint16_t off = (no_segdw - 1) << 1;
404                 struct nix_send_mem_s *send_mem;
405
406                 send_mem = (struct nix_send_mem_s *)(lmt + off);
407                 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
408                 send_hdr_ext->w0.tstmp = 1;
409                 if (flags & NIX_TX_MULTI_SEG_F) {
410                         /* Retrieving the default desc values */
411                         lmt[off] = cmd[2];
412
413                         /* Using compiler barier to avoid voilation of C
414                          * aliasing rules.
415                          */
416                         rte_compiler_barrier();
417                 }
418
419                 /* Packets for which PKT_TX_IEEE1588_TMST is not set, tx tstamp
420                  * should not be recorded, hence changing the alg type to
421                  * NIX_SENDMEMALG_SET and also changing send mem addr field to
422                  * next 8 bytes as it corrpt the actual tx tstamp registered
423                  * address.
424                  */
425                 send_mem->w0.subdc = NIX_SUBDC_MEM;
426                 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
427                 send_mem->addr =
428                         (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
429         }
430 }
431
432 static __rte_always_inline uint16_t
433 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
434 {
435         struct nix_send_hdr_s *send_hdr;
436         union nix_send_sg_s *sg;
437         struct rte_mbuf *m_next;
438         uint64_t *slist, sg_u;
439         uint64_t nb_segs;
440         uint64_t segdw;
441         uint8_t off, i;
442
443         send_hdr = (struct nix_send_hdr_s *)cmd;
444         send_hdr->w0.total = m->pkt_len;
445         send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
446
447         if (flags & NIX_TX_NEED_EXT_HDR)
448                 off = 2;
449         else
450                 off = 0;
451
452         sg = (union nix_send_sg_s *)&cmd[2 + off];
453         /* Clear sg->u header before use */
454         sg->u &= 0xFC00000000000000;
455         sg_u = sg->u;
456         slist = &cmd[3 + off];
457
458         i = 0;
459         nb_segs = m->nb_segs;
460
461         /* Fill mbuf segments */
462         do {
463                 m_next = m->next;
464                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
465                 *slist = rte_mbuf_data_iova(m);
466                 /* Set invert df if buffer is not to be freed by H/W */
467                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
468                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
469                         /* Mark mempool object as "put" since it is freed by NIX
470                          */
471 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
472                 if (!(sg_u & (1ULL << (i + 55))))
473                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
474 #endif
475                 slist++;
476                 i++;
477                 nb_segs--;
478                 if (i > 2 && nb_segs) {
479                         i = 0;
480                         /* Next SG subdesc */
481                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
482                         sg->u = sg_u;
483                         sg->segs = 3;
484                         sg = (union nix_send_sg_s *)slist;
485                         sg_u = sg->u;
486                         slist++;
487                 }
488                 m = m_next;
489         } while (nb_segs);
490
491         sg->u = sg_u;
492         sg->segs = i;
493         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
494         /* Roundup extra dwords to multiple of 2 */
495         segdw = (segdw >> 1) + (segdw & 0x1);
496         /* Default dwords */
497         segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
498         send_hdr->w0.sizem1 = segdw - 1;
499
500         return segdw;
501 }
502
503 static __rte_always_inline uint16_t
504 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
505                     uint64_t *cmd, const uint16_t flags)
506 {
507         struct cn10k_eth_txq *txq = tx_queue;
508         const rte_iova_t io_addr = txq->io_addr;
509         uintptr_t pa, lmt_addr = txq->lmt_base;
510         uint16_t lmt_id, burst, left, i;
511         uint64_t lso_tun_fmt;
512         uint64_t data;
513
514         NIX_XMIT_FC_OR_RETURN(txq, pkts);
515
516         /* Get cmd skeleton */
517         cn10k_nix_tx_skeleton(txq, cmd, flags);
518
519         /* Reduce the cached count */
520         txq->fc_cache_pkts -= pkts;
521
522         if (flags & NIX_TX_OFFLOAD_TSO_F)
523                 lso_tun_fmt = txq->lso_tun_fmt;
524
525         /* Get LMT base address and LMT ID as lcore id */
526         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
527         left = pkts;
528 again:
529         burst = left > 32 ? 32 : left;
530         for (i = 0; i < burst; i++) {
531                 /* Perform header writes for TSO, barrier at
532                  * lmt steorl will suffice.
533                  */
534                 if (flags & NIX_TX_OFFLOAD_TSO_F)
535                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
536
537                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
538                                        lso_tun_fmt);
539                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
540                                               tx_pkts[i]->ol_flags, 4, flags);
541                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
542         }
543
544         /* Trigger LMTST */
545         if (burst > 16) {
546                 data = cn10k_nix_tx_steor_data(flags);
547                 pa = io_addr | (data & 0x7) << 4;
548                 data &= ~0x7ULL;
549                 data |= (15ULL << 12);
550                 data |= (uint64_t)lmt_id;
551
552                 /* STEOR0 */
553                 roc_lmt_submit_steorl(data, pa);
554
555                 data = cn10k_nix_tx_steor_data(flags);
556                 pa = io_addr | (data & 0x7) << 4;
557                 data &= ~0x7ULL;
558                 data |= ((uint64_t)(burst - 17)) << 12;
559                 data |= (uint64_t)(lmt_id + 16);
560
561                 /* STEOR1 */
562                 roc_lmt_submit_steorl(data, pa);
563         } else if (burst) {
564                 data = cn10k_nix_tx_steor_data(flags);
565                 pa = io_addr | (data & 0x7) << 4;
566                 data &= ~0x7ULL;
567                 data |= ((uint64_t)(burst - 1)) << 12;
568                 data |= lmt_id;
569
570                 /* STEOR0 */
571                 roc_lmt_submit_steorl(data, pa);
572         }
573
574         left -= burst;
575         rte_io_wmb();
576         if (left) {
577                 /* Start processing another burst */
578                 tx_pkts += burst;
579                 /* Reset lmt base addr */
580                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
581                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
582                 goto again;
583         }
584
585         return pkts;
586 }
587
588 static __rte_always_inline uint16_t
589 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
590                          uint16_t pkts, uint64_t *cmd, const uint16_t flags)
591 {
592         struct cn10k_eth_txq *txq = tx_queue;
593         uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
594         const rte_iova_t io_addr = txq->io_addr;
595         uint16_t segdw, lmt_id, burst, left, i;
596         uint64_t data0, data1;
597         uint64_t lso_tun_fmt;
598         __uint128_t data128;
599         uint16_t shft;
600
601         NIX_XMIT_FC_OR_RETURN(txq, pkts);
602
603         cn10k_nix_tx_skeleton(txq, cmd, flags);
604
605         /* Reduce the cached count */
606         txq->fc_cache_pkts -= pkts;
607
608         if (flags & NIX_TX_OFFLOAD_TSO_F)
609                 lso_tun_fmt = txq->lso_tun_fmt;
610
611         /* Get LMT base address and LMT ID as lcore id */
612         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
613         left = pkts;
614 again:
615         burst = left > 32 ? 32 : left;
616         shft = 16;
617         data128 = 0;
618         for (i = 0; i < burst; i++) {
619                 /* Perform header writes for TSO, barrier at
620                  * lmt steorl will suffice.
621                  */
622                 if (flags & NIX_TX_OFFLOAD_TSO_F)
623                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
624
625                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
626                                        lso_tun_fmt);
627                 /* Store sg list directly on lmt line */
628                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
629                                                flags);
630                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
631                                               tx_pkts[i]->ol_flags, segdw,
632                                               flags);
633                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
634                 data128 |= (((__uint128_t)(segdw - 1)) << shft);
635                 shft += 3;
636         }
637
638         data0 = (uint64_t)data128;
639         data1 = (uint64_t)(data128 >> 64);
640         /* Make data0 similar to data1 */
641         data0 >>= 16;
642         /* Trigger LMTST */
643         if (burst > 16) {
644                 pa0 = io_addr | (data0 & 0x7) << 4;
645                 data0 &= ~0x7ULL;
646                 /* Move lmtst1..15 sz to bits 63:19 */
647                 data0 <<= 16;
648                 data0 |= (15ULL << 12);
649                 data0 |= (uint64_t)lmt_id;
650
651                 /* STEOR0 */
652                 roc_lmt_submit_steorl(data0, pa0);
653
654                 pa1 = io_addr | (data1 & 0x7) << 4;
655                 data1 &= ~0x7ULL;
656                 data1 <<= 16;
657                 data1 |= ((uint64_t)(burst - 17)) << 12;
658                 data1 |= (uint64_t)(lmt_id + 16);
659
660                 /* STEOR1 */
661                 roc_lmt_submit_steorl(data1, pa1);
662         } else if (burst) {
663                 pa0 = io_addr | (data0 & 0x7) << 4;
664                 data0 &= ~0x7ULL;
665                 /* Move lmtst1..15 sz to bits 63:19 */
666                 data0 <<= 16;
667                 data0 |= ((burst - 1) << 12);
668                 data0 |= (uint64_t)lmt_id;
669
670                 /* STEOR0 */
671                 roc_lmt_submit_steorl(data0, pa0);
672         }
673
674         left -= burst;
675         rte_io_wmb();
676         if (left) {
677                 /* Start processing another burst */
678                 tx_pkts += burst;
679                 /* Reset lmt base addr */
680                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
681                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
682                 goto again;
683         }
684
685         return pkts;
686 }
687
688 #if defined(RTE_ARCH_ARM64)
689
690 #define NIX_DESCS_PER_LOOP 4
691 static __rte_always_inline uint16_t
692 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
693                            uint16_t pkts, uint64_t *cmd, const uint16_t flags)
694 {
695         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
696         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
697         uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
698                 cmd2[NIX_DESCS_PER_LOOP];
699         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, data, pa;
700         uint64x2_t senddesc01_w0, senddesc23_w0;
701         uint64x2_t senddesc01_w1, senddesc23_w1;
702         uint16_t left, scalar, burst, i, lmt_id;
703         uint64x2_t sendext01_w0, sendext23_w0;
704         uint64x2_t sendext01_w1, sendext23_w1;
705         uint64x2_t sgdesc01_w0, sgdesc23_w0;
706         uint64x2_t sgdesc01_w1, sgdesc23_w1;
707         struct cn10k_eth_txq *txq = tx_queue;
708         uintptr_t laddr = txq->lmt_base;
709         rte_iova_t io_addr = txq->io_addr;
710         uint64x2_t ltypes01, ltypes23;
711         uint64x2_t xtmp128, ytmp128;
712         uint64x2_t xmask01, xmask23;
713         uint8_t lnum;
714
715         NIX_XMIT_FC_OR_RETURN(txq, pkts);
716
717         scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
718         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
719
720         /* Reduce the cached count */
721         txq->fc_cache_pkts -= pkts;
722
723         senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
724         senddesc23_w0 = senddesc01_w0;
725         senddesc01_w1 = vdupq_n_u64(0);
726         senddesc23_w1 = senddesc01_w1;
727         sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
728         sgdesc23_w0 = sgdesc01_w0;
729
730         /* Load command defaults into vector variables. */
731         if (flags & NIX_TX_NEED_EXT_HDR) {
732                 sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
733                 sendext23_w0 = sendext01_w0;
734                 sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
735                 sendext23_w1 = sendext01_w1;
736         }
737
738         /* Get LMT base address and LMT ID as lcore id */
739         ROC_LMT_BASE_ID_GET(laddr, lmt_id);
740         left = pkts;
741 again:
742         /* Number of packets to prepare depends on offloads enabled. */
743         burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
744                               cn10k_nix_pkts_per_vec_brst(flags) :
745                               left;
746         lnum = 0;
747         for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
748                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
749                 senddesc01_w0 =
750                         vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
751                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
752
753                 senddesc23_w0 = senddesc01_w0;
754                 sgdesc23_w0 = sgdesc01_w0;
755
756                 /* Clear vlan enables. */
757                 if (flags & NIX_TX_NEED_EXT_HDR) {
758                         sendext01_w1 = vbicq_u64(sendext01_w1,
759                                                  vdupq_n_u64(0x3FFFF00FFFF00));
760                         sendext23_w1 = sendext01_w1;
761                 }
762
763                 /* Move mbufs to iova */
764                 mbuf0 = (uint64_t *)tx_pkts[0];
765                 mbuf1 = (uint64_t *)tx_pkts[1];
766                 mbuf2 = (uint64_t *)tx_pkts[2];
767                 mbuf3 = (uint64_t *)tx_pkts[3];
768
769                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
770                                      offsetof(struct rte_mbuf, buf_iova));
771                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
772                                      offsetof(struct rte_mbuf, buf_iova));
773                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
774                                      offsetof(struct rte_mbuf, buf_iova));
775                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
776                                      offsetof(struct rte_mbuf, buf_iova));
777                 /*
778                  * Get mbuf's, olflags, iova, pktlen, dataoff
779                  * dataoff_iovaX.D[0] = iova,
780                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
781                  * len_olflagsX.D[0] = ol_flags,
782                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
783                  */
784                 dataoff_iova0 = vld1q_u64(mbuf0);
785                 len_olflags0 = vld1q_u64(mbuf0 + 2);
786                 dataoff_iova1 = vld1q_u64(mbuf1);
787                 len_olflags1 = vld1q_u64(mbuf1 + 2);
788                 dataoff_iova2 = vld1q_u64(mbuf2);
789                 len_olflags2 = vld1q_u64(mbuf2 + 2);
790                 dataoff_iova3 = vld1q_u64(mbuf3);
791                 len_olflags3 = vld1q_u64(mbuf3 + 2);
792
793                 /* Move mbufs to point pool */
794                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
795                                      offsetof(struct rte_mbuf, pool) -
796                                      offsetof(struct rte_mbuf, buf_iova));
797                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
798                                      offsetof(struct rte_mbuf, pool) -
799                                      offsetof(struct rte_mbuf, buf_iova));
800                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
801                                      offsetof(struct rte_mbuf, pool) -
802                                      offsetof(struct rte_mbuf, buf_iova));
803                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
804                                      offsetof(struct rte_mbuf, pool) -
805                                      offsetof(struct rte_mbuf, buf_iova));
806
807                 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
808                              NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
809                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
810                         /*
811                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
812                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
813                          */
814
815                         asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
816                                      : [a] "+w"(senddesc01_w1)
817                                      : [in] "r"(mbuf0 + 2)
818                                      : "memory");
819
820                         asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
821                                      : [a] "+w"(senddesc01_w1)
822                                      : [in] "r"(mbuf1 + 2)
823                                      : "memory");
824
825                         asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
826                                      : [b] "+w"(senddesc23_w1)
827                                      : [in] "r"(mbuf2 + 2)
828                                      : "memory");
829
830                         asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
831                                      : [b] "+w"(senddesc23_w1)
832                                      : [in] "r"(mbuf3 + 2)
833                                      : "memory");
834
835                         /* Get pool pointer alone */
836                         mbuf0 = (uint64_t *)*mbuf0;
837                         mbuf1 = (uint64_t *)*mbuf1;
838                         mbuf2 = (uint64_t *)*mbuf2;
839                         mbuf3 = (uint64_t *)*mbuf3;
840                 } else {
841                         /* Get pool pointer alone */
842                         mbuf0 = (uint64_t *)*mbuf0;
843                         mbuf1 = (uint64_t *)*mbuf1;
844                         mbuf2 = (uint64_t *)*mbuf2;
845                         mbuf3 = (uint64_t *)*mbuf3;
846                 }
847
848                 const uint8x16_t shuf_mask2 = {
849                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
850                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
851                 };
852                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
853                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
854
855                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
856                 const uint64x2_t and_mask0 = {
857                         0xFFFFFFFFFFFFFFFF,
858                         0x000000000000FFFF,
859                 };
860
861                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
862                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
863                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
864                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
865
866                 /*
867                  * Pick only 16 bits of pktlen preset at bits 63:32
868                  * and place them at bits 15:0.
869                  */
870                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
871                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
872
873                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
874                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
875                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
876
877                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
878                  * pktlen at 15:0 position.
879                  */
880                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
881                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
882                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
883                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
884
885                 /* Move mbuf to point to pool_id. */
886                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
887                                      offsetof(struct rte_mempool, pool_id));
888                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
889                                      offsetof(struct rte_mempool, pool_id));
890                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
891                                      offsetof(struct rte_mempool, pool_id));
892                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
893                                      offsetof(struct rte_mempool, pool_id));
894
895                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
896                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
897                         /*
898                          * Lookup table to translate ol_flags to
899                          * il3/il4 types. But we still use ol3/ol4 types in
900                          * senddesc_w1 as only one header processing is enabled.
901                          */
902                         const uint8x16_t tbl = {
903                                 /* [0-15] = il4type:il3type */
904                                 0x04, /* none (IPv6 assumed) */
905                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
906                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
907                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
908                                 0x03, /* PKT_TX_IP_CKSUM */
909                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
910                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
911                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
912                                 0x02, /* PKT_TX_IPV4  */
913                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
914                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
915                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
916                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
917                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
918                                        * PKT_TX_TCP_CKSUM
919                                        */
920                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
921                                        * PKT_TX_SCTP_CKSUM
922                                        */
923                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
924                                        * PKT_TX_UDP_CKSUM
925                                        */
926                         };
927
928                         /* Extract olflags to translate to iltypes */
929                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
930                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
931
932                         /*
933                          * E(47):L3_LEN(9):L2_LEN(7+z)
934                          * E(47):L3_LEN(9):L2_LEN(7+z)
935                          */
936                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
937                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
938
939                         /* Move OLFLAGS bits 55:52 to 51:48
940                          * with zeros preprended on the byte and rest
941                          * don't care
942                          */
943                         xtmp128 = vshrq_n_u8(xtmp128, 4);
944                         ytmp128 = vshrq_n_u8(ytmp128, 4);
945                         /*
946                          * E(48):L3_LEN(8):L2_LEN(z+7)
947                          * E(48):L3_LEN(8):L2_LEN(z+7)
948                          */
949                         const int8x16_t tshft3 = {
950                                 -1, 0, 8, 8, 8, 8, 8, 8,
951                                 -1, 0, 8, 8, 8, 8, 8, 8,
952                         };
953
954                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
955                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
956
957                         /* Do the lookup */
958                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
959                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
960
961                         /* Pick only relevant fields i.e Bit 48:55 of iltype
962                          * and place it in ol3/ol4type of senddesc_w1
963                          */
964                         const uint8x16_t shuf_mask0 = {
965                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
966                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
967                         };
968
969                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
970                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
971
972                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
973                          * a [E(32):E(16):OL3(8):OL2(8)]
974                          * a = a + (a << 8)
975                          * a [E(32):E(16):(OL3+OL2):OL2]
976                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
977                          */
978                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
979                                                  vshlq_n_u16(senddesc01_w1, 8));
980                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
981                                                  vshlq_n_u16(senddesc23_w1, 8));
982
983                         /* Move ltypes to senddesc*_w1 */
984                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
985                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
986                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
987                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
988                         /*
989                          * Lookup table to translate ol_flags to
990                          * ol3/ol4 types.
991                          */
992
993                         const uint8x16_t tbl = {
994                                 /* [0-15] = ol4type:ol3type */
995                                 0x00, /* none */
996                                 0x03, /* OUTER_IP_CKSUM */
997                                 0x02, /* OUTER_IPV4 */
998                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
999                                 0x04, /* OUTER_IPV6 */
1000                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1001                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1002                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1003                                        * OUTER_IP_CKSUM
1004                                        */
1005                                 0x00, /* OUTER_UDP_CKSUM */
1006                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
1007                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
1008                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
1009                                        * OUTER_IP_CKSUM
1010                                        */
1011                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
1012                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1013                                        * OUTER_IP_CKSUM
1014                                        */
1015                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1016                                        * OUTER_IPV4
1017                                        */
1018                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1019                                        * OUTER_IPV4 | OUTER_IP_CKSUM
1020                                        */
1021                         };
1022
1023                         /* Extract olflags to translate to iltypes */
1024                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1025                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1026
1027                         /*
1028                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1029                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1030                          */
1031                         const uint8x16_t shuf_mask5 = {
1032                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1033                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1034                         };
1035                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1036                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1037
1038                         /* Extract outer ol flags only */
1039                         const uint64x2_t o_cksum_mask = {
1040                                 0x1C00020000000000,
1041                                 0x1C00020000000000,
1042                         };
1043
1044                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1045                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1046
1047                         /* Extract OUTER_UDP_CKSUM bit 41 and
1048                          * move it to bit 61
1049                          */
1050
1051                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1052                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1053
1054                         /* Shift oltype by 2 to start nibble from BIT(56)
1055                          * instead of BIT(58)
1056                          */
1057                         xtmp128 = vshrq_n_u8(xtmp128, 2);
1058                         ytmp128 = vshrq_n_u8(ytmp128, 2);
1059                         /*
1060                          * E(48):L3_LEN(8):L2_LEN(z+7)
1061                          * E(48):L3_LEN(8):L2_LEN(z+7)
1062                          */
1063                         const int8x16_t tshft3 = {
1064                                 -1, 0, 8, 8, 8, 8, 8, 8,
1065                                 -1, 0, 8, 8, 8, 8, 8, 8,
1066                         };
1067
1068                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1069                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1070
1071                         /* Do the lookup */
1072                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1073                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1074
1075                         /* Pick only relevant fields i.e Bit 56:63 of oltype
1076                          * and place it in ol3/ol4type of senddesc_w1
1077                          */
1078                         const uint8x16_t shuf_mask0 = {
1079                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1080                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1081                         };
1082
1083                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1084                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1085
1086                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1087                          * a [E(32):E(16):OL3(8):OL2(8)]
1088                          * a = a + (a << 8)
1089                          * a [E(32):E(16):(OL3+OL2):OL2]
1090                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1091                          */
1092                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1093                                                  vshlq_n_u16(senddesc01_w1, 8));
1094                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1095                                                  vshlq_n_u16(senddesc23_w1, 8));
1096
1097                         /* Move ltypes to senddesc*_w1 */
1098                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1099                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1100                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1101                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1102                         /* Lookup table to translate ol_flags to
1103                          * ol4type, ol3type, il4type, il3type of senddesc_w1
1104                          */
1105                         const uint8x16x2_t tbl = {{
1106                                 {
1107                                         /* [0-15] = il4type:il3type */
1108                                         0x04, /* none (IPv6) */
1109                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
1110                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
1111                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
1112                                         0x03, /* PKT_TX_IP_CKSUM */
1113                                         0x13, /* PKT_TX_IP_CKSUM |
1114                                                * PKT_TX_TCP_CKSUM
1115                                                */
1116                                         0x23, /* PKT_TX_IP_CKSUM |
1117                                                * PKT_TX_SCTP_CKSUM
1118                                                */
1119                                         0x33, /* PKT_TX_IP_CKSUM |
1120                                                * PKT_TX_UDP_CKSUM
1121                                                */
1122                                         0x02, /* PKT_TX_IPV4 */
1123                                         0x12, /* PKT_TX_IPV4 |
1124                                                * PKT_TX_TCP_CKSUM
1125                                                */
1126                                         0x22, /* PKT_TX_IPV4 |
1127                                                * PKT_TX_SCTP_CKSUM
1128                                                */
1129                                         0x32, /* PKT_TX_IPV4 |
1130                                                * PKT_TX_UDP_CKSUM
1131                                                */
1132                                         0x03, /* PKT_TX_IPV4 |
1133                                                * PKT_TX_IP_CKSUM
1134                                                */
1135                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1136                                                * PKT_TX_TCP_CKSUM
1137                                                */
1138                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1139                                                * PKT_TX_SCTP_CKSUM
1140                                                */
1141                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
1142                                                * PKT_TX_UDP_CKSUM
1143                                                */
1144                                 },
1145
1146                                 {
1147                                         /* [16-31] = ol4type:ol3type */
1148                                         0x00, /* none */
1149                                         0x03, /* OUTER_IP_CKSUM */
1150                                         0x02, /* OUTER_IPV4 */
1151                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1152                                         0x04, /* OUTER_IPV6 */
1153                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1154                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1155                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1156                                                * OUTER_IP_CKSUM
1157                                                */
1158                                         0x00, /* OUTER_UDP_CKSUM */
1159                                         0x33, /* OUTER_UDP_CKSUM |
1160                                                * OUTER_IP_CKSUM
1161                                                */
1162                                         0x32, /* OUTER_UDP_CKSUM |
1163                                                * OUTER_IPV4
1164                                                */
1165                                         0x33, /* OUTER_UDP_CKSUM |
1166                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1167                                                */
1168                                         0x34, /* OUTER_UDP_CKSUM |
1169                                                * OUTER_IPV6
1170                                                */
1171                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1172                                                * OUTER_IP_CKSUM
1173                                                */
1174                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1175                                                * OUTER_IPV4
1176                                                */
1177                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1178                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1179                                                */
1180                                 },
1181                         }};
1182
1183                         /* Extract olflags to translate to oltype & iltype */
1184                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1185                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1186
1187                         /*
1188                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1189                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1190                          */
1191                         const uint32x4_t tshft_4 = {
1192                                 1,
1193                                 0,
1194                                 1,
1195                                 0,
1196                         };
1197                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1198                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1199
1200                         /*
1201                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1202                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1203                          */
1204                         const uint8x16_t shuf_mask5 = {
1205                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1206                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1207                         };
1208                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1209                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1210
1211                         /* Extract outer and inner header ol_flags */
1212                         const uint64x2_t oi_cksum_mask = {
1213                                 0x1CF0020000000000,
1214                                 0x1CF0020000000000,
1215                         };
1216
1217                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1218                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1219
1220                         /* Extract OUTER_UDP_CKSUM bit 41 and
1221                          * move it to bit 61
1222                          */
1223
1224                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1225                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1226
1227                         /* Shift right oltype by 2 and iltype by 4
1228                          * to start oltype nibble from BIT(58)
1229                          * instead of BIT(56) and iltype nibble from BIT(48)
1230                          * instead of BIT(52).
1231                          */
1232                         const int8x16_t tshft5 = {
1233                                 8, 8, 8, 8, 8, 8, -4, -2,
1234                                 8, 8, 8, 8, 8, 8, -4, -2,
1235                         };
1236
1237                         xtmp128 = vshlq_u8(xtmp128, tshft5);
1238                         ytmp128 = vshlq_u8(ytmp128, tshft5);
1239                         /*
1240                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1241                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1242                          */
1243                         const int8x16_t tshft3 = {
1244                                 -1, 0, -1, 0, 0, 0, 0, 0,
1245                                 -1, 0, -1, 0, 0, 0, 0, 0,
1246                         };
1247
1248                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1249                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1250
1251                         /* Mark Bit(4) of oltype */
1252                         const uint64x2_t oi_cksum_mask2 = {
1253                                 0x1000000000000000,
1254                                 0x1000000000000000,
1255                         };
1256
1257                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1258                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1259
1260                         /* Do the lookup */
1261                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1262                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1263
1264                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
1265                          * Bit 56:63 of oltype and place it in corresponding
1266                          * place in senddesc_w1.
1267                          */
1268                         const uint8x16_t shuf_mask0 = {
1269                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1270                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1271                         };
1272
1273                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1274                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1275
1276                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1277                          * l3len, l2len, ol3len, ol2len.
1278                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1279                          * a = a + (a << 8)
1280                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1281                          * a = a + (a << 16)
1282                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1283                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1284                          */
1285                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1286                                                  vshlq_n_u32(senddesc01_w1, 8));
1287                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1288                                                  vshlq_n_u32(senddesc23_w1, 8));
1289
1290                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1291                         senddesc01_w1 = vaddq_u8(
1292                                 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1293                         senddesc23_w1 = vaddq_u8(
1294                                 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1295
1296                         /* Move ltypes to senddesc*_w1 */
1297                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1298                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1299                 }
1300
1301                 xmask01 = vdupq_n_u64(0);
1302                 xmask23 = xmask01;
1303                 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1304                              : [a] "+w"(xmask01)
1305                              : [in] "r"(mbuf0)
1306                              : "memory");
1307
1308                 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1309                              : [a] "+w"(xmask01)
1310                              : [in] "r"(mbuf1)
1311                              : "memory");
1312
1313                 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1314                              : [b] "+w"(xmask23)
1315                              : [in] "r"(mbuf2)
1316                              : "memory");
1317
1318                 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1319                              : [b] "+w"(xmask23)
1320                              : [in] "r"(mbuf3)
1321                              : "memory");
1322                 xmask01 = vshlq_n_u64(xmask01, 20);
1323                 xmask23 = vshlq_n_u64(xmask23, 20);
1324
1325                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1326                 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1327
1328                 if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
1329                         /* Tx ol_flag for vlan. */
1330                         const uint64x2_t olv = {PKT_TX_VLAN, PKT_TX_VLAN};
1331                         /* Bit enable for VLAN1 */
1332                         const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
1333                         /* Tx ol_flag for QnQ. */
1334                         const uint64x2_t olq = {PKT_TX_QINQ, PKT_TX_QINQ};
1335                         /* Bit enable for VLAN0 */
1336                         const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
1337                         /* Load vlan values from packet. outer is VLAN 0 */
1338                         uint64x2_t ext01 = {
1339                                 ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
1340                                         ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
1341                                 ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
1342                                         ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
1343                         };
1344                         uint64x2_t ext23 = {
1345                                 ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
1346                                         ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
1347                                 ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
1348                                         ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
1349                         };
1350
1351                         /* Get ol_flags of the packets. */
1352                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1353                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1354
1355                         /* ORR vlan outer/inner values into cmd. */
1356                         sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
1357                         sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
1358
1359                         /* Test for offload enable bits and generate masks. */
1360                         xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
1361                                                       mlv),
1362                                             vandq_u64(vtstq_u64(xtmp128, olq),
1363                                                       mlq));
1364                         ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
1365                                                       mlv),
1366                                             vandq_u64(vtstq_u64(ytmp128, olq),
1367                                                       mlq));
1368
1369                         /* Set vlan enable bits into cmd based on mask. */
1370                         sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
1371                         sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
1372                 }
1373
1374                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
1375                         /* Set don't free bit if reference count > 1 */
1376                         xmask01 = vdupq_n_u64(0);
1377                         xmask23 = xmask01;
1378
1379                         /* Move mbufs to iova */
1380                         mbuf0 = (uint64_t *)tx_pkts[0];
1381                         mbuf1 = (uint64_t *)tx_pkts[1];
1382                         mbuf2 = (uint64_t *)tx_pkts[2];
1383                         mbuf3 = (uint64_t *)tx_pkts[3];
1384
1385                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1386                                 vsetq_lane_u64(0x80000, xmask01, 0);
1387                         else
1388                                 __mempool_check_cookies(
1389                                         ((struct rte_mbuf *)mbuf0)->pool,
1390                                         (void **)&mbuf0, 1, 0);
1391
1392                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1393                                 vsetq_lane_u64(0x80000, xmask01, 1);
1394                         else
1395                                 __mempool_check_cookies(
1396                                         ((struct rte_mbuf *)mbuf1)->pool,
1397                                         (void **)&mbuf1, 1, 0);
1398
1399                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1400                                 vsetq_lane_u64(0x80000, xmask23, 0);
1401                         else
1402                                 __mempool_check_cookies(
1403                                         ((struct rte_mbuf *)mbuf2)->pool,
1404                                         (void **)&mbuf2, 1, 0);
1405
1406                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1407                                 vsetq_lane_u64(0x80000, xmask23, 1);
1408                         else
1409                                 __mempool_check_cookies(
1410                                         ((struct rte_mbuf *)mbuf3)->pool,
1411                                         (void **)&mbuf3, 1, 0);
1412                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1413                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1414                 } else {
1415                         /* Move mbufs to iova */
1416                         mbuf0 = (uint64_t *)tx_pkts[0];
1417                         mbuf1 = (uint64_t *)tx_pkts[1];
1418                         mbuf2 = (uint64_t *)tx_pkts[2];
1419                         mbuf3 = (uint64_t *)tx_pkts[3];
1420
1421                         /* Mark mempool object as "put" since
1422                          * it is freed by NIX
1423                          */
1424                         __mempool_check_cookies(
1425                                 ((struct rte_mbuf *)mbuf0)->pool,
1426                                 (void **)&mbuf0, 1, 0);
1427
1428                         __mempool_check_cookies(
1429                                 ((struct rte_mbuf *)mbuf1)->pool,
1430                                 (void **)&mbuf1, 1, 0);
1431
1432                         __mempool_check_cookies(
1433                                 ((struct rte_mbuf *)mbuf2)->pool,
1434                                 (void **)&mbuf2, 1, 0);
1435
1436                         __mempool_check_cookies(
1437                                 ((struct rte_mbuf *)mbuf3)->pool,
1438                                 (void **)&mbuf3, 1, 0);
1439                 }
1440
1441                 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1442                 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1443                 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1444                 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1445                 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1446
1447                 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1448                 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1449                 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1450                 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1451
1452                 if (flags & NIX_TX_NEED_EXT_HDR) {
1453                         cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
1454                         cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
1455                         cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
1456                         cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
1457                 }
1458
1459                 if (flags & NIX_TX_NEED_EXT_HDR) {
1460                         /* Store the prepared send desc to LMT lines */
1461                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1462                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1463                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1464                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
1465                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
1466                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
1467                         lnum += 1;
1468                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1469                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1470                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1471                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
1472                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
1473                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
1474                         lnum += 1;
1475                 } else {
1476                         /* Store the prepared send desc to LMT lines */
1477                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1478                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1479                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1480                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1481                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1482                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1483                         vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1484                         vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1485                         lnum += 1;
1486                 }
1487
1488                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1489         }
1490
1491         /* Trigger LMTST */
1492         if (lnum > 16) {
1493                 data = cn10k_nix_tx_steor_vec_data(flags);
1494                 pa = io_addr | (data & 0x7) << 4;
1495                 data &= ~0x7ULL;
1496                 data |= (15ULL << 12);
1497                 data |= (uint64_t)lmt_id;
1498
1499                 /* STEOR0 */
1500                 roc_lmt_submit_steorl(data, pa);
1501
1502                 data = cn10k_nix_tx_steor_vec_data(flags);
1503                 pa = io_addr | (data & 0x7) << 4;
1504                 data &= ~0x7ULL;
1505                 data |= ((uint64_t)(lnum - 17)) << 12;
1506                 data |= (uint64_t)(lmt_id + 16);
1507
1508                 /* STEOR1 */
1509                 roc_lmt_submit_steorl(data, pa);
1510         } else if (lnum) {
1511                 data = cn10k_nix_tx_steor_vec_data(flags);
1512                 pa = io_addr | (data & 0x7) << 4;
1513                 data &= ~0x7ULL;
1514                 data |= ((uint64_t)(lnum - 1)) << 12;
1515                 data |= lmt_id;
1516
1517                 /* STEOR0 */
1518                 roc_lmt_submit_steorl(data, pa);
1519         }
1520
1521         left -= burst;
1522         rte_io_wmb();
1523         if (left)
1524                 goto again;
1525
1526         if (unlikely(scalar))
1527                 pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar, cmd,
1528                                             flags);
1529
1530         return pkts;
1531 }
1532
1533 #else
1534 static __rte_always_inline uint16_t
1535 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
1536                            uint16_t pkts, uint64_t *cmd, const uint16_t flags)
1537 {
1538         RTE_SET_USED(tx_queue);
1539         RTE_SET_USED(tx_pkts);
1540         RTE_SET_USED(pkts);
1541         RTE_SET_USED(cmd);
1542         RTE_SET_USED(flags);
1543         return 0;
1544 }
1545 #endif
1546
1547 #define L3L4CSUM_F   NIX_TX_OFFLOAD_L3_L4_CSUM_F
1548 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
1549 #define VLAN_F       NIX_TX_OFFLOAD_VLAN_QINQ_F
1550 #define NOFF_F       NIX_TX_OFFLOAD_MBUF_NOFF_F
1551 #define TSO_F        NIX_TX_OFFLOAD_TSO_F
1552 #define TSP_F        NIX_TX_OFFLOAD_TSTAMP_F
1553
1554 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
1555 #define NIX_TX_FASTPATH_MODES                                           \
1556 T(no_offload,                           0, 0, 0, 0, 0, 0,       4,      \
1557                 NIX_TX_OFFLOAD_NONE)                                    \
1558 T(l3l4csum,                             0, 0, 0, 0, 0, 1,       4,      \
1559                 L3L4CSUM_F)                                             \
1560 T(ol3ol4csum,                           0, 0, 0, 0, 1, 0,       4,      \
1561                 OL3OL4CSUM_F)                                           \
1562 T(ol3ol4csum_l3l4csum,                  0, 0, 0, 0, 1, 1,       4,      \
1563                 OL3OL4CSUM_F | L3L4CSUM_F)                              \
1564 T(vlan,                                 0, 0, 0, 1, 0, 0,       6,      \
1565                 VLAN_F)                                                 \
1566 T(vlan_l3l4csum,                        0, 0, 0, 1, 0, 1,       6,      \
1567                 VLAN_F | L3L4CSUM_F)                                    \
1568 T(vlan_ol3ol4csum,                      0, 0, 0, 1, 1, 0,       6,      \
1569                 VLAN_F | OL3OL4CSUM_F)                                  \
1570 T(vlan_ol3ol4csum_l3l4csum,             0, 0, 0, 1, 1, 1,       6,      \
1571                 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
1572 T(noff,                                 0, 0, 1, 0, 0, 0,       4,      \
1573                 NOFF_F)                                                 \
1574 T(noff_l3l4csum,                        0, 0, 1, 0, 0, 1,       4,      \
1575                 NOFF_F | L3L4CSUM_F)                                    \
1576 T(noff_ol3ol4csum,                      0, 0, 1, 0, 1, 0,       4,      \
1577                 NOFF_F | OL3OL4CSUM_F)                                  \
1578 T(noff_ol3ol4csum_l3l4csum,             0, 0, 1, 0, 1, 1,       4,      \
1579                 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
1580 T(noff_vlan,                            0, 0, 1, 1, 0, 0,       6,      \
1581                 NOFF_F | VLAN_F)                                        \
1582 T(noff_vlan_l3l4csum,                   0, 0, 1, 1, 0, 1,       6,      \
1583                 NOFF_F | VLAN_F | L3L4CSUM_F)                           \
1584 T(noff_vlan_ol3ol4csum,                 0, 0, 1, 1, 1, 0,       6,      \
1585                 NOFF_F | VLAN_F | OL3OL4CSUM_F)                         \
1586 T(noff_vlan_ol3ol4csum_l3l4csum,        0, 0, 1, 1, 1, 1,       6,      \
1587                 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)            \
1588 T(tso,                                  0, 1, 0, 0, 0, 0,       6,      \
1589                 TSO_F)                                                  \
1590 T(tso_l3l4csum,                         0, 1, 0, 0, 0, 1,       6,      \
1591                 TSO_F | L3L4CSUM_F)                                     \
1592 T(tso_ol3ol4csum,                       0, 1, 0, 0, 1, 0,       6,      \
1593                 TSO_F | OL3OL4CSUM_F)                                   \
1594 T(tso_ol3ol4csum_l3l4csum,              0, 1, 0, 0, 1, 1,       6,      \
1595                 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
1596 T(tso_vlan,                             0, 1, 0, 1, 0, 0,       6,      \
1597                 TSO_F | VLAN_F)                                         \
1598 T(tso_vlan_l3l4csum,                    0, 1, 0, 1, 0, 1,       6,      \
1599                 TSO_F | VLAN_F | L3L4CSUM_F)                            \
1600 T(tso_vlan_ol3ol4csum,                  0, 1, 0, 1, 1, 0,       6,      \
1601                 TSO_F | VLAN_F | OL3OL4CSUM_F)                          \
1602 T(tso_vlan_ol3ol4csum_l3l4csum,         0, 1, 0, 1, 1, 1,       6,      \
1603                 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
1604 T(tso_noff,                             0, 1, 1, 0, 0, 0,       6,      \
1605                 TSO_F | NOFF_F)                                         \
1606 T(tso_noff_l3l4csum,                    0, 1, 1, 0, 0, 1,       6,      \
1607                 TSO_F | NOFF_F | L3L4CSUM_F)                            \
1608 T(tso_noff_ol3ol4csum,                  0, 1, 1, 0, 1, 0,       6,      \
1609                 TSO_F | NOFF_F | OL3OL4CSUM_F)                          \
1610 T(tso_noff_ol3ol4csum_l3l4csum,         0, 1, 1, 0, 1, 1,       6,      \
1611                 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
1612 T(tso_noff_vlan,                        0, 1, 1, 1, 0, 0,       6,      \
1613                 TSO_F | NOFF_F | VLAN_F)                                \
1614 T(tso_noff_vlan_l3l4csum,               0, 1, 1, 1, 0, 1,       6,      \
1615                 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
1616 T(tso_noff_vlan_ol3ol4csum,             0, 1, 1, 1, 1, 0,       6,      \
1617                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
1618 T(tso_noff_vlan_ol3ol4csum_l3l4csum,    0, 1, 1, 1, 1, 1,       6,      \
1619                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
1620 T(ts,                                   1, 0, 0, 0, 0, 0,       8,      \
1621                 TSP_F)                                                  \
1622 T(ts_l3l4csum,                          1, 0, 0, 0, 0, 1,       8,      \
1623                 TSP_F | L3L4CSUM_F)                                     \
1624 T(ts_ol3ol4csum,                        1, 0, 0, 0, 1, 0,       8,      \
1625                 TSP_F | OL3OL4CSUM_F)                                   \
1626 T(ts_ol3ol4csum_l3l4csum,               1, 0, 0, 0, 1, 1,       8,      \
1627                 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
1628 T(ts_vlan,                              1, 0, 0, 1, 0, 0,       8,      \
1629                 TSP_F | VLAN_F)                                         \
1630 T(ts_vlan_l3l4csum,                     1, 0, 0, 1, 0, 1,       8,      \
1631                 TSP_F | VLAN_F | L3L4CSUM_F)                            \
1632 T(ts_vlan_ol3ol4csum,                   1, 0, 0, 1, 1, 0,       8,      \
1633                 TSP_F | VLAN_F | OL3OL4CSUM_F)                          \
1634 T(ts_vlan_ol3ol4csum_l3l4csum,          1, 0, 0, 1, 1, 1,       8,      \
1635                 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
1636 T(ts_noff,                              1, 0, 1, 0, 0, 0,       8,      \
1637                 TSP_F | NOFF_F)                                         \
1638 T(ts_noff_l3l4csum,                     1, 0, 1, 0, 0, 1,       8,      \
1639                 TSP_F | NOFF_F | L3L4CSUM_F)                            \
1640 T(ts_noff_ol3ol4csum,                   1, 0, 1, 0, 1, 0,       8,      \
1641                 TSP_F | NOFF_F | OL3OL4CSUM_F)                          \
1642 T(ts_noff_ol3ol4csum_l3l4csum,          1, 0, 1, 0, 1, 1,       8,      \
1643                 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
1644 T(ts_noff_vlan,                         1, 0, 1, 1, 0, 0,       8,      \
1645                 TSP_F | NOFF_F | VLAN_F)                                \
1646 T(ts_noff_vlan_l3l4csum,                1, 0, 1, 1, 0, 1,       8,      \
1647                 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
1648 T(ts_noff_vlan_ol3ol4csum,              1, 0, 1, 1, 1, 0,       8,      \
1649                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
1650 T(ts_noff_vlan_ol3ol4csum_l3l4csum,     1, 0, 1, 1, 1, 1,       8,      \
1651                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
1652 T(ts_tso,                               1, 1, 0, 0, 0, 0,       8,      \
1653                 TSP_F | TSO_F)                                          \
1654 T(ts_tso_l3l4csum,                      1, 1, 0, 0, 0, 1,       8,      \
1655                 TSP_F | TSO_F | L3L4CSUM_F)                             \
1656 T(ts_tso_ol3ol4csum,                    1, 1, 0, 0, 1, 0,       8,      \
1657                 TSP_F | TSO_F | OL3OL4CSUM_F)                           \
1658 T(ts_tso_ol3ol4csum_l3l4csum,           1, 1, 0, 0, 1, 1,       8,      \
1659                 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)              \
1660 T(ts_tso_vlan,                          1, 1, 0, 1, 0, 0,       8,      \
1661                 TSP_F | TSO_F | VLAN_F)                                 \
1662 T(ts_tso_vlan_l3l4csum,                 1, 1, 0, 1, 0, 1,       8,      \
1663                 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F)                    \
1664 T(ts_tso_vlan_ol3ol4csum,               1, 1, 0, 1, 1, 0,       8,      \
1665                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F)                  \
1666 T(ts_tso_vlan_ol3ol4csum_l3l4csum,      1, 1, 0, 1, 1, 1,       8,      \
1667                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
1668 T(ts_tso_noff,                          1, 1, 1, 0, 0, 0,       8,      \
1669                 TSP_F | TSO_F | NOFF_F)                                 \
1670 T(ts_tso_noff_l3l4csum,                 1, 1, 1, 0, 0, 1,       8,      \
1671                 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F)                    \
1672 T(ts_tso_noff_ol3ol4csum,               1, 1, 1, 0, 1, 0,       8,      \
1673                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F)                  \
1674 T(ts_tso_noff_ol3ol4csum_l3l4csum,      1, 1, 1, 0, 1, 1,       8,      \
1675                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
1676 T(ts_tso_noff_vlan,                     1, 1, 1, 1, 0, 0,       8,      \
1677                 TSP_F | TSO_F | NOFF_F | VLAN_F)                        \
1678 T(ts_tso_noff_vlan_l3l4csum,            1, 1, 1, 1, 0, 1,       8,      \
1679                 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)           \
1680 T(ts_tso_noff_vlan_ol3ol4csum,          1, 1, 1, 1, 1, 0,       8,      \
1681                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)         \
1682 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1,       8,      \
1683                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
1684
1685 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
1686         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name(          \
1687                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
1688                                                                                \
1689         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name(     \
1690                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
1691                                                                                \
1692         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
1693                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);
1694
1695 NIX_TX_FASTPATH_MODES
1696 #undef T
1697
1698 #endif /* __CN10K_TX_H__ */