mbuf: add rte prefix to offload flags
[dpdk.git] / drivers / net / cnxk / cn10k_tx.h
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2021 Marvell.
3  */
4 #ifndef __CN10K_TX_H__
5 #define __CN10K_TX_H__
6
7 #include <rte_vect.h>
8
9 #define NIX_TX_OFFLOAD_NONE           (0)
10 #define NIX_TX_OFFLOAD_L3_L4_CSUM_F   BIT(0)
11 #define NIX_TX_OFFLOAD_OL3_OL4_CSUM_F BIT(1)
12 #define NIX_TX_OFFLOAD_VLAN_QINQ_F    BIT(2)
13 #define NIX_TX_OFFLOAD_MBUF_NOFF_F    BIT(3)
14 #define NIX_TX_OFFLOAD_TSO_F          BIT(4)
15 #define NIX_TX_OFFLOAD_TSTAMP_F       BIT(5)
16
17 /* Flags to control xmit_prepare function.
18  * Defining it from backwards to denote its been
19  * not used as offload flags to pick function
20  */
21 #define NIX_TX_VWQE_F      BIT(14)
22 #define NIX_TX_MULTI_SEG_F BIT(15)
23
24 #define NIX_TX_NEED_SEND_HDR_W1                                                \
25         (NIX_TX_OFFLOAD_L3_L4_CSUM_F | NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |         \
26          NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F)
27
28 #define NIX_TX_NEED_EXT_HDR                                                    \
29         (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |                \
30          NIX_TX_OFFLOAD_TSO_F)
31
32 #define NIX_XMIT_FC_OR_RETURN(txq, pkts)                                       \
33         do {                                                                   \
34                 /* Cached value is low, Update the fc_cache_pkts */            \
35                 if (unlikely((txq)->fc_cache_pkts < (pkts))) {                 \
36                         /* Multiply with sqe_per_sqb to express in pkts */     \
37                         (txq)->fc_cache_pkts =                                 \
38                                 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem)      \
39                                 << (txq)->sqes_per_sqb_log2;                   \
40                         /* Check it again for the room */                      \
41                         if (unlikely((txq)->fc_cache_pkts < (pkts)))           \
42                                 return 0;                                      \
43                 }                                                              \
44         } while (0)
45
46 /* Encoded number of segments to number of dwords macro, each value of nb_segs
47  * is encoded as 4bits.
48  */
49 #define NIX_SEGDW_MAGIC 0x76654432210ULL
50
51 #define NIX_NB_SEGS_TO_SEGDW(x) ((NIX_SEGDW_MAGIC >> ((x) << 2)) & 0xF)
52
53 #define LMT_OFF(lmt_addr, lmt_num, offset)                                     \
54         (void *)((lmt_addr) + ((lmt_num) << ROC_LMT_LINE_SIZE_LOG2) + (offset))
55
56 /* Function to determine no of tx subdesc required in case ext
57  * sub desc is enabled.
58  */
59 static __rte_always_inline int
60 cn10k_nix_tx_ext_subs(const uint16_t flags)
61 {
62         return (flags & NIX_TX_OFFLOAD_TSTAMP_F)
63                        ? 2
64                        : ((flags &
65                            (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSO_F))
66                                   ? 1
67                                   : 0);
68 }
69
70 static __rte_always_inline uint8_t
71 cn10k_nix_pkts_per_vec_brst(const uint16_t flags)
72 {
73         return ((flags & NIX_TX_NEED_EXT_HDR) ? 2 : 4)
74                << ROC_LMT_LINES_PER_CORE_LOG2;
75 }
76
77 static __rte_always_inline uint8_t
78 cn10k_nix_tx_dwords_per_line(const uint16_t flags)
79 {
80         return (flags & NIX_TX_NEED_EXT_HDR) ?
81                              ((flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6) :
82                              8;
83 }
84
85 static __rte_always_inline uint64_t
86 cn10k_nix_tx_steor_data(const uint16_t flags)
87 {
88         const uint64_t dw_m1 = cn10k_nix_tx_ext_subs(flags) + 1;
89         uint64_t data;
90
91         /* This will be moved to addr area */
92         data = dw_m1;
93         /* 15 vector sizes for single seg */
94         data |= dw_m1 << 19;
95         data |= dw_m1 << 22;
96         data |= dw_m1 << 25;
97         data |= dw_m1 << 28;
98         data |= dw_m1 << 31;
99         data |= dw_m1 << 34;
100         data |= dw_m1 << 37;
101         data |= dw_m1 << 40;
102         data |= dw_m1 << 43;
103         data |= dw_m1 << 46;
104         data |= dw_m1 << 49;
105         data |= dw_m1 << 52;
106         data |= dw_m1 << 55;
107         data |= dw_m1 << 58;
108         data |= dw_m1 << 61;
109
110         return data;
111 }
112
113 static __rte_always_inline uint8_t
114 cn10k_nix_tx_dwords_per_line_seg(const uint16_t flags)
115 {
116         return ((flags & NIX_TX_NEED_EXT_HDR) ?
117                               (flags & NIX_TX_OFFLOAD_TSTAMP_F) ? 8 : 6 :
118                               4);
119 }
120
121 static __rte_always_inline uint64_t
122 cn10k_nix_tx_steor_vec_data(const uint16_t flags)
123 {
124         const uint64_t dw_m1 = cn10k_nix_tx_dwords_per_line(flags) - 1;
125         uint64_t data;
126
127         /* This will be moved to addr area */
128         data = dw_m1;
129         /* 15 vector sizes for single seg */
130         data |= dw_m1 << 19;
131         data |= dw_m1 << 22;
132         data |= dw_m1 << 25;
133         data |= dw_m1 << 28;
134         data |= dw_m1 << 31;
135         data |= dw_m1 << 34;
136         data |= dw_m1 << 37;
137         data |= dw_m1 << 40;
138         data |= dw_m1 << 43;
139         data |= dw_m1 << 46;
140         data |= dw_m1 << 49;
141         data |= dw_m1 << 52;
142         data |= dw_m1 << 55;
143         data |= dw_m1 << 58;
144         data |= dw_m1 << 61;
145
146         return data;
147 }
148
149 static __rte_always_inline void
150 cn10k_nix_tx_skeleton(const struct cn10k_eth_txq *txq, uint64_t *cmd,
151                       const uint16_t flags)
152 {
153         /* Send hdr */
154         cmd[0] = txq->send_hdr_w0;
155         cmd[1] = 0;
156         cmd += 2;
157
158         /* Send ext if present */
159         if (flags & NIX_TX_NEED_EXT_HDR) {
160                 *(__uint128_t *)cmd = *(const __uint128_t *)txq->cmd;
161                 cmd += 2;
162         }
163
164         /* Send sg */
165         cmd[0] = txq->sg_w0;
166         cmd[1] = 0;
167 }
168
169 static __rte_always_inline void
170 cn10k_nix_xmit_prepare_tso(struct rte_mbuf *m, const uint64_t flags)
171 {
172         uint64_t mask, ol_flags = m->ol_flags;
173
174         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
175                 uintptr_t mdata = rte_pktmbuf_mtod(m, uintptr_t);
176                 uint16_t *iplen, *oiplen, *oudplen;
177                 uint16_t lso_sb, paylen;
178
179                 mask = -!!(ol_flags & (RTE_MBUF_F_TX_OUTER_IPV4 | RTE_MBUF_F_TX_OUTER_IPV6));
180                 lso_sb = (mask & (m->outer_l2_len + m->outer_l3_len)) +
181                          m->l2_len + m->l3_len + m->l4_len;
182
183                 /* Reduce payload len from base headers */
184                 paylen = m->pkt_len - lso_sb;
185
186                 /* Get iplen position assuming no tunnel hdr */
187                 iplen = (uint16_t *)(mdata + m->l2_len +
188                                      (2 << !!(ol_flags & RTE_MBUF_F_TX_IPV6)));
189                 /* Handle tunnel tso */
190                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
191                     (ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)) {
192                         const uint8_t is_udp_tun =
193                                 (CNXK_NIX_UDP_TUN_BITMASK >>
194                                  ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) >> 45)) &
195                                 0x1;
196
197                         oiplen = (uint16_t *)(mdata + m->outer_l2_len +
198                                               (2 << !!(ol_flags &
199                                                        RTE_MBUF_F_TX_OUTER_IPV6)));
200                         *oiplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*oiplen) -
201                                                    paylen);
202
203                         /* Update format for UDP tunneled packet */
204                         if (is_udp_tun) {
205                                 oudplen = (uint16_t *)(mdata + m->outer_l2_len +
206                                                        m->outer_l3_len + 4);
207                                 *oudplen = rte_cpu_to_be_16(
208                                         rte_be_to_cpu_16(*oudplen) - paylen);
209                         }
210
211                         /* Update iplen position to inner ip hdr */
212                         iplen = (uint16_t *)(mdata + lso_sb - m->l3_len -
213                                              m->l4_len +
214                                              (2 << !!(ol_flags & RTE_MBUF_F_TX_IPV6)));
215                 }
216
217                 *iplen = rte_cpu_to_be_16(rte_be_to_cpu_16(*iplen) - paylen);
218         }
219 }
220
221 static __rte_always_inline void
222 cn10k_nix_xmit_prepare(struct rte_mbuf *m, uint64_t *cmd, uintptr_t lmt_addr,
223                        const uint16_t flags, const uint64_t lso_tun_fmt)
224 {
225         struct nix_send_ext_s *send_hdr_ext;
226         struct nix_send_hdr_s *send_hdr;
227         uint64_t ol_flags = 0, mask;
228         union nix_send_hdr_w1_u w1;
229         union nix_send_sg_s *sg;
230
231         send_hdr = (struct nix_send_hdr_s *)cmd;
232         if (flags & NIX_TX_NEED_EXT_HDR) {
233                 send_hdr_ext = (struct nix_send_ext_s *)(cmd + 2);
234                 sg = (union nix_send_sg_s *)(cmd + 4);
235                 /* Clear previous markings */
236                 send_hdr_ext->w0.lso = 0;
237                 send_hdr_ext->w1.u = 0;
238         } else {
239                 sg = (union nix_send_sg_s *)(cmd + 2);
240         }
241
242         if (flags & NIX_TX_NEED_SEND_HDR_W1) {
243                 ol_flags = m->ol_flags;
244                 w1.u = 0;
245         }
246
247         if (!(flags & NIX_TX_MULTI_SEG_F)) {
248                 send_hdr->w0.total = m->data_len;
249                 send_hdr->w0.aura =
250                         roc_npa_aura_handle_to_aura(m->pool->pool_id);
251         }
252
253         /*
254          * L3type:  2 => IPV4
255          *          3 => IPV4 with csum
256          *          4 => IPV6
257          * L3type and L3ptr needs to be set for either
258          * L3 csum or L4 csum or LSO
259          *
260          */
261
262         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
263             (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
264                 const uint8_t csum = !!(ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM);
265                 const uint8_t ol3type =
266                         ((!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV4)) << 1) +
267                         ((!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV6)) << 2) +
268                         !!(ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM);
269
270                 /* Outer L3 */
271                 w1.ol3type = ol3type;
272                 mask = 0xffffull << ((!!ol3type) << 4);
273                 w1.ol3ptr = ~mask & m->outer_l2_len;
274                 w1.ol4ptr = ~mask & (w1.ol3ptr + m->outer_l3_len);
275
276                 /* Outer L4 */
277                 w1.ol4type = csum + (csum << 1);
278
279                 /* Inner L3 */
280                 w1.il3type = ((!!(ol_flags & RTE_MBUF_F_TX_IPV4)) << 1) +
281                              ((!!(ol_flags & RTE_MBUF_F_TX_IPV6)) << 2);
282                 w1.il3ptr = w1.ol4ptr + m->l2_len;
283                 w1.il4ptr = w1.il3ptr + m->l3_len;
284                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
285                 w1.il3type = w1.il3type + !!(ol_flags & RTE_MBUF_F_TX_IP_CKSUM);
286
287                 /* Inner L4 */
288                 w1.il4type = (ol_flags & RTE_MBUF_F_TX_L4_MASK) >> 52;
289
290                 /* In case of no tunnel header use only
291                  * shift IL3/IL4 fields a bit to use
292                  * OL3/OL4 for header checksum
293                  */
294                 mask = !ol3type;
295                 w1.u = ((w1.u & 0xFFFFFFFF00000000) >> (mask << 3)) |
296                        ((w1.u & 0X00000000FFFFFFFF) >> (mask << 4));
297
298         } else if (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) {
299                 const uint8_t csum = !!(ol_flags & RTE_MBUF_F_TX_OUTER_UDP_CKSUM);
300                 const uint8_t outer_l2_len = m->outer_l2_len;
301
302                 /* Outer L3 */
303                 w1.ol3ptr = outer_l2_len;
304                 w1.ol4ptr = outer_l2_len + m->outer_l3_len;
305                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
306                 w1.ol3type = ((!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV4)) << 1) +
307                              ((!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV6)) << 2) +
308                              !!(ol_flags & RTE_MBUF_F_TX_OUTER_IP_CKSUM);
309
310                 /* Outer L4 */
311                 w1.ol4type = csum + (csum << 1);
312
313         } else if (flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) {
314                 const uint8_t l2_len = m->l2_len;
315
316                 /* Always use OLXPTR and OLXTYPE when only
317                  * when one header is present
318                  */
319
320                 /* Inner L3 */
321                 w1.ol3ptr = l2_len;
322                 w1.ol4ptr = l2_len + m->l3_len;
323                 /* Increment it by 1 if it is IPV4 as 3 is with csum */
324                 w1.ol3type = ((!!(ol_flags & RTE_MBUF_F_TX_IPV4)) << 1) +
325                              ((!!(ol_flags & RTE_MBUF_F_TX_IPV6)) << 2) +
326                              !!(ol_flags & RTE_MBUF_F_TX_IP_CKSUM);
327
328                 /* Inner L4 */
329                 w1.ol4type = (ol_flags & RTE_MBUF_F_TX_L4_MASK) >> 52;
330         }
331
332         if (flags & NIX_TX_NEED_EXT_HDR && flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
333                 send_hdr_ext->w1.vlan1_ins_ena = !!(ol_flags & RTE_MBUF_F_TX_VLAN);
334                 /* HW will update ptr after vlan0 update */
335                 send_hdr_ext->w1.vlan1_ins_ptr = 12;
336                 send_hdr_ext->w1.vlan1_ins_tci = m->vlan_tci;
337
338                 send_hdr_ext->w1.vlan0_ins_ena = !!(ol_flags & RTE_MBUF_F_TX_QINQ);
339                 /* 2B before end of l2 header */
340                 send_hdr_ext->w1.vlan0_ins_ptr = 12;
341                 send_hdr_ext->w1.vlan0_ins_tci = m->vlan_tci_outer;
342         }
343
344         if (flags & NIX_TX_OFFLOAD_TSO_F && (ol_flags & RTE_MBUF_F_TX_TCP_SEG)) {
345                 uint16_t lso_sb;
346                 uint64_t mask;
347
348                 mask = -(!w1.il3type);
349                 lso_sb = (mask & w1.ol4ptr) + (~mask & w1.il4ptr) + m->l4_len;
350
351                 send_hdr_ext->w0.lso_sb = lso_sb;
352                 send_hdr_ext->w0.lso = 1;
353                 send_hdr_ext->w0.lso_mps = m->tso_segsz;
354                 send_hdr_ext->w0.lso_format =
355                         NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & RTE_MBUF_F_TX_IPV6);
356                 w1.ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
357
358                 /* Handle tunnel tso */
359                 if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
360                     (ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)) {
361                         const uint8_t is_udp_tun =
362                                 (CNXK_NIX_UDP_TUN_BITMASK >>
363                                  ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) >> 45)) &
364                                 0x1;
365                         uint8_t shift = is_udp_tun ? 32 : 0;
366
367                         shift += (!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV6) << 4);
368                         shift += (!!(ol_flags & RTE_MBUF_F_TX_IPV6) << 3);
369
370                         w1.il4type = NIX_SENDL4TYPE_TCP_CKSUM;
371                         w1.ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
372                         /* Update format for UDP tunneled packet */
373                         send_hdr_ext->w0.lso_format = (lso_tun_fmt >> shift);
374                 }
375         }
376
377         if (flags & NIX_TX_NEED_SEND_HDR_W1)
378                 send_hdr->w1.u = w1.u;
379
380         if (!(flags & NIX_TX_MULTI_SEG_F)) {
381                 sg->seg1_size = m->data_len;
382                 *(rte_iova_t *)(sg + 1) = rte_mbuf_data_iova(m);
383
384                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
385                         /* DF bit = 1 if refcount of current mbuf or parent mbuf
386                          *              is greater than 1
387                          * DF bit = 0 otherwise
388                          */
389                         send_hdr->w0.df = cnxk_nix_prefree_seg(m);
390                 }
391                 /* Mark mempool object as "put" since it is freed by NIX */
392                 if (!send_hdr->w0.df)
393                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
394         }
395
396         /* With minimal offloads, 'cmd' being local could be optimized out to
397          * registers. In other cases, 'cmd' will be in stack. Intent is
398          * 'cmd' stores content from txq->cmd which is copied only once.
399          */
400         *((struct nix_send_hdr_s *)lmt_addr) = *send_hdr;
401         lmt_addr += 16;
402         if (flags & NIX_TX_NEED_EXT_HDR) {
403                 *((struct nix_send_ext_s *)lmt_addr) = *send_hdr_ext;
404                 lmt_addr += 16;
405         }
406         /* In case of multi-seg, sg template is stored here */
407         *((union nix_send_sg_s *)lmt_addr) = *sg;
408         *(rte_iova_t *)(lmt_addr + 8) = *(rte_iova_t *)(sg + 1);
409 }
410
411 static __rte_always_inline void
412 cn10k_nix_xmit_prepare_tstamp(uintptr_t lmt_addr, const uint64_t *cmd,
413                               const uint64_t ol_flags, const uint16_t no_segdw,
414                               const uint16_t flags)
415 {
416         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
417                 const uint8_t is_ol_tstamp = !(ol_flags & RTE_MBUF_F_TX_IEEE1588_TMST);
418                 struct nix_send_ext_s *send_hdr_ext =
419                                         (struct nix_send_ext_s *)lmt_addr + 16;
420                 uint64_t *lmt = (uint64_t *)lmt_addr;
421                 uint16_t off = (no_segdw - 1) << 1;
422                 struct nix_send_mem_s *send_mem;
423
424                 send_mem = (struct nix_send_mem_s *)(lmt + off);
425                 send_hdr_ext->w0.subdc = NIX_SUBDC_EXT;
426                 send_hdr_ext->w0.tstmp = 1;
427                 if (flags & NIX_TX_MULTI_SEG_F) {
428                         /* Retrieving the default desc values */
429                         lmt[off] = cmd[2];
430
431                         /* Using compiler barier to avoid voilation of C
432                          * aliasing rules.
433                          */
434                         rte_compiler_barrier();
435                 }
436
437                 /* Packets for which RTE_MBUF_F_TX_IEEE1588_TMST is not set, tx tstamp
438                  * should not be recorded, hence changing the alg type to
439                  * NIX_SENDMEMALG_SET and also changing send mem addr field to
440                  * next 8 bytes as it corrpt the actual tx tstamp registered
441                  * address.
442                  */
443                 send_mem->w0.subdc = NIX_SUBDC_MEM;
444                 send_mem->w0.alg = NIX_SENDMEMALG_SETTSTMP - (is_ol_tstamp);
445                 send_mem->addr =
446                         (rte_iova_t)(((uint64_t *)cmd[3]) + is_ol_tstamp);
447         }
448 }
449
450 static __rte_always_inline uint16_t
451 cn10k_nix_prepare_mseg(struct rte_mbuf *m, uint64_t *cmd, const uint16_t flags)
452 {
453         struct nix_send_hdr_s *send_hdr;
454         union nix_send_sg_s *sg;
455         struct rte_mbuf *m_next;
456         uint64_t *slist, sg_u;
457         uint64_t nb_segs;
458         uint64_t segdw;
459         uint8_t off, i;
460
461         send_hdr = (struct nix_send_hdr_s *)cmd;
462         send_hdr->w0.total = m->pkt_len;
463         send_hdr->w0.aura = roc_npa_aura_handle_to_aura(m->pool->pool_id);
464
465         if (flags & NIX_TX_NEED_EXT_HDR)
466                 off = 2;
467         else
468                 off = 0;
469
470         sg = (union nix_send_sg_s *)&cmd[2 + off];
471         /* Clear sg->u header before use */
472         sg->u &= 0xFC00000000000000;
473         sg_u = sg->u;
474         slist = &cmd[3 + off];
475
476         i = 0;
477         nb_segs = m->nb_segs;
478
479         /* Fill mbuf segments */
480         do {
481                 m_next = m->next;
482                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
483                 *slist = rte_mbuf_data_iova(m);
484                 /* Set invert df if buffer is not to be freed by H/W */
485                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
486                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
487                         /* Mark mempool object as "put" since it is freed by NIX
488                          */
489 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
490                 if (!(sg_u & (1ULL << (i + 55))))
491                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
492 #endif
493                 slist++;
494                 i++;
495                 nb_segs--;
496                 if (i > 2 && nb_segs) {
497                         i = 0;
498                         /* Next SG subdesc */
499                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
500                         sg->u = sg_u;
501                         sg->segs = 3;
502                         sg = (union nix_send_sg_s *)slist;
503                         sg_u = sg->u;
504                         slist++;
505                 }
506                 m = m_next;
507         } while (nb_segs);
508
509         sg->u = sg_u;
510         sg->segs = i;
511         segdw = (uint64_t *)slist - (uint64_t *)&cmd[2 + off];
512         /* Roundup extra dwords to multiple of 2 */
513         segdw = (segdw >> 1) + (segdw & 0x1);
514         /* Default dwords */
515         segdw += (off >> 1) + 1 + !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
516         send_hdr->w0.sizem1 = segdw - 1;
517
518         return segdw;
519 }
520
521 static __rte_always_inline uint16_t
522 cn10k_nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts,
523                     uint64_t *cmd, uintptr_t base, const uint16_t flags)
524 {
525         struct cn10k_eth_txq *txq = tx_queue;
526         const rte_iova_t io_addr = txq->io_addr;
527         uintptr_t pa, lmt_addr = txq->lmt_base;
528         uint16_t lmt_id, burst, left, i;
529         uint64_t lso_tun_fmt;
530         uint64_t data;
531
532         if (!(flags & NIX_TX_VWQE_F)) {
533                 NIX_XMIT_FC_OR_RETURN(txq, pkts);
534                 /* Reduce the cached count */
535                 txq->fc_cache_pkts -= pkts;
536         }
537
538         /* Get cmd skeleton */
539         cn10k_nix_tx_skeleton(txq, cmd, flags);
540
541         if (flags & NIX_TX_OFFLOAD_TSO_F)
542                 lso_tun_fmt = txq->lso_tun_fmt;
543
544         /* Get LMT base address and LMT ID as lcore id */
545         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
546         left = pkts;
547 again:
548         burst = left > 32 ? 32 : left;
549         for (i = 0; i < burst; i++) {
550                 /* Perform header writes for TSO, barrier at
551                  * lmt steorl will suffice.
552                  */
553                 if (flags & NIX_TX_OFFLOAD_TSO_F)
554                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
555
556                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
557                                        lso_tun_fmt);
558                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
559                                               tx_pkts[i]->ol_flags, 4, flags);
560                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
561         }
562
563         if (flags & NIX_TX_VWQE_F)
564                 roc_sso_hws_head_wait(base);
565
566         /* Trigger LMTST */
567         if (burst > 16) {
568                 data = cn10k_nix_tx_steor_data(flags);
569                 pa = io_addr | (data & 0x7) << 4;
570                 data &= ~0x7ULL;
571                 data |= (15ULL << 12);
572                 data |= (uint64_t)lmt_id;
573
574                 /* STEOR0 */
575                 roc_lmt_submit_steorl(data, pa);
576
577                 data = cn10k_nix_tx_steor_data(flags);
578                 pa = io_addr | (data & 0x7) << 4;
579                 data &= ~0x7ULL;
580                 data |= ((uint64_t)(burst - 17)) << 12;
581                 data |= (uint64_t)(lmt_id + 16);
582
583                 /* STEOR1 */
584                 roc_lmt_submit_steorl(data, pa);
585         } else if (burst) {
586                 data = cn10k_nix_tx_steor_data(flags);
587                 pa = io_addr | (data & 0x7) << 4;
588                 data &= ~0x7ULL;
589                 data |= ((uint64_t)(burst - 1)) << 12;
590                 data |= lmt_id;
591
592                 /* STEOR0 */
593                 roc_lmt_submit_steorl(data, pa);
594         }
595
596         left -= burst;
597         rte_io_wmb();
598         if (left) {
599                 /* Start processing another burst */
600                 tx_pkts += burst;
601                 /* Reset lmt base addr */
602                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
603                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
604                 goto again;
605         }
606
607         return pkts;
608 }
609
610 static __rte_always_inline uint16_t
611 cn10k_nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
612                          uint16_t pkts, uint64_t *cmd, uintptr_t base,
613                          const uint16_t flags)
614 {
615         struct cn10k_eth_txq *txq = tx_queue;
616         uintptr_t pa0, pa1, lmt_addr = txq->lmt_base;
617         const rte_iova_t io_addr = txq->io_addr;
618         uint16_t segdw, lmt_id, burst, left, i;
619         uint64_t data0, data1;
620         uint64_t lso_tun_fmt;
621         __uint128_t data128;
622         uint16_t shft;
623
624         NIX_XMIT_FC_OR_RETURN(txq, pkts);
625
626         cn10k_nix_tx_skeleton(txq, cmd, flags);
627
628         /* Reduce the cached count */
629         txq->fc_cache_pkts -= pkts;
630
631         if (flags & NIX_TX_OFFLOAD_TSO_F)
632                 lso_tun_fmt = txq->lso_tun_fmt;
633
634         /* Get LMT base address and LMT ID as lcore id */
635         ROC_LMT_BASE_ID_GET(lmt_addr, lmt_id);
636         left = pkts;
637 again:
638         burst = left > 32 ? 32 : left;
639         shft = 16;
640         data128 = 0;
641         for (i = 0; i < burst; i++) {
642                 /* Perform header writes for TSO, barrier at
643                  * lmt steorl will suffice.
644                  */
645                 if (flags & NIX_TX_OFFLOAD_TSO_F)
646                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
647
648                 cn10k_nix_xmit_prepare(tx_pkts[i], cmd, lmt_addr, flags,
649                                        lso_tun_fmt);
650                 /* Store sg list directly on lmt line */
651                 segdw = cn10k_nix_prepare_mseg(tx_pkts[i], (uint64_t *)lmt_addr,
652                                                flags);
653                 cn10k_nix_xmit_prepare_tstamp(lmt_addr, &txq->cmd[0],
654                                               tx_pkts[i]->ol_flags, segdw,
655                                               flags);
656                 lmt_addr += (1ULL << ROC_LMT_LINE_SIZE_LOG2);
657                 data128 |= (((__uint128_t)(segdw - 1)) << shft);
658                 shft += 3;
659         }
660
661         if (flags & NIX_TX_VWQE_F)
662                 roc_sso_hws_head_wait(base);
663
664         data0 = (uint64_t)data128;
665         data1 = (uint64_t)(data128 >> 64);
666         /* Make data0 similar to data1 */
667         data0 >>= 16;
668         /* Trigger LMTST */
669         if (burst > 16) {
670                 pa0 = io_addr | (data0 & 0x7) << 4;
671                 data0 &= ~0x7ULL;
672                 /* Move lmtst1..15 sz to bits 63:19 */
673                 data0 <<= 16;
674                 data0 |= (15ULL << 12);
675                 data0 |= (uint64_t)lmt_id;
676
677                 /* STEOR0 */
678                 roc_lmt_submit_steorl(data0, pa0);
679
680                 pa1 = io_addr | (data1 & 0x7) << 4;
681                 data1 &= ~0x7ULL;
682                 data1 <<= 16;
683                 data1 |= ((uint64_t)(burst - 17)) << 12;
684                 data1 |= (uint64_t)(lmt_id + 16);
685
686                 /* STEOR1 */
687                 roc_lmt_submit_steorl(data1, pa1);
688         } else if (burst) {
689                 pa0 = io_addr | (data0 & 0x7) << 4;
690                 data0 &= ~0x7ULL;
691                 /* Move lmtst1..15 sz to bits 63:19 */
692                 data0 <<= 16;
693                 data0 |= ((burst - 1) << 12);
694                 data0 |= (uint64_t)lmt_id;
695
696                 /* STEOR0 */
697                 roc_lmt_submit_steorl(data0, pa0);
698         }
699
700         left -= burst;
701         rte_io_wmb();
702         if (left) {
703                 /* Start processing another burst */
704                 tx_pkts += burst;
705                 /* Reset lmt base addr */
706                 lmt_addr -= (1ULL << ROC_LMT_LINE_SIZE_LOG2);
707                 lmt_addr &= (~(BIT_ULL(ROC_LMT_BASE_PER_CORE_LOG2) - 1));
708                 goto again;
709         }
710
711         return pkts;
712 }
713
714 #if defined(RTE_ARCH_ARM64)
715
716 static __rte_always_inline void
717 cn10k_nix_prepare_tso(struct rte_mbuf *m, union nix_send_hdr_w1_u *w1,
718                       union nix_send_ext_w0_u *w0, uint64_t ol_flags,
719                       const uint64_t flags, const uint64_t lso_tun_fmt)
720 {
721         uint16_t lso_sb;
722         uint64_t mask;
723
724         if (!(ol_flags & RTE_MBUF_F_TX_TCP_SEG))
725                 return;
726
727         mask = -(!w1->il3type);
728         lso_sb = (mask & w1->ol4ptr) + (~mask & w1->il4ptr) + m->l4_len;
729
730         w0->u |= BIT(14);
731         w0->lso_sb = lso_sb;
732         w0->lso_mps = m->tso_segsz;
733         w0->lso_format = NIX_LSO_FORMAT_IDX_TSOV4 + !!(ol_flags & RTE_MBUF_F_TX_IPV6);
734         w1->ol4type = NIX_SENDL4TYPE_TCP_CKSUM;
735
736         /* Handle tunnel tso */
737         if ((flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F) &&
738             (ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK)) {
739                 const uint8_t is_udp_tun =
740                         (CNXK_NIX_UDP_TUN_BITMASK >>
741                          ((ol_flags & RTE_MBUF_F_TX_TUNNEL_MASK) >> 45)) &
742                         0x1;
743                 uint8_t shift = is_udp_tun ? 32 : 0;
744
745                 shift += (!!(ol_flags & RTE_MBUF_F_TX_OUTER_IPV6) << 4);
746                 shift += (!!(ol_flags & RTE_MBUF_F_TX_IPV6) << 3);
747
748                 w1->il4type = NIX_SENDL4TYPE_TCP_CKSUM;
749                 w1->ol4type = is_udp_tun ? NIX_SENDL4TYPE_UDP_CKSUM : 0;
750                 /* Update format for UDP tunneled packet */
751
752                 w0->lso_format = (lso_tun_fmt >> shift);
753         }
754 }
755
756 static __rte_always_inline void
757 cn10k_nix_prepare_mseg_vec_list(struct rte_mbuf *m, uint64_t *cmd,
758                                 union nix_send_hdr_w0_u *sh,
759                                 union nix_send_sg_s *sg, const uint32_t flags)
760 {
761         struct rte_mbuf *m_next;
762         uint64_t *slist, sg_u;
763         uint16_t nb_segs;
764         int i = 1;
765
766         sh->total = m->pkt_len;
767         /* Clear sg->u header before use */
768         sg->u &= 0xFC00000000000000;
769         sg_u = sg->u;
770         slist = &cmd[0];
771
772         sg_u = sg_u | ((uint64_t)m->data_len);
773
774         nb_segs = m->nb_segs - 1;
775         m_next = m->next;
776
777         /* Set invert df if buffer is not to be freed by H/W */
778         if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
779                 sg_u |= (cnxk_nix_prefree_seg(m) << 55);
780                 /* Mark mempool object as "put" since it is freed by NIX */
781 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
782         if (!(sg_u & (1ULL << 55)))
783                 __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
784         rte_io_wmb();
785 #endif
786
787         m = m_next;
788         /* Fill mbuf segments */
789         do {
790                 m_next = m->next;
791                 sg_u = sg_u | ((uint64_t)m->data_len << (i << 4));
792                 *slist = rte_mbuf_data_iova(m);
793                 /* Set invert df if buffer is not to be freed by H/W */
794                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)
795                         sg_u |= (cnxk_nix_prefree_seg(m) << (i + 55));
796                         /* Mark mempool object as "put" since it is freed by NIX
797                          */
798 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
799                 if (!(sg_u & (1ULL << (i + 55))))
800                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
801                 rte_io_wmb();
802 #endif
803                 slist++;
804                 i++;
805                 nb_segs--;
806                 if (i > 2 && nb_segs) {
807                         i = 0;
808                         /* Next SG subdesc */
809                         *(uint64_t *)slist = sg_u & 0xFC00000000000000;
810                         sg->u = sg_u;
811                         sg->segs = 3;
812                         sg = (union nix_send_sg_s *)slist;
813                         sg_u = sg->u;
814                         slist++;
815                 }
816                 m = m_next;
817         } while (nb_segs);
818
819         sg->u = sg_u;
820         sg->segs = i;
821 }
822
823 static __rte_always_inline void
824 cn10k_nix_prepare_mseg_vec(struct rte_mbuf *m, uint64_t *cmd, uint64x2_t *cmd0,
825                            uint64x2_t *cmd1, const uint8_t segdw,
826                            const uint32_t flags)
827 {
828         union nix_send_hdr_w0_u sh;
829         union nix_send_sg_s sg;
830
831         if (m->nb_segs == 1) {
832                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
833                         sg.u = vgetq_lane_u64(cmd1[0], 0);
834                         sg.u |= (cnxk_nix_prefree_seg(m) << 55);
835                         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
836                 }
837
838 #ifdef RTE_LIBRTE_MEMPOOL_DEBUG
839                 sg.u = vgetq_lane_u64(cmd1[0], 0);
840                 if (!(sg.u & (1ULL << 55)))
841                         __mempool_check_cookies(m->pool, (void **)&m, 1, 0);
842                 rte_io_wmb();
843 #endif
844                 return;
845         }
846
847         sh.u = vgetq_lane_u64(cmd0[0], 0);
848         sg.u = vgetq_lane_u64(cmd1[0], 0);
849
850         cn10k_nix_prepare_mseg_vec_list(m, cmd, &sh, &sg, flags);
851
852         sh.sizem1 = segdw - 1;
853         cmd0[0] = vsetq_lane_u64(sh.u, cmd0[0], 0);
854         cmd1[0] = vsetq_lane_u64(sg.u, cmd1[0], 0);
855 }
856
857 #define NIX_DESCS_PER_LOOP 4
858
859 static __rte_always_inline uint8_t
860 cn10k_nix_prep_lmt_mseg_vector(struct rte_mbuf **mbufs, uint64x2_t *cmd0,
861                                uint64x2_t *cmd1, uint64x2_t *cmd2,
862                                uint64x2_t *cmd3, uint8_t *segdw,
863                                uint64_t *lmt_addr, __uint128_t *data128,
864                                uint8_t *shift, const uint16_t flags)
865 {
866         uint8_t j, off, lmt_used;
867
868         if (!(flags & NIX_TX_NEED_EXT_HDR) &&
869             !(flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
870                 /* No segments in 4 consecutive packets. */
871                 if ((segdw[0] + segdw[1] + segdw[2] + segdw[3]) <= 8) {
872                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++)
873                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
874                                                            &cmd0[j], &cmd1[j],
875                                                            segdw[j], flags);
876                         vst1q_u64(lmt_addr, cmd0[0]);
877                         vst1q_u64(lmt_addr + 2, cmd1[0]);
878                         vst1q_u64(lmt_addr + 4, cmd0[1]);
879                         vst1q_u64(lmt_addr + 6, cmd1[1]);
880                         vst1q_u64(lmt_addr + 8, cmd0[2]);
881                         vst1q_u64(lmt_addr + 10, cmd1[2]);
882                         vst1q_u64(lmt_addr + 12, cmd0[3]);
883                         vst1q_u64(lmt_addr + 14, cmd1[3]);
884
885                         *data128 |= ((__uint128_t)7) << *shift;
886                         shift += 3;
887
888                         return 1;
889                 }
890         }
891
892         lmt_used = 0;
893         for (j = 0; j < NIX_DESCS_PER_LOOP;) {
894                 /* Fit consecutive packets in same LMTLINE. */
895                 if ((segdw[j] + segdw[j + 1]) <= 8) {
896                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
897                                 cn10k_nix_prepare_mseg_vec(mbufs[j], NULL,
898                                                            &cmd0[j], &cmd1[j],
899                                                            segdw[j], flags);
900                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1], NULL,
901                                                            &cmd0[j + 1],
902                                                            &cmd1[j + 1],
903                                                            segdw[j + 1], flags);
904                                 /* TSTAMP takes 4 each, no segs. */
905                                 vst1q_u64(lmt_addr, cmd0[j]);
906                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
907                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
908                                 vst1q_u64(lmt_addr + 6, cmd3[j]);
909
910                                 vst1q_u64(lmt_addr + 8, cmd0[j + 1]);
911                                 vst1q_u64(lmt_addr + 10, cmd2[j + 1]);
912                                 vst1q_u64(lmt_addr + 12, cmd1[j + 1]);
913                                 vst1q_u64(lmt_addr + 14, cmd3[j + 1]);
914                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
915                                 /* EXT header take 3 each, space for 2 segs.*/
916                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
917                                                            lmt_addr + 6,
918                                                            &cmd0[j], &cmd1[j],
919                                                            segdw[j], flags);
920                                 vst1q_u64(lmt_addr, cmd0[j]);
921                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
922                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
923                                 off = segdw[j] - 3;
924                                 off <<= 1;
925                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
926                                                            lmt_addr + 12 + off,
927                                                            &cmd0[j + 1],
928                                                            &cmd1[j + 1],
929                                                            segdw[j + 1], flags);
930                                 vst1q_u64(lmt_addr + 6 + off, cmd0[j + 1]);
931                                 vst1q_u64(lmt_addr + 8 + off, cmd2[j + 1]);
932                                 vst1q_u64(lmt_addr + 10 + off, cmd1[j + 1]);
933                         } else {
934                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
935                                                            lmt_addr + 4,
936                                                            &cmd0[j], &cmd1[j],
937                                                            segdw[j], flags);
938                                 vst1q_u64(lmt_addr, cmd0[j]);
939                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
940                                 off = segdw[j] - 2;
941                                 off <<= 1;
942                                 cn10k_nix_prepare_mseg_vec(mbufs[j + 1],
943                                                            lmt_addr + 8 + off,
944                                                            &cmd0[j + 1],
945                                                            &cmd1[j + 1],
946                                                            segdw[j + 1], flags);
947                                 vst1q_u64(lmt_addr + 4 + off, cmd0[j + 1]);
948                                 vst1q_u64(lmt_addr + 6 + off, cmd1[j + 1]);
949                         }
950                         *data128 |= ((__uint128_t)(segdw[j] + segdw[j + 1]) - 1)
951                                     << *shift;
952                         *shift += 3;
953                         j += 2;
954                 } else {
955                         if ((flags & NIX_TX_NEED_EXT_HDR) &&
956                             (flags & NIX_TX_OFFLOAD_TSTAMP_F)) {
957                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
958                                                            lmt_addr + 6,
959                                                            &cmd0[j], &cmd1[j],
960                                                            segdw[j], flags);
961                                 vst1q_u64(lmt_addr, cmd0[j]);
962                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
963                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
964                                 off = segdw[j] - 4;
965                                 off <<= 1;
966                                 vst1q_u64(lmt_addr + 6 + off, cmd3[j]);
967                         } else if (flags & NIX_TX_NEED_EXT_HDR) {
968                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
969                                                            lmt_addr + 6,
970                                                            &cmd0[j], &cmd1[j],
971                                                            segdw[j], flags);
972                                 vst1q_u64(lmt_addr, cmd0[j]);
973                                 vst1q_u64(lmt_addr + 2, cmd2[j]);
974                                 vst1q_u64(lmt_addr + 4, cmd1[j]);
975                         } else {
976                                 cn10k_nix_prepare_mseg_vec(mbufs[j],
977                                                            lmt_addr + 4,
978                                                            &cmd0[j], &cmd1[j],
979                                                            segdw[j], flags);
980                                 vst1q_u64(lmt_addr, cmd0[j]);
981                                 vst1q_u64(lmt_addr + 2, cmd1[j]);
982                         }
983                         *data128 |= ((__uint128_t)(segdw[j]) - 1) << *shift;
984                         *shift += 3;
985                         j++;
986                 }
987                 lmt_used++;
988                 lmt_addr += 16;
989         }
990
991         return lmt_used;
992 }
993
994 static __rte_always_inline uint16_t
995 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
996                            uint16_t pkts, uint64_t *cmd, uintptr_t base,
997                            const uint16_t flags)
998 {
999         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
1000         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
1001         uint64x2_t cmd0[NIX_DESCS_PER_LOOP], cmd1[NIX_DESCS_PER_LOOP],
1002                 cmd2[NIX_DESCS_PER_LOOP], cmd3[NIX_DESCS_PER_LOOP];
1003         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3, pa;
1004         uint64x2_t senddesc01_w0, senddesc23_w0;
1005         uint64x2_t senddesc01_w1, senddesc23_w1;
1006         uint16_t left, scalar, burst, i, lmt_id;
1007         uint64x2_t sendext01_w0, sendext23_w0;
1008         uint64x2_t sendext01_w1, sendext23_w1;
1009         uint64x2_t sendmem01_w0, sendmem23_w0;
1010         uint64x2_t sendmem01_w1, sendmem23_w1;
1011         uint8_t segdw[NIX_DESCS_PER_LOOP + 1];
1012         uint64x2_t sgdesc01_w0, sgdesc23_w0;
1013         uint64x2_t sgdesc01_w1, sgdesc23_w1;
1014         struct cn10k_eth_txq *txq = tx_queue;
1015         uintptr_t laddr = txq->lmt_base;
1016         rte_iova_t io_addr = txq->io_addr;
1017         uint64x2_t ltypes01, ltypes23;
1018         uint64x2_t xtmp128, ytmp128;
1019         uint64x2_t xmask01, xmask23;
1020         uint8_t lnum, shift;
1021         union wdata {
1022                 __uint128_t data128;
1023                 uint64_t data[2];
1024         } wd;
1025
1026         if (!(flags & NIX_TX_VWQE_F)) {
1027                 NIX_XMIT_FC_OR_RETURN(txq, pkts);
1028                 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1029                 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1030                 /* Reduce the cached count */
1031                 txq->fc_cache_pkts -= pkts;
1032         } else {
1033                 scalar = pkts & (NIX_DESCS_PER_LOOP - 1);
1034                 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
1035         }
1036
1037         /* Perform header writes before barrier for TSO */
1038         if (flags & NIX_TX_OFFLOAD_TSO_F) {
1039                 for (i = 0; i < pkts; i++)
1040                         cn10k_nix_xmit_prepare_tso(tx_pkts[i], flags);
1041         }
1042
1043         senddesc01_w0 = vld1q_dup_u64(&txq->send_hdr_w0);
1044         senddesc23_w0 = senddesc01_w0;
1045         senddesc01_w1 = vdupq_n_u64(0);
1046         senddesc23_w1 = senddesc01_w1;
1047         sgdesc01_w0 = vld1q_dup_u64(&txq->sg_w0);
1048         sgdesc23_w0 = sgdesc01_w0;
1049
1050         /* Load command defaults into vector variables. */
1051         if (flags & NIX_TX_NEED_EXT_HDR) {
1052                 sendext01_w0 = vld1q_dup_u64(&txq->cmd[0]);
1053                 sendext23_w0 = sendext01_w0;
1054                 sendext01_w1 = vdupq_n_u64(12 | 12U << 24);
1055                 sendext23_w1 = sendext01_w1;
1056                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1057                         sendmem01_w0 = vld1q_dup_u64(&txq->cmd[2]);
1058                         sendmem23_w0 = sendmem01_w0;
1059                         sendmem01_w1 = vld1q_dup_u64(&txq->cmd[3]);
1060                         sendmem23_w1 = sendmem01_w1;
1061                 }
1062         }
1063
1064         /* Get LMT base address and LMT ID as lcore id */
1065         ROC_LMT_BASE_ID_GET(laddr, lmt_id);
1066         left = pkts;
1067 again:
1068         /* Number of packets to prepare depends on offloads enabled. */
1069         burst = left > cn10k_nix_pkts_per_vec_brst(flags) ?
1070                               cn10k_nix_pkts_per_vec_brst(flags) :
1071                               left;
1072         if (flags & NIX_TX_MULTI_SEG_F) {
1073                 wd.data128 = 0;
1074                 shift = 16;
1075         }
1076         lnum = 0;
1077
1078         for (i = 0; i < burst; i += NIX_DESCS_PER_LOOP) {
1079                 if (flags & NIX_TX_MULTI_SEG_F) {
1080                         uint8_t j;
1081
1082                         for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1083                                 struct rte_mbuf *m = tx_pkts[j];
1084
1085                                 /* Get dwords based on nb_segs. */
1086                                 segdw[j] = NIX_NB_SEGS_TO_SEGDW(m->nb_segs);
1087                                 /* Add dwords based on offloads. */
1088                                 segdw[j] += 1 + /* SEND HDR */
1089                                             !!(flags & NIX_TX_NEED_EXT_HDR) +
1090                                             !!(flags & NIX_TX_OFFLOAD_TSTAMP_F);
1091                         }
1092
1093                         /* Check if there are enough LMTLINES for this loop */
1094                         if (lnum + 4 > 32) {
1095                                 uint8_t ldwords_con = 0, lneeded = 0;
1096                                 for (j = 0; j < NIX_DESCS_PER_LOOP; j++) {
1097                                         ldwords_con += segdw[j];
1098                                         if (ldwords_con > 8) {
1099                                                 lneeded += 1;
1100                                                 ldwords_con = segdw[j];
1101                                         }
1102                                 }
1103                                 lneeded += 1;
1104                                 if (lnum + lneeded > 32) {
1105                                         burst = i;
1106                                         break;
1107                                 }
1108                         }
1109                 }
1110                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
1111                 senddesc01_w0 =
1112                         vbicq_u64(senddesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1113                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0, vdupq_n_u64(0xFFFFFFFF));
1114
1115                 senddesc23_w0 = senddesc01_w0;
1116                 sgdesc23_w0 = sgdesc01_w0;
1117
1118                 /* Clear vlan enables. */
1119                 if (flags & NIX_TX_NEED_EXT_HDR) {
1120                         sendext01_w1 = vbicq_u64(sendext01_w1,
1121                                                  vdupq_n_u64(0x3FFFF00FFFF00));
1122                         sendext23_w1 = sendext01_w1;
1123                 }
1124
1125                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1126                         /* Reset send mem alg to SETTSTMP from SUB*/
1127                         sendmem01_w0 = vbicq_u64(sendmem01_w0,
1128                                                  vdupq_n_u64(BIT_ULL(59)));
1129                         /* Reset send mem address to default. */
1130                         sendmem01_w1 =
1131                                 vbicq_u64(sendmem01_w1, vdupq_n_u64(0xF));
1132                         sendmem23_w0 = sendmem01_w0;
1133                         sendmem23_w1 = sendmem01_w1;
1134                 }
1135
1136                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1137                         /* Clear the LSO enable bit. */
1138                         sendext01_w0 = vbicq_u64(sendext01_w0,
1139                                                  vdupq_n_u64(BIT_ULL(14)));
1140                         sendext23_w0 = sendext01_w0;
1141                 }
1142
1143                 /* Move mbufs to iova */
1144                 mbuf0 = (uint64_t *)tx_pkts[0];
1145                 mbuf1 = (uint64_t *)tx_pkts[1];
1146                 mbuf2 = (uint64_t *)tx_pkts[2];
1147                 mbuf3 = (uint64_t *)tx_pkts[3];
1148
1149                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1150                                      offsetof(struct rte_mbuf, buf_iova));
1151                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1152                                      offsetof(struct rte_mbuf, buf_iova));
1153                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1154                                      offsetof(struct rte_mbuf, buf_iova));
1155                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1156                                      offsetof(struct rte_mbuf, buf_iova));
1157                 /*
1158                  * Get mbuf's, olflags, iova, pktlen, dataoff
1159                  * dataoff_iovaX.D[0] = iova,
1160                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
1161                  * len_olflagsX.D[0] = ol_flags,
1162                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
1163                  */
1164                 dataoff_iova0 = vld1q_u64(mbuf0);
1165                 len_olflags0 = vld1q_u64(mbuf0 + 2);
1166                 dataoff_iova1 = vld1q_u64(mbuf1);
1167                 len_olflags1 = vld1q_u64(mbuf1 + 2);
1168                 dataoff_iova2 = vld1q_u64(mbuf2);
1169                 len_olflags2 = vld1q_u64(mbuf2 + 2);
1170                 dataoff_iova3 = vld1q_u64(mbuf3);
1171                 len_olflags3 = vld1q_u64(mbuf3 + 2);
1172
1173                 /* Move mbufs to point pool */
1174                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1175                                      offsetof(struct rte_mbuf, pool) -
1176                                      offsetof(struct rte_mbuf, buf_iova));
1177                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1178                                      offsetof(struct rte_mbuf, pool) -
1179                                      offsetof(struct rte_mbuf, buf_iova));
1180                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1181                                      offsetof(struct rte_mbuf, pool) -
1182                                      offsetof(struct rte_mbuf, buf_iova));
1183                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1184                                      offsetof(struct rte_mbuf, pool) -
1185                                      offsetof(struct rte_mbuf, buf_iova));
1186
1187                 if (flags & (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
1188                              NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
1189                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
1190                         /*
1191                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1192                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
1193                          */
1194
1195                         asm volatile("LD1 {%[a].D}[0],[%[in]]\n\t"
1196                                      : [a] "+w"(senddesc01_w1)
1197                                      : [in] "r"(mbuf0 + 2)
1198                                      : "memory");
1199
1200                         asm volatile("LD1 {%[a].D}[1],[%[in]]\n\t"
1201                                      : [a] "+w"(senddesc01_w1)
1202                                      : [in] "r"(mbuf1 + 2)
1203                                      : "memory");
1204
1205                         asm volatile("LD1 {%[b].D}[0],[%[in]]\n\t"
1206                                      : [b] "+w"(senddesc23_w1)
1207                                      : [in] "r"(mbuf2 + 2)
1208                                      : "memory");
1209
1210                         asm volatile("LD1 {%[b].D}[1],[%[in]]\n\t"
1211                                      : [b] "+w"(senddesc23_w1)
1212                                      : [in] "r"(mbuf3 + 2)
1213                                      : "memory");
1214
1215                         /* Get pool pointer alone */
1216                         mbuf0 = (uint64_t *)*mbuf0;
1217                         mbuf1 = (uint64_t *)*mbuf1;
1218                         mbuf2 = (uint64_t *)*mbuf2;
1219                         mbuf3 = (uint64_t *)*mbuf3;
1220                 } else {
1221                         /* Get pool pointer alone */
1222                         mbuf0 = (uint64_t *)*mbuf0;
1223                         mbuf1 = (uint64_t *)*mbuf1;
1224                         mbuf2 = (uint64_t *)*mbuf2;
1225                         mbuf3 = (uint64_t *)*mbuf3;
1226                 }
1227
1228                 const uint8x16_t shuf_mask2 = {
1229                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1230                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1231                 };
1232                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
1233                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
1234
1235                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
1236                 const uint64x2_t and_mask0 = {
1237                         0xFFFFFFFFFFFFFFFF,
1238                         0x000000000000FFFF,
1239                 };
1240
1241                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
1242                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
1243                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
1244                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
1245
1246                 /*
1247                  * Pick only 16 bits of pktlen preset at bits 63:32
1248                  * and place them at bits 15:0.
1249                  */
1250                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
1251                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
1252
1253                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
1254                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
1255                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
1256
1257                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
1258                  * pktlen at 15:0 position.
1259                  */
1260                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
1261                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
1262                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
1263                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
1264
1265                 /* Move mbuf to point to pool_id. */
1266                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
1267                                      offsetof(struct rte_mempool, pool_id));
1268                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
1269                                      offsetof(struct rte_mempool, pool_id));
1270                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
1271                                      offsetof(struct rte_mempool, pool_id));
1272                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
1273                                      offsetof(struct rte_mempool, pool_id));
1274
1275                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1276                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1277                         /*
1278                          * Lookup table to translate ol_flags to
1279                          * il3/il4 types. But we still use ol3/ol4 types in
1280                          * senddesc_w1 as only one header processing is enabled.
1281                          */
1282                         const uint8x16_t tbl = {
1283                                 /* [0-15] = il4type:il3type */
1284                                 0x04, /* none (IPv6 assumed) */
1285                                 0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6 assumed) */
1286                                 0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6 assumed) */
1287                                 0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6 assumed) */
1288                                 0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
1289                                 0x13, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_TCP_CKSUM */
1290                                 0x23, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_SCTP_CKSUM */
1291                                 0x33, /* RTE_MBUF_F_TX_IP_CKSUM | RTE_MBUF_F_TX_UDP_CKSUM */
1292                                 0x02, /* RTE_MBUF_F_TX_IPV4  */
1293                                 0x12, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_TCP_CKSUM */
1294                                 0x22, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_SCTP_CKSUM */
1295                                 0x32, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_UDP_CKSUM */
1296                                 0x03, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM */
1297                                 0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1298                                        * RTE_MBUF_F_TX_TCP_CKSUM
1299                                        */
1300                                 0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1301                                        * RTE_MBUF_F_TX_SCTP_CKSUM
1302                                        */
1303                                 0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1304                                        * RTE_MBUF_F_TX_UDP_CKSUM
1305                                        */
1306                         };
1307
1308                         /* Extract olflags to translate to iltypes */
1309                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1310                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1311
1312                         /*
1313                          * E(47):L3_LEN(9):L2_LEN(7+z)
1314                          * E(47):L3_LEN(9):L2_LEN(7+z)
1315                          */
1316                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
1317                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
1318
1319                         /* Move OLFLAGS bits 55:52 to 51:48
1320                          * with zeros preprended on the byte and rest
1321                          * don't care
1322                          */
1323                         xtmp128 = vshrq_n_u8(xtmp128, 4);
1324                         ytmp128 = vshrq_n_u8(ytmp128, 4);
1325                         /*
1326                          * E(48):L3_LEN(8):L2_LEN(z+7)
1327                          * E(48):L3_LEN(8):L2_LEN(z+7)
1328                          */
1329                         const int8x16_t tshft3 = {
1330                                 -1, 0, 8, 8, 8, 8, 8, 8,
1331                                 -1, 0, 8, 8, 8, 8, 8, 8,
1332                         };
1333
1334                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1335                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1336
1337                         /* Do the lookup */
1338                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1339                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1340
1341                         /* Pick only relevant fields i.e Bit 48:55 of iltype
1342                          * and place it in ol3/ol4type of senddesc_w1
1343                          */
1344                         const uint8x16_t shuf_mask0 = {
1345                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
1346                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
1347                         };
1348
1349                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1350                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1351
1352                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1353                          * a [E(32):E(16):OL3(8):OL2(8)]
1354                          * a = a + (a << 8)
1355                          * a [E(32):E(16):(OL3+OL2):OL2]
1356                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1357                          */
1358                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1359                                                  vshlq_n_u16(senddesc01_w1, 8));
1360                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1361                                                  vshlq_n_u16(senddesc23_w1, 8));
1362
1363                         /* Move ltypes to senddesc*_w1 */
1364                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1365                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1366                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1367                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1368                         /*
1369                          * Lookup table to translate ol_flags to
1370                          * ol3/ol4 types.
1371                          */
1372
1373                         const uint8x16_t tbl = {
1374                                 /* [0-15] = ol4type:ol3type */
1375                                 0x00, /* none */
1376                                 0x03, /* OUTER_IP_CKSUM */
1377                                 0x02, /* OUTER_IPV4 */
1378                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1379                                 0x04, /* OUTER_IPV6 */
1380                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1381                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1382                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1383                                        * OUTER_IP_CKSUM
1384                                        */
1385                                 0x00, /* OUTER_UDP_CKSUM */
1386                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
1387                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
1388                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
1389                                        * OUTER_IP_CKSUM
1390                                        */
1391                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
1392                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1393                                        * OUTER_IP_CKSUM
1394                                        */
1395                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1396                                        * OUTER_IPV4
1397                                        */
1398                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1399                                        * OUTER_IPV4 | OUTER_IP_CKSUM
1400                                        */
1401                         };
1402
1403                         /* Extract olflags to translate to iltypes */
1404                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1405                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1406
1407                         /*
1408                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1409                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
1410                          */
1411                         const uint8x16_t shuf_mask5 = {
1412                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1413                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1414                         };
1415                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1416                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1417
1418                         /* Extract outer ol flags only */
1419                         const uint64x2_t o_cksum_mask = {
1420                                 0x1C00020000000000,
1421                                 0x1C00020000000000,
1422                         };
1423
1424                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
1425                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
1426
1427                         /* Extract OUTER_UDP_CKSUM bit 41 and
1428                          * move it to bit 61
1429                          */
1430
1431                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1432                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1433
1434                         /* Shift oltype by 2 to start nibble from BIT(56)
1435                          * instead of BIT(58)
1436                          */
1437                         xtmp128 = vshrq_n_u8(xtmp128, 2);
1438                         ytmp128 = vshrq_n_u8(ytmp128, 2);
1439                         /*
1440                          * E(48):L3_LEN(8):L2_LEN(z+7)
1441                          * E(48):L3_LEN(8):L2_LEN(z+7)
1442                          */
1443                         const int8x16_t tshft3 = {
1444                                 -1, 0, 8, 8, 8, 8, 8, 8,
1445                                 -1, 0, 8, 8, 8, 8, 8, 8,
1446                         };
1447
1448                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1449                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1450
1451                         /* Do the lookup */
1452                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
1453                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
1454
1455                         /* Pick only relevant fields i.e Bit 56:63 of oltype
1456                          * and place it in ol3/ol4type of senddesc_w1
1457                          */
1458                         const uint8x16_t shuf_mask0 = {
1459                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
1460                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
1461                         };
1462
1463                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1464                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1465
1466                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
1467                          * a [E(32):E(16):OL3(8):OL2(8)]
1468                          * a = a + (a << 8)
1469                          * a [E(32):E(16):(OL3+OL2):OL2]
1470                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
1471                          */
1472                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1473                                                  vshlq_n_u16(senddesc01_w1, 8));
1474                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1475                                                  vshlq_n_u16(senddesc23_w1, 8));
1476
1477                         /* Move ltypes to senddesc*_w1 */
1478                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1479                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1480                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
1481                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
1482                         /* Lookup table to translate ol_flags to
1483                          * ol4type, ol3type, il4type, il3type of senddesc_w1
1484                          */
1485                         const uint8x16x2_t tbl = {{
1486                                 {
1487                                         /* [0-15] = il4type:il3type */
1488                                         0x04, /* none (IPv6) */
1489                                         0x14, /* RTE_MBUF_F_TX_TCP_CKSUM (IPv6) */
1490                                         0x24, /* RTE_MBUF_F_TX_SCTP_CKSUM (IPv6) */
1491                                         0x34, /* RTE_MBUF_F_TX_UDP_CKSUM (IPv6) */
1492                                         0x03, /* RTE_MBUF_F_TX_IP_CKSUM */
1493                                         0x13, /* RTE_MBUF_F_TX_IP_CKSUM |
1494                                                * RTE_MBUF_F_TX_TCP_CKSUM
1495                                                */
1496                                         0x23, /* RTE_MBUF_F_TX_IP_CKSUM |
1497                                                * RTE_MBUF_F_TX_SCTP_CKSUM
1498                                                */
1499                                         0x33, /* RTE_MBUF_F_TX_IP_CKSUM |
1500                                                * RTE_MBUF_F_TX_UDP_CKSUM
1501                                                */
1502                                         0x02, /* RTE_MBUF_F_TX_IPV4 */
1503                                         0x12, /* RTE_MBUF_F_TX_IPV4 |
1504                                                * RTE_MBUF_F_TX_TCP_CKSUM
1505                                                */
1506                                         0x22, /* RTE_MBUF_F_TX_IPV4 |
1507                                                * RTE_MBUF_F_TX_SCTP_CKSUM
1508                                                */
1509                                         0x32, /* RTE_MBUF_F_TX_IPV4 |
1510                                                * RTE_MBUF_F_TX_UDP_CKSUM
1511                                                */
1512                                         0x03, /* RTE_MBUF_F_TX_IPV4 |
1513                                                * RTE_MBUF_F_TX_IP_CKSUM
1514                                                */
1515                                         0x13, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1516                                                * RTE_MBUF_F_TX_TCP_CKSUM
1517                                                */
1518                                         0x23, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1519                                                * RTE_MBUF_F_TX_SCTP_CKSUM
1520                                                */
1521                                         0x33, /* RTE_MBUF_F_TX_IPV4 | RTE_MBUF_F_TX_IP_CKSUM |
1522                                                * RTE_MBUF_F_TX_UDP_CKSUM
1523                                                */
1524                                 },
1525
1526                                 {
1527                                         /* [16-31] = ol4type:ol3type */
1528                                         0x00, /* none */
1529                                         0x03, /* OUTER_IP_CKSUM */
1530                                         0x02, /* OUTER_IPV4 */
1531                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
1532                                         0x04, /* OUTER_IPV6 */
1533                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
1534                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
1535                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
1536                                                * OUTER_IP_CKSUM
1537                                                */
1538                                         0x00, /* OUTER_UDP_CKSUM */
1539                                         0x33, /* OUTER_UDP_CKSUM |
1540                                                * OUTER_IP_CKSUM
1541                                                */
1542                                         0x32, /* OUTER_UDP_CKSUM |
1543                                                * OUTER_IPV4
1544                                                */
1545                                         0x33, /* OUTER_UDP_CKSUM |
1546                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1547                                                */
1548                                         0x34, /* OUTER_UDP_CKSUM |
1549                                                * OUTER_IPV6
1550                                                */
1551                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1552                                                * OUTER_IP_CKSUM
1553                                                */
1554                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1555                                                * OUTER_IPV4
1556                                                */
1557                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
1558                                                * OUTER_IPV4 | OUTER_IP_CKSUM
1559                                                */
1560                                 },
1561                         }};
1562
1563                         /* Extract olflags to translate to oltype & iltype */
1564                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1565                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1566
1567                         /*
1568                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1569                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
1570                          */
1571                         const uint32x4_t tshft_4 = {
1572                                 1,
1573                                 0,
1574                                 1,
1575                                 0,
1576                         };
1577                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
1578                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
1579
1580                         /*
1581                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1582                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
1583                          */
1584                         const uint8x16_t shuf_mask5 = {
1585                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
1586                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
1587                         };
1588                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
1589                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
1590
1591                         /* Extract outer and inner header ol_flags */
1592                         const uint64x2_t oi_cksum_mask = {
1593                                 0x1CF0020000000000,
1594                                 0x1CF0020000000000,
1595                         };
1596
1597                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
1598                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
1599
1600                         /* Extract OUTER_UDP_CKSUM bit 41 and
1601                          * move it to bit 61
1602                          */
1603
1604                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
1605                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
1606
1607                         /* Shift right oltype by 2 and iltype by 4
1608                          * to start oltype nibble from BIT(58)
1609                          * instead of BIT(56) and iltype nibble from BIT(48)
1610                          * instead of BIT(52).
1611                          */
1612                         const int8x16_t tshft5 = {
1613                                 8, 8, 8, 8, 8, 8, -4, -2,
1614                                 8, 8, 8, 8, 8, 8, -4, -2,
1615                         };
1616
1617                         xtmp128 = vshlq_u8(xtmp128, tshft5);
1618                         ytmp128 = vshlq_u8(ytmp128, tshft5);
1619                         /*
1620                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1621                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
1622                          */
1623                         const int8x16_t tshft3 = {
1624                                 -1, 0, -1, 0, 0, 0, 0, 0,
1625                                 -1, 0, -1, 0, 0, 0, 0, 0,
1626                         };
1627
1628                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
1629                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
1630
1631                         /* Mark Bit(4) of oltype */
1632                         const uint64x2_t oi_cksum_mask2 = {
1633                                 0x1000000000000000,
1634                                 0x1000000000000000,
1635                         };
1636
1637                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
1638                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
1639
1640                         /* Do the lookup */
1641                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
1642                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
1643
1644                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
1645                          * Bit 56:63 of oltype and place it in corresponding
1646                          * place in senddesc_w1.
1647                          */
1648                         const uint8x16_t shuf_mask0 = {
1649                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
1650                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
1651                         };
1652
1653                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
1654                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
1655
1656                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
1657                          * l3len, l2len, ol3len, ol2len.
1658                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
1659                          * a = a + (a << 8)
1660                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
1661                          * a = a + (a << 16)
1662                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
1663                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
1664                          */
1665                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
1666                                                  vshlq_n_u32(senddesc01_w1, 8));
1667                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
1668                                                  vshlq_n_u32(senddesc23_w1, 8));
1669
1670                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
1671                         senddesc01_w1 = vaddq_u8(
1672                                 senddesc01_w1, vshlq_n_u32(senddesc01_w1, 16));
1673                         senddesc23_w1 = vaddq_u8(
1674                                 senddesc23_w1, vshlq_n_u32(senddesc23_w1, 16));
1675
1676                         /* Move ltypes to senddesc*_w1 */
1677                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
1678                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
1679                 }
1680
1681                 xmask01 = vdupq_n_u64(0);
1682                 xmask23 = xmask01;
1683                 asm volatile("LD1 {%[a].H}[0],[%[in]]\n\t"
1684                              : [a] "+w"(xmask01)
1685                              : [in] "r"(mbuf0)
1686                              : "memory");
1687
1688                 asm volatile("LD1 {%[a].H}[4],[%[in]]\n\t"
1689                              : [a] "+w"(xmask01)
1690                              : [in] "r"(mbuf1)
1691                              : "memory");
1692
1693                 asm volatile("LD1 {%[b].H}[0],[%[in]]\n\t"
1694                              : [b] "+w"(xmask23)
1695                              : [in] "r"(mbuf2)
1696                              : "memory");
1697
1698                 asm volatile("LD1 {%[b].H}[4],[%[in]]\n\t"
1699                              : [b] "+w"(xmask23)
1700                              : [in] "r"(mbuf3)
1701                              : "memory");
1702                 xmask01 = vshlq_n_u64(xmask01, 20);
1703                 xmask23 = vshlq_n_u64(xmask23, 20);
1704
1705                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1706                 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1707
1708                 if (flags & NIX_TX_OFFLOAD_VLAN_QINQ_F) {
1709                         /* Tx ol_flag for vlan. */
1710                         const uint64x2_t olv = {RTE_MBUF_F_TX_VLAN,
1711 RTE_MBUF_F_TX_VLAN};
1712                         /* Bit enable for VLAN1 */
1713                         const uint64x2_t mlv = {BIT_ULL(49), BIT_ULL(49)};
1714                         /* Tx ol_flag for QnQ. */
1715                         const uint64x2_t olq = {RTE_MBUF_F_TX_QINQ,
1716 RTE_MBUF_F_TX_QINQ};
1717                         /* Bit enable for VLAN0 */
1718                         const uint64x2_t mlq = {BIT_ULL(48), BIT_ULL(48)};
1719                         /* Load vlan values from packet. outer is VLAN 0 */
1720                         uint64x2_t ext01 = {
1721                                 ((uint32_t)tx_pkts[0]->vlan_tci_outer) << 8 |
1722                                         ((uint64_t)tx_pkts[0]->vlan_tci) << 32,
1723                                 ((uint32_t)tx_pkts[1]->vlan_tci_outer) << 8 |
1724                                         ((uint64_t)tx_pkts[1]->vlan_tci) << 32,
1725                         };
1726                         uint64x2_t ext23 = {
1727                                 ((uint32_t)tx_pkts[2]->vlan_tci_outer) << 8 |
1728                                         ((uint64_t)tx_pkts[2]->vlan_tci) << 32,
1729                                 ((uint32_t)tx_pkts[3]->vlan_tci_outer) << 8 |
1730                                         ((uint64_t)tx_pkts[3]->vlan_tci) << 32,
1731                         };
1732
1733                         /* Get ol_flags of the packets. */
1734                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1735                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1736
1737                         /* ORR vlan outer/inner values into cmd. */
1738                         sendext01_w1 = vorrq_u64(sendext01_w1, ext01);
1739                         sendext23_w1 = vorrq_u64(sendext23_w1, ext23);
1740
1741                         /* Test for offload enable bits and generate masks. */
1742                         xtmp128 = vorrq_u64(vandq_u64(vtstq_u64(xtmp128, olv),
1743                                                       mlv),
1744                                             vandq_u64(vtstq_u64(xtmp128, olq),
1745                                                       mlq));
1746                         ytmp128 = vorrq_u64(vandq_u64(vtstq_u64(ytmp128, olv),
1747                                                       mlv),
1748                                             vandq_u64(vtstq_u64(ytmp128, olq),
1749                                                       mlq));
1750
1751                         /* Set vlan enable bits into cmd based on mask. */
1752                         sendext01_w1 = vorrq_u64(sendext01_w1, xtmp128);
1753                         sendext23_w1 = vorrq_u64(sendext23_w1, ytmp128);
1754                 }
1755
1756                 if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1757                         /* Tx ol_flag for timestam. */
1758                         const uint64x2_t olf = {RTE_MBUF_F_TX_IEEE1588_TMST,
1759                                                 RTE_MBUF_F_TX_IEEE1588_TMST};
1760                         /* Set send mem alg to SUB. */
1761                         const uint64x2_t alg = {BIT_ULL(59), BIT_ULL(59)};
1762                         /* Increment send mem address by 8. */
1763                         const uint64x2_t addr = {0x8, 0x8};
1764
1765                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1766                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1767
1768                         /* Check if timestamp is requested and generate inverted
1769                          * mask as we need not make any changes to default cmd
1770                          * value.
1771                          */
1772                         xtmp128 = vmvnq_u32(vtstq_u64(olf, xtmp128));
1773                         ytmp128 = vmvnq_u32(vtstq_u64(olf, ytmp128));
1774
1775                         /* Change send mem address to an 8 byte offset when
1776                          * TSTMP is disabled.
1777                          */
1778                         sendmem01_w1 = vaddq_u64(sendmem01_w1,
1779                                                  vandq_u64(xtmp128, addr));
1780                         sendmem23_w1 = vaddq_u64(sendmem23_w1,
1781                                                  vandq_u64(ytmp128, addr));
1782                         /* Change send mem alg to SUB when TSTMP is disabled. */
1783                         sendmem01_w0 = vorrq_u64(sendmem01_w0,
1784                                                  vandq_u64(xtmp128, alg));
1785                         sendmem23_w0 = vorrq_u64(sendmem23_w0,
1786                                                  vandq_u64(ytmp128, alg));
1787
1788                         cmd3[0] = vzip1q_u64(sendmem01_w0, sendmem01_w1);
1789                         cmd3[1] = vzip2q_u64(sendmem01_w0, sendmem01_w1);
1790                         cmd3[2] = vzip1q_u64(sendmem23_w0, sendmem23_w1);
1791                         cmd3[3] = vzip2q_u64(sendmem23_w0, sendmem23_w1);
1792                 }
1793
1794                 if (flags & NIX_TX_OFFLOAD_TSO_F) {
1795                         const uint64_t lso_fmt = txq->lso_tun_fmt;
1796                         uint64_t sx_w0[NIX_DESCS_PER_LOOP];
1797                         uint64_t sd_w1[NIX_DESCS_PER_LOOP];
1798
1799                         /* Extract SD W1 as we need to set L4 types. */
1800                         vst1q_u64(sd_w1, senddesc01_w1);
1801                         vst1q_u64(sd_w1 + 2, senddesc23_w1);
1802
1803                         /* Extract SX W0 as we need to set LSO fields. */
1804                         vst1q_u64(sx_w0, sendext01_w0);
1805                         vst1q_u64(sx_w0 + 2, sendext23_w0);
1806
1807                         /* Extract ol_flags. */
1808                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
1809                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
1810
1811                         /* Prepare individual mbufs. */
1812                         cn10k_nix_prepare_tso(tx_pkts[0],
1813                                 (union nix_send_hdr_w1_u *)&sd_w1[0],
1814                                 (union nix_send_ext_w0_u *)&sx_w0[0],
1815                                 vgetq_lane_u64(xtmp128, 0), flags, lso_fmt);
1816
1817                         cn10k_nix_prepare_tso(tx_pkts[1],
1818                                 (union nix_send_hdr_w1_u *)&sd_w1[1],
1819                                 (union nix_send_ext_w0_u *)&sx_w0[1],
1820                                 vgetq_lane_u64(xtmp128, 1), flags, lso_fmt);
1821
1822                         cn10k_nix_prepare_tso(tx_pkts[2],
1823                                 (union nix_send_hdr_w1_u *)&sd_w1[2],
1824                                 (union nix_send_ext_w0_u *)&sx_w0[2],
1825                                 vgetq_lane_u64(ytmp128, 0), flags, lso_fmt);
1826
1827                         cn10k_nix_prepare_tso(tx_pkts[3],
1828                                 (union nix_send_hdr_w1_u *)&sd_w1[3],
1829                                 (union nix_send_ext_w0_u *)&sx_w0[3],
1830                                 vgetq_lane_u64(ytmp128, 1), flags, lso_fmt);
1831
1832                         senddesc01_w1 = vld1q_u64(sd_w1);
1833                         senddesc23_w1 = vld1q_u64(sd_w1 + 2);
1834
1835                         sendext01_w0 = vld1q_u64(sx_w0);
1836                         sendext23_w0 = vld1q_u64(sx_w0 + 2);
1837                 }
1838
1839                 if ((flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) &&
1840                     !(flags & NIX_TX_MULTI_SEG_F)) {
1841                         /* Set don't free bit if reference count > 1 */
1842                         xmask01 = vdupq_n_u64(0);
1843                         xmask23 = xmask01;
1844
1845                         /* Move mbufs to iova */
1846                         mbuf0 = (uint64_t *)tx_pkts[0];
1847                         mbuf1 = (uint64_t *)tx_pkts[1];
1848                         mbuf2 = (uint64_t *)tx_pkts[2];
1849                         mbuf3 = (uint64_t *)tx_pkts[3];
1850
1851                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf0))
1852                                 vsetq_lane_u64(0x80000, xmask01, 0);
1853                         else
1854                                 __mempool_check_cookies(
1855                                         ((struct rte_mbuf *)mbuf0)->pool,
1856                                         (void **)&mbuf0, 1, 0);
1857
1858                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf1))
1859                                 vsetq_lane_u64(0x80000, xmask01, 1);
1860                         else
1861                                 __mempool_check_cookies(
1862                                         ((struct rte_mbuf *)mbuf1)->pool,
1863                                         (void **)&mbuf1, 1, 0);
1864
1865                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf2))
1866                                 vsetq_lane_u64(0x80000, xmask23, 0);
1867                         else
1868                                 __mempool_check_cookies(
1869                                         ((struct rte_mbuf *)mbuf2)->pool,
1870                                         (void **)&mbuf2, 1, 0);
1871
1872                         if (cnxk_nix_prefree_seg((struct rte_mbuf *)mbuf3))
1873                                 vsetq_lane_u64(0x80000, xmask23, 1);
1874                         else
1875                                 __mempool_check_cookies(
1876                                         ((struct rte_mbuf *)mbuf3)->pool,
1877                                         (void **)&mbuf3, 1, 0);
1878                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
1879                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
1880                 } else if (!(flags & NIX_TX_MULTI_SEG_F)) {
1881                         /* Move mbufs to iova */
1882                         mbuf0 = (uint64_t *)tx_pkts[0];
1883                         mbuf1 = (uint64_t *)tx_pkts[1];
1884                         mbuf2 = (uint64_t *)tx_pkts[2];
1885                         mbuf3 = (uint64_t *)tx_pkts[3];
1886
1887                         /* Mark mempool object as "put" since
1888                          * it is freed by NIX
1889                          */
1890                         __mempool_check_cookies(
1891                                 ((struct rte_mbuf *)mbuf0)->pool,
1892                                 (void **)&mbuf0, 1, 0);
1893
1894                         __mempool_check_cookies(
1895                                 ((struct rte_mbuf *)mbuf1)->pool,
1896                                 (void **)&mbuf1, 1, 0);
1897
1898                         __mempool_check_cookies(
1899                                 ((struct rte_mbuf *)mbuf2)->pool,
1900                                 (void **)&mbuf2, 1, 0);
1901
1902                         __mempool_check_cookies(
1903                                 ((struct rte_mbuf *)mbuf3)->pool,
1904                                 (void **)&mbuf3, 1, 0);
1905                 }
1906
1907                 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
1908                 cmd0[0] = vzip1q_u64(senddesc01_w0, senddesc01_w1);
1909                 cmd0[1] = vzip2q_u64(senddesc01_w0, senddesc01_w1);
1910                 cmd0[2] = vzip1q_u64(senddesc23_w0, senddesc23_w1);
1911                 cmd0[3] = vzip2q_u64(senddesc23_w0, senddesc23_w1);
1912
1913                 cmd1[0] = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
1914                 cmd1[1] = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
1915                 cmd1[2] = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
1916                 cmd1[3] = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
1917
1918                 if (flags & NIX_TX_NEED_EXT_HDR) {
1919                         cmd2[0] = vzip1q_u64(sendext01_w0, sendext01_w1);
1920                         cmd2[1] = vzip2q_u64(sendext01_w0, sendext01_w1);
1921                         cmd2[2] = vzip1q_u64(sendext23_w0, sendext23_w1);
1922                         cmd2[3] = vzip2q_u64(sendext23_w0, sendext23_w1);
1923                 }
1924
1925                 if (flags & NIX_TX_MULTI_SEG_F) {
1926                         uint8_t j;
1927
1928                         segdw[4] = 8;
1929                         j = cn10k_nix_prep_lmt_mseg_vector(tx_pkts, cmd0, cmd1,
1930                                                           cmd2, cmd3, segdw,
1931                                                           (uint64_t *)
1932                                                           LMT_OFF(laddr, lnum,
1933                                                                   0),
1934                                                           &wd.data128, &shift,
1935                                                           flags);
1936                         lnum += j;
1937                 } else if (flags & NIX_TX_NEED_EXT_HDR) {
1938                         /* Store the prepared send desc to LMT lines */
1939                         if (flags & NIX_TX_OFFLOAD_TSTAMP_F) {
1940                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1941                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1942                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1943                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[0]);
1944                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[1]);
1945                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[1]);
1946                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[1]);
1947                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[1]);
1948                                 lnum += 1;
1949                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1950                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1951                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1952                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd3[2]);
1953                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[3]);
1954                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd2[3]);
1955                                 vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd1[3]);
1956                                 vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd3[3]);
1957                         } else {
1958                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1959                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[0]);
1960                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[0]);
1961                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[1]);
1962                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[1]);
1963                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[1]);
1964                                 lnum += 1;
1965                                 vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[2]);
1966                                 vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd2[2]);
1967                                 vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd1[2]);
1968                                 vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd0[3]);
1969                                 vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd2[3]);
1970                                 vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[3]);
1971                         }
1972                         lnum += 1;
1973                 } else {
1974                         /* Store the prepared send desc to LMT lines */
1975                         vst1q_u64(LMT_OFF(laddr, lnum, 0), cmd0[0]);
1976                         vst1q_u64(LMT_OFF(laddr, lnum, 16), cmd1[0]);
1977                         vst1q_u64(LMT_OFF(laddr, lnum, 32), cmd0[1]);
1978                         vst1q_u64(LMT_OFF(laddr, lnum, 48), cmd1[1]);
1979                         vst1q_u64(LMT_OFF(laddr, lnum, 64), cmd0[2]);
1980                         vst1q_u64(LMT_OFF(laddr, lnum, 80), cmd1[2]);
1981                         vst1q_u64(LMT_OFF(laddr, lnum, 96), cmd0[3]);
1982                         vst1q_u64(LMT_OFF(laddr, lnum, 112), cmd1[3]);
1983                         lnum += 1;
1984                 }
1985
1986                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
1987         }
1988
1989         if (flags & NIX_TX_MULTI_SEG_F)
1990                 wd.data[0] >>= 16;
1991
1992         if (flags & NIX_TX_VWQE_F)
1993                 roc_sso_hws_head_wait(base);
1994
1995         /* Trigger LMTST */
1996         if (lnum > 16) {
1997                 if (!(flags & NIX_TX_MULTI_SEG_F))
1998                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
1999
2000                 pa = io_addr | (wd.data[0] & 0x7) << 4;
2001                 wd.data[0] &= ~0x7ULL;
2002
2003                 if (flags & NIX_TX_MULTI_SEG_F)
2004                         wd.data[0] <<= 16;
2005
2006                 wd.data[0] |= (15ULL << 12);
2007                 wd.data[0] |= (uint64_t)lmt_id;
2008
2009                 /* STEOR0 */
2010                 roc_lmt_submit_steorl(wd.data[0], pa);
2011
2012                 if (!(flags & NIX_TX_MULTI_SEG_F))
2013                         wd.data[1] = cn10k_nix_tx_steor_vec_data(flags);
2014
2015                 pa = io_addr | (wd.data[1] & 0x7) << 4;
2016                 wd.data[1] &= ~0x7ULL;
2017
2018                 if (flags & NIX_TX_MULTI_SEG_F)
2019                         wd.data[1] <<= 16;
2020
2021                 wd.data[1] |= ((uint64_t)(lnum - 17)) << 12;
2022                 wd.data[1] |= (uint64_t)(lmt_id + 16);
2023
2024                 /* STEOR1 */
2025                 roc_lmt_submit_steorl(wd.data[1], pa);
2026         } else if (lnum) {
2027                 if (!(flags & NIX_TX_MULTI_SEG_F))
2028                         wd.data[0] = cn10k_nix_tx_steor_vec_data(flags);
2029
2030                 pa = io_addr | (wd.data[0] & 0x7) << 4;
2031                 wd.data[0] &= ~0x7ULL;
2032
2033                 if (flags & NIX_TX_MULTI_SEG_F)
2034                         wd.data[0] <<= 16;
2035
2036                 wd.data[0] |= ((uint64_t)(lnum - 1)) << 12;
2037                 wd.data[0] |= lmt_id;
2038
2039                 /* STEOR0 */
2040                 roc_lmt_submit_steorl(wd.data[0], pa);
2041         }
2042
2043         left -= burst;
2044         rte_io_wmb();
2045         if (left)
2046                 goto again;
2047
2048         if (unlikely(scalar)) {
2049                 if (flags & NIX_TX_MULTI_SEG_F)
2050                         pkts += cn10k_nix_xmit_pkts_mseg(tx_queue, tx_pkts,
2051                                                          scalar, cmd, base,
2052                                                          flags);
2053                 else
2054                         pkts += cn10k_nix_xmit_pkts(tx_queue, tx_pkts, scalar,
2055                                                     cmd, base, flags);
2056         }
2057
2058         return pkts;
2059 }
2060
2061 #else
2062 static __rte_always_inline uint16_t
2063 cn10k_nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
2064                            uint16_t pkts, uint64_t *cmd, uintptr_t base,
2065                            const uint16_t flags)
2066 {
2067         RTE_SET_USED(tx_queue);
2068         RTE_SET_USED(tx_pkts);
2069         RTE_SET_USED(pkts);
2070         RTE_SET_USED(cmd);
2071         RTE_SET_USED(flags);
2072         RTE_SET_USED(base);
2073         return 0;
2074 }
2075 #endif
2076
2077 #define L3L4CSUM_F   NIX_TX_OFFLOAD_L3_L4_CSUM_F
2078 #define OL3OL4CSUM_F NIX_TX_OFFLOAD_OL3_OL4_CSUM_F
2079 #define VLAN_F       NIX_TX_OFFLOAD_VLAN_QINQ_F
2080 #define NOFF_F       NIX_TX_OFFLOAD_MBUF_NOFF_F
2081 #define TSO_F        NIX_TX_OFFLOAD_TSO_F
2082 #define TSP_F        NIX_TX_OFFLOAD_TSTAMP_F
2083
2084 /* [TSP] [TSO] [NOFF] [VLAN] [OL3OL4CSUM] [L3L4CSUM] */
2085 #define NIX_TX_FASTPATH_MODES                                           \
2086 T(no_offload,                           0, 0, 0, 0, 0, 0,       4,      \
2087                 NIX_TX_OFFLOAD_NONE)                                    \
2088 T(l3l4csum,                             0, 0, 0, 0, 0, 1,       4,      \
2089                 L3L4CSUM_F)                                             \
2090 T(ol3ol4csum,                           0, 0, 0, 0, 1, 0,       4,      \
2091                 OL3OL4CSUM_F)                                           \
2092 T(ol3ol4csum_l3l4csum,                  0, 0, 0, 0, 1, 1,       4,      \
2093                 OL3OL4CSUM_F | L3L4CSUM_F)                              \
2094 T(vlan,                                 0, 0, 0, 1, 0, 0,       6,      \
2095                 VLAN_F)                                                 \
2096 T(vlan_l3l4csum,                        0, 0, 0, 1, 0, 1,       6,      \
2097                 VLAN_F | L3L4CSUM_F)                                    \
2098 T(vlan_ol3ol4csum,                      0, 0, 0, 1, 1, 0,       6,      \
2099                 VLAN_F | OL3OL4CSUM_F)                                  \
2100 T(vlan_ol3ol4csum_l3l4csum,             0, 0, 0, 1, 1, 1,       6,      \
2101                 VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2102 T(noff,                                 0, 0, 1, 0, 0, 0,       4,      \
2103                 NOFF_F)                                                 \
2104 T(noff_l3l4csum,                        0, 0, 1, 0, 0, 1,       4,      \
2105                 NOFF_F | L3L4CSUM_F)                                    \
2106 T(noff_ol3ol4csum,                      0, 0, 1, 0, 1, 0,       4,      \
2107                 NOFF_F | OL3OL4CSUM_F)                                  \
2108 T(noff_ol3ol4csum_l3l4csum,             0, 0, 1, 0, 1, 1,       4,      \
2109                 NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)                     \
2110 T(noff_vlan,                            0, 0, 1, 1, 0, 0,       6,      \
2111                 NOFF_F | VLAN_F)                                        \
2112 T(noff_vlan_l3l4csum,                   0, 0, 1, 1, 0, 1,       6,      \
2113                 NOFF_F | VLAN_F | L3L4CSUM_F)                           \
2114 T(noff_vlan_ol3ol4csum,                 0, 0, 1, 1, 1, 0,       6,      \
2115                 NOFF_F | VLAN_F | OL3OL4CSUM_F)                         \
2116 T(noff_vlan_ol3ol4csum_l3l4csum,        0, 0, 1, 1, 1, 1,       6,      \
2117                 NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)            \
2118 T(tso,                                  0, 1, 0, 0, 0, 0,       6,      \
2119                 TSO_F)                                                  \
2120 T(tso_l3l4csum,                         0, 1, 0, 0, 0, 1,       6,      \
2121                 TSO_F | L3L4CSUM_F)                                     \
2122 T(tso_ol3ol4csum,                       0, 1, 0, 0, 1, 0,       6,      \
2123                 TSO_F | OL3OL4CSUM_F)                                   \
2124 T(tso_ol3ol4csum_l3l4csum,              0, 1, 0, 0, 1, 1,       6,      \
2125                 TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2126 T(tso_vlan,                             0, 1, 0, 1, 0, 0,       6,      \
2127                 TSO_F | VLAN_F)                                         \
2128 T(tso_vlan_l3l4csum,                    0, 1, 0, 1, 0, 1,       6,      \
2129                 TSO_F | VLAN_F | L3L4CSUM_F)                            \
2130 T(tso_vlan_ol3ol4csum,                  0, 1, 0, 1, 1, 0,       6,      \
2131                 TSO_F | VLAN_F | OL3OL4CSUM_F)                          \
2132 T(tso_vlan_ol3ol4csum_l3l4csum,         0, 1, 0, 1, 1, 1,       6,      \
2133                 TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2134 T(tso_noff,                             0, 1, 1, 0, 0, 0,       6,      \
2135                 TSO_F | NOFF_F)                                         \
2136 T(tso_noff_l3l4csum,                    0, 1, 1, 0, 0, 1,       6,      \
2137                 TSO_F | NOFF_F | L3L4CSUM_F)                            \
2138 T(tso_noff_ol3ol4csum,                  0, 1, 1, 0, 1, 0,       6,      \
2139                 TSO_F | NOFF_F | OL3OL4CSUM_F)                          \
2140 T(tso_noff_ol3ol4csum_l3l4csum,         0, 1, 1, 0, 1, 1,       6,      \
2141                 TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2142 T(tso_noff_vlan,                        0, 1, 1, 1, 0, 0,       6,      \
2143                 TSO_F | NOFF_F | VLAN_F)                                \
2144 T(tso_noff_vlan_l3l4csum,               0, 1, 1, 1, 0, 1,       6,      \
2145                 TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2146 T(tso_noff_vlan_ol3ol4csum,             0, 1, 1, 1, 1, 0,       6,      \
2147                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2148 T(tso_noff_vlan_ol3ol4csum_l3l4csum,    0, 1, 1, 1, 1, 1,       6,      \
2149                 TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2150 T(ts,                                   1, 0, 0, 0, 0, 0,       8,      \
2151                 TSP_F)                                                  \
2152 T(ts_l3l4csum,                          1, 0, 0, 0, 0, 1,       8,      \
2153                 TSP_F | L3L4CSUM_F)                                     \
2154 T(ts_ol3ol4csum,                        1, 0, 0, 0, 1, 0,       8,      \
2155                 TSP_F | OL3OL4CSUM_F)                                   \
2156 T(ts_ol3ol4csum_l3l4csum,               1, 0, 0, 0, 1, 1,       8,      \
2157                 TSP_F | OL3OL4CSUM_F | L3L4CSUM_F)                      \
2158 T(ts_vlan,                              1, 0, 0, 1, 0, 0,       8,      \
2159                 TSP_F | VLAN_F)                                         \
2160 T(ts_vlan_l3l4csum,                     1, 0, 0, 1, 0, 1,       8,      \
2161                 TSP_F | VLAN_F | L3L4CSUM_F)                            \
2162 T(ts_vlan_ol3ol4csum,                   1, 0, 0, 1, 1, 0,       8,      \
2163                 TSP_F | VLAN_F | OL3OL4CSUM_F)                          \
2164 T(ts_vlan_ol3ol4csum_l3l4csum,          1, 0, 0, 1, 1, 1,       8,      \
2165                 TSP_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2166 T(ts_noff,                              1, 0, 1, 0, 0, 0,       8,      \
2167                 TSP_F | NOFF_F)                                         \
2168 T(ts_noff_l3l4csum,                     1, 0, 1, 0, 0, 1,       8,      \
2169                 TSP_F | NOFF_F | L3L4CSUM_F)                            \
2170 T(ts_noff_ol3ol4csum,                   1, 0, 1, 0, 1, 0,       8,      \
2171                 TSP_F | NOFF_F | OL3OL4CSUM_F)                          \
2172 T(ts_noff_ol3ol4csum_l3l4csum,          1, 0, 1, 0, 1, 1,       8,      \
2173                 TSP_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)             \
2174 T(ts_noff_vlan,                         1, 0, 1, 1, 0, 0,       8,      \
2175                 TSP_F | NOFF_F | VLAN_F)                                \
2176 T(ts_noff_vlan_l3l4csum,                1, 0, 1, 1, 0, 1,       8,      \
2177                 TSP_F | NOFF_F | VLAN_F | L3L4CSUM_F)                   \
2178 T(ts_noff_vlan_ol3ol4csum,              1, 0, 1, 1, 1, 0,       8,      \
2179                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)                 \
2180 T(ts_noff_vlan_ol3ol4csum_l3l4csum,     1, 0, 1, 1, 1, 1,       8,      \
2181                 TSP_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)    \
2182 T(ts_tso,                               1, 1, 0, 0, 0, 0,       8,      \
2183                 TSP_F | TSO_F)                                          \
2184 T(ts_tso_l3l4csum,                      1, 1, 0, 0, 0, 1,       8,      \
2185                 TSP_F | TSO_F | L3L4CSUM_F)                             \
2186 T(ts_tso_ol3ol4csum,                    1, 1, 0, 0, 1, 0,       8,      \
2187                 TSP_F | TSO_F | OL3OL4CSUM_F)                           \
2188 T(ts_tso_ol3ol4csum_l3l4csum,           1, 1, 0, 0, 1, 1,       8,      \
2189                 TSP_F | TSO_F | OL3OL4CSUM_F | L3L4CSUM_F)              \
2190 T(ts_tso_vlan,                          1, 1, 0, 1, 0, 0,       8,      \
2191                 TSP_F | TSO_F | VLAN_F)                                 \
2192 T(ts_tso_vlan_l3l4csum,                 1, 1, 0, 1, 0, 1,       8,      \
2193                 TSP_F | TSO_F | VLAN_F | L3L4CSUM_F)                    \
2194 T(ts_tso_vlan_ol3ol4csum,               1, 1, 0, 1, 1, 0,       8,      \
2195                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F)                  \
2196 T(ts_tso_vlan_ol3ol4csum_l3l4csum,      1, 1, 0, 1, 1, 1,       8,      \
2197                 TSP_F | TSO_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2198 T(ts_tso_noff,                          1, 1, 1, 0, 0, 0,       8,      \
2199                 TSP_F | TSO_F | NOFF_F)                                 \
2200 T(ts_tso_noff_l3l4csum,                 1, 1, 1, 0, 0, 1,       8,      \
2201                 TSP_F | TSO_F | NOFF_F | L3L4CSUM_F)                    \
2202 T(ts_tso_noff_ol3ol4csum,               1, 1, 1, 0, 1, 0,       8,      \
2203                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F)                  \
2204 T(ts_tso_noff_ol3ol4csum_l3l4csum,      1, 1, 1, 0, 1, 1,       8,      \
2205                 TSP_F | TSO_F | NOFF_F | OL3OL4CSUM_F | L3L4CSUM_F)     \
2206 T(ts_tso_noff_vlan,                     1, 1, 1, 1, 0, 0,       8,      \
2207                 TSP_F | TSO_F | NOFF_F | VLAN_F)                        \
2208 T(ts_tso_noff_vlan_l3l4csum,            1, 1, 1, 1, 0, 1,       8,      \
2209                 TSP_F | TSO_F | NOFF_F | VLAN_F | L3L4CSUM_F)           \
2210 T(ts_tso_noff_vlan_ol3ol4csum,          1, 1, 1, 1, 1, 0,       8,      \
2211                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F)         \
2212 T(ts_tso_noff_vlan_ol3ol4csum_l3l4csum, 1, 1, 1, 1, 1, 1,       8,      \
2213                 TSP_F | TSO_F | NOFF_F | VLAN_F | OL3OL4CSUM_F | L3L4CSUM_F)
2214
2215 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                             \
2216         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_##name(          \
2217                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2218                                                                                \
2219         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_mseg_##name(     \
2220                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2221                                                                                \
2222         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_##name(      \
2223                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2224                                                                                \
2225         uint16_t __rte_noinline __rte_hot cn10k_nix_xmit_pkts_vec_mseg_##name( \
2226                 void *tx_queue, struct rte_mbuf **tx_pkts, uint16_t pkts);     \
2227
2228 NIX_TX_FASTPATH_MODES
2229 #undef T
2230
2231 #endif /* __CN10K_TX_H__ */