net/txgbe: update link setup process of backplane NICs
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Perform header writes before barrier for TSO */
36         if (flags & NIX_TX_OFFLOAD_TSO_F) {
37                 for (i = 0; i < pkts; i++)
38                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39         }
40
41         /* Lets commit any changes in the packet here as no further changes
42          * to the packet will be done unless no fast free is enabled.
43          */
44         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
45                 rte_io_wmb();
46
47         for (i = 0; i < pkts; i++) {
48                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
49                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
50                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
51                                              tx_pkts[i]->ol_flags, 4, flags);
52                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
53         }
54
55         /* Reduce the cached count */
56         txq->fc_cache_pkts -= pkts;
57
58         return pkts;
59 }
60
61 static __rte_always_inline uint16_t
62 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
63                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
64 {
65         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
66         const rte_iova_t io_addr = txq->io_addr;
67         void *lmt_addr = txq->lmt_addr;
68         uint16_t segdw;
69
70         NIX_XMIT_FC_OR_RETURN(txq, pkts);
71
72         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
73
74         /* Perform header writes before barrier for TSO */
75         if (flags & NIX_TX_OFFLOAD_TSO_F) {
76                 for (i = 0; i < pkts; i++)
77                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
78         }
79
80         /* Lets commit any changes in the packet here as no further changes
81          * to the packet will be done unless no fast free is enabled.
82          */
83         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
84                 rte_io_wmb();
85
86         for (i = 0; i < pkts; i++) {
87                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
88                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
89                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
90                                              tx_pkts[i]->ol_flags, segdw,
91                                              flags);
92                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
93         }
94
95         /* Reduce the cached count */
96         txq->fc_cache_pkts -= pkts;
97
98         return pkts;
99 }
100
101 #if defined(RTE_ARCH_ARM64)
102
103 #define NIX_DESCS_PER_LOOP      4
104 static __rte_always_inline uint16_t
105 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
106                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
107 {
108         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
109         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
110         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
111         uint64x2_t senddesc01_w0, senddesc23_w0;
112         uint64x2_t senddesc01_w1, senddesc23_w1;
113         uint64x2_t sgdesc01_w0, sgdesc23_w0;
114         uint64x2_t sgdesc01_w1, sgdesc23_w1;
115         struct otx2_eth_txq *txq = tx_queue;
116         uint64_t *lmt_addr = txq->lmt_addr;
117         rte_iova_t io_addr = txq->io_addr;
118         uint64x2_t ltypes01, ltypes23;
119         uint64x2_t xtmp128, ytmp128;
120         uint64x2_t xmask01, xmask23;
121         uint64x2_t cmd00, cmd01;
122         uint64x2_t cmd10, cmd11;
123         uint64x2_t cmd20, cmd21;
124         uint64x2_t cmd30, cmd31;
125         uint64_t lmt_status, i;
126         uint16_t pkts_left;
127
128         NIX_XMIT_FC_OR_RETURN(txq, pkts);
129
130         pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
131         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
132
133         /* Reduce the cached count */
134         txq->fc_cache_pkts -= pkts;
135
136         /* Lets commit any changes in the packet here as no further changes
137          * to the packet will be done unless no fast free is enabled.
138          */
139         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
140                 rte_io_wmb();
141
142         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
143         senddesc23_w0 = senddesc01_w0;
144         senddesc01_w1 = vdupq_n_u64(0);
145         senddesc23_w1 = senddesc01_w1;
146         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
147         sgdesc23_w0 = sgdesc01_w0;
148
149         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
150                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
151                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
152                                           vdupq_n_u64(0xFFFFFFFF));
153                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
154                                         vdupq_n_u64(0xFFFFFFFF));
155
156                 senddesc23_w0 = senddesc01_w0;
157                 sgdesc23_w0 = sgdesc01_w0;
158
159                 /* Move mbufs to iova */
160                 mbuf0 = (uint64_t *)tx_pkts[0];
161                 mbuf1 = (uint64_t *)tx_pkts[1];
162                 mbuf2 = (uint64_t *)tx_pkts[2];
163                 mbuf3 = (uint64_t *)tx_pkts[3];
164
165                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
166                                      offsetof(struct rte_mbuf, buf_iova));
167                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
168                                      offsetof(struct rte_mbuf, buf_iova));
169                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
170                                      offsetof(struct rte_mbuf, buf_iova));
171                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
172                                      offsetof(struct rte_mbuf, buf_iova));
173                 /*
174                  * Get mbuf's, olflags, iova, pktlen, dataoff
175                  * dataoff_iovaX.D[0] = iova,
176                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
177                  * len_olflagsX.D[0] = ol_flags,
178                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
179                  */
180                 dataoff_iova0  = vld1q_u64(mbuf0);
181                 len_olflags0 = vld1q_u64(mbuf0 + 2);
182                 dataoff_iova1  = vld1q_u64(mbuf1);
183                 len_olflags1 = vld1q_u64(mbuf1 + 2);
184                 dataoff_iova2  = vld1q_u64(mbuf2);
185                 len_olflags2 = vld1q_u64(mbuf2 + 2);
186                 dataoff_iova3  = vld1q_u64(mbuf3);
187                 len_olflags3 = vld1q_u64(mbuf3 + 2);
188
189                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
190                         struct rte_mbuf *mbuf;
191                         /* Set don't free bit if reference count > 1 */
192                         xmask01 = vdupq_n_u64(0);
193                         xmask23 = xmask01;
194
195                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
196                                 offsetof(struct rte_mbuf, buf_iova));
197
198                         if (otx2_nix_prefree_seg(mbuf))
199                                 vsetq_lane_u64(0x80000, xmask01, 0);
200                         else
201                                 __mempool_check_cookies(mbuf->pool,
202                                                         (void **)&mbuf,
203                                                         1, 0);
204
205                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
206                                 offsetof(struct rte_mbuf, buf_iova));
207                         if (otx2_nix_prefree_seg(mbuf))
208                                 vsetq_lane_u64(0x80000, xmask01, 1);
209                         else
210                                 __mempool_check_cookies(mbuf->pool,
211                                                         (void **)&mbuf,
212                                                         1, 0);
213
214                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
215                                 offsetof(struct rte_mbuf, buf_iova));
216                         if (otx2_nix_prefree_seg(mbuf))
217                                 vsetq_lane_u64(0x80000, xmask23, 0);
218                         else
219                                 __mempool_check_cookies(mbuf->pool,
220                                                         (void **)&mbuf,
221                                                         1, 0);
222
223                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
224                                 offsetof(struct rte_mbuf, buf_iova));
225                         if (otx2_nix_prefree_seg(mbuf))
226                                 vsetq_lane_u64(0x80000, xmask23, 1);
227                         else
228                                 __mempool_check_cookies(mbuf->pool,
229                                                         (void **)&mbuf,
230                                                         1, 0);
231                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
232                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
233                         /* Ensuring mbuf fields which got updated in
234                          * otx2_nix_prefree_seg are written before LMTST.
235                          */
236                         rte_io_wmb();
237                 } else {
238                         struct rte_mbuf *mbuf;
239                         /* Mark mempool object as "put" since
240                          * it is freed by NIX
241                          */
242                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
243                                 offsetof(struct rte_mbuf, buf_iova));
244                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
245                                                 1, 0);
246
247                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
248                                 offsetof(struct rte_mbuf, buf_iova));
249                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
250                                                 1, 0);
251
252                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
253                                 offsetof(struct rte_mbuf, buf_iova));
254                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
255                                                 1, 0);
256
257                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
258                                 offsetof(struct rte_mbuf, buf_iova));
259                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
260                                                 1, 0);
261                         RTE_SET_USED(mbuf);
262                 }
263
264                 /* Move mbufs to point pool */
265                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
266                          offsetof(struct rte_mbuf, pool) -
267                          offsetof(struct rte_mbuf, buf_iova));
268                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
269                          offsetof(struct rte_mbuf, pool) -
270                          offsetof(struct rte_mbuf, buf_iova));
271                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
272                          offsetof(struct rte_mbuf, pool) -
273                          offsetof(struct rte_mbuf, buf_iova));
274                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
275                          offsetof(struct rte_mbuf, pool) -
276                          offsetof(struct rte_mbuf, buf_iova));
277
278                 if (flags &
279                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
280                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
281                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
282                         /*
283                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
284                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
285                          */
286
287                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
288                                       [a]"+w"(senddesc01_w1) :
289                                       [in]"r"(mbuf0 + 2) : "memory");
290
291                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
292                                       [a]"+w"(senddesc01_w1) :
293                                       [in]"r"(mbuf1 + 2) : "memory");
294
295                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
296                                       [b]"+w"(senddesc23_w1) :
297                                       [in]"r"(mbuf2 + 2) : "memory");
298
299                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
300                                       [b]"+w"(senddesc23_w1) :
301                                       [in]"r"(mbuf3 + 2) : "memory");
302
303                         /* Get pool pointer alone */
304                         mbuf0 = (uint64_t *)*mbuf0;
305                         mbuf1 = (uint64_t *)*mbuf1;
306                         mbuf2 = (uint64_t *)*mbuf2;
307                         mbuf3 = (uint64_t *)*mbuf3;
308                 } else {
309                         /* Get pool pointer alone */
310                         mbuf0 = (uint64_t *)*mbuf0;
311                         mbuf1 = (uint64_t *)*mbuf1;
312                         mbuf2 = (uint64_t *)*mbuf2;
313                         mbuf3 = (uint64_t *)*mbuf3;
314                 }
315
316                 const uint8x16_t shuf_mask2 = {
317                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
318                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
319                 };
320                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
321                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
322
323                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
324                 const uint64x2_t and_mask0 = {
325                         0xFFFFFFFFFFFFFFFF,
326                         0x000000000000FFFF,
327                 };
328
329                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
330                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
331                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
332                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
333
334                 /*
335                  * Pick only 16 bits of pktlen preset at bits 63:32
336                  * and place them at bits 15:0.
337                  */
338                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
339                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
340
341                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
342                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
343                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
344
345                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
346                  * pktlen at 15:0 position.
347                  */
348                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
349                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
350                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
351                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
352
353                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
354                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
355                         /*
356                          * Lookup table to translate ol_flags to
357                          * il3/il4 types. But we still use ol3/ol4 types in
358                          * senddesc_w1 as only one header processing is enabled.
359                          */
360                         const uint8x16_t tbl = {
361                                 /* [0-15] = il4type:il3type */
362                                 0x04, /* none (IPv6 assumed) */
363                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
364                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
365                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
366                                 0x03, /* PKT_TX_IP_CKSUM */
367                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
368                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
369                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
370                                 0x02, /* PKT_TX_IPV4  */
371                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
372                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
373                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
374                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
375                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
376                                        * PKT_TX_TCP_CKSUM
377                                        */
378                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
379                                        * PKT_TX_SCTP_CKSUM
380                                        */
381                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
382                                        * PKT_TX_UDP_CKSUM
383                                        */
384                         };
385
386                         /* Extract olflags to translate to iltypes */
387                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
388                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
389
390                         /*
391                          * E(47):L3_LEN(9):L2_LEN(7+z)
392                          * E(47):L3_LEN(9):L2_LEN(7+z)
393                          */
394                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
395                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
396
397                         /* Move OLFLAGS bits 55:52 to 51:48
398                          * with zeros preprended on the byte and rest
399                          * don't care
400                          */
401                         xtmp128 = vshrq_n_u8(xtmp128, 4);
402                         ytmp128 = vshrq_n_u8(ytmp128, 4);
403                         /*
404                          * E(48):L3_LEN(8):L2_LEN(z+7)
405                          * E(48):L3_LEN(8):L2_LEN(z+7)
406                          */
407                         const int8x16_t tshft3 = {
408                                 -1, 0, 8, 8, 8, 8, 8, 8,
409                                 -1, 0, 8, 8, 8, 8, 8, 8,
410                         };
411
412                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
413                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
414
415                         /* Do the lookup */
416                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
417                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
418
419                         /* Just use ld1q to retrieve aura
420                          * when we don't need tx_offload
421                          */
422                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
423                                         offsetof(struct rte_mempool, pool_id));
424                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
425                                         offsetof(struct rte_mempool, pool_id));
426                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
427                                         offsetof(struct rte_mempool, pool_id));
428                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
429                                         offsetof(struct rte_mempool, pool_id));
430
431                         /* Pick only relevant fields i.e Bit 48:55 of iltype
432                          * and place it in ol3/ol4type of senddesc_w1
433                          */
434                         const uint8x16_t shuf_mask0 = {
435                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
436                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
437                         };
438
439                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
440                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
441
442                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
443                          * a [E(32):E(16):OL3(8):OL2(8)]
444                          * a = a + (a << 8)
445                          * a [E(32):E(16):(OL3+OL2):OL2]
446                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
447                          */
448                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
449                                                  vshlq_n_u16(senddesc01_w1, 8));
450                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
451                                                  vshlq_n_u16(senddesc23_w1, 8));
452
453                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
454                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
455                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
456                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
457                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
458
459                         xmask01 = vdupq_n_u64(0);
460                         xmask23 = xmask01;
461                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
462                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
463
464                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
465                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
466
467                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
468                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
469
470                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
471                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
472                         xmask01 = vshlq_n_u64(xmask01, 20);
473                         xmask23 = vshlq_n_u64(xmask23, 20);
474
475                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
476                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
477                         /* Move ltypes to senddesc*_w1 */
478                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
479                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
480
481                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
482                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
483                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
484                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
485                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
486
487                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
488                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
489                         /*
490                          * Lookup table to translate ol_flags to
491                          * ol3/ol4 types.
492                          */
493
494                         const uint8x16_t tbl = {
495                                 /* [0-15] = ol4type:ol3type */
496                                 0x00, /* none */
497                                 0x03, /* OUTER_IP_CKSUM */
498                                 0x02, /* OUTER_IPV4 */
499                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
500                                 0x04, /* OUTER_IPV6 */
501                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
502                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
503                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
504                                        * OUTER_IP_CKSUM
505                                        */
506                                 0x00, /* OUTER_UDP_CKSUM */
507                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
508                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
509                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
510                                        * OUTER_IP_CKSUM
511                                        */
512                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
513                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
514                                        * OUTER_IP_CKSUM
515                                        */
516                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
517                                        * OUTER_IPV4
518                                        */
519                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
520                                        * OUTER_IPV4 | OUTER_IP_CKSUM
521                                        */
522                         };
523
524                         /* Extract olflags to translate to iltypes */
525                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
526                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
527
528                         /*
529                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
530                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
531                          */
532                         const uint8x16_t shuf_mask5 = {
533                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
534                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
535                         };
536                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
537                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
538
539                         /* Extract outer ol flags only */
540                         const uint64x2_t o_cksum_mask = {
541                                 0x1C00020000000000,
542                                 0x1C00020000000000,
543                         };
544
545                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
546                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
547
548                         /* Extract OUTER_UDP_CKSUM bit 41 and
549                          * move it to bit 61
550                          */
551
552                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
553                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
554
555                         /* Shift oltype by 2 to start nibble from BIT(56)
556                          * instead of BIT(58)
557                          */
558                         xtmp128 = vshrq_n_u8(xtmp128, 2);
559                         ytmp128 = vshrq_n_u8(ytmp128, 2);
560                         /*
561                          * E(48):L3_LEN(8):L2_LEN(z+7)
562                          * E(48):L3_LEN(8):L2_LEN(z+7)
563                          */
564                         const int8x16_t tshft3 = {
565                                 -1, 0, 8, 8, 8, 8, 8, 8,
566                                 -1, 0, 8, 8, 8, 8, 8, 8,
567                         };
568
569                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
570                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
571
572                         /* Do the lookup */
573                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
574                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
575
576                         /* Just use ld1q to retrieve aura
577                          * when we don't need tx_offload
578                          */
579                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
580                                         offsetof(struct rte_mempool, pool_id));
581                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
582                                         offsetof(struct rte_mempool, pool_id));
583                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
584                                         offsetof(struct rte_mempool, pool_id));
585                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
586                                         offsetof(struct rte_mempool, pool_id));
587
588                         /* Pick only relevant fields i.e Bit 56:63 of oltype
589                          * and place it in ol3/ol4type of senddesc_w1
590                          */
591                         const uint8x16_t shuf_mask0 = {
592                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
593                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
594                         };
595
596                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
597                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
598
599                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
600                          * a [E(32):E(16):OL3(8):OL2(8)]
601                          * a = a + (a << 8)
602                          * a [E(32):E(16):(OL3+OL2):OL2]
603                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
604                          */
605                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
606                                                  vshlq_n_u16(senddesc01_w1, 8));
607                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
608                                                  vshlq_n_u16(senddesc23_w1, 8));
609
610                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
611                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
612                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
613                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
614                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
615
616                         xmask01 = vdupq_n_u64(0);
617                         xmask23 = xmask01;
618                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
619                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
620
621                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
622                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
623
624                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
625                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
626
627                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
628                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
629                         xmask01 = vshlq_n_u64(xmask01, 20);
630                         xmask23 = vshlq_n_u64(xmask23, 20);
631
632                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
633                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
634                         /* Move ltypes to senddesc*_w1 */
635                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
636                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
637
638                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
639                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
640                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
641                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
642                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
643
644                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
645                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
646                         /* Lookup table to translate ol_flags to
647                          * ol4type, ol3type, il4type, il3type of senddesc_w1
648                          */
649                         const uint8x16x2_t tbl = {
650                         {
651                                 {
652                                         /* [0-15] = il4type:il3type */
653                                         0x04, /* none (IPv6) */
654                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
655                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
656                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
657                                         0x03, /* PKT_TX_IP_CKSUM */
658                                         0x13, /* PKT_TX_IP_CKSUM |
659                                                * PKT_TX_TCP_CKSUM
660                                                */
661                                         0x23, /* PKT_TX_IP_CKSUM |
662                                                * PKT_TX_SCTP_CKSUM
663                                                */
664                                         0x33, /* PKT_TX_IP_CKSUM |
665                                                * PKT_TX_UDP_CKSUM
666                                                */
667                                         0x02, /* PKT_TX_IPV4 */
668                                         0x12, /* PKT_TX_IPV4 |
669                                                * PKT_TX_TCP_CKSUM
670                                                */
671                                         0x22, /* PKT_TX_IPV4 |
672                                                * PKT_TX_SCTP_CKSUM
673                                                */
674                                         0x32, /* PKT_TX_IPV4 |
675                                                * PKT_TX_UDP_CKSUM
676                                                */
677                                         0x03, /* PKT_TX_IPV4 |
678                                                * PKT_TX_IP_CKSUM
679                                                */
680                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
681                                                * PKT_TX_TCP_CKSUM
682                                                */
683                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
684                                                * PKT_TX_SCTP_CKSUM
685                                                */
686                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
687                                                * PKT_TX_UDP_CKSUM
688                                                */
689                                 },
690
691                                 {
692                                         /* [16-31] = ol4type:ol3type */
693                                         0x00, /* none */
694                                         0x03, /* OUTER_IP_CKSUM */
695                                         0x02, /* OUTER_IPV4 */
696                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
697                                         0x04, /* OUTER_IPV6 */
698                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
699                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
700                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
701                                                * OUTER_IP_CKSUM
702                                                */
703                                         0x00, /* OUTER_UDP_CKSUM */
704                                         0x33, /* OUTER_UDP_CKSUM |
705                                                * OUTER_IP_CKSUM
706                                                */
707                                         0x32, /* OUTER_UDP_CKSUM |
708                                                * OUTER_IPV4
709                                                */
710                                         0x33, /* OUTER_UDP_CKSUM |
711                                                * OUTER_IPV4 | OUTER_IP_CKSUM
712                                                */
713                                         0x34, /* OUTER_UDP_CKSUM |
714                                                * OUTER_IPV6
715                                                */
716                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
717                                                * OUTER_IP_CKSUM
718                                                */
719                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
720                                                * OUTER_IPV4
721                                                */
722                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
723                                                * OUTER_IPV4 | OUTER_IP_CKSUM
724                                                */
725                                 },
726                         }
727                         };
728
729                         /* Extract olflags to translate to oltype & iltype */
730                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
731                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
732
733                         /*
734                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
735                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
736                          */
737                         const uint32x4_t tshft_4 = {
738                                 1, 0,
739                                 1, 0,
740                         };
741                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
742                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
743
744                         /*
745                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
746                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
747                          */
748                         const uint8x16_t shuf_mask5 = {
749                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
750                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
751                         };
752                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
753                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
754
755                         /* Extract outer and inner header ol_flags */
756                         const uint64x2_t oi_cksum_mask = {
757                                 0x1CF0020000000000,
758                                 0x1CF0020000000000,
759                         };
760
761                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
762                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
763
764                         /* Extract OUTER_UDP_CKSUM bit 41 and
765                          * move it to bit 61
766                          */
767
768                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
769                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
770
771                         /* Shift right oltype by 2 and iltype by 4
772                          * to start oltype nibble from BIT(58)
773                          * instead of BIT(56) and iltype nibble from BIT(48)
774                          * instead of BIT(52).
775                          */
776                         const int8x16_t tshft5 = {
777                                 8, 8, 8, 8, 8, 8, -4, -2,
778                                 8, 8, 8, 8, 8, 8, -4, -2,
779                         };
780
781                         xtmp128 = vshlq_u8(xtmp128, tshft5);
782                         ytmp128 = vshlq_u8(ytmp128, tshft5);
783                         /*
784                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
785                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
786                          */
787                         const int8x16_t tshft3 = {
788                                 -1, 0, -1, 0, 0, 0, 0, 0,
789                                 -1, 0, -1, 0, 0, 0, 0, 0,
790                         };
791
792                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
793                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
794
795                         /* Mark Bit(4) of oltype */
796                         const uint64x2_t oi_cksum_mask2 = {
797                                 0x1000000000000000,
798                                 0x1000000000000000,
799                         };
800
801                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
802                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
803
804                         /* Do the lookup */
805                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
806                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
807
808                         /* Just use ld1q to retrieve aura
809                          * when we don't need tx_offload
810                          */
811                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
812                                         offsetof(struct rte_mempool, pool_id));
813                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
814                                         offsetof(struct rte_mempool, pool_id));
815                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
816                                         offsetof(struct rte_mempool, pool_id));
817                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
818                                         offsetof(struct rte_mempool, pool_id));
819
820                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
821                          * Bit 56:63 of oltype and place it in corresponding
822                          * place in senddesc_w1.
823                          */
824                         const uint8x16_t shuf_mask0 = {
825                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
826                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
827                         };
828
829                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
830                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
831
832                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
833                          * l3len, l2len, ol3len, ol2len.
834                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
835                          * a = a + (a << 8)
836                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
837                          * a = a + (a << 16)
838                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
839                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
840                          */
841                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
842                                                  vshlq_n_u32(senddesc01_w1, 8));
843                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
844                                                  vshlq_n_u32(senddesc23_w1, 8));
845
846                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
847                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
848                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
849                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
850                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
851
852                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
853                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
854                                                 vshlq_n_u32(senddesc01_w1, 16));
855                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
856                                                 vshlq_n_u32(senddesc23_w1, 16));
857
858                         xmask01 = vdupq_n_u64(0);
859                         xmask23 = xmask01;
860                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
861                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
862
863                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
864                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
865
866                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
867                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
868
869                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
870                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
871                         xmask01 = vshlq_n_u64(xmask01, 20);
872                         xmask23 = vshlq_n_u64(xmask23, 20);
873
874                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
875                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
876                         /* Move ltypes to senddesc*_w1 */
877                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
878                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
879
880                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
881                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
882                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
883                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
884                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
885                 } else {
886                         /* Just use ld1q to retrieve aura
887                          * when we don't need tx_offload
888                          */
889                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
890                                         offsetof(struct rte_mempool, pool_id));
891                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
892                                         offsetof(struct rte_mempool, pool_id));
893                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
894                                         offsetof(struct rte_mempool, pool_id));
895                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
896                                         offsetof(struct rte_mempool, pool_id));
897                         xmask01 = vdupq_n_u64(0);
898                         xmask23 = xmask01;
899                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
900                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
901
902                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
903                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
904
905                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
906                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
907
908                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
909                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
910                         xmask01 = vshlq_n_u64(xmask01, 20);
911                         xmask23 = vshlq_n_u64(xmask23, 20);
912
913                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
914                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
915
916                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
917                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
918                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
919                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
920                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
921                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
922                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
923                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
924                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
925                 }
926
927                 do {
928                         vst1q_u64(lmt_addr, cmd00);
929                         vst1q_u64(lmt_addr + 2, cmd01);
930                         vst1q_u64(lmt_addr + 4, cmd10);
931                         vst1q_u64(lmt_addr + 6, cmd11);
932                         vst1q_u64(lmt_addr + 8, cmd20);
933                         vst1q_u64(lmt_addr + 10, cmd21);
934                         vst1q_u64(lmt_addr + 12, cmd30);
935                         vst1q_u64(lmt_addr + 14, cmd31);
936                         lmt_status = otx2_lmt_submit(io_addr);
937
938                 } while (lmt_status == 0);
939                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
940         }
941
942         if (unlikely(pkts_left))
943                 pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
944
945         return pkts;
946 }
947
948 #else
949 static __rte_always_inline uint16_t
950 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
951                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
952 {
953         RTE_SET_USED(tx_queue);
954         RTE_SET_USED(tx_pkts);
955         RTE_SET_USED(pkts);
956         RTE_SET_USED(cmd);
957         RTE_SET_USED(flags);
958         return 0;
959 }
960 #endif
961
962 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
963 static uint16_t __rte_noinline  __rte_hot                                       \
964 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
965                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
966 {                                                                       \
967         uint64_t cmd[sz];                                               \
968                                                                         \
969         /* For TSO inner checksum is a must */                          \
970         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
971             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
972                 return 0;                                               \
973         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
974 }
975
976 NIX_TX_FASTPATH_MODES
977 #undef T
978
979 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
980 static uint16_t __rte_noinline  __rte_hot                                       \
981 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
982                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
983 {                                                                       \
984         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
985                                                                         \
986         /* For TSO inner checksum is a must */                          \
987         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
988             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
989                 return 0;                                               \
990         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
991                                   (flags) | NIX_TX_MULTI_SEG_F);        \
992 }
993
994 NIX_TX_FASTPATH_MODES
995 #undef T
996
997 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
998 static uint16_t __rte_noinline  __rte_hot                                       \
999 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
1000                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
1001 {                                                                       \
1002         uint64_t cmd[sz];                                               \
1003                                                                         \
1004         /* VLAN, TSTMP, TSO is not supported by vec */                  \
1005         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
1006             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
1007             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
1008                 return 0;                                               \
1009         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1010 }
1011
1012 NIX_TX_FASTPATH_MODES
1013 #undef T
1014
1015 static inline void
1016 pick_tx_func(struct rte_eth_dev *eth_dev,
1017              const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1018 {
1019         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1020
1021         /* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1022         eth_dev->tx_pkt_burst = tx_burst
1023                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1024                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1025                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1026                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1027                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1028                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1029                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1030 }
1031
1032 void
1033 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1034 {
1035         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1036
1037         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1038 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1039         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1040
1041 NIX_TX_FASTPATH_MODES
1042 #undef T
1043         };
1044
1045         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1046 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1047         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1048
1049 NIX_TX_FASTPATH_MODES
1050 #undef T
1051         };
1052
1053         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1054 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1055         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1056
1057 NIX_TX_FASTPATH_MODES
1058 #undef T
1059         };
1060
1061         if (dev->scalar_ena ||
1062             (dev->tx_offload_flags &
1063              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1064               NIX_TX_OFFLOAD_TSO_F)))
1065                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1066         else
1067                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1068
1069         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1070                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1071
1072         rte_mb();
1073 }