replace alignment attributes
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Perform header writes before barrier for TSO */
36         if (flags & NIX_TX_OFFLOAD_TSO_F) {
37                 for (i = 0; i < pkts; i++)
38                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39         }
40
41         /* Lets commit any changes in the packet */
42         rte_cio_wmb();
43
44         for (i = 0; i < pkts; i++) {
45                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
46                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
47                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
48                                              tx_pkts[i]->ol_flags, 4, flags);
49                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
50         }
51
52         /* Reduce the cached count */
53         txq->fc_cache_pkts -= pkts;
54
55         return pkts;
56 }
57
58 static __rte_always_inline uint16_t
59 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
60                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
61 {
62         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
63         const rte_iova_t io_addr = txq->io_addr;
64         void *lmt_addr = txq->lmt_addr;
65         uint16_t segdw;
66
67         NIX_XMIT_FC_OR_RETURN(txq, pkts);
68
69         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
70
71         /* Perform header writes before barrier for TSO */
72         if (flags & NIX_TX_OFFLOAD_TSO_F) {
73                 for (i = 0; i < pkts; i++)
74                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
75         }
76
77         /* Lets commit any changes in the packet */
78         rte_cio_wmb();
79
80         for (i = 0; i < pkts; i++) {
81                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
82                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
83                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
84                                              tx_pkts[i]->ol_flags, segdw,
85                                              flags);
86                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
87         }
88
89         /* Reduce the cached count */
90         txq->fc_cache_pkts -= pkts;
91
92         return pkts;
93 }
94
95 #if defined(RTE_ARCH_ARM64)
96
97 #define NIX_DESCS_PER_LOOP      4
98 static __rte_always_inline uint16_t
99 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
100                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
101 {
102         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
103         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
104         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
105         uint64x2_t senddesc01_w0, senddesc23_w0;
106         uint64x2_t senddesc01_w1, senddesc23_w1;
107         uint64x2_t sgdesc01_w0, sgdesc23_w0;
108         uint64x2_t sgdesc01_w1, sgdesc23_w1;
109         struct otx2_eth_txq *txq = tx_queue;
110         uint64_t *lmt_addr = txq->lmt_addr;
111         rte_iova_t io_addr = txq->io_addr;
112         uint64x2_t ltypes01, ltypes23;
113         uint64x2_t xtmp128, ytmp128;
114         uint64x2_t xmask01, xmask23;
115         uint64x2_t cmd00, cmd01;
116         uint64x2_t cmd10, cmd11;
117         uint64x2_t cmd20, cmd21;
118         uint64x2_t cmd30, cmd31;
119         uint64_t lmt_status, i;
120         uint16_t pkts_left;
121
122         NIX_XMIT_FC_OR_RETURN(txq, pkts);
123
124         pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
125         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
126
127         /* Reduce the cached count */
128         txq->fc_cache_pkts -= pkts;
129
130         /* Lets commit any changes in the packet */
131         rte_cio_wmb();
132
133         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
134         senddesc23_w0 = senddesc01_w0;
135         senddesc01_w1 = vdupq_n_u64(0);
136         senddesc23_w1 = senddesc01_w1;
137         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
138         sgdesc23_w0 = sgdesc01_w0;
139
140         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
141                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
142                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
143                                           vdupq_n_u64(0xFFFFFFFF));
144                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
145                                         vdupq_n_u64(0xFFFFFFFF));
146
147                 senddesc23_w0 = senddesc01_w0;
148                 sgdesc23_w0 = sgdesc01_w0;
149
150                 /* Move mbufs to iova */
151                 mbuf0 = (uint64_t *)tx_pkts[0];
152                 mbuf1 = (uint64_t *)tx_pkts[1];
153                 mbuf2 = (uint64_t *)tx_pkts[2];
154                 mbuf3 = (uint64_t *)tx_pkts[3];
155
156                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
157                                      offsetof(struct rte_mbuf, buf_iova));
158                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
159                                      offsetof(struct rte_mbuf, buf_iova));
160                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
161                                      offsetof(struct rte_mbuf, buf_iova));
162                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
163                                      offsetof(struct rte_mbuf, buf_iova));
164                 /*
165                  * Get mbuf's, olflags, iova, pktlen, dataoff
166                  * dataoff_iovaX.D[0] = iova,
167                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
168                  * len_olflagsX.D[0] = ol_flags,
169                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
170                  */
171                 dataoff_iova0  = vld1q_u64(mbuf0);
172                 len_olflags0 = vld1q_u64(mbuf0 + 2);
173                 dataoff_iova1  = vld1q_u64(mbuf1);
174                 len_olflags1 = vld1q_u64(mbuf1 + 2);
175                 dataoff_iova2  = vld1q_u64(mbuf2);
176                 len_olflags2 = vld1q_u64(mbuf2 + 2);
177                 dataoff_iova3  = vld1q_u64(mbuf3);
178                 len_olflags3 = vld1q_u64(mbuf3 + 2);
179
180                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
181                         struct rte_mbuf *mbuf;
182                         /* Set don't free bit if reference count > 1 */
183                         xmask01 = vdupq_n_u64(0);
184                         xmask23 = xmask01;
185
186                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
187                                 offsetof(struct rte_mbuf, buf_iova));
188
189                         if (otx2_nix_prefree_seg(mbuf))
190                                 vsetq_lane_u64(0x80000, xmask01, 0);
191                         else
192                                 __mempool_check_cookies(mbuf->pool,
193                                                         (void **)&mbuf,
194                                                         1, 0);
195
196                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
197                                 offsetof(struct rte_mbuf, buf_iova));
198                         if (otx2_nix_prefree_seg(mbuf))
199                                 vsetq_lane_u64(0x80000, xmask01, 1);
200                         else
201                                 __mempool_check_cookies(mbuf->pool,
202                                                         (void **)&mbuf,
203                                                         1, 0);
204
205                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
206                                 offsetof(struct rte_mbuf, buf_iova));
207                         if (otx2_nix_prefree_seg(mbuf))
208                                 vsetq_lane_u64(0x80000, xmask23, 0);
209                         else
210                                 __mempool_check_cookies(mbuf->pool,
211                                                         (void **)&mbuf,
212                                                         1, 0);
213
214                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
215                                 offsetof(struct rte_mbuf, buf_iova));
216                         if (otx2_nix_prefree_seg(mbuf))
217                                 vsetq_lane_u64(0x80000, xmask23, 1);
218                         else
219                                 __mempool_check_cookies(mbuf->pool,
220                                                         (void **)&mbuf,
221                                                         1, 0);
222                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
223                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
224                 } else {
225                         struct rte_mbuf *mbuf;
226                         /* Mark mempool object as "put" since
227                          * it is freed by NIX
228                          */
229                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
230                                 offsetof(struct rte_mbuf, buf_iova));
231                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
232                                                 1, 0);
233
234                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
235                                 offsetof(struct rte_mbuf, buf_iova));
236                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
237                                                 1, 0);
238
239                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
240                                 offsetof(struct rte_mbuf, buf_iova));
241                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
242                                                 1, 0);
243
244                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
245                                 offsetof(struct rte_mbuf, buf_iova));
246                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
247                                                 1, 0);
248                         RTE_SET_USED(mbuf);
249                 }
250
251                 /* Move mbufs to point pool */
252                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
253                          offsetof(struct rte_mbuf, pool) -
254                          offsetof(struct rte_mbuf, buf_iova));
255                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
256                          offsetof(struct rte_mbuf, pool) -
257                          offsetof(struct rte_mbuf, buf_iova));
258                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
259                          offsetof(struct rte_mbuf, pool) -
260                          offsetof(struct rte_mbuf, buf_iova));
261                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
262                          offsetof(struct rte_mbuf, pool) -
263                          offsetof(struct rte_mbuf, buf_iova));
264
265                 if (flags &
266                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
267                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
268                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
269                         /*
270                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
271                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
272                          */
273
274                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
275                                       [a]"+w"(senddesc01_w1) :
276                                       [in]"r"(mbuf0 + 2) : "memory");
277
278                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
279                                       [a]"+w"(senddesc01_w1) :
280                                       [in]"r"(mbuf1 + 2) : "memory");
281
282                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
283                                       [b]"+w"(senddesc23_w1) :
284                                       [in]"r"(mbuf2 + 2) : "memory");
285
286                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
287                                       [b]"+w"(senddesc23_w1) :
288                                       [in]"r"(mbuf3 + 2) : "memory");
289
290                         /* Get pool pointer alone */
291                         mbuf0 = (uint64_t *)*mbuf0;
292                         mbuf1 = (uint64_t *)*mbuf1;
293                         mbuf2 = (uint64_t *)*mbuf2;
294                         mbuf3 = (uint64_t *)*mbuf3;
295                 } else {
296                         /* Get pool pointer alone */
297                         mbuf0 = (uint64_t *)*mbuf0;
298                         mbuf1 = (uint64_t *)*mbuf1;
299                         mbuf2 = (uint64_t *)*mbuf2;
300                         mbuf3 = (uint64_t *)*mbuf3;
301                 }
302
303                 const uint8x16_t shuf_mask2 = {
304                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
305                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
306                 };
307                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
308                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
309
310                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
311                 const uint64x2_t and_mask0 = {
312                         0xFFFFFFFFFFFFFFFF,
313                         0x000000000000FFFF,
314                 };
315
316                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
317                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
318                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
319                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
320
321                 /*
322                  * Pick only 16 bits of pktlen preset at bits 63:32
323                  * and place them at bits 15:0.
324                  */
325                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
326                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
327
328                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
329                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
330                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
331
332                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
333                  * pktlen at 15:0 position.
334                  */
335                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
336                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
337                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
338                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
339
340                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
341                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
342                         /*
343                          * Lookup table to translate ol_flags to
344                          * il3/il4 types. But we still use ol3/ol4 types in
345                          * senddesc_w1 as only one header processing is enabled.
346                          */
347                         const uint8x16_t tbl = {
348                                 /* [0-15] = il4type:il3type */
349                                 0x04, /* none (IPv6 assumed) */
350                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
351                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
352                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
353                                 0x03, /* PKT_TX_IP_CKSUM */
354                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
355                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
356                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
357                                 0x02, /* PKT_TX_IPV4  */
358                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
359                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
360                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
361                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
362                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
363                                        * PKT_TX_TCP_CKSUM
364                                        */
365                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
366                                        * PKT_TX_SCTP_CKSUM
367                                        */
368                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
369                                        * PKT_TX_UDP_CKSUM
370                                        */
371                         };
372
373                         /* Extract olflags to translate to iltypes */
374                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
375                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
376
377                         /*
378                          * E(47):L3_LEN(9):L2_LEN(7+z)
379                          * E(47):L3_LEN(9):L2_LEN(7+z)
380                          */
381                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
382                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
383
384                         /* Move OLFLAGS bits 55:52 to 51:48
385                          * with zeros preprended on the byte and rest
386                          * don't care
387                          */
388                         xtmp128 = vshrq_n_u8(xtmp128, 4);
389                         ytmp128 = vshrq_n_u8(ytmp128, 4);
390                         /*
391                          * E(48):L3_LEN(8):L2_LEN(z+7)
392                          * E(48):L3_LEN(8):L2_LEN(z+7)
393                          */
394                         const int8x16_t tshft3 = {
395                                 -1, 0, 8, 8, 8, 8, 8, 8,
396                                 -1, 0, 8, 8, 8, 8, 8, 8,
397                         };
398
399                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
400                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
401
402                         /* Do the lookup */
403                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
404                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
405
406                         /* Just use ld1q to retrieve aura
407                          * when we don't need tx_offload
408                          */
409                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
410                                         offsetof(struct rte_mempool, pool_id));
411                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
412                                         offsetof(struct rte_mempool, pool_id));
413                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
414                                         offsetof(struct rte_mempool, pool_id));
415                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
416                                         offsetof(struct rte_mempool, pool_id));
417
418                         /* Pick only relevant fields i.e Bit 48:55 of iltype
419                          * and place it in ol3/ol4type of senddesc_w1
420                          */
421                         const uint8x16_t shuf_mask0 = {
422                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
423                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
424                         };
425
426                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
427                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
428
429                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
430                          * a [E(32):E(16):OL3(8):OL2(8)]
431                          * a = a + (a << 8)
432                          * a [E(32):E(16):(OL3+OL2):OL2]
433                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
434                          */
435                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
436                                                  vshlq_n_u16(senddesc01_w1, 8));
437                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
438                                                  vshlq_n_u16(senddesc23_w1, 8));
439
440                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
441                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
442                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
443                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
444                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
445
446                         xmask01 = vdupq_n_u64(0);
447                         xmask23 = xmask01;
448                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
449                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
450
451                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
452                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
453
454                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
455                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
456
457                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
458                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
459                         xmask01 = vshlq_n_u64(xmask01, 20);
460                         xmask23 = vshlq_n_u64(xmask23, 20);
461
462                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
463                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
464                         /* Move ltypes to senddesc*_w1 */
465                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
466                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
467
468                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
469                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
470                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
471                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
472                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
473
474                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
475                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
476                         /*
477                          * Lookup table to translate ol_flags to
478                          * ol3/ol4 types.
479                          */
480
481                         const uint8x16_t tbl = {
482                                 /* [0-15] = ol4type:ol3type */
483                                 0x00, /* none */
484                                 0x03, /* OUTER_IP_CKSUM */
485                                 0x02, /* OUTER_IPV4 */
486                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
487                                 0x04, /* OUTER_IPV6 */
488                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
489                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
490                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
491                                        * OUTER_IP_CKSUM
492                                        */
493                                 0x00, /* OUTER_UDP_CKSUM */
494                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
495                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
496                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
497                                        * OUTER_IP_CKSUM
498                                        */
499                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
500                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
501                                        * OUTER_IP_CKSUM
502                                        */
503                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
504                                        * OUTER_IPV4
505                                        */
506                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
507                                        * OUTER_IPV4 | OUTER_IP_CKSUM
508                                        */
509                         };
510
511                         /* Extract olflags to translate to iltypes */
512                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
513                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
514
515                         /*
516                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
517                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
518                          */
519                         const uint8x16_t shuf_mask5 = {
520                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
521                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
522                         };
523                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
524                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
525
526                         /* Extract outer ol flags only */
527                         const uint64x2_t o_cksum_mask = {
528                                 0x1C00020000000000,
529                                 0x1C00020000000000,
530                         };
531
532                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
533                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
534
535                         /* Extract OUTER_UDP_CKSUM bit 41 and
536                          * move it to bit 61
537                          */
538
539                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
540                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
541
542                         /* Shift oltype by 2 to start nibble from BIT(56)
543                          * instead of BIT(58)
544                          */
545                         xtmp128 = vshrq_n_u8(xtmp128, 2);
546                         ytmp128 = vshrq_n_u8(ytmp128, 2);
547                         /*
548                          * E(48):L3_LEN(8):L2_LEN(z+7)
549                          * E(48):L3_LEN(8):L2_LEN(z+7)
550                          */
551                         const int8x16_t tshft3 = {
552                                 -1, 0, 8, 8, 8, 8, 8, 8,
553                                 -1, 0, 8, 8, 8, 8, 8, 8,
554                         };
555
556                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
557                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
558
559                         /* Do the lookup */
560                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
561                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
562
563                         /* Just use ld1q to retrieve aura
564                          * when we don't need tx_offload
565                          */
566                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
567                                         offsetof(struct rte_mempool, pool_id));
568                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
569                                         offsetof(struct rte_mempool, pool_id));
570                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
571                                         offsetof(struct rte_mempool, pool_id));
572                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
573                                         offsetof(struct rte_mempool, pool_id));
574
575                         /* Pick only relevant fields i.e Bit 56:63 of oltype
576                          * and place it in ol3/ol4type of senddesc_w1
577                          */
578                         const uint8x16_t shuf_mask0 = {
579                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
580                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
581                         };
582
583                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
584                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
585
586                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
587                          * a [E(32):E(16):OL3(8):OL2(8)]
588                          * a = a + (a << 8)
589                          * a [E(32):E(16):(OL3+OL2):OL2]
590                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
591                          */
592                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
593                                                  vshlq_n_u16(senddesc01_w1, 8));
594                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
595                                                  vshlq_n_u16(senddesc23_w1, 8));
596
597                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
598                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
599                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
600                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
601                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
602
603                         xmask01 = vdupq_n_u64(0);
604                         xmask23 = xmask01;
605                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
606                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
607
608                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
609                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
610
611                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
612                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
613
614                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
615                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
616                         xmask01 = vshlq_n_u64(xmask01, 20);
617                         xmask23 = vshlq_n_u64(xmask23, 20);
618
619                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
620                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
621                         /* Move ltypes to senddesc*_w1 */
622                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
623                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
624
625                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
626                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
627                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
628                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
629                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
630
631                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
632                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
633                         /* Lookup table to translate ol_flags to
634                          * ol4type, ol3type, il4type, il3type of senddesc_w1
635                          */
636                         const uint8x16x2_t tbl = {
637                         {
638                                 {
639                                         /* [0-15] = il4type:il3type */
640                                         0x04, /* none (IPv6) */
641                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
642                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
643                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
644                                         0x03, /* PKT_TX_IP_CKSUM */
645                                         0x13, /* PKT_TX_IP_CKSUM |
646                                                * PKT_TX_TCP_CKSUM
647                                                */
648                                         0x23, /* PKT_TX_IP_CKSUM |
649                                                * PKT_TX_SCTP_CKSUM
650                                                */
651                                         0x33, /* PKT_TX_IP_CKSUM |
652                                                * PKT_TX_UDP_CKSUM
653                                                */
654                                         0x02, /* PKT_TX_IPV4 */
655                                         0x12, /* PKT_TX_IPV4 |
656                                                * PKT_TX_TCP_CKSUM
657                                                */
658                                         0x22, /* PKT_TX_IPV4 |
659                                                * PKT_TX_SCTP_CKSUM
660                                                */
661                                         0x32, /* PKT_TX_IPV4 |
662                                                * PKT_TX_UDP_CKSUM
663                                                */
664                                         0x03, /* PKT_TX_IPV4 |
665                                                * PKT_TX_IP_CKSUM
666                                                */
667                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
668                                                * PKT_TX_TCP_CKSUM
669                                                */
670                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
671                                                * PKT_TX_SCTP_CKSUM
672                                                */
673                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
674                                                * PKT_TX_UDP_CKSUM
675                                                */
676                                 },
677
678                                 {
679                                         /* [16-31] = ol4type:ol3type */
680                                         0x00, /* none */
681                                         0x03, /* OUTER_IP_CKSUM */
682                                         0x02, /* OUTER_IPV4 */
683                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
684                                         0x04, /* OUTER_IPV6 */
685                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
686                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
687                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
688                                                * OUTER_IP_CKSUM
689                                                */
690                                         0x00, /* OUTER_UDP_CKSUM */
691                                         0x33, /* OUTER_UDP_CKSUM |
692                                                * OUTER_IP_CKSUM
693                                                */
694                                         0x32, /* OUTER_UDP_CKSUM |
695                                                * OUTER_IPV4
696                                                */
697                                         0x33, /* OUTER_UDP_CKSUM |
698                                                * OUTER_IPV4 | OUTER_IP_CKSUM
699                                                */
700                                         0x34, /* OUTER_UDP_CKSUM |
701                                                * OUTER_IPV6
702                                                */
703                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
704                                                * OUTER_IP_CKSUM
705                                                */
706                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
707                                                * OUTER_IPV4
708                                                */
709                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
710                                                * OUTER_IPV4 | OUTER_IP_CKSUM
711                                                */
712                                 },
713                         }
714                         };
715
716                         /* Extract olflags to translate to oltype & iltype */
717                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
718                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
719
720                         /*
721                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
722                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
723                          */
724                         const uint32x4_t tshft_4 = {
725                                 1, 0,
726                                 1, 0,
727                         };
728                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
729                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
730
731                         /*
732                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
733                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
734                          */
735                         const uint8x16_t shuf_mask5 = {
736                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
737                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
738                         };
739                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
740                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
741
742                         /* Extract outer and inner header ol_flags */
743                         const uint64x2_t oi_cksum_mask = {
744                                 0x1CF0020000000000,
745                                 0x1CF0020000000000,
746                         };
747
748                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
749                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
750
751                         /* Extract OUTER_UDP_CKSUM bit 41 and
752                          * move it to bit 61
753                          */
754
755                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
756                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
757
758                         /* Shift right oltype by 2 and iltype by 4
759                          * to start oltype nibble from BIT(58)
760                          * instead of BIT(56) and iltype nibble from BIT(48)
761                          * instead of BIT(52).
762                          */
763                         const int8x16_t tshft5 = {
764                                 8, 8, 8, 8, 8, 8, -4, -2,
765                                 8, 8, 8, 8, 8, 8, -4, -2,
766                         };
767
768                         xtmp128 = vshlq_u8(xtmp128, tshft5);
769                         ytmp128 = vshlq_u8(ytmp128, tshft5);
770                         /*
771                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
772                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
773                          */
774                         const int8x16_t tshft3 = {
775                                 -1, 0, -1, 0, 0, 0, 0, 0,
776                                 -1, 0, -1, 0, 0, 0, 0, 0,
777                         };
778
779                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
780                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
781
782                         /* Mark Bit(4) of oltype */
783                         const uint64x2_t oi_cksum_mask2 = {
784                                 0x1000000000000000,
785                                 0x1000000000000000,
786                         };
787
788                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
789                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
790
791                         /* Do the lookup */
792                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
793                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
794
795                         /* Just use ld1q to retrieve aura
796                          * when we don't need tx_offload
797                          */
798                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
799                                         offsetof(struct rte_mempool, pool_id));
800                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
801                                         offsetof(struct rte_mempool, pool_id));
802                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
803                                         offsetof(struct rte_mempool, pool_id));
804                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
805                                         offsetof(struct rte_mempool, pool_id));
806
807                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
808                          * Bit 56:63 of oltype and place it in corresponding
809                          * place in senddesc_w1.
810                          */
811                         const uint8x16_t shuf_mask0 = {
812                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
813                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
814                         };
815
816                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
817                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
818
819                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
820                          * l3len, l2len, ol3len, ol2len.
821                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
822                          * a = a + (a << 8)
823                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
824                          * a = a + (a << 16)
825                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
826                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
827                          */
828                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
829                                                  vshlq_n_u32(senddesc01_w1, 8));
830                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
831                                                  vshlq_n_u32(senddesc23_w1, 8));
832
833                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
834                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
835                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
836                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
837                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
838
839                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
840                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
841                                                 vshlq_n_u32(senddesc01_w1, 16));
842                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
843                                                 vshlq_n_u32(senddesc23_w1, 16));
844
845                         xmask01 = vdupq_n_u64(0);
846                         xmask23 = xmask01;
847                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
848                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
849
850                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
851                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
852
853                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
854                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
855
856                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
857                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
858                         xmask01 = vshlq_n_u64(xmask01, 20);
859                         xmask23 = vshlq_n_u64(xmask23, 20);
860
861                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
862                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
863                         /* Move ltypes to senddesc*_w1 */
864                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
865                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
866
867                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
868                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
869                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
870                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
871                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
872                 } else {
873                         /* Just use ld1q to retrieve aura
874                          * when we don't need tx_offload
875                          */
876                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
877                                         offsetof(struct rte_mempool, pool_id));
878                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
879                                         offsetof(struct rte_mempool, pool_id));
880                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
881                                         offsetof(struct rte_mempool, pool_id));
882                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
883                                         offsetof(struct rte_mempool, pool_id));
884                         xmask01 = vdupq_n_u64(0);
885                         xmask23 = xmask01;
886                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
887                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
888
889                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
890                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
891
892                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
893                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
894
895                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
896                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
897                         xmask01 = vshlq_n_u64(xmask01, 20);
898                         xmask23 = vshlq_n_u64(xmask23, 20);
899
900                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
901                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
902
903                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
904                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
905                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
906                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
907                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
908                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
909                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
910                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
911                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
912                 }
913
914                 do {
915                         vst1q_u64(lmt_addr, cmd00);
916                         vst1q_u64(lmt_addr + 2, cmd01);
917                         vst1q_u64(lmt_addr + 4, cmd10);
918                         vst1q_u64(lmt_addr + 6, cmd11);
919                         vst1q_u64(lmt_addr + 8, cmd20);
920                         vst1q_u64(lmt_addr + 10, cmd21);
921                         vst1q_u64(lmt_addr + 12, cmd30);
922                         vst1q_u64(lmt_addr + 14, cmd31);
923                         lmt_status = otx2_lmt_submit(io_addr);
924
925                 } while (lmt_status == 0);
926                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
927         }
928
929         if (unlikely(pkts_left))
930                 pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
931
932         return pkts;
933 }
934
935 #else
936 static __rte_always_inline uint16_t
937 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
938                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
939 {
940         RTE_SET_USED(tx_queue);
941         RTE_SET_USED(tx_pkts);
942         RTE_SET_USED(pkts);
943         RTE_SET_USED(cmd);
944         RTE_SET_USED(flags);
945         return 0;
946 }
947 #endif
948
949 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
950 static uint16_t __rte_noinline  __hot                                   \
951 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
952                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
953 {                                                                       \
954         uint64_t cmd[sz];                                               \
955                                                                         \
956         /* For TSO inner checksum is a must */                          \
957         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
958             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
959                 return 0;                                               \
960         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
961 }
962
963 NIX_TX_FASTPATH_MODES
964 #undef T
965
966 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
967 static uint16_t __rte_noinline  __hot                                   \
968 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
969                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
970 {                                                                       \
971         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
972                                                                         \
973         /* For TSO inner checksum is a must */                          \
974         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
975             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
976                 return 0;                                               \
977         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
978                                   (flags) | NIX_TX_MULTI_SEG_F);        \
979 }
980
981 NIX_TX_FASTPATH_MODES
982 #undef T
983
984 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
985 static uint16_t __rte_noinline  __hot                                   \
986 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
987                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
988 {                                                                       \
989         uint64_t cmd[sz];                                               \
990                                                                         \
991         /* VLAN, TSTMP, TSO is not supported by vec */                  \
992         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
993             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
994             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
995                 return 0;                                               \
996         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
997 }
998
999 NIX_TX_FASTPATH_MODES
1000 #undef T
1001
1002 static inline void
1003 pick_tx_func(struct rte_eth_dev *eth_dev,
1004              const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1005 {
1006         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1007
1008         /* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1009         eth_dev->tx_pkt_burst = tx_burst
1010                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1011                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1012                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1013                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1014                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1015                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1016                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1017 }
1018
1019 void
1020 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1021 {
1022         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1023
1024         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1025 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1026         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1027
1028 NIX_TX_FASTPATH_MODES
1029 #undef T
1030         };
1031
1032         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1033 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1034         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1035
1036 NIX_TX_FASTPATH_MODES
1037 #undef T
1038         };
1039
1040         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1041 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1042         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1043
1044 NIX_TX_FASTPATH_MODES
1045 #undef T
1046         };
1047
1048         if (dev->scalar_ena ||
1049             (dev->tx_offload_flags &
1050              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1051               NIX_TX_OFFLOAD_TSO_F)))
1052                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1053         else
1054                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1055
1056         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1057                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1058
1059         rte_mb();
1060 }