net/sfc: use compat for 128-bit unsigned integer
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Perform header writes before barrier for TSO */
36         if (flags & NIX_TX_OFFLOAD_TSO_F) {
37                 for (i = 0; i < pkts; i++)
38                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39         }
40
41         /* Lets commit any changes in the packet here as no further changes
42          * to the packet will be done unless no fast free is enabled.
43          */
44         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
45                 rte_io_wmb();
46
47         for (i = 0; i < pkts; i++) {
48                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
49                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
50                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
51                                              tx_pkts[i]->ol_flags, 4, flags);
52                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
53         }
54
55         /* Reduce the cached count */
56         txq->fc_cache_pkts -= pkts;
57
58         return pkts;
59 }
60
61 static __rte_always_inline uint16_t
62 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
63                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
64 {
65         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
66         const rte_iova_t io_addr = txq->io_addr;
67         void *lmt_addr = txq->lmt_addr;
68         uint16_t segdw;
69
70         NIX_XMIT_FC_OR_RETURN(txq, pkts);
71
72         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
73
74         /* Perform header writes before barrier for TSO */
75         if (flags & NIX_TX_OFFLOAD_TSO_F) {
76                 for (i = 0; i < pkts; i++)
77                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
78         }
79
80         for (i = 0; i < pkts; i++) {
81                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
82                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
83                 /* Lets commit any changes in the packet */
84                 rte_io_wmb();
85                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
86                                              tx_pkts[i]->ol_flags, segdw,
87                                              flags);
88                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
89         }
90
91         /* Reduce the cached count */
92         txq->fc_cache_pkts -= pkts;
93
94         return pkts;
95 }
96
97 #if defined(RTE_ARCH_ARM64)
98
99 #define NIX_DESCS_PER_LOOP      4
100 static __rte_always_inline uint16_t
101 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
102                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
103 {
104         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
105         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
106         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
107         uint64x2_t senddesc01_w0, senddesc23_w0;
108         uint64x2_t senddesc01_w1, senddesc23_w1;
109         uint64x2_t sgdesc01_w0, sgdesc23_w0;
110         uint64x2_t sgdesc01_w1, sgdesc23_w1;
111         struct otx2_eth_txq *txq = tx_queue;
112         uint64_t *lmt_addr = txq->lmt_addr;
113         rte_iova_t io_addr = txq->io_addr;
114         uint64x2_t ltypes01, ltypes23;
115         uint64x2_t xtmp128, ytmp128;
116         uint64x2_t xmask01, xmask23;
117         uint64x2_t cmd00, cmd01;
118         uint64x2_t cmd10, cmd11;
119         uint64x2_t cmd20, cmd21;
120         uint64x2_t cmd30, cmd31;
121         uint64_t lmt_status, i;
122         uint16_t pkts_left;
123
124         NIX_XMIT_FC_OR_RETURN(txq, pkts);
125
126         pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
127         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
128
129         /* Reduce the cached count */
130         txq->fc_cache_pkts -= pkts;
131
132         /* Lets commit any changes in the packet here as no further changes
133          * to the packet will be done unless no fast free is enabled.
134          */
135         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
136                 rte_io_wmb();
137
138         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
139         senddesc23_w0 = senddesc01_w0;
140         senddesc01_w1 = vdupq_n_u64(0);
141         senddesc23_w1 = senddesc01_w1;
142         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
143         sgdesc23_w0 = sgdesc01_w0;
144
145         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
146                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
147                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
148                                           vdupq_n_u64(0xFFFFFFFF));
149                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
150                                         vdupq_n_u64(0xFFFFFFFF));
151
152                 senddesc23_w0 = senddesc01_w0;
153                 sgdesc23_w0 = sgdesc01_w0;
154
155                 /* Move mbufs to iova */
156                 mbuf0 = (uint64_t *)tx_pkts[0];
157                 mbuf1 = (uint64_t *)tx_pkts[1];
158                 mbuf2 = (uint64_t *)tx_pkts[2];
159                 mbuf3 = (uint64_t *)tx_pkts[3];
160
161                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
162                                      offsetof(struct rte_mbuf, buf_iova));
163                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
164                                      offsetof(struct rte_mbuf, buf_iova));
165                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
166                                      offsetof(struct rte_mbuf, buf_iova));
167                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
168                                      offsetof(struct rte_mbuf, buf_iova));
169                 /*
170                  * Get mbuf's, olflags, iova, pktlen, dataoff
171                  * dataoff_iovaX.D[0] = iova,
172                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
173                  * len_olflagsX.D[0] = ol_flags,
174                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
175                  */
176                 dataoff_iova0  = vld1q_u64(mbuf0);
177                 len_olflags0 = vld1q_u64(mbuf0 + 2);
178                 dataoff_iova1  = vld1q_u64(mbuf1);
179                 len_olflags1 = vld1q_u64(mbuf1 + 2);
180                 dataoff_iova2  = vld1q_u64(mbuf2);
181                 len_olflags2 = vld1q_u64(mbuf2 + 2);
182                 dataoff_iova3  = vld1q_u64(mbuf3);
183                 len_olflags3 = vld1q_u64(mbuf3 + 2);
184
185                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
186                         struct rte_mbuf *mbuf;
187                         /* Set don't free bit if reference count > 1 */
188                         xmask01 = vdupq_n_u64(0);
189                         xmask23 = xmask01;
190
191                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
192                                 offsetof(struct rte_mbuf, buf_iova));
193
194                         if (otx2_nix_prefree_seg(mbuf))
195                                 vsetq_lane_u64(0x80000, xmask01, 0);
196                         else
197                                 __mempool_check_cookies(mbuf->pool,
198                                                         (void **)&mbuf,
199                                                         1, 0);
200
201                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
202                                 offsetof(struct rte_mbuf, buf_iova));
203                         if (otx2_nix_prefree_seg(mbuf))
204                                 vsetq_lane_u64(0x80000, xmask01, 1);
205                         else
206                                 __mempool_check_cookies(mbuf->pool,
207                                                         (void **)&mbuf,
208                                                         1, 0);
209
210                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
211                                 offsetof(struct rte_mbuf, buf_iova));
212                         if (otx2_nix_prefree_seg(mbuf))
213                                 vsetq_lane_u64(0x80000, xmask23, 0);
214                         else
215                                 __mempool_check_cookies(mbuf->pool,
216                                                         (void **)&mbuf,
217                                                         1, 0);
218
219                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
220                                 offsetof(struct rte_mbuf, buf_iova));
221                         if (otx2_nix_prefree_seg(mbuf))
222                                 vsetq_lane_u64(0x80000, xmask23, 1);
223                         else
224                                 __mempool_check_cookies(mbuf->pool,
225                                                         (void **)&mbuf,
226                                                         1, 0);
227                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
228                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
229                         /* Ensuring mbuf fields which got updated in
230                          * otx2_nix_prefree_seg are written before LMTST.
231                          */
232                         rte_io_wmb();
233                 } else {
234                         struct rte_mbuf *mbuf;
235                         /* Mark mempool object as "put" since
236                          * it is freed by NIX
237                          */
238                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
239                                 offsetof(struct rte_mbuf, buf_iova));
240                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
241                                                 1, 0);
242
243                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
244                                 offsetof(struct rte_mbuf, buf_iova));
245                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
246                                                 1, 0);
247
248                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
249                                 offsetof(struct rte_mbuf, buf_iova));
250                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
251                                                 1, 0);
252
253                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
254                                 offsetof(struct rte_mbuf, buf_iova));
255                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
256                                                 1, 0);
257                         RTE_SET_USED(mbuf);
258                 }
259
260                 /* Move mbufs to point pool */
261                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
262                          offsetof(struct rte_mbuf, pool) -
263                          offsetof(struct rte_mbuf, buf_iova));
264                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
265                          offsetof(struct rte_mbuf, pool) -
266                          offsetof(struct rte_mbuf, buf_iova));
267                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
268                          offsetof(struct rte_mbuf, pool) -
269                          offsetof(struct rte_mbuf, buf_iova));
270                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
271                          offsetof(struct rte_mbuf, pool) -
272                          offsetof(struct rte_mbuf, buf_iova));
273
274                 if (flags &
275                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
276                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
277                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
278                         /*
279                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
280                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
281                          */
282
283                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
284                                       [a]"+w"(senddesc01_w1) :
285                                       [in]"r"(mbuf0 + 2) : "memory");
286
287                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
288                                       [a]"+w"(senddesc01_w1) :
289                                       [in]"r"(mbuf1 + 2) : "memory");
290
291                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
292                                       [b]"+w"(senddesc23_w1) :
293                                       [in]"r"(mbuf2 + 2) : "memory");
294
295                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
296                                       [b]"+w"(senddesc23_w1) :
297                                       [in]"r"(mbuf3 + 2) : "memory");
298
299                         /* Get pool pointer alone */
300                         mbuf0 = (uint64_t *)*mbuf0;
301                         mbuf1 = (uint64_t *)*mbuf1;
302                         mbuf2 = (uint64_t *)*mbuf2;
303                         mbuf3 = (uint64_t *)*mbuf3;
304                 } else {
305                         /* Get pool pointer alone */
306                         mbuf0 = (uint64_t *)*mbuf0;
307                         mbuf1 = (uint64_t *)*mbuf1;
308                         mbuf2 = (uint64_t *)*mbuf2;
309                         mbuf3 = (uint64_t *)*mbuf3;
310                 }
311
312                 const uint8x16_t shuf_mask2 = {
313                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
314                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
315                 };
316                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
317                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
318
319                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
320                 const uint64x2_t and_mask0 = {
321                         0xFFFFFFFFFFFFFFFF,
322                         0x000000000000FFFF,
323                 };
324
325                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
326                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
327                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
328                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
329
330                 /*
331                  * Pick only 16 bits of pktlen preset at bits 63:32
332                  * and place them at bits 15:0.
333                  */
334                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
335                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
336
337                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
338                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
339                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
340
341                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
342                  * pktlen at 15:0 position.
343                  */
344                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
345                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
346                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
347                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
348
349                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
350                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
351                         /*
352                          * Lookup table to translate ol_flags to
353                          * il3/il4 types. But we still use ol3/ol4 types in
354                          * senddesc_w1 as only one header processing is enabled.
355                          */
356                         const uint8x16_t tbl = {
357                                 /* [0-15] = il4type:il3type */
358                                 0x04, /* none (IPv6 assumed) */
359                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
360                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
361                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
362                                 0x03, /* PKT_TX_IP_CKSUM */
363                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
364                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
365                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
366                                 0x02, /* PKT_TX_IPV4  */
367                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
368                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
369                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
370                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
371                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
372                                        * PKT_TX_TCP_CKSUM
373                                        */
374                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
375                                        * PKT_TX_SCTP_CKSUM
376                                        */
377                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
378                                        * PKT_TX_UDP_CKSUM
379                                        */
380                         };
381
382                         /* Extract olflags to translate to iltypes */
383                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
384                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
385
386                         /*
387                          * E(47):L3_LEN(9):L2_LEN(7+z)
388                          * E(47):L3_LEN(9):L2_LEN(7+z)
389                          */
390                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
391                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
392
393                         /* Move OLFLAGS bits 55:52 to 51:48
394                          * with zeros preprended on the byte and rest
395                          * don't care
396                          */
397                         xtmp128 = vshrq_n_u8(xtmp128, 4);
398                         ytmp128 = vshrq_n_u8(ytmp128, 4);
399                         /*
400                          * E(48):L3_LEN(8):L2_LEN(z+7)
401                          * E(48):L3_LEN(8):L2_LEN(z+7)
402                          */
403                         const int8x16_t tshft3 = {
404                                 -1, 0, 8, 8, 8, 8, 8, 8,
405                                 -1, 0, 8, 8, 8, 8, 8, 8,
406                         };
407
408                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
409                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
410
411                         /* Do the lookup */
412                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
413                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
414
415                         /* Just use ld1q to retrieve aura
416                          * when we don't need tx_offload
417                          */
418                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
419                                         offsetof(struct rte_mempool, pool_id));
420                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
421                                         offsetof(struct rte_mempool, pool_id));
422                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
423                                         offsetof(struct rte_mempool, pool_id));
424                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
425                                         offsetof(struct rte_mempool, pool_id));
426
427                         /* Pick only relevant fields i.e Bit 48:55 of iltype
428                          * and place it in ol3/ol4type of senddesc_w1
429                          */
430                         const uint8x16_t shuf_mask0 = {
431                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
432                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
433                         };
434
435                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
436                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
437
438                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
439                          * a [E(32):E(16):OL3(8):OL2(8)]
440                          * a = a + (a << 8)
441                          * a [E(32):E(16):(OL3+OL2):OL2]
442                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
443                          */
444                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
445                                                  vshlq_n_u16(senddesc01_w1, 8));
446                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
447                                                  vshlq_n_u16(senddesc23_w1, 8));
448
449                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
450                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
451                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
452                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
453                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
454
455                         xmask01 = vdupq_n_u64(0);
456                         xmask23 = xmask01;
457                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
458                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
459
460                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
461                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
462
463                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
464                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
465
466                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
467                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
468                         xmask01 = vshlq_n_u64(xmask01, 20);
469                         xmask23 = vshlq_n_u64(xmask23, 20);
470
471                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
472                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
473                         /* Move ltypes to senddesc*_w1 */
474                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
475                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
476
477                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
478                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
479                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
480                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
481                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
482
483                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
484                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
485                         /*
486                          * Lookup table to translate ol_flags to
487                          * ol3/ol4 types.
488                          */
489
490                         const uint8x16_t tbl = {
491                                 /* [0-15] = ol4type:ol3type */
492                                 0x00, /* none */
493                                 0x03, /* OUTER_IP_CKSUM */
494                                 0x02, /* OUTER_IPV4 */
495                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
496                                 0x04, /* OUTER_IPV6 */
497                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
498                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
499                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
500                                        * OUTER_IP_CKSUM
501                                        */
502                                 0x00, /* OUTER_UDP_CKSUM */
503                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
504                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
505                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
506                                        * OUTER_IP_CKSUM
507                                        */
508                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
509                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
510                                        * OUTER_IP_CKSUM
511                                        */
512                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
513                                        * OUTER_IPV4
514                                        */
515                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
516                                        * OUTER_IPV4 | OUTER_IP_CKSUM
517                                        */
518                         };
519
520                         /* Extract olflags to translate to iltypes */
521                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
522                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
523
524                         /*
525                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
526                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
527                          */
528                         const uint8x16_t shuf_mask5 = {
529                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
530                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
531                         };
532                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
533                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
534
535                         /* Extract outer ol flags only */
536                         const uint64x2_t o_cksum_mask = {
537                                 0x1C00020000000000,
538                                 0x1C00020000000000,
539                         };
540
541                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
542                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
543
544                         /* Extract OUTER_UDP_CKSUM bit 41 and
545                          * move it to bit 61
546                          */
547
548                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
549                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
550
551                         /* Shift oltype by 2 to start nibble from BIT(56)
552                          * instead of BIT(58)
553                          */
554                         xtmp128 = vshrq_n_u8(xtmp128, 2);
555                         ytmp128 = vshrq_n_u8(ytmp128, 2);
556                         /*
557                          * E(48):L3_LEN(8):L2_LEN(z+7)
558                          * E(48):L3_LEN(8):L2_LEN(z+7)
559                          */
560                         const int8x16_t tshft3 = {
561                                 -1, 0, 8, 8, 8, 8, 8, 8,
562                                 -1, 0, 8, 8, 8, 8, 8, 8,
563                         };
564
565                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
566                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
567
568                         /* Do the lookup */
569                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
570                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
571
572                         /* Just use ld1q to retrieve aura
573                          * when we don't need tx_offload
574                          */
575                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
576                                         offsetof(struct rte_mempool, pool_id));
577                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
578                                         offsetof(struct rte_mempool, pool_id));
579                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
580                                         offsetof(struct rte_mempool, pool_id));
581                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
582                                         offsetof(struct rte_mempool, pool_id));
583
584                         /* Pick only relevant fields i.e Bit 56:63 of oltype
585                          * and place it in ol3/ol4type of senddesc_w1
586                          */
587                         const uint8x16_t shuf_mask0 = {
588                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
589                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
590                         };
591
592                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
593                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
594
595                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
596                          * a [E(32):E(16):OL3(8):OL2(8)]
597                          * a = a + (a << 8)
598                          * a [E(32):E(16):(OL3+OL2):OL2]
599                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
600                          */
601                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
602                                                  vshlq_n_u16(senddesc01_w1, 8));
603                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
604                                                  vshlq_n_u16(senddesc23_w1, 8));
605
606                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
607                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
608                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
609                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
610                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
611
612                         xmask01 = vdupq_n_u64(0);
613                         xmask23 = xmask01;
614                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
615                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
616
617                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
618                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
619
620                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
621                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
622
623                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
624                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
625                         xmask01 = vshlq_n_u64(xmask01, 20);
626                         xmask23 = vshlq_n_u64(xmask23, 20);
627
628                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
629                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
630                         /* Move ltypes to senddesc*_w1 */
631                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
632                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
633
634                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
635                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
636                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
637                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
638                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
639
640                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
641                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
642                         /* Lookup table to translate ol_flags to
643                          * ol4type, ol3type, il4type, il3type of senddesc_w1
644                          */
645                         const uint8x16x2_t tbl = {
646                         {
647                                 {
648                                         /* [0-15] = il4type:il3type */
649                                         0x04, /* none (IPv6) */
650                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
651                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
652                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
653                                         0x03, /* PKT_TX_IP_CKSUM */
654                                         0x13, /* PKT_TX_IP_CKSUM |
655                                                * PKT_TX_TCP_CKSUM
656                                                */
657                                         0x23, /* PKT_TX_IP_CKSUM |
658                                                * PKT_TX_SCTP_CKSUM
659                                                */
660                                         0x33, /* PKT_TX_IP_CKSUM |
661                                                * PKT_TX_UDP_CKSUM
662                                                */
663                                         0x02, /* PKT_TX_IPV4 */
664                                         0x12, /* PKT_TX_IPV4 |
665                                                * PKT_TX_TCP_CKSUM
666                                                */
667                                         0x22, /* PKT_TX_IPV4 |
668                                                * PKT_TX_SCTP_CKSUM
669                                                */
670                                         0x32, /* PKT_TX_IPV4 |
671                                                * PKT_TX_UDP_CKSUM
672                                                */
673                                         0x03, /* PKT_TX_IPV4 |
674                                                * PKT_TX_IP_CKSUM
675                                                */
676                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
677                                                * PKT_TX_TCP_CKSUM
678                                                */
679                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
680                                                * PKT_TX_SCTP_CKSUM
681                                                */
682                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
683                                                * PKT_TX_UDP_CKSUM
684                                                */
685                                 },
686
687                                 {
688                                         /* [16-31] = ol4type:ol3type */
689                                         0x00, /* none */
690                                         0x03, /* OUTER_IP_CKSUM */
691                                         0x02, /* OUTER_IPV4 */
692                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
693                                         0x04, /* OUTER_IPV6 */
694                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
695                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
696                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
697                                                * OUTER_IP_CKSUM
698                                                */
699                                         0x00, /* OUTER_UDP_CKSUM */
700                                         0x33, /* OUTER_UDP_CKSUM |
701                                                * OUTER_IP_CKSUM
702                                                */
703                                         0x32, /* OUTER_UDP_CKSUM |
704                                                * OUTER_IPV4
705                                                */
706                                         0x33, /* OUTER_UDP_CKSUM |
707                                                * OUTER_IPV4 | OUTER_IP_CKSUM
708                                                */
709                                         0x34, /* OUTER_UDP_CKSUM |
710                                                * OUTER_IPV6
711                                                */
712                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
713                                                * OUTER_IP_CKSUM
714                                                */
715                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
716                                                * OUTER_IPV4
717                                                */
718                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
719                                                * OUTER_IPV4 | OUTER_IP_CKSUM
720                                                */
721                                 },
722                         }
723                         };
724
725                         /* Extract olflags to translate to oltype & iltype */
726                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
727                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
728
729                         /*
730                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
731                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
732                          */
733                         const uint32x4_t tshft_4 = {
734                                 1, 0,
735                                 1, 0,
736                         };
737                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
738                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
739
740                         /*
741                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
742                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
743                          */
744                         const uint8x16_t shuf_mask5 = {
745                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
746                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
747                         };
748                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
749                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
750
751                         /* Extract outer and inner header ol_flags */
752                         const uint64x2_t oi_cksum_mask = {
753                                 0x1CF0020000000000,
754                                 0x1CF0020000000000,
755                         };
756
757                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
758                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
759
760                         /* Extract OUTER_UDP_CKSUM bit 41 and
761                          * move it to bit 61
762                          */
763
764                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
765                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
766
767                         /* Shift right oltype by 2 and iltype by 4
768                          * to start oltype nibble from BIT(58)
769                          * instead of BIT(56) and iltype nibble from BIT(48)
770                          * instead of BIT(52).
771                          */
772                         const int8x16_t tshft5 = {
773                                 8, 8, 8, 8, 8, 8, -4, -2,
774                                 8, 8, 8, 8, 8, 8, -4, -2,
775                         };
776
777                         xtmp128 = vshlq_u8(xtmp128, tshft5);
778                         ytmp128 = vshlq_u8(ytmp128, tshft5);
779                         /*
780                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
781                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
782                          */
783                         const int8x16_t tshft3 = {
784                                 -1, 0, -1, 0, 0, 0, 0, 0,
785                                 -1, 0, -1, 0, 0, 0, 0, 0,
786                         };
787
788                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
789                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
790
791                         /* Mark Bit(4) of oltype */
792                         const uint64x2_t oi_cksum_mask2 = {
793                                 0x1000000000000000,
794                                 0x1000000000000000,
795                         };
796
797                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
798                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
799
800                         /* Do the lookup */
801                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
802                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
803
804                         /* Just use ld1q to retrieve aura
805                          * when we don't need tx_offload
806                          */
807                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
808                                         offsetof(struct rte_mempool, pool_id));
809                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
810                                         offsetof(struct rte_mempool, pool_id));
811                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
812                                         offsetof(struct rte_mempool, pool_id));
813                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
814                                         offsetof(struct rte_mempool, pool_id));
815
816                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
817                          * Bit 56:63 of oltype and place it in corresponding
818                          * place in senddesc_w1.
819                          */
820                         const uint8x16_t shuf_mask0 = {
821                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
822                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
823                         };
824
825                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
826                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
827
828                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
829                          * l3len, l2len, ol3len, ol2len.
830                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
831                          * a = a + (a << 8)
832                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
833                          * a = a + (a << 16)
834                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
835                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
836                          */
837                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
838                                                  vshlq_n_u32(senddesc01_w1, 8));
839                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
840                                                  vshlq_n_u32(senddesc23_w1, 8));
841
842                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
843                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
844                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
845                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
846                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
847
848                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
849                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
850                                                 vshlq_n_u32(senddesc01_w1, 16));
851                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
852                                                 vshlq_n_u32(senddesc23_w1, 16));
853
854                         xmask01 = vdupq_n_u64(0);
855                         xmask23 = xmask01;
856                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
857                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
858
859                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
860                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
861
862                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
863                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
864
865                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
866                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
867                         xmask01 = vshlq_n_u64(xmask01, 20);
868                         xmask23 = vshlq_n_u64(xmask23, 20);
869
870                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
871                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
872                         /* Move ltypes to senddesc*_w1 */
873                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
874                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
875
876                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
877                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
878                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
879                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
880                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
881                 } else {
882                         /* Just use ld1q to retrieve aura
883                          * when we don't need tx_offload
884                          */
885                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
886                                         offsetof(struct rte_mempool, pool_id));
887                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
888                                         offsetof(struct rte_mempool, pool_id));
889                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
890                                         offsetof(struct rte_mempool, pool_id));
891                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
892                                         offsetof(struct rte_mempool, pool_id));
893                         xmask01 = vdupq_n_u64(0);
894                         xmask23 = xmask01;
895                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
896                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
897
898                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
899                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
900
901                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
902                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
903
904                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
905                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
906                         xmask01 = vshlq_n_u64(xmask01, 20);
907                         xmask23 = vshlq_n_u64(xmask23, 20);
908
909                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
910                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
911
912                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
913                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
914                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
915                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
916                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
917                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
918                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
919                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
920                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
921                 }
922
923                 do {
924                         vst1q_u64(lmt_addr, cmd00);
925                         vst1q_u64(lmt_addr + 2, cmd01);
926                         vst1q_u64(lmt_addr + 4, cmd10);
927                         vst1q_u64(lmt_addr + 6, cmd11);
928                         vst1q_u64(lmt_addr + 8, cmd20);
929                         vst1q_u64(lmt_addr + 10, cmd21);
930                         vst1q_u64(lmt_addr + 12, cmd30);
931                         vst1q_u64(lmt_addr + 14, cmd31);
932                         lmt_status = otx2_lmt_submit(io_addr);
933
934                 } while (lmt_status == 0);
935                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
936         }
937
938         if (unlikely(pkts_left))
939                 pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
940
941         return pkts;
942 }
943
944 #else
945 static __rte_always_inline uint16_t
946 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
947                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
948 {
949         RTE_SET_USED(tx_queue);
950         RTE_SET_USED(tx_pkts);
951         RTE_SET_USED(pkts);
952         RTE_SET_USED(cmd);
953         RTE_SET_USED(flags);
954         return 0;
955 }
956 #endif
957
958 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
959 static uint16_t __rte_noinline  __rte_hot                                       \
960 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
961                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
962 {                                                                       \
963         uint64_t cmd[sz];                                               \
964                                                                         \
965         /* For TSO inner checksum is a must */                          \
966         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
967             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
968                 return 0;                                               \
969         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
970 }
971
972 NIX_TX_FASTPATH_MODES
973 #undef T
974
975 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
976 static uint16_t __rte_noinline  __rte_hot                                       \
977 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
978                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
979 {                                                                       \
980         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
981                                                                         \
982         /* For TSO inner checksum is a must */                          \
983         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
984             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
985                 return 0;                                               \
986         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
987                                   (flags) | NIX_TX_MULTI_SEG_F);        \
988 }
989
990 NIX_TX_FASTPATH_MODES
991 #undef T
992
993 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
994 static uint16_t __rte_noinline  __rte_hot                                       \
995 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
996                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
997 {                                                                       \
998         uint64_t cmd[sz];                                               \
999                                                                         \
1000         /* VLAN, TSTMP, TSO is not supported by vec */                  \
1001         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
1002             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
1003             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
1004                 return 0;                                               \
1005         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1006 }
1007
1008 NIX_TX_FASTPATH_MODES
1009 #undef T
1010
1011 static inline void
1012 pick_tx_func(struct rte_eth_dev *eth_dev,
1013              const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1014 {
1015         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1016
1017         /* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1018         eth_dev->tx_pkt_burst = tx_burst
1019                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1020                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1021                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1022                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1023                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1024                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1025                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1026 }
1027
1028 void
1029 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1030 {
1031         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1032
1033         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1034 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1035         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1036
1037 NIX_TX_FASTPATH_MODES
1038 #undef T
1039         };
1040
1041         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1042 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1043         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1044
1045 NIX_TX_FASTPATH_MODES
1046 #undef T
1047         };
1048
1049         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1050 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1051         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1052
1053 NIX_TX_FASTPATH_MODES
1054 #undef T
1055         };
1056
1057         if (dev->scalar_ena ||
1058             (dev->tx_offload_flags &
1059              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1060               NIX_TX_OFFLOAD_TSO_F)))
1061                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1062         else
1063                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1064
1065         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1066                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1067
1068         rte_mb();
1069 }