fa533000edf8f3ee546decadacb2935b56ebe6f6
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Perform header writes before barrier for TSO */
36         if (flags & NIX_TX_OFFLOAD_TSO_F) {
37                 for (i = 0; i < pkts; i++)
38                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39         }
40
41         /* Lets commit any changes in the packet */
42         rte_cio_wmb();
43
44         for (i = 0; i < pkts; i++) {
45                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
46                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
47                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
48                                              tx_pkts[i]->ol_flags, 4, flags);
49                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
50         }
51
52         /* Reduce the cached count */
53         txq->fc_cache_pkts -= pkts;
54
55         return pkts;
56 }
57
58 static __rte_always_inline uint16_t
59 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
60                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
61 {
62         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
63         const rte_iova_t io_addr = txq->io_addr;
64         void *lmt_addr = txq->lmt_addr;
65         uint16_t segdw;
66
67         NIX_XMIT_FC_OR_RETURN(txq, pkts);
68
69         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
70
71         /* Perform header writes before barrier for TSO */
72         if (flags & NIX_TX_OFFLOAD_TSO_F) {
73                 for (i = 0; i < pkts; i++)
74                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
75         }
76
77         /* Lets commit any changes in the packet */
78         rte_cio_wmb();
79
80         for (i = 0; i < pkts; i++) {
81                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
82                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
83                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
84                                              tx_pkts[i]->ol_flags, segdw,
85                                              flags);
86                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
87         }
88
89         /* Reduce the cached count */
90         txq->fc_cache_pkts -= pkts;
91
92         return pkts;
93 }
94
95 #if defined(RTE_ARCH_ARM64)
96
97 #define NIX_DESCS_PER_LOOP      4
98 static __rte_always_inline uint16_t
99 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
100                      uint16_t pkts, const uint16_t flags)
101 {
102         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
103         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
104         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
105         uint64x2_t senddesc01_w0, senddesc23_w0;
106         uint64x2_t senddesc01_w1, senddesc23_w1;
107         uint64x2_t sgdesc01_w0, sgdesc23_w0;
108         uint64x2_t sgdesc01_w1, sgdesc23_w1;
109         struct otx2_eth_txq *txq = tx_queue;
110         uint64_t *lmt_addr = txq->lmt_addr;
111         rte_iova_t io_addr = txq->io_addr;
112         uint64x2_t ltypes01, ltypes23;
113         uint64x2_t xtmp128, ytmp128;
114         uint64x2_t xmask01, xmask23;
115         uint64x2_t mbuf01, mbuf23;
116         uint64x2_t cmd00, cmd01;
117         uint64x2_t cmd10, cmd11;
118         uint64x2_t cmd20, cmd21;
119         uint64x2_t cmd30, cmd31;
120         uint64_t lmt_status, i;
121
122         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
123
124         NIX_XMIT_FC_OR_RETURN(txq, pkts);
125
126         /* Reduce the cached count */
127         txq->fc_cache_pkts -= pkts;
128
129         /* Lets commit any changes in the packet */
130         rte_cio_wmb();
131
132         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
133         senddesc23_w0 = senddesc01_w0;
134         senddesc01_w1 = vdupq_n_u64(0);
135         senddesc23_w1 = senddesc01_w1;
136         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
137         sgdesc23_w0 = sgdesc01_w0;
138
139         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
140                 mbuf01 = vld1q_u64((uint64_t *)tx_pkts);
141                 mbuf23 = vld1q_u64((uint64_t *)(tx_pkts + 2));
142
143                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
144                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
145                                           vdupq_n_u64(0xFFFFFFFF));
146                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
147                                         vdupq_n_u64(0xFFFFFFFF));
148
149                 senddesc23_w0 = senddesc01_w0;
150                 sgdesc23_w0 = sgdesc01_w0;
151
152                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
153
154                 /* Move mbufs to iova */
155                 mbuf0 = (uint64_t *)vgetq_lane_u64(mbuf01, 0);
156                 mbuf1 = (uint64_t *)vgetq_lane_u64(mbuf01, 1);
157                 mbuf2 = (uint64_t *)vgetq_lane_u64(mbuf23, 0);
158                 mbuf3 = (uint64_t *)vgetq_lane_u64(mbuf23, 1);
159
160                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
161                                      offsetof(struct rte_mbuf, buf_iova));
162                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
163                                      offsetof(struct rte_mbuf, buf_iova));
164                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
165                                      offsetof(struct rte_mbuf, buf_iova));
166                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
167                                      offsetof(struct rte_mbuf, buf_iova));
168                 /*
169                  * Get mbuf's, olflags, iova, pktlen, dataoff
170                  * dataoff_iovaX.D[0] = iova,
171                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
172                  * len_olflagsX.D[0] = ol_flags,
173                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
174                  */
175                 dataoff_iova0  = vld1q_u64(mbuf0);
176                 len_olflags0 = vld1q_u64(mbuf0 + 2);
177                 dataoff_iova1  = vld1q_u64(mbuf1);
178                 len_olflags1 = vld1q_u64(mbuf1 + 2);
179                 dataoff_iova2  = vld1q_u64(mbuf2);
180                 len_olflags2 = vld1q_u64(mbuf2 + 2);
181                 dataoff_iova3  = vld1q_u64(mbuf3);
182                 len_olflags3 = vld1q_u64(mbuf3 + 2);
183
184                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
185                         struct rte_mbuf *mbuf;
186                         /* Set don't free bit if reference count > 1 */
187                         xmask01 = vdupq_n_u64(0);
188                         xmask23 = xmask01;
189
190                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
191                                 offsetof(struct rte_mbuf, buf_iova));
192
193                         if (otx2_nix_prefree_seg(mbuf))
194                                 vsetq_lane_u64(0x80000, xmask01, 0);
195                         else
196                                 __mempool_check_cookies(mbuf->pool,
197                                                         (void **)&mbuf,
198                                                         1, 0);
199
200                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
201                                 offsetof(struct rte_mbuf, buf_iova));
202                         if (otx2_nix_prefree_seg(mbuf))
203                                 vsetq_lane_u64(0x80000, xmask01, 1);
204                         else
205                                 __mempool_check_cookies(mbuf->pool,
206                                                         (void **)&mbuf,
207                                                         1, 0);
208
209                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
210                                 offsetof(struct rte_mbuf, buf_iova));
211                         if (otx2_nix_prefree_seg(mbuf))
212                                 vsetq_lane_u64(0x80000, xmask23, 0);
213                         else
214                                 __mempool_check_cookies(mbuf->pool,
215                                                         (void **)&mbuf,
216                                                         1, 0);
217
218                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
219                                 offsetof(struct rte_mbuf, buf_iova));
220                         if (otx2_nix_prefree_seg(mbuf))
221                                 vsetq_lane_u64(0x80000, xmask23, 1);
222                         else
223                                 __mempool_check_cookies(mbuf->pool,
224                                                         (void **)&mbuf,
225                                                         1, 0);
226                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
227                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
228                 } else {
229                         struct rte_mbuf *mbuf;
230                         /* Mark mempool object as "put" since
231                          * it is freed by NIX
232                          */
233                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
234                                 offsetof(struct rte_mbuf, buf_iova));
235                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
236                                                 1, 0);
237
238                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
239                                 offsetof(struct rte_mbuf, buf_iova));
240                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
241                                                 1, 0);
242
243                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
244                                 offsetof(struct rte_mbuf, buf_iova));
245                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
246                                                 1, 0);
247
248                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
249                                 offsetof(struct rte_mbuf, buf_iova));
250                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
251                                                 1, 0);
252                         RTE_SET_USED(mbuf);
253                 }
254
255                 /* Move mbufs to point pool */
256                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
257                          offsetof(struct rte_mbuf, pool) -
258                          offsetof(struct rte_mbuf, buf_iova));
259                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
260                          offsetof(struct rte_mbuf, pool) -
261                          offsetof(struct rte_mbuf, buf_iova));
262                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
263                          offsetof(struct rte_mbuf, pool) -
264                          offsetof(struct rte_mbuf, buf_iova));
265                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
266                          offsetof(struct rte_mbuf, pool) -
267                          offsetof(struct rte_mbuf, buf_iova));
268
269                 if (flags &
270                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
271                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
272                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
273                         /*
274                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
275                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
276                          */
277
278                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
279                                       [a]"+w"(senddesc01_w1) :
280                                       [in]"r"(mbuf0 + 2) : "memory");
281
282                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
283                                       [a]"+w"(senddesc01_w1) :
284                                       [in]"r"(mbuf1 + 2) : "memory");
285
286                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
287                                       [b]"+w"(senddesc23_w1) :
288                                       [in]"r"(mbuf2 + 2) : "memory");
289
290                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
291                                       [b]"+w"(senddesc23_w1) :
292                                       [in]"r"(mbuf3 + 2) : "memory");
293
294                         /* Get pool pointer alone */
295                         mbuf0 = (uint64_t *)*mbuf0;
296                         mbuf1 = (uint64_t *)*mbuf1;
297                         mbuf2 = (uint64_t *)*mbuf2;
298                         mbuf3 = (uint64_t *)*mbuf3;
299                 } else {
300                         /* Get pool pointer alone */
301                         mbuf0 = (uint64_t *)*mbuf0;
302                         mbuf1 = (uint64_t *)*mbuf1;
303                         mbuf2 = (uint64_t *)*mbuf2;
304                         mbuf3 = (uint64_t *)*mbuf3;
305                 }
306
307                 const uint8x16_t shuf_mask2 = {
308                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
309                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
310                 };
311                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
312                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
313
314                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
315                 const uint64x2_t and_mask0 = {
316                         0xFFFFFFFFFFFFFFFF,
317                         0x000000000000FFFF,
318                 };
319
320                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
321                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
322                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
323                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
324
325                 /*
326                  * Pick only 16 bits of pktlen preset at bits 63:32
327                  * and place them at bits 15:0.
328                  */
329                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
330                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
331
332                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
333                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
334                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
335
336                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
337                  * pktlen at 15:0 position.
338                  */
339                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
340                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
341                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
342                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
343
344                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
345                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
346                         /*
347                          * Lookup table to translate ol_flags to
348                          * il3/il4 types. But we still use ol3/ol4 types in
349                          * senddesc_w1 as only one header processing is enabled.
350                          */
351                         const uint8x16_t tbl = {
352                                 /* [0-15] = il4type:il3type */
353                                 0x04, /* none (IPv6 assumed) */
354                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
355                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
356                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
357                                 0x03, /* PKT_TX_IP_CKSUM */
358                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
359                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
360                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
361                                 0x02, /* PKT_TX_IPV4  */
362                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
363                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
364                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
365                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
366                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
367                                        * PKT_TX_TCP_CKSUM
368                                        */
369                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
370                                        * PKT_TX_SCTP_CKSUM
371                                        */
372                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
373                                        * PKT_TX_UDP_CKSUM
374                                        */
375                         };
376
377                         /* Extract olflags to translate to iltypes */
378                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
379                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
380
381                         /*
382                          * E(47):L3_LEN(9):L2_LEN(7+z)
383                          * E(47):L3_LEN(9):L2_LEN(7+z)
384                          */
385                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
386                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
387
388                         /* Move OLFLAGS bits 55:52 to 51:48
389                          * with zeros preprended on the byte and rest
390                          * don't care
391                          */
392                         xtmp128 = vshrq_n_u8(xtmp128, 4);
393                         ytmp128 = vshrq_n_u8(ytmp128, 4);
394                         /*
395                          * E(48):L3_LEN(8):L2_LEN(z+7)
396                          * E(48):L3_LEN(8):L2_LEN(z+7)
397                          */
398                         const int8x16_t tshft3 = {
399                                 -1, 0, 8, 8, 8, 8, 8, 8,
400                                 -1, 0, 8, 8, 8, 8, 8, 8,
401                         };
402
403                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
404                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
405
406                         /* Do the lookup */
407                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
408                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
409
410                         /* Just use ld1q to retrieve aura
411                          * when we don't need tx_offload
412                          */
413                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
414                                         offsetof(struct rte_mempool, pool_id));
415                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
416                                         offsetof(struct rte_mempool, pool_id));
417                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
418                                         offsetof(struct rte_mempool, pool_id));
419                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
420                                         offsetof(struct rte_mempool, pool_id));
421
422                         /* Pick only relevant fields i.e Bit 48:55 of iltype
423                          * and place it in ol3/ol4type of senddesc_w1
424                          */
425                         const uint8x16_t shuf_mask0 = {
426                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
427                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
428                         };
429
430                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
431                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
432
433                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
434                          * a [E(32):E(16):OL3(8):OL2(8)]
435                          * a = a + (a << 8)
436                          * a [E(32):E(16):(OL3+OL2):OL2]
437                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
438                          */
439                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
440                                                  vshlq_n_u16(senddesc01_w1, 8));
441                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
442                                                  vshlq_n_u16(senddesc23_w1, 8));
443
444                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
445                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
446                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
447                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
448                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
449
450                         xmask01 = vdupq_n_u64(0);
451                         xmask23 = xmask01;
452                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
453                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
454
455                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
456                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
457
458                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
459                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
460
461                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
462                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
463                         xmask01 = vshlq_n_u64(xmask01, 20);
464                         xmask23 = vshlq_n_u64(xmask23, 20);
465
466                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
467                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
468                         /* Move ltypes to senddesc*_w1 */
469                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
470                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
471
472                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
473                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
474                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
475                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
476                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
477
478                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
479                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
480                         /*
481                          * Lookup table to translate ol_flags to
482                          * ol3/ol4 types.
483                          */
484
485                         const uint8x16_t tbl = {
486                                 /* [0-15] = ol4type:ol3type */
487                                 0x00, /* none */
488                                 0x03, /* OUTER_IP_CKSUM */
489                                 0x02, /* OUTER_IPV4 */
490                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
491                                 0x04, /* OUTER_IPV6 */
492                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
493                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
494                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
495                                        * OUTER_IP_CKSUM
496                                        */
497                                 0x00, /* OUTER_UDP_CKSUM */
498                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
499                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
500                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
501                                        * OUTER_IP_CKSUM
502                                        */
503                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
504                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
505                                        * OUTER_IP_CKSUM
506                                        */
507                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
508                                        * OUTER_IPV4
509                                        */
510                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
511                                        * OUTER_IPV4 | OUTER_IP_CKSUM
512                                        */
513                         };
514
515                         /* Extract olflags to translate to iltypes */
516                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
517                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
518
519                         /*
520                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
521                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
522                          */
523                         const uint8x16_t shuf_mask5 = {
524                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
525                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
526                         };
527                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
528                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
529
530                         /* Extract outer ol flags only */
531                         const uint64x2_t o_cksum_mask = {
532                                 0x1C00020000000000,
533                                 0x1C00020000000000,
534                         };
535
536                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
537                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
538
539                         /* Extract OUTER_UDP_CKSUM bit 41 and
540                          * move it to bit 61
541                          */
542
543                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
544                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
545
546                         /* Shift oltype by 2 to start nibble from BIT(56)
547                          * instead of BIT(58)
548                          */
549                         xtmp128 = vshrq_n_u8(xtmp128, 2);
550                         ytmp128 = vshrq_n_u8(ytmp128, 2);
551                         /*
552                          * E(48):L3_LEN(8):L2_LEN(z+7)
553                          * E(48):L3_LEN(8):L2_LEN(z+7)
554                          */
555                         const int8x16_t tshft3 = {
556                                 -1, 0, 8, 8, 8, 8, 8, 8,
557                                 -1, 0, 8, 8, 8, 8, 8, 8,
558                         };
559
560                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
561                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
562
563                         /* Do the lookup */
564                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
565                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
566
567                         /* Just use ld1q to retrieve aura
568                          * when we don't need tx_offload
569                          */
570                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
571                                         offsetof(struct rte_mempool, pool_id));
572                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
573                                         offsetof(struct rte_mempool, pool_id));
574                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
575                                         offsetof(struct rte_mempool, pool_id));
576                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
577                                         offsetof(struct rte_mempool, pool_id));
578
579                         /* Pick only relevant fields i.e Bit 56:63 of oltype
580                          * and place it in ol3/ol4type of senddesc_w1
581                          */
582                         const uint8x16_t shuf_mask0 = {
583                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
584                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
585                         };
586
587                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
588                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
589
590                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
591                          * a [E(32):E(16):OL3(8):OL2(8)]
592                          * a = a + (a << 8)
593                          * a [E(32):E(16):(OL3+OL2):OL2]
594                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
595                          */
596                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
597                                                  vshlq_n_u16(senddesc01_w1, 8));
598                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
599                                                  vshlq_n_u16(senddesc23_w1, 8));
600
601                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
602                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
603                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
604                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
605                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
606
607                         xmask01 = vdupq_n_u64(0);
608                         xmask23 = xmask01;
609                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
610                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
611
612                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
613                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
614
615                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
616                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
617
618                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
619                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
620                         xmask01 = vshlq_n_u64(xmask01, 20);
621                         xmask23 = vshlq_n_u64(xmask23, 20);
622
623                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
624                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
625                         /* Move ltypes to senddesc*_w1 */
626                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
627                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
628
629                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
630                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
631                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
632                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
633                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
634
635                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
636                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
637                         /* Lookup table to translate ol_flags to
638                          * ol4type, ol3type, il4type, il3type of senddesc_w1
639                          */
640                         const uint8x16x2_t tbl = {
641                         {
642                                 {
643                                         /* [0-15] = il4type:il3type */
644                                         0x04, /* none (IPv6) */
645                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
646                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
647                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
648                                         0x03, /* PKT_TX_IP_CKSUM */
649                                         0x13, /* PKT_TX_IP_CKSUM |
650                                                * PKT_TX_TCP_CKSUM
651                                                */
652                                         0x23, /* PKT_TX_IP_CKSUM |
653                                                * PKT_TX_SCTP_CKSUM
654                                                */
655                                         0x33, /* PKT_TX_IP_CKSUM |
656                                                * PKT_TX_UDP_CKSUM
657                                                */
658                                         0x02, /* PKT_TX_IPV4 */
659                                         0x12, /* PKT_TX_IPV4 |
660                                                * PKT_TX_TCP_CKSUM
661                                                */
662                                         0x22, /* PKT_TX_IPV4 |
663                                                * PKT_TX_SCTP_CKSUM
664                                                */
665                                         0x32, /* PKT_TX_IPV4 |
666                                                * PKT_TX_UDP_CKSUM
667                                                */
668                                         0x03, /* PKT_TX_IPV4 |
669                                                * PKT_TX_IP_CKSUM
670                                                */
671                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
672                                                * PKT_TX_TCP_CKSUM
673                                                */
674                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
675                                                * PKT_TX_SCTP_CKSUM
676                                                */
677                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
678                                                * PKT_TX_UDP_CKSUM
679                                                */
680                                 },
681
682                                 {
683                                         /* [16-31] = ol4type:ol3type */
684                                         0x00, /* none */
685                                         0x03, /* OUTER_IP_CKSUM */
686                                         0x02, /* OUTER_IPV4 */
687                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
688                                         0x04, /* OUTER_IPV6 */
689                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
690                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
691                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
692                                                * OUTER_IP_CKSUM
693                                                */
694                                         0x00, /* OUTER_UDP_CKSUM */
695                                         0x33, /* OUTER_UDP_CKSUM |
696                                                * OUTER_IP_CKSUM
697                                                */
698                                         0x32, /* OUTER_UDP_CKSUM |
699                                                * OUTER_IPV4
700                                                */
701                                         0x33, /* OUTER_UDP_CKSUM |
702                                                * OUTER_IPV4 | OUTER_IP_CKSUM
703                                                */
704                                         0x34, /* OUTER_UDP_CKSUM |
705                                                * OUTER_IPV6
706                                                */
707                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
708                                                * OUTER_IP_CKSUM
709                                                */
710                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
711                                                * OUTER_IPV4
712                                                */
713                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
714                                                * OUTER_IPV4 | OUTER_IP_CKSUM
715                                                */
716                                 },
717                         }
718                         };
719
720                         /* Extract olflags to translate to oltype & iltype */
721                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
722                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
723
724                         /*
725                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
726                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
727                          */
728                         const uint32x4_t tshft_4 = {
729                                 1, 0,
730                                 1, 0,
731                         };
732                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
733                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
734
735                         /*
736                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
737                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
738                          */
739                         const uint8x16_t shuf_mask5 = {
740                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
741                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
742                         };
743                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
744                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
745
746                         /* Extract outer and inner header ol_flags */
747                         const uint64x2_t oi_cksum_mask = {
748                                 0x1CF0020000000000,
749                                 0x1CF0020000000000,
750                         };
751
752                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
753                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
754
755                         /* Extract OUTER_UDP_CKSUM bit 41 and
756                          * move it to bit 61
757                          */
758
759                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
760                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
761
762                         /* Shift right oltype by 2 and iltype by 4
763                          * to start oltype nibble from BIT(58)
764                          * instead of BIT(56) and iltype nibble from BIT(48)
765                          * instead of BIT(52).
766                          */
767                         const int8x16_t tshft5 = {
768                                 8, 8, 8, 8, 8, 8, -4, -2,
769                                 8, 8, 8, 8, 8, 8, -4, -2,
770                         };
771
772                         xtmp128 = vshlq_u8(xtmp128, tshft5);
773                         ytmp128 = vshlq_u8(ytmp128, tshft5);
774                         /*
775                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
776                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
777                          */
778                         const int8x16_t tshft3 = {
779                                 -1, 0, -1, 0, 0, 0, 0, 0,
780                                 -1, 0, -1, 0, 0, 0, 0, 0,
781                         };
782
783                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
784                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
785
786                         /* Mark Bit(4) of oltype */
787                         const uint64x2_t oi_cksum_mask2 = {
788                                 0x1000000000000000,
789                                 0x1000000000000000,
790                         };
791
792                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
793                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
794
795                         /* Do the lookup */
796                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
797                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
798
799                         /* Just use ld1q to retrieve aura
800                          * when we don't need tx_offload
801                          */
802                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
803                                         offsetof(struct rte_mempool, pool_id));
804                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
805                                         offsetof(struct rte_mempool, pool_id));
806                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
807                                         offsetof(struct rte_mempool, pool_id));
808                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
809                                         offsetof(struct rte_mempool, pool_id));
810
811                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
812                          * Bit 56:63 of oltype and place it in corresponding
813                          * place in senddesc_w1.
814                          */
815                         const uint8x16_t shuf_mask0 = {
816                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
817                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
818                         };
819
820                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
821                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
822
823                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
824                          * l3len, l2len, ol3len, ol2len.
825                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
826                          * a = a + (a << 8)
827                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
828                          * a = a + (a << 16)
829                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
830                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
831                          */
832                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
833                                                  vshlq_n_u32(senddesc01_w1, 8));
834                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
835                                                  vshlq_n_u32(senddesc23_w1, 8));
836
837                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
838                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
839                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
840                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
841                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
842
843                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
844                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
845                                                 vshlq_n_u32(senddesc01_w1, 16));
846                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
847                                                 vshlq_n_u32(senddesc23_w1, 16));
848
849                         xmask01 = vdupq_n_u64(0);
850                         xmask23 = xmask01;
851                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
852                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
853
854                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
855                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
856
857                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
858                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
859
860                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
861                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
862                         xmask01 = vshlq_n_u64(xmask01, 20);
863                         xmask23 = vshlq_n_u64(xmask23, 20);
864
865                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
866                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
867                         /* Move ltypes to senddesc*_w1 */
868                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
869                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
870
871                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
872                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
873                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
874                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
875                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
876                 } else {
877                         /* Just use ld1q to retrieve aura
878                          * when we don't need tx_offload
879                          */
880                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
881                                         offsetof(struct rte_mempool, pool_id));
882                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
883                                         offsetof(struct rte_mempool, pool_id));
884                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
885                                         offsetof(struct rte_mempool, pool_id));
886                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
887                                         offsetof(struct rte_mempool, pool_id));
888                         xmask01 = vdupq_n_u64(0);
889                         xmask23 = xmask01;
890                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
891                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
892
893                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
894                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
895
896                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
897                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
898
899                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
900                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
901                         xmask01 = vshlq_n_u64(xmask01, 20);
902                         xmask23 = vshlq_n_u64(xmask23, 20);
903
904                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
905                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
906
907                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
908                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
909                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
910                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
911                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
912                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
913                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
914                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
915                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
916                 }
917
918                 do {
919                         vst1q_u64(lmt_addr, cmd00);
920                         vst1q_u64(lmt_addr + 2, cmd01);
921                         vst1q_u64(lmt_addr + 4, cmd10);
922                         vst1q_u64(lmt_addr + 6, cmd11);
923                         vst1q_u64(lmt_addr + 8, cmd20);
924                         vst1q_u64(lmt_addr + 10, cmd21);
925                         vst1q_u64(lmt_addr + 12, cmd30);
926                         vst1q_u64(lmt_addr + 14, cmd31);
927                         lmt_status = otx2_lmt_submit(io_addr);
928
929                 } while (lmt_status == 0);
930         }
931
932         return pkts;
933 }
934
935 #else
936 static __rte_always_inline uint16_t
937 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
938                      uint16_t pkts, const uint16_t flags)
939 {
940         RTE_SET_USED(tx_queue);
941         RTE_SET_USED(tx_pkts);
942         RTE_SET_USED(pkts);
943         RTE_SET_USED(flags);
944         return 0;
945 }
946 #endif
947
948 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
949 static uint16_t __rte_noinline  __hot                                   \
950 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
951                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
952 {                                                                       \
953         uint64_t cmd[sz];                                               \
954                                                                         \
955         /* For TSO inner checksum is a must */                          \
956         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
957             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
958                 return 0;                                               \
959         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
960 }
961
962 NIX_TX_FASTPATH_MODES
963 #undef T
964
965 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
966 static uint16_t __rte_noinline  __hot                                   \
967 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
968                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
969 {                                                                       \
970         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
971                                                                         \
972         /* For TSO inner checksum is a must */                          \
973         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
974             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
975                 return 0;                                               \
976         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
977                                   (flags) | NIX_TX_MULTI_SEG_F);        \
978 }
979
980 NIX_TX_FASTPATH_MODES
981 #undef T
982
983 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
984 static uint16_t __rte_noinline  __hot                                   \
985 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
986                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
987 {                                                                       \
988         /* VLAN, TSTMP, TSO is not supported by vec */                  \
989         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
990             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
991             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
992                 return 0;                                               \
993         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, (flags));  \
994 }
995
996 NIX_TX_FASTPATH_MODES
997 #undef T
998
999 static inline void
1000 pick_tx_func(struct rte_eth_dev *eth_dev,
1001              const eth_tx_burst_t tx_burst[2][2][2][2][2][2])
1002 {
1003         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1004
1005         /* [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1006         eth_dev->tx_pkt_burst = tx_burst
1007                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1008                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1009                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1010                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1011                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1012                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1013 }
1014
1015 void
1016 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1017 {
1018         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1019
1020         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2] = {
1021 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1022         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1023
1024 NIX_TX_FASTPATH_MODES
1025 #undef T
1026         };
1027
1028         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2] = {
1029 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1030         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1031
1032 NIX_TX_FASTPATH_MODES
1033 #undef T
1034         };
1035
1036         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2] = {
1037 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1038         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1039
1040 NIX_TX_FASTPATH_MODES
1041 #undef T
1042         };
1043
1044         if (dev->scalar_ena ||
1045             (dev->tx_offload_flags &
1046              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1047               NIX_TX_OFFLOAD_TSO_F)))
1048                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1049         else
1050                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1051
1052         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1053                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1054
1055         rte_mb();
1056 }