96be92a61b7f9e942e3d98af887b019391ca33ee
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Perform header writes before barrier for TSO */
36         if (flags & NIX_TX_OFFLOAD_TSO_F) {
37                 for (i = 0; i < pkts; i++)
38                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
39         }
40
41         /* Lets commit any changes in the packet */
42         rte_cio_wmb();
43
44         for (i = 0; i < pkts; i++) {
45                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
46                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
47                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
48                                              tx_pkts[i]->ol_flags, 4, flags);
49                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
50         }
51
52         /* Reduce the cached count */
53         txq->fc_cache_pkts -= pkts;
54
55         return pkts;
56 }
57
58 static __rte_always_inline uint16_t
59 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
60                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
61 {
62         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
63         const rte_iova_t io_addr = txq->io_addr;
64         void *lmt_addr = txq->lmt_addr;
65         uint16_t segdw;
66
67         NIX_XMIT_FC_OR_RETURN(txq, pkts);
68
69         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
70
71         /* Perform header writes before barrier for TSO */
72         if (flags & NIX_TX_OFFLOAD_TSO_F) {
73                 for (i = 0; i < pkts; i++)
74                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
75         }
76
77         /* Lets commit any changes in the packet */
78         rte_cio_wmb();
79
80         for (i = 0; i < pkts; i++) {
81                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
82                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
83                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
84                                              tx_pkts[i]->ol_flags, segdw,
85                                              flags);
86                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
87         }
88
89         /* Reduce the cached count */
90         txq->fc_cache_pkts -= pkts;
91
92         return pkts;
93 }
94
95 #if defined(RTE_ARCH_ARM64)
96
97 #define NIX_DESCS_PER_LOOP      4
98 static __rte_always_inline uint16_t
99 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
100                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
101 {
102         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
103         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
104         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
105         uint64x2_t senddesc01_w0, senddesc23_w0;
106         uint64x2_t senddesc01_w1, senddesc23_w1;
107         uint64x2_t sgdesc01_w0, sgdesc23_w0;
108         uint64x2_t sgdesc01_w1, sgdesc23_w1;
109         struct otx2_eth_txq *txq = tx_queue;
110         uint64_t *lmt_addr = txq->lmt_addr;
111         rte_iova_t io_addr = txq->io_addr;
112         uint64x2_t ltypes01, ltypes23;
113         uint64x2_t xtmp128, ytmp128;
114         uint64x2_t xmask01, xmask23;
115         uint64x2_t mbuf01, mbuf23;
116         uint64x2_t cmd00, cmd01;
117         uint64x2_t cmd10, cmd11;
118         uint64x2_t cmd20, cmd21;
119         uint64x2_t cmd30, cmd31;
120         uint64_t lmt_status, i;
121         uint16_t pkts_left;
122
123         NIX_XMIT_FC_OR_RETURN(txq, pkts);
124
125         pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
126         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
127
128         /* Reduce the cached count */
129         txq->fc_cache_pkts -= pkts;
130
131         /* Lets commit any changes in the packet */
132         rte_cio_wmb();
133
134         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
135         senddesc23_w0 = senddesc01_w0;
136         senddesc01_w1 = vdupq_n_u64(0);
137         senddesc23_w1 = senddesc01_w1;
138         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
139         sgdesc23_w0 = sgdesc01_w0;
140
141         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
142                 mbuf01 = vld1q_u64((uint64_t *)tx_pkts);
143                 mbuf23 = vld1q_u64((uint64_t *)(tx_pkts + 2));
144
145                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
146                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
147                                           vdupq_n_u64(0xFFFFFFFF));
148                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
149                                         vdupq_n_u64(0xFFFFFFFF));
150
151                 senddesc23_w0 = senddesc01_w0;
152                 sgdesc23_w0 = sgdesc01_w0;
153
154                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
155
156                 /* Move mbufs to iova */
157                 mbuf0 = (uint64_t *)vgetq_lane_u64(mbuf01, 0);
158                 mbuf1 = (uint64_t *)vgetq_lane_u64(mbuf01, 1);
159                 mbuf2 = (uint64_t *)vgetq_lane_u64(mbuf23, 0);
160                 mbuf3 = (uint64_t *)vgetq_lane_u64(mbuf23, 1);
161
162                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
163                                      offsetof(struct rte_mbuf, buf_iova));
164                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
165                                      offsetof(struct rte_mbuf, buf_iova));
166                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
167                                      offsetof(struct rte_mbuf, buf_iova));
168                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
169                                      offsetof(struct rte_mbuf, buf_iova));
170                 /*
171                  * Get mbuf's, olflags, iova, pktlen, dataoff
172                  * dataoff_iovaX.D[0] = iova,
173                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
174                  * len_olflagsX.D[0] = ol_flags,
175                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
176                  */
177                 dataoff_iova0  = vld1q_u64(mbuf0);
178                 len_olflags0 = vld1q_u64(mbuf0 + 2);
179                 dataoff_iova1  = vld1q_u64(mbuf1);
180                 len_olflags1 = vld1q_u64(mbuf1 + 2);
181                 dataoff_iova2  = vld1q_u64(mbuf2);
182                 len_olflags2 = vld1q_u64(mbuf2 + 2);
183                 dataoff_iova3  = vld1q_u64(mbuf3);
184                 len_olflags3 = vld1q_u64(mbuf3 + 2);
185
186                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
187                         struct rte_mbuf *mbuf;
188                         /* Set don't free bit if reference count > 1 */
189                         xmask01 = vdupq_n_u64(0);
190                         xmask23 = xmask01;
191
192                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
193                                 offsetof(struct rte_mbuf, buf_iova));
194
195                         if (otx2_nix_prefree_seg(mbuf))
196                                 vsetq_lane_u64(0x80000, xmask01, 0);
197                         else
198                                 __mempool_check_cookies(mbuf->pool,
199                                                         (void **)&mbuf,
200                                                         1, 0);
201
202                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
203                                 offsetof(struct rte_mbuf, buf_iova));
204                         if (otx2_nix_prefree_seg(mbuf))
205                                 vsetq_lane_u64(0x80000, xmask01, 1);
206                         else
207                                 __mempool_check_cookies(mbuf->pool,
208                                                         (void **)&mbuf,
209                                                         1, 0);
210
211                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
212                                 offsetof(struct rte_mbuf, buf_iova));
213                         if (otx2_nix_prefree_seg(mbuf))
214                                 vsetq_lane_u64(0x80000, xmask23, 0);
215                         else
216                                 __mempool_check_cookies(mbuf->pool,
217                                                         (void **)&mbuf,
218                                                         1, 0);
219
220                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
221                                 offsetof(struct rte_mbuf, buf_iova));
222                         if (otx2_nix_prefree_seg(mbuf))
223                                 vsetq_lane_u64(0x80000, xmask23, 1);
224                         else
225                                 __mempool_check_cookies(mbuf->pool,
226                                                         (void **)&mbuf,
227                                                         1, 0);
228                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
229                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
230                 } else {
231                         struct rte_mbuf *mbuf;
232                         /* Mark mempool object as "put" since
233                          * it is freed by NIX
234                          */
235                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
236                                 offsetof(struct rte_mbuf, buf_iova));
237                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
238                                                 1, 0);
239
240                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
241                                 offsetof(struct rte_mbuf, buf_iova));
242                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
243                                                 1, 0);
244
245                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
246                                 offsetof(struct rte_mbuf, buf_iova));
247                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
248                                                 1, 0);
249
250                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
251                                 offsetof(struct rte_mbuf, buf_iova));
252                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
253                                                 1, 0);
254                         RTE_SET_USED(mbuf);
255                 }
256
257                 /* Move mbufs to point pool */
258                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
259                          offsetof(struct rte_mbuf, pool) -
260                          offsetof(struct rte_mbuf, buf_iova));
261                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
262                          offsetof(struct rte_mbuf, pool) -
263                          offsetof(struct rte_mbuf, buf_iova));
264                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
265                          offsetof(struct rte_mbuf, pool) -
266                          offsetof(struct rte_mbuf, buf_iova));
267                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
268                          offsetof(struct rte_mbuf, pool) -
269                          offsetof(struct rte_mbuf, buf_iova));
270
271                 if (flags &
272                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
273                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
274                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
275                         /*
276                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
277                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
278                          */
279
280                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
281                                       [a]"+w"(senddesc01_w1) :
282                                       [in]"r"(mbuf0 + 2) : "memory");
283
284                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
285                                       [a]"+w"(senddesc01_w1) :
286                                       [in]"r"(mbuf1 + 2) : "memory");
287
288                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
289                                       [b]"+w"(senddesc23_w1) :
290                                       [in]"r"(mbuf2 + 2) : "memory");
291
292                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
293                                       [b]"+w"(senddesc23_w1) :
294                                       [in]"r"(mbuf3 + 2) : "memory");
295
296                         /* Get pool pointer alone */
297                         mbuf0 = (uint64_t *)*mbuf0;
298                         mbuf1 = (uint64_t *)*mbuf1;
299                         mbuf2 = (uint64_t *)*mbuf2;
300                         mbuf3 = (uint64_t *)*mbuf3;
301                 } else {
302                         /* Get pool pointer alone */
303                         mbuf0 = (uint64_t *)*mbuf0;
304                         mbuf1 = (uint64_t *)*mbuf1;
305                         mbuf2 = (uint64_t *)*mbuf2;
306                         mbuf3 = (uint64_t *)*mbuf3;
307                 }
308
309                 const uint8x16_t shuf_mask2 = {
310                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
311                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
312                 };
313                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
314                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
315
316                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
317                 const uint64x2_t and_mask0 = {
318                         0xFFFFFFFFFFFFFFFF,
319                         0x000000000000FFFF,
320                 };
321
322                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
323                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
324                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
325                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
326
327                 /*
328                  * Pick only 16 bits of pktlen preset at bits 63:32
329                  * and place them at bits 15:0.
330                  */
331                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
332                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
333
334                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
335                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
336                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
337
338                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
339                  * pktlen at 15:0 position.
340                  */
341                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
342                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
343                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
344                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
345
346                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
347                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
348                         /*
349                          * Lookup table to translate ol_flags to
350                          * il3/il4 types. But we still use ol3/ol4 types in
351                          * senddesc_w1 as only one header processing is enabled.
352                          */
353                         const uint8x16_t tbl = {
354                                 /* [0-15] = il4type:il3type */
355                                 0x04, /* none (IPv6 assumed) */
356                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
357                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
358                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
359                                 0x03, /* PKT_TX_IP_CKSUM */
360                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
361                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
362                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
363                                 0x02, /* PKT_TX_IPV4  */
364                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
365                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
366                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
367                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
368                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
369                                        * PKT_TX_TCP_CKSUM
370                                        */
371                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
372                                        * PKT_TX_SCTP_CKSUM
373                                        */
374                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
375                                        * PKT_TX_UDP_CKSUM
376                                        */
377                         };
378
379                         /* Extract olflags to translate to iltypes */
380                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
381                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
382
383                         /*
384                          * E(47):L3_LEN(9):L2_LEN(7+z)
385                          * E(47):L3_LEN(9):L2_LEN(7+z)
386                          */
387                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
388                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
389
390                         /* Move OLFLAGS bits 55:52 to 51:48
391                          * with zeros preprended on the byte and rest
392                          * don't care
393                          */
394                         xtmp128 = vshrq_n_u8(xtmp128, 4);
395                         ytmp128 = vshrq_n_u8(ytmp128, 4);
396                         /*
397                          * E(48):L3_LEN(8):L2_LEN(z+7)
398                          * E(48):L3_LEN(8):L2_LEN(z+7)
399                          */
400                         const int8x16_t tshft3 = {
401                                 -1, 0, 8, 8, 8, 8, 8, 8,
402                                 -1, 0, 8, 8, 8, 8, 8, 8,
403                         };
404
405                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
406                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
407
408                         /* Do the lookup */
409                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
410                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
411
412                         /* Just use ld1q to retrieve aura
413                          * when we don't need tx_offload
414                          */
415                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
416                                         offsetof(struct rte_mempool, pool_id));
417                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
418                                         offsetof(struct rte_mempool, pool_id));
419                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
420                                         offsetof(struct rte_mempool, pool_id));
421                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
422                                         offsetof(struct rte_mempool, pool_id));
423
424                         /* Pick only relevant fields i.e Bit 48:55 of iltype
425                          * and place it in ol3/ol4type of senddesc_w1
426                          */
427                         const uint8x16_t shuf_mask0 = {
428                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
429                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
430                         };
431
432                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
433                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
434
435                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
436                          * a [E(32):E(16):OL3(8):OL2(8)]
437                          * a = a + (a << 8)
438                          * a [E(32):E(16):(OL3+OL2):OL2]
439                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
440                          */
441                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
442                                                  vshlq_n_u16(senddesc01_w1, 8));
443                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
444                                                  vshlq_n_u16(senddesc23_w1, 8));
445
446                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
447                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
448                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
449                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
450                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
451
452                         xmask01 = vdupq_n_u64(0);
453                         xmask23 = xmask01;
454                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
455                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
456
457                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
458                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
459
460                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
461                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
462
463                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
464                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
465                         xmask01 = vshlq_n_u64(xmask01, 20);
466                         xmask23 = vshlq_n_u64(xmask23, 20);
467
468                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
469                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
470                         /* Move ltypes to senddesc*_w1 */
471                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
472                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
473
474                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
475                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
476                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
477                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
478                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
479
480                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
481                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
482                         /*
483                          * Lookup table to translate ol_flags to
484                          * ol3/ol4 types.
485                          */
486
487                         const uint8x16_t tbl = {
488                                 /* [0-15] = ol4type:ol3type */
489                                 0x00, /* none */
490                                 0x03, /* OUTER_IP_CKSUM */
491                                 0x02, /* OUTER_IPV4 */
492                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
493                                 0x04, /* OUTER_IPV6 */
494                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
495                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
496                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
497                                        * OUTER_IP_CKSUM
498                                        */
499                                 0x00, /* OUTER_UDP_CKSUM */
500                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
501                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
502                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
503                                        * OUTER_IP_CKSUM
504                                        */
505                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
506                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
507                                        * OUTER_IP_CKSUM
508                                        */
509                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
510                                        * OUTER_IPV4
511                                        */
512                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
513                                        * OUTER_IPV4 | OUTER_IP_CKSUM
514                                        */
515                         };
516
517                         /* Extract olflags to translate to iltypes */
518                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
519                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
520
521                         /*
522                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
523                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
524                          */
525                         const uint8x16_t shuf_mask5 = {
526                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
527                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
528                         };
529                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
530                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
531
532                         /* Extract outer ol flags only */
533                         const uint64x2_t o_cksum_mask = {
534                                 0x1C00020000000000,
535                                 0x1C00020000000000,
536                         };
537
538                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
539                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
540
541                         /* Extract OUTER_UDP_CKSUM bit 41 and
542                          * move it to bit 61
543                          */
544
545                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
546                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
547
548                         /* Shift oltype by 2 to start nibble from BIT(56)
549                          * instead of BIT(58)
550                          */
551                         xtmp128 = vshrq_n_u8(xtmp128, 2);
552                         ytmp128 = vshrq_n_u8(ytmp128, 2);
553                         /*
554                          * E(48):L3_LEN(8):L2_LEN(z+7)
555                          * E(48):L3_LEN(8):L2_LEN(z+7)
556                          */
557                         const int8x16_t tshft3 = {
558                                 -1, 0, 8, 8, 8, 8, 8, 8,
559                                 -1, 0, 8, 8, 8, 8, 8, 8,
560                         };
561
562                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
563                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
564
565                         /* Do the lookup */
566                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
567                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
568
569                         /* Just use ld1q to retrieve aura
570                          * when we don't need tx_offload
571                          */
572                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
573                                         offsetof(struct rte_mempool, pool_id));
574                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
575                                         offsetof(struct rte_mempool, pool_id));
576                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
577                                         offsetof(struct rte_mempool, pool_id));
578                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
579                                         offsetof(struct rte_mempool, pool_id));
580
581                         /* Pick only relevant fields i.e Bit 56:63 of oltype
582                          * and place it in ol3/ol4type of senddesc_w1
583                          */
584                         const uint8x16_t shuf_mask0 = {
585                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
586                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
587                         };
588
589                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
590                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
591
592                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
593                          * a [E(32):E(16):OL3(8):OL2(8)]
594                          * a = a + (a << 8)
595                          * a [E(32):E(16):(OL3+OL2):OL2]
596                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
597                          */
598                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
599                                                  vshlq_n_u16(senddesc01_w1, 8));
600                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
601                                                  vshlq_n_u16(senddesc23_w1, 8));
602
603                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
604                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
605                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
606                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
607                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
608
609                         xmask01 = vdupq_n_u64(0);
610                         xmask23 = xmask01;
611                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
612                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
613
614                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
615                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
616
617                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
618                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
619
620                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
621                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
622                         xmask01 = vshlq_n_u64(xmask01, 20);
623                         xmask23 = vshlq_n_u64(xmask23, 20);
624
625                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
626                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
627                         /* Move ltypes to senddesc*_w1 */
628                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
629                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
630
631                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
632                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
633                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
634                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
635                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
636
637                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
638                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
639                         /* Lookup table to translate ol_flags to
640                          * ol4type, ol3type, il4type, il3type of senddesc_w1
641                          */
642                         const uint8x16x2_t tbl = {
643                         {
644                                 {
645                                         /* [0-15] = il4type:il3type */
646                                         0x04, /* none (IPv6) */
647                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
648                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
649                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
650                                         0x03, /* PKT_TX_IP_CKSUM */
651                                         0x13, /* PKT_TX_IP_CKSUM |
652                                                * PKT_TX_TCP_CKSUM
653                                                */
654                                         0x23, /* PKT_TX_IP_CKSUM |
655                                                * PKT_TX_SCTP_CKSUM
656                                                */
657                                         0x33, /* PKT_TX_IP_CKSUM |
658                                                * PKT_TX_UDP_CKSUM
659                                                */
660                                         0x02, /* PKT_TX_IPV4 */
661                                         0x12, /* PKT_TX_IPV4 |
662                                                * PKT_TX_TCP_CKSUM
663                                                */
664                                         0x22, /* PKT_TX_IPV4 |
665                                                * PKT_TX_SCTP_CKSUM
666                                                */
667                                         0x32, /* PKT_TX_IPV4 |
668                                                * PKT_TX_UDP_CKSUM
669                                                */
670                                         0x03, /* PKT_TX_IPV4 |
671                                                * PKT_TX_IP_CKSUM
672                                                */
673                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
674                                                * PKT_TX_TCP_CKSUM
675                                                */
676                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
677                                                * PKT_TX_SCTP_CKSUM
678                                                */
679                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
680                                                * PKT_TX_UDP_CKSUM
681                                                */
682                                 },
683
684                                 {
685                                         /* [16-31] = ol4type:ol3type */
686                                         0x00, /* none */
687                                         0x03, /* OUTER_IP_CKSUM */
688                                         0x02, /* OUTER_IPV4 */
689                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
690                                         0x04, /* OUTER_IPV6 */
691                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
692                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
693                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
694                                                * OUTER_IP_CKSUM
695                                                */
696                                         0x00, /* OUTER_UDP_CKSUM */
697                                         0x33, /* OUTER_UDP_CKSUM |
698                                                * OUTER_IP_CKSUM
699                                                */
700                                         0x32, /* OUTER_UDP_CKSUM |
701                                                * OUTER_IPV4
702                                                */
703                                         0x33, /* OUTER_UDP_CKSUM |
704                                                * OUTER_IPV4 | OUTER_IP_CKSUM
705                                                */
706                                         0x34, /* OUTER_UDP_CKSUM |
707                                                * OUTER_IPV6
708                                                */
709                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
710                                                * OUTER_IP_CKSUM
711                                                */
712                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
713                                                * OUTER_IPV4
714                                                */
715                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
716                                                * OUTER_IPV4 | OUTER_IP_CKSUM
717                                                */
718                                 },
719                         }
720                         };
721
722                         /* Extract olflags to translate to oltype & iltype */
723                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
724                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
725
726                         /*
727                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
728                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
729                          */
730                         const uint32x4_t tshft_4 = {
731                                 1, 0,
732                                 1, 0,
733                         };
734                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
735                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
736
737                         /*
738                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
739                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
740                          */
741                         const uint8x16_t shuf_mask5 = {
742                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
743                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
744                         };
745                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
746                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
747
748                         /* Extract outer and inner header ol_flags */
749                         const uint64x2_t oi_cksum_mask = {
750                                 0x1CF0020000000000,
751                                 0x1CF0020000000000,
752                         };
753
754                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
755                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
756
757                         /* Extract OUTER_UDP_CKSUM bit 41 and
758                          * move it to bit 61
759                          */
760
761                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
762                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
763
764                         /* Shift right oltype by 2 and iltype by 4
765                          * to start oltype nibble from BIT(58)
766                          * instead of BIT(56) and iltype nibble from BIT(48)
767                          * instead of BIT(52).
768                          */
769                         const int8x16_t tshft5 = {
770                                 8, 8, 8, 8, 8, 8, -4, -2,
771                                 8, 8, 8, 8, 8, 8, -4, -2,
772                         };
773
774                         xtmp128 = vshlq_u8(xtmp128, tshft5);
775                         ytmp128 = vshlq_u8(ytmp128, tshft5);
776                         /*
777                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
778                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
779                          */
780                         const int8x16_t tshft3 = {
781                                 -1, 0, -1, 0, 0, 0, 0, 0,
782                                 -1, 0, -1, 0, 0, 0, 0, 0,
783                         };
784
785                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
786                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
787
788                         /* Mark Bit(4) of oltype */
789                         const uint64x2_t oi_cksum_mask2 = {
790                                 0x1000000000000000,
791                                 0x1000000000000000,
792                         };
793
794                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
795                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
796
797                         /* Do the lookup */
798                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
799                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
800
801                         /* Just use ld1q to retrieve aura
802                          * when we don't need tx_offload
803                          */
804                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
805                                         offsetof(struct rte_mempool, pool_id));
806                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
807                                         offsetof(struct rte_mempool, pool_id));
808                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
809                                         offsetof(struct rte_mempool, pool_id));
810                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
811                                         offsetof(struct rte_mempool, pool_id));
812
813                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
814                          * Bit 56:63 of oltype and place it in corresponding
815                          * place in senddesc_w1.
816                          */
817                         const uint8x16_t shuf_mask0 = {
818                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
819                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
820                         };
821
822                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
823                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
824
825                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
826                          * l3len, l2len, ol3len, ol2len.
827                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
828                          * a = a + (a << 8)
829                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
830                          * a = a + (a << 16)
831                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
832                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
833                          */
834                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
835                                                  vshlq_n_u32(senddesc01_w1, 8));
836                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
837                                                  vshlq_n_u32(senddesc23_w1, 8));
838
839                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
840                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
841                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
842                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
843                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
844
845                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
846                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
847                                                 vshlq_n_u32(senddesc01_w1, 16));
848                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
849                                                 vshlq_n_u32(senddesc23_w1, 16));
850
851                         xmask01 = vdupq_n_u64(0);
852                         xmask23 = xmask01;
853                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
854                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
855
856                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
857                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
858
859                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
860                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
861
862                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
863                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
864                         xmask01 = vshlq_n_u64(xmask01, 20);
865                         xmask23 = vshlq_n_u64(xmask23, 20);
866
867                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
868                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
869                         /* Move ltypes to senddesc*_w1 */
870                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
871                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
872
873                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
874                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
875                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
876                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
877                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
878                 } else {
879                         /* Just use ld1q to retrieve aura
880                          * when we don't need tx_offload
881                          */
882                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
883                                         offsetof(struct rte_mempool, pool_id));
884                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
885                                         offsetof(struct rte_mempool, pool_id));
886                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
887                                         offsetof(struct rte_mempool, pool_id));
888                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
889                                         offsetof(struct rte_mempool, pool_id));
890                         xmask01 = vdupq_n_u64(0);
891                         xmask23 = xmask01;
892                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
893                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
894
895                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
896                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
897
898                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
899                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
900
901                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
902                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
903                         xmask01 = vshlq_n_u64(xmask01, 20);
904                         xmask23 = vshlq_n_u64(xmask23, 20);
905
906                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
907                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
908
909                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
910                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
911                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
912                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
913                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
914                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
915                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
916                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
917                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
918                 }
919
920                 do {
921                         vst1q_u64(lmt_addr, cmd00);
922                         vst1q_u64(lmt_addr + 2, cmd01);
923                         vst1q_u64(lmt_addr + 4, cmd10);
924                         vst1q_u64(lmt_addr + 6, cmd11);
925                         vst1q_u64(lmt_addr + 8, cmd20);
926                         vst1q_u64(lmt_addr + 10, cmd21);
927                         vst1q_u64(lmt_addr + 12, cmd30);
928                         vst1q_u64(lmt_addr + 14, cmd31);
929                         lmt_status = otx2_lmt_submit(io_addr);
930
931                 } while (lmt_status == 0);
932         }
933
934         if (unlikely(pkts_left))
935                 pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
936
937         return pkts;
938 }
939
940 #else
941 static __rte_always_inline uint16_t
942 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
943                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
944 {
945         RTE_SET_USED(tx_queue);
946         RTE_SET_USED(tx_pkts);
947         RTE_SET_USED(pkts);
948         RTE_SET_USED(cmd);
949         RTE_SET_USED(flags);
950         return 0;
951 }
952 #endif
953
954 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
955 static uint16_t __rte_noinline  __hot                                   \
956 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
957                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
958 {                                                                       \
959         uint64_t cmd[sz];                                               \
960                                                                         \
961         /* For TSO inner checksum is a must */                          \
962         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
963             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
964                 return 0;                                               \
965         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
966 }
967
968 NIX_TX_FASTPATH_MODES
969 #undef T
970
971 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
972 static uint16_t __rte_noinline  __hot                                   \
973 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
974                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
975 {                                                                       \
976         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
977                                                                         \
978         /* For TSO inner checksum is a must */                          \
979         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
980             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
981                 return 0;                                               \
982         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
983                                   (flags) | NIX_TX_MULTI_SEG_F);        \
984 }
985
986 NIX_TX_FASTPATH_MODES
987 #undef T
988
989 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
990 static uint16_t __rte_noinline  __hot                                   \
991 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
992                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
993 {                                                                       \
994         uint64_t cmd[sz];                                               \
995                                                                         \
996         /* VLAN, TSTMP, TSO is not supported by vec */                  \
997         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
998             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
999             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
1000                 return 0;                                               \
1001         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1002 }
1003
1004 NIX_TX_FASTPATH_MODES
1005 #undef T
1006
1007 static inline void
1008 pick_tx_func(struct rte_eth_dev *eth_dev,
1009              const eth_tx_burst_t tx_burst[2][2][2][2][2][2])
1010 {
1011         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1012
1013         /* [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1014         eth_dev->tx_pkt_burst = tx_burst
1015                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1016                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1017                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1018                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1019                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1020                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1021 }
1022
1023 void
1024 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1025 {
1026         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1027
1028         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2] = {
1029 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1030         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1031
1032 NIX_TX_FASTPATH_MODES
1033 #undef T
1034         };
1035
1036         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2] = {
1037 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1038         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1039
1040 NIX_TX_FASTPATH_MODES
1041 #undef T
1042         };
1043
1044         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2] = {
1045 #define T(name, f5, f4, f3, f2, f1, f0, sz, flags)                      \
1046         [f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1047
1048 NIX_TX_FASTPATH_MODES
1049 #undef T
1050         };
1051
1052         if (dev->scalar_ena ||
1053             (dev->tx_offload_flags &
1054              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1055               NIX_TX_OFFLOAD_TSO_F)))
1056                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1057         else
1058                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1059
1060         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1061                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1062
1063         rte_mb();
1064 }