net/octeontx2: add Tx vector version
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30
31         NIX_XMIT_FC_OR_RETURN(txq, pkts);
32
33         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
34
35         /* Lets commit any changes in the packet */
36         rte_cio_wmb();
37
38         for (i = 0; i < pkts; i++) {
39                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
40                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
41                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
42                                              tx_pkts[i]->ol_flags, 4, flags);
43                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
44         }
45
46         /* Reduce the cached count */
47         txq->fc_cache_pkts -= pkts;
48
49         return pkts;
50 }
51
52 static __rte_always_inline uint16_t
53 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
54                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
55 {
56         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
57         const rte_iova_t io_addr = txq->io_addr;
58         void *lmt_addr = txq->lmt_addr;
59         uint16_t segdw;
60
61         NIX_XMIT_FC_OR_RETURN(txq, pkts);
62
63         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
64
65         /* Lets commit any changes in the packet */
66         rte_cio_wmb();
67
68         for (i = 0; i < pkts; i++) {
69                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
70                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
71                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
72                                              tx_pkts[i]->ol_flags, segdw,
73                                              flags);
74                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
75         }
76
77         /* Reduce the cached count */
78         txq->fc_cache_pkts -= pkts;
79
80         return pkts;
81 }
82
83 #if defined(RTE_ARCH_ARM64)
84
85 #define NIX_DESCS_PER_LOOP      4
86 static __rte_always_inline uint16_t
87 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
88                      uint16_t pkts, const uint16_t flags)
89 {
90         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
91         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
92         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
93         uint64x2_t senddesc01_w0, senddesc23_w0;
94         uint64x2_t senddesc01_w1, senddesc23_w1;
95         uint64x2_t sgdesc01_w0, sgdesc23_w0;
96         uint64x2_t sgdesc01_w1, sgdesc23_w1;
97         struct otx2_eth_txq *txq = tx_queue;
98         uint64_t *lmt_addr = txq->lmt_addr;
99         rte_iova_t io_addr = txq->io_addr;
100         uint64x2_t ltypes01, ltypes23;
101         uint64x2_t xtmp128, ytmp128;
102         uint64x2_t xmask01, xmask23;
103         uint64x2_t mbuf01, mbuf23;
104         uint64x2_t cmd00, cmd01;
105         uint64x2_t cmd10, cmd11;
106         uint64x2_t cmd20, cmd21;
107         uint64x2_t cmd30, cmd31;
108         uint64_t lmt_status, i;
109
110         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
111
112         NIX_XMIT_FC_OR_RETURN(txq, pkts);
113
114         /* Reduce the cached count */
115         txq->fc_cache_pkts -= pkts;
116
117         /* Lets commit any changes in the packet */
118         rte_cio_wmb();
119
120         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
121         senddesc23_w0 = senddesc01_w0;
122         senddesc01_w1 = vdupq_n_u64(0);
123         senddesc23_w1 = senddesc01_w1;
124         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
125         sgdesc23_w0 = sgdesc01_w0;
126
127         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
128                 mbuf01 = vld1q_u64((uint64_t *)tx_pkts);
129                 mbuf23 = vld1q_u64((uint64_t *)(tx_pkts + 2));
130
131                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
132                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
133                                           vdupq_n_u64(0xFFFFFFFF));
134                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
135                                         vdupq_n_u64(0xFFFFFFFF));
136
137                 senddesc23_w0 = senddesc01_w0;
138                 sgdesc23_w0 = sgdesc01_w0;
139
140                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
141
142                 /* Move mbufs to iova */
143                 mbuf0 = (uint64_t *)vgetq_lane_u64(mbuf01, 0);
144                 mbuf1 = (uint64_t *)vgetq_lane_u64(mbuf01, 1);
145                 mbuf2 = (uint64_t *)vgetq_lane_u64(mbuf23, 0);
146                 mbuf3 = (uint64_t *)vgetq_lane_u64(mbuf23, 1);
147
148                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
149                                      offsetof(struct rte_mbuf, buf_iova));
150                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
151                                      offsetof(struct rte_mbuf, buf_iova));
152                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
153                                      offsetof(struct rte_mbuf, buf_iova));
154                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
155                                      offsetof(struct rte_mbuf, buf_iova));
156                 /*
157                  * Get mbuf's, olflags, iova, pktlen, dataoff
158                  * dataoff_iovaX.D[0] = iova,
159                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
160                  * len_olflagsX.D[0] = ol_flags,
161                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
162                  */
163                 dataoff_iova0  = vld1q_u64(mbuf0);
164                 len_olflags0 = vld1q_u64(mbuf0 + 2);
165                 dataoff_iova1  = vld1q_u64(mbuf1);
166                 len_olflags1 = vld1q_u64(mbuf1 + 2);
167                 dataoff_iova2  = vld1q_u64(mbuf2);
168                 len_olflags2 = vld1q_u64(mbuf2 + 2);
169                 dataoff_iova3  = vld1q_u64(mbuf3);
170                 len_olflags3 = vld1q_u64(mbuf3 + 2);
171
172                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
173                         struct rte_mbuf *mbuf;
174                         /* Set don't free bit if reference count > 1 */
175                         xmask01 = vdupq_n_u64(0);
176                         xmask23 = xmask01;
177
178                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
179                                 offsetof(struct rte_mbuf, buf_iova));
180
181                         if (rte_pktmbuf_prefree_seg(mbuf) == NULL)
182                                 vsetq_lane_u64(0x80000, xmask01, 0);
183                         else
184                                 __mempool_check_cookies(mbuf->pool,
185                                                         (void **)&mbuf,
186                                                         1, 0);
187
188                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
189                                 offsetof(struct rte_mbuf, buf_iova));
190                         if (rte_pktmbuf_prefree_seg(mbuf) == NULL)
191                                 vsetq_lane_u64(0x80000, xmask01, 1);
192                         else
193                                 __mempool_check_cookies(mbuf->pool,
194                                                         (void **)&mbuf,
195                                                         1, 0);
196
197                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
198                                 offsetof(struct rte_mbuf, buf_iova));
199                         if (rte_pktmbuf_prefree_seg(mbuf) == NULL)
200                                 vsetq_lane_u64(0x80000, xmask23, 0);
201                         else
202                                 __mempool_check_cookies(mbuf->pool,
203                                                         (void **)&mbuf,
204                                                         1, 0);
205
206                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
207                                 offsetof(struct rte_mbuf, buf_iova));
208                         if (rte_pktmbuf_prefree_seg(mbuf) == NULL)
209                                 vsetq_lane_u64(0x80000, xmask23, 1);
210                         else
211                                 __mempool_check_cookies(mbuf->pool,
212                                                         (void **)&mbuf,
213                                                         1, 0);
214                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
215                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
216                 } else {
217                         struct rte_mbuf *mbuf;
218                         /* Mark mempool object as "put" since
219                          * it is freed by NIX
220                          */
221                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
222                                 offsetof(struct rte_mbuf, buf_iova));
223                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
224                                                 1, 0);
225
226                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
227                                 offsetof(struct rte_mbuf, buf_iova));
228                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
229                                                 1, 0);
230
231                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
232                                 offsetof(struct rte_mbuf, buf_iova));
233                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
234                                                 1, 0);
235
236                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
237                                 offsetof(struct rte_mbuf, buf_iova));
238                         __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
239                                                 1, 0);
240                         RTE_SET_USED(mbuf);
241                 }
242
243                 /* Move mbufs to point pool */
244                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
245                          offsetof(struct rte_mbuf, pool) -
246                          offsetof(struct rte_mbuf, buf_iova));
247                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
248                          offsetof(struct rte_mbuf, pool) -
249                          offsetof(struct rte_mbuf, buf_iova));
250                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
251                          offsetof(struct rte_mbuf, pool) -
252                          offsetof(struct rte_mbuf, buf_iova));
253                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
254                          offsetof(struct rte_mbuf, pool) -
255                          offsetof(struct rte_mbuf, buf_iova));
256
257                 if (flags &
258                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
259                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
260                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
261                         /*
262                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
263                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
264                          */
265
266                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
267                                       [a]"+w"(senddesc01_w1) :
268                                       [in]"r"(mbuf0 + 2) : "memory");
269
270                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
271                                       [a]"+w"(senddesc01_w1) :
272                                       [in]"r"(mbuf1 + 2) : "memory");
273
274                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
275                                       [b]"+w"(senddesc23_w1) :
276                                       [in]"r"(mbuf2 + 2) : "memory");
277
278                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
279                                       [b]"+w"(senddesc23_w1) :
280                                       [in]"r"(mbuf3 + 2) : "memory");
281
282                         /* Get pool pointer alone */
283                         mbuf0 = (uint64_t *)*mbuf0;
284                         mbuf1 = (uint64_t *)*mbuf1;
285                         mbuf2 = (uint64_t *)*mbuf2;
286                         mbuf3 = (uint64_t *)*mbuf3;
287                 } else {
288                         /* Get pool pointer alone */
289                         mbuf0 = (uint64_t *)*mbuf0;
290                         mbuf1 = (uint64_t *)*mbuf1;
291                         mbuf2 = (uint64_t *)*mbuf2;
292                         mbuf3 = (uint64_t *)*mbuf3;
293                 }
294
295                 const uint8x16_t shuf_mask2 = {
296                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
297                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
298                 };
299                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
300                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
301
302                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
303                 const uint64x2_t and_mask0 = {
304                         0xFFFFFFFFFFFFFFFF,
305                         0x000000000000FFFF,
306                 };
307
308                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
309                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
310                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
311                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
312
313                 /*
314                  * Pick only 16 bits of pktlen preset at bits 63:32
315                  * and place them at bits 15:0.
316                  */
317                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
318                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
319
320                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
321                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
322                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
323
324                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
325                  * pktlen at 15:0 position.
326                  */
327                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
328                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
329                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
330                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
331
332                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
333                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
334                         /*
335                          * Lookup table to translate ol_flags to
336                          * il3/il4 types. But we still use ol3/ol4 types in
337                          * senddesc_w1 as only one header processing is enabled.
338                          */
339                         const uint8x16_t tbl = {
340                                 /* [0-15] = il4type:il3type */
341                                 0x04, /* none (IPv6 assumed) */
342                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
343                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
344                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
345                                 0x03, /* PKT_TX_IP_CKSUM */
346                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
347                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
348                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
349                                 0x02, /* PKT_TX_IPV4  */
350                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
351                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
352                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
353                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
354                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
355                                        * PKT_TX_TCP_CKSUM
356                                        */
357                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
358                                        * PKT_TX_SCTP_CKSUM
359                                        */
360                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
361                                        * PKT_TX_UDP_CKSUM
362                                        */
363                         };
364
365                         /* Extract olflags to translate to iltypes */
366                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
367                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
368
369                         /*
370                          * E(47):L3_LEN(9):L2_LEN(7+z)
371                          * E(47):L3_LEN(9):L2_LEN(7+z)
372                          */
373                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
374                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
375
376                         /* Move OLFLAGS bits 55:52 to 51:48
377                          * with zeros preprended on the byte and rest
378                          * don't care
379                          */
380                         xtmp128 = vshrq_n_u8(xtmp128, 4);
381                         ytmp128 = vshrq_n_u8(ytmp128, 4);
382                         /*
383                          * E(48):L3_LEN(8):L2_LEN(z+7)
384                          * E(48):L3_LEN(8):L2_LEN(z+7)
385                          */
386                         const int8x16_t tshft3 = {
387                                 -1, 0, 8, 8, 8, 8, 8, 8,
388                                 -1, 0, 8, 8, 8, 8, 8, 8,
389                         };
390
391                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
392                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
393
394                         /* Do the lookup */
395                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
396                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
397
398                         /* Just use ld1q to retrieve aura
399                          * when we don't need tx_offload
400                          */
401                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
402                                         offsetof(struct rte_mempool, pool_id));
403                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
404                                         offsetof(struct rte_mempool, pool_id));
405                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
406                                         offsetof(struct rte_mempool, pool_id));
407                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
408                                         offsetof(struct rte_mempool, pool_id));
409
410                         /* Pick only relevant fields i.e Bit 48:55 of iltype
411                          * and place it in ol3/ol4type of senddesc_w1
412                          */
413                         const uint8x16_t shuf_mask0 = {
414                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
415                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
416                         };
417
418                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
419                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
420
421                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
422                          * a [E(32):E(16):OL3(8):OL2(8)]
423                          * a = a + (a << 8)
424                          * a [E(32):E(16):(OL3+OL2):OL2]
425                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
426                          */
427                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
428                                                  vshlq_n_u16(senddesc01_w1, 8));
429                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
430                                                  vshlq_n_u16(senddesc23_w1, 8));
431
432                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
433                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
434                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
435                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
436                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
437
438                         xmask01 = vdupq_n_u64(0);
439                         xmask23 = xmask01;
440                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
441                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
442
443                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
444                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
445
446                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
447                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
448
449                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
450                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
451                         xmask01 = vshlq_n_u64(xmask01, 20);
452                         xmask23 = vshlq_n_u64(xmask23, 20);
453
454                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
455                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
456                         /* Move ltypes to senddesc*_w1 */
457                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
458                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
459
460                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
461                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
462                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
463                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
464                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
465
466                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
467                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
468                         /*
469                          * Lookup table to translate ol_flags to
470                          * ol3/ol4 types.
471                          */
472
473                         const uint8x16_t tbl = {
474                                 /* [0-15] = ol4type:ol3type */
475                                 0x00, /* none */
476                                 0x03, /* OUTER_IP_CKSUM */
477                                 0x02, /* OUTER_IPV4 */
478                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
479                                 0x04, /* OUTER_IPV6 */
480                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
481                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
482                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
483                                        * OUTER_IP_CKSUM
484                                        */
485                                 0x00, /* OUTER_UDP_CKSUM */
486                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
487                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
488                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
489                                        * OUTER_IP_CKSUM
490                                        */
491                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
492                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
493                                        * OUTER_IP_CKSUM
494                                        */
495                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
496                                        * OUTER_IPV4
497                                        */
498                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
499                                        * OUTER_IPV4 | OUTER_IP_CKSUM
500                                        */
501                         };
502
503                         /* Extract olflags to translate to iltypes */
504                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
505                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
506
507                         /*
508                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
509                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
510                          */
511                         const uint8x16_t shuf_mask5 = {
512                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
513                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
514                         };
515                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
516                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
517
518                         /* Extract outer ol flags only */
519                         const uint64x2_t o_cksum_mask = {
520                                 0x1C00020000000000,
521                                 0x1C00020000000000,
522                         };
523
524                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
525                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
526
527                         /* Extract OUTER_UDP_CKSUM bit 41 and
528                          * move it to bit 61
529                          */
530
531                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
532                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
533
534                         /* Shift oltype by 2 to start nibble from BIT(56)
535                          * instead of BIT(58)
536                          */
537                         xtmp128 = vshrq_n_u8(xtmp128, 2);
538                         ytmp128 = vshrq_n_u8(ytmp128, 2);
539                         /*
540                          * E(48):L3_LEN(8):L2_LEN(z+7)
541                          * E(48):L3_LEN(8):L2_LEN(z+7)
542                          */
543                         const int8x16_t tshft3 = {
544                                 -1, 0, 8, 8, 8, 8, 8, 8,
545                                 -1, 0, 8, 8, 8, 8, 8, 8,
546                         };
547
548                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
549                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
550
551                         /* Do the lookup */
552                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
553                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
554
555                         /* Just use ld1q to retrieve aura
556                          * when we don't need tx_offload
557                          */
558                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
559                                         offsetof(struct rte_mempool, pool_id));
560                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
561                                         offsetof(struct rte_mempool, pool_id));
562                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
563                                         offsetof(struct rte_mempool, pool_id));
564                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
565                                         offsetof(struct rte_mempool, pool_id));
566
567                         /* Pick only relevant fields i.e Bit 56:63 of oltype
568                          * and place it in ol3/ol4type of senddesc_w1
569                          */
570                         const uint8x16_t shuf_mask0 = {
571                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
572                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
573                         };
574
575                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
576                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
577
578                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
579                          * a [E(32):E(16):OL3(8):OL2(8)]
580                          * a = a + (a << 8)
581                          * a [E(32):E(16):(OL3+OL2):OL2]
582                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
583                          */
584                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
585                                                  vshlq_n_u16(senddesc01_w1, 8));
586                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
587                                                  vshlq_n_u16(senddesc23_w1, 8));
588
589                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
590                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
591                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
592                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
593                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
594
595                         xmask01 = vdupq_n_u64(0);
596                         xmask23 = xmask01;
597                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
598                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
599
600                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
601                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
602
603                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
604                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
605
606                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
607                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
608                         xmask01 = vshlq_n_u64(xmask01, 20);
609                         xmask23 = vshlq_n_u64(xmask23, 20);
610
611                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
612                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
613                         /* Move ltypes to senddesc*_w1 */
614                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
615                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
616
617                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
618                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
619                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
620                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
621                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
622
623                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
624                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
625                         /* Lookup table to translate ol_flags to
626                          * ol4type, ol3type, il4type, il3type of senddesc_w1
627                          */
628                         const uint8x16x2_t tbl = {
629                         {
630                                 {
631                                         /* [0-15] = il4type:il3type */
632                                         0x04, /* none (IPv6) */
633                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
634                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
635                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
636                                         0x03, /* PKT_TX_IP_CKSUM */
637                                         0x13, /* PKT_TX_IP_CKSUM |
638                                                * PKT_TX_TCP_CKSUM
639                                                */
640                                         0x23, /* PKT_TX_IP_CKSUM |
641                                                * PKT_TX_SCTP_CKSUM
642                                                */
643                                         0x33, /* PKT_TX_IP_CKSUM |
644                                                * PKT_TX_UDP_CKSUM
645                                                */
646                                         0x02, /* PKT_TX_IPV4 */
647                                         0x12, /* PKT_TX_IPV4 |
648                                                * PKT_TX_TCP_CKSUM
649                                                */
650                                         0x22, /* PKT_TX_IPV4 |
651                                                * PKT_TX_SCTP_CKSUM
652                                                */
653                                         0x32, /* PKT_TX_IPV4 |
654                                                * PKT_TX_UDP_CKSUM
655                                                */
656                                         0x03, /* PKT_TX_IPV4 |
657                                                * PKT_TX_IP_CKSUM
658                                                */
659                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
660                                                * PKT_TX_TCP_CKSUM
661                                                */
662                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
663                                                * PKT_TX_SCTP_CKSUM
664                                                */
665                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
666                                                * PKT_TX_UDP_CKSUM
667                                                */
668                                 },
669
670                                 {
671                                         /* [16-31] = ol4type:ol3type */
672                                         0x00, /* none */
673                                         0x03, /* OUTER_IP_CKSUM */
674                                         0x02, /* OUTER_IPV4 */
675                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
676                                         0x04, /* OUTER_IPV6 */
677                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
678                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
679                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
680                                                * OUTER_IP_CKSUM
681                                                */
682                                         0x00, /* OUTER_UDP_CKSUM */
683                                         0x33, /* OUTER_UDP_CKSUM |
684                                                * OUTER_IP_CKSUM
685                                                */
686                                         0x32, /* OUTER_UDP_CKSUM |
687                                                * OUTER_IPV4
688                                                */
689                                         0x33, /* OUTER_UDP_CKSUM |
690                                                * OUTER_IPV4 | OUTER_IP_CKSUM
691                                                */
692                                         0x34, /* OUTER_UDP_CKSUM |
693                                                * OUTER_IPV6
694                                                */
695                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
696                                                * OUTER_IP_CKSUM
697                                                */
698                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
699                                                * OUTER_IPV4
700                                                */
701                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
702                                                * OUTER_IPV4 | OUTER_IP_CKSUM
703                                                */
704                                 },
705                         }
706                         };
707
708                         /* Extract olflags to translate to oltype & iltype */
709                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
710                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
711
712                         /*
713                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
714                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
715                          */
716                         const uint32x4_t tshft_4 = {
717                                 1, 0,
718                                 1, 0,
719                         };
720                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
721                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
722
723                         /*
724                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
725                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
726                          */
727                         const uint8x16_t shuf_mask5 = {
728                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
729                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
730                         };
731                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
732                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
733
734                         /* Extract outer and inner header ol_flags */
735                         const uint64x2_t oi_cksum_mask = {
736                                 0x1CF0020000000000,
737                                 0x1CF0020000000000,
738                         };
739
740                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
741                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
742
743                         /* Extract OUTER_UDP_CKSUM bit 41 and
744                          * move it to bit 61
745                          */
746
747                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
748                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
749
750                         /* Shift right oltype by 2 and iltype by 4
751                          * to start oltype nibble from BIT(58)
752                          * instead of BIT(56) and iltype nibble from BIT(48)
753                          * instead of BIT(52).
754                          */
755                         const int8x16_t tshft5 = {
756                                 8, 8, 8, 8, 8, 8, -4, -2,
757                                 8, 8, 8, 8, 8, 8, -4, -2,
758                         };
759
760                         xtmp128 = vshlq_u8(xtmp128, tshft5);
761                         ytmp128 = vshlq_u8(ytmp128, tshft5);
762                         /*
763                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
764                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
765                          */
766                         const int8x16_t tshft3 = {
767                                 -1, 0, -1, 0, 0, 0, 0, 0,
768                                 -1, 0, -1, 0, 0, 0, 0, 0,
769                         };
770
771                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
772                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
773
774                         /* Mark Bit(4) of oltype */
775                         const uint64x2_t oi_cksum_mask2 = {
776                                 0x1000000000000000,
777                                 0x1000000000000000,
778                         };
779
780                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
781                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
782
783                         /* Do the lookup */
784                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
785                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
786
787                         /* Just use ld1q to retrieve aura
788                          * when we don't need tx_offload
789                          */
790                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
791                                         offsetof(struct rte_mempool, pool_id));
792                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
793                                         offsetof(struct rte_mempool, pool_id));
794                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
795                                         offsetof(struct rte_mempool, pool_id));
796                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
797                                         offsetof(struct rte_mempool, pool_id));
798
799                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
800                          * Bit 56:63 of oltype and place it in corresponding
801                          * place in senddesc_w1.
802                          */
803                         const uint8x16_t shuf_mask0 = {
804                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
805                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
806                         };
807
808                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
809                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
810
811                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
812                          * l3len, l2len, ol3len, ol2len.
813                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
814                          * a = a + (a << 8)
815                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
816                          * a = a + (a << 16)
817                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
818                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
819                          */
820                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
821                                                  vshlq_n_u32(senddesc01_w1, 8));
822                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
823                                                  vshlq_n_u32(senddesc23_w1, 8));
824
825                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
826                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
827                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
828                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
829                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
830
831                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
832                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
833                                                 vshlq_n_u32(senddesc01_w1, 16));
834                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
835                                                 vshlq_n_u32(senddesc23_w1, 16));
836
837                         xmask01 = vdupq_n_u64(0);
838                         xmask23 = xmask01;
839                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
840                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
841
842                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
843                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
844
845                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
846                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
847
848                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
849                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
850                         xmask01 = vshlq_n_u64(xmask01, 20);
851                         xmask23 = vshlq_n_u64(xmask23, 20);
852
853                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
854                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
855                         /* Move ltypes to senddesc*_w1 */
856                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
857                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
858
859                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
860                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
861                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
862                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
863                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
864                 } else {
865                         /* Just use ld1q to retrieve aura
866                          * when we don't need tx_offload
867                          */
868                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
869                                         offsetof(struct rte_mempool, pool_id));
870                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
871                                         offsetof(struct rte_mempool, pool_id));
872                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
873                                         offsetof(struct rte_mempool, pool_id));
874                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
875                                         offsetof(struct rte_mempool, pool_id));
876                         xmask01 = vdupq_n_u64(0);
877                         xmask23 = xmask01;
878                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
879                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
880
881                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
882                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
883
884                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
885                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
886
887                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
888                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
889                         xmask01 = vshlq_n_u64(xmask01, 20);
890                         xmask23 = vshlq_n_u64(xmask23, 20);
891
892                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
893                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
894
895                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
896                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
897                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
898                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
899                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
900                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
901                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
902                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
903                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
904                 }
905
906                 do {
907                         vst1q_u64(lmt_addr, cmd00);
908                         vst1q_u64(lmt_addr + 2, cmd01);
909                         vst1q_u64(lmt_addr + 4, cmd10);
910                         vst1q_u64(lmt_addr + 6, cmd11);
911                         vst1q_u64(lmt_addr + 8, cmd20);
912                         vst1q_u64(lmt_addr + 10, cmd21);
913                         vst1q_u64(lmt_addr + 12, cmd30);
914                         vst1q_u64(lmt_addr + 14, cmd31);
915                         lmt_status = otx2_lmt_submit(io_addr);
916
917                 } while (lmt_status == 0);
918         }
919
920         return pkts;
921 }
922
923 #else
924 static __rte_always_inline uint16_t
925 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
926                      uint16_t pkts, const uint16_t flags)
927 {
928         RTE_SET_USED(tx_queue);
929         RTE_SET_USED(tx_pkts);
930         RTE_SET_USED(pkts);
931         RTE_SET_USED(flags);
932         return 0;
933 }
934 #endif
935
936 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
937 static uint16_t __rte_noinline  __hot                                   \
938 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
939                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
940 {                                                                       \
941         uint64_t cmd[sz];                                               \
942                                                                         \
943         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
944 }
945
946 NIX_TX_FASTPATH_MODES
947 #undef T
948
949 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
950 static uint16_t __rte_noinline  __hot                                   \
951 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
952                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
953 {                                                                       \
954         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
955                                                                         \
956         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
957                                   (flags) | NIX_TX_MULTI_SEG_F);        \
958 }
959
960 NIX_TX_FASTPATH_MODES
961 #undef T
962
963 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
964 static uint16_t __rte_noinline  __hot                                   \
965 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
966                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
967 {                                                                       \
968         /* VLAN and TSTMP is not supported by vec */                    \
969         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
970             (flags) & NIX_TX_OFFLOAD_TSTAMP_F)                          \
971                 return 0;                                               \
972         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, (flags));  \
973 }
974
975 NIX_TX_FASTPATH_MODES
976 #undef T
977
978 static inline void
979 pick_tx_func(struct rte_eth_dev *eth_dev,
980              const eth_tx_burst_t tx_burst[2][2][2][2][2])
981 {
982         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
983
984         /* [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
985         eth_dev->tx_pkt_burst = tx_burst
986                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
987                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
988                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
989                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
990                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
991 }
992
993 void
994 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
995 {
996         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
997
998         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2] = {
999 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
1000         [f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1001
1002 NIX_TX_FASTPATH_MODES
1003 #undef T
1004         };
1005
1006         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2] = {
1007 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
1008         [f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1009
1010 NIX_TX_FASTPATH_MODES
1011 #undef T
1012         };
1013
1014         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2] = {
1015 #define T(name, f4, f3, f2, f1, f0, sz, flags)                          \
1016         [f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1017
1018 NIX_TX_FASTPATH_MODES
1019 #undef T
1020         };
1021
1022         if (dev->scalar_ena ||
1023             (dev->tx_offload_flags &
1024              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F)))
1025                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1026         else
1027                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1028
1029         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1030                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1031
1032         rte_mb();
1033 }