net/i40e: fix Rx packet statistics
[dpdk.git] / drivers / net / octeontx2 / otx2_tx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(C) 2019 Marvell International Ltd.
3  */
4
5 #include <rte_vect.h>
6
7 #include "otx2_ethdev.h"
8
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do {                           \
10         /* Cached value is low, Update the fc_cache_pkts */             \
11         if (unlikely((txq)->fc_cache_pkts < (pkts))) {                  \
12                 /* Multiply with sqe_per_sqb to express in pkts */      \
13                 (txq)->fc_cache_pkts =                                  \
14                         ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) <<    \
15                                 (txq)->sqes_per_sqb_log2;               \
16                 /* Check it again for the room */                       \
17                 if (unlikely((txq)->fc_cache_pkts < (pkts)))            \
18                         return 0;                                       \
19         }                                                               \
20 } while (0)
21
22
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25               uint16_t pkts, uint64_t *cmd, const uint16_t flags)
26 {
27         struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28         const rte_iova_t io_addr = txq->io_addr;
29         void *lmt_addr = txq->lmt_addr;
30         uint64_t lso_tun_fmt;
31
32         NIX_XMIT_FC_OR_RETURN(txq, pkts);
33
34         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
35
36         /* Perform header writes before barrier for TSO */
37         if (flags & NIX_TX_OFFLOAD_TSO_F) {
38                 lso_tun_fmt = txq->lso_tun_fmt;
39                 for (i = 0; i < pkts; i++)
40                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
41         }
42
43         /* Lets commit any changes in the packet here as no further changes
44          * to the packet will be done unless no fast free is enabled.
45          */
46         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
47                 rte_io_wmb();
48
49         for (i = 0; i < pkts; i++) {
50                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
51                 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
52                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
53                                              tx_pkts[i]->ol_flags, 4, flags);
54                 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
55         }
56
57         /* Reduce the cached count */
58         txq->fc_cache_pkts -= pkts;
59
60         return pkts;
61 }
62
63 static __rte_always_inline uint16_t
64 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
65                    uint16_t pkts, uint64_t *cmd, const uint16_t flags)
66 {
67         struct otx2_eth_txq *txq = tx_queue; uint64_t i;
68         const rte_iova_t io_addr = txq->io_addr;
69         void *lmt_addr = txq->lmt_addr;
70         uint64_t lso_tun_fmt;
71         uint16_t segdw;
72
73         NIX_XMIT_FC_OR_RETURN(txq, pkts);
74
75         otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
76
77         /* Perform header writes before barrier for TSO */
78         if (flags & NIX_TX_OFFLOAD_TSO_F) {
79                 lso_tun_fmt = txq->lso_tun_fmt;
80                 for (i = 0; i < pkts; i++)
81                         otx2_nix_xmit_prepare_tso(tx_pkts[i], flags);
82         }
83
84         /* Lets commit any changes in the packet here as no further changes
85          * to the packet will be done unless no fast free is enabled.
86          */
87         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
88                 rte_io_wmb();
89
90         for (i = 0; i < pkts; i++) {
91                 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags, lso_tun_fmt);
92                 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
93                 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
94                                              tx_pkts[i]->ol_flags, segdw,
95                                              flags);
96                 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
97         }
98
99         /* Reduce the cached count */
100         txq->fc_cache_pkts -= pkts;
101
102         return pkts;
103 }
104
105 #if defined(RTE_ARCH_ARM64)
106
107 #define NIX_DESCS_PER_LOOP      4
108 static __rte_always_inline uint16_t
109 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
110                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
111 {
112         uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
113         uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
114         uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
115         uint64x2_t senddesc01_w0, senddesc23_w0;
116         uint64x2_t senddesc01_w1, senddesc23_w1;
117         uint64x2_t sgdesc01_w0, sgdesc23_w0;
118         uint64x2_t sgdesc01_w1, sgdesc23_w1;
119         struct otx2_eth_txq *txq = tx_queue;
120         uint64_t *lmt_addr = txq->lmt_addr;
121         rte_iova_t io_addr = txq->io_addr;
122         uint64x2_t ltypes01, ltypes23;
123         uint64x2_t xtmp128, ytmp128;
124         uint64x2_t xmask01, xmask23;
125         uint64x2_t cmd00, cmd01;
126         uint64x2_t cmd10, cmd11;
127         uint64x2_t cmd20, cmd21;
128         uint64x2_t cmd30, cmd31;
129         uint64_t lmt_status, i;
130         uint16_t pkts_left;
131
132         NIX_XMIT_FC_OR_RETURN(txq, pkts);
133
134         pkts_left = pkts & (NIX_DESCS_PER_LOOP - 1);
135         pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
136
137         /* Reduce the cached count */
138         txq->fc_cache_pkts -= pkts;
139
140         /* Lets commit any changes in the packet here as no further changes
141          * to the packet will be done unless no fast free is enabled.
142          */
143         if (!(flags & NIX_TX_OFFLOAD_MBUF_NOFF_F))
144                 rte_io_wmb();
145
146         senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
147         senddesc23_w0 = senddesc01_w0;
148         senddesc01_w1 = vdupq_n_u64(0);
149         senddesc23_w1 = senddesc01_w1;
150         sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
151         sgdesc23_w0 = sgdesc01_w0;
152
153         for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
154                 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
155                 senddesc01_w0 = vbicq_u64(senddesc01_w0,
156                                           vdupq_n_u64(0xFFFFFFFF));
157                 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
158                                         vdupq_n_u64(0xFFFFFFFF));
159
160                 senddesc23_w0 = senddesc01_w0;
161                 sgdesc23_w0 = sgdesc01_w0;
162
163                 /* Move mbufs to iova */
164                 mbuf0 = (uint64_t *)tx_pkts[0];
165                 mbuf1 = (uint64_t *)tx_pkts[1];
166                 mbuf2 = (uint64_t *)tx_pkts[2];
167                 mbuf3 = (uint64_t *)tx_pkts[3];
168
169                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
170                                      offsetof(struct rte_mbuf, buf_iova));
171                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
172                                      offsetof(struct rte_mbuf, buf_iova));
173                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
174                                      offsetof(struct rte_mbuf, buf_iova));
175                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
176                                      offsetof(struct rte_mbuf, buf_iova));
177                 /*
178                  * Get mbuf's, olflags, iova, pktlen, dataoff
179                  * dataoff_iovaX.D[0] = iova,
180                  * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
181                  * len_olflagsX.D[0] = ol_flags,
182                  * len_olflagsX.D[1](63:32) = mbuf->pkt_len
183                  */
184                 dataoff_iova0  = vld1q_u64(mbuf0);
185                 len_olflags0 = vld1q_u64(mbuf0 + 2);
186                 dataoff_iova1  = vld1q_u64(mbuf1);
187                 len_olflags1 = vld1q_u64(mbuf1 + 2);
188                 dataoff_iova2  = vld1q_u64(mbuf2);
189                 len_olflags2 = vld1q_u64(mbuf2 + 2);
190                 dataoff_iova3  = vld1q_u64(mbuf3);
191                 len_olflags3 = vld1q_u64(mbuf3 + 2);
192
193                 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
194                         struct rte_mbuf *mbuf;
195                         /* Set don't free bit if reference count > 1 */
196                         xmask01 = vdupq_n_u64(0);
197                         xmask23 = xmask01;
198
199                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
200                                 offsetof(struct rte_mbuf, buf_iova));
201
202                         if (otx2_nix_prefree_seg(mbuf))
203                                 vsetq_lane_u64(0x80000, xmask01, 0);
204                         else
205                                 RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
206                                                         (void **)&mbuf,
207                                                         1, 0);
208
209                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
210                                 offsetof(struct rte_mbuf, buf_iova));
211                         if (otx2_nix_prefree_seg(mbuf))
212                                 vsetq_lane_u64(0x80000, xmask01, 1);
213                         else
214                                 RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
215                                                         (void **)&mbuf,
216                                                         1, 0);
217
218                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
219                                 offsetof(struct rte_mbuf, buf_iova));
220                         if (otx2_nix_prefree_seg(mbuf))
221                                 vsetq_lane_u64(0x80000, xmask23, 0);
222                         else
223                                 RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
224                                                         (void **)&mbuf,
225                                                         1, 0);
226
227                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
228                                 offsetof(struct rte_mbuf, buf_iova));
229                         if (otx2_nix_prefree_seg(mbuf))
230                                 vsetq_lane_u64(0x80000, xmask23, 1);
231                         else
232                                 RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool,
233                                                         (void **)&mbuf,
234                                                         1, 0);
235                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
236                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
237                         /* Ensuring mbuf fields which got updated in
238                          * otx2_nix_prefree_seg are written before LMTST.
239                          */
240                         rte_io_wmb();
241                 } else {
242                         struct rte_mbuf *mbuf;
243                         /* Mark mempool object as "put" since
244                          * it is freed by NIX
245                          */
246                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
247                                 offsetof(struct rte_mbuf, buf_iova));
248                         RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
249                                                 1, 0);
250
251                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
252                                 offsetof(struct rte_mbuf, buf_iova));
253                         RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
254                                                 1, 0);
255
256                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
257                                 offsetof(struct rte_mbuf, buf_iova));
258                         RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
259                                                 1, 0);
260
261                         mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
262                                 offsetof(struct rte_mbuf, buf_iova));
263                         RTE_MEMPOOL_CHECK_COOKIES(mbuf->pool, (void **)&mbuf,
264                                                 1, 0);
265                         RTE_SET_USED(mbuf);
266                 }
267
268                 /* Move mbufs to point pool */
269                 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
270                          offsetof(struct rte_mbuf, pool) -
271                          offsetof(struct rte_mbuf, buf_iova));
272                 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
273                          offsetof(struct rte_mbuf, pool) -
274                          offsetof(struct rte_mbuf, buf_iova));
275                 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
276                          offsetof(struct rte_mbuf, pool) -
277                          offsetof(struct rte_mbuf, buf_iova));
278                 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
279                          offsetof(struct rte_mbuf, pool) -
280                          offsetof(struct rte_mbuf, buf_iova));
281
282                 if (flags &
283                     (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
284                      NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
285                         /* Get tx_offload for ol2, ol3, l2, l3 lengths */
286                         /*
287                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
288                          * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
289                          */
290
291                         asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
292                                       [a]"+w"(senddesc01_w1) :
293                                       [in]"r"(mbuf0 + 2) : "memory");
294
295                         asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
296                                       [a]"+w"(senddesc01_w1) :
297                                       [in]"r"(mbuf1 + 2) : "memory");
298
299                         asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
300                                       [b]"+w"(senddesc23_w1) :
301                                       [in]"r"(mbuf2 + 2) : "memory");
302
303                         asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
304                                       [b]"+w"(senddesc23_w1) :
305                                       [in]"r"(mbuf3 + 2) : "memory");
306
307                         /* Get pool pointer alone */
308                         mbuf0 = (uint64_t *)*mbuf0;
309                         mbuf1 = (uint64_t *)*mbuf1;
310                         mbuf2 = (uint64_t *)*mbuf2;
311                         mbuf3 = (uint64_t *)*mbuf3;
312                 } else {
313                         /* Get pool pointer alone */
314                         mbuf0 = (uint64_t *)*mbuf0;
315                         mbuf1 = (uint64_t *)*mbuf1;
316                         mbuf2 = (uint64_t *)*mbuf2;
317                         mbuf3 = (uint64_t *)*mbuf3;
318                 }
319
320                 const uint8x16_t shuf_mask2 = {
321                         0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
322                         0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
323                 };
324                 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
325                 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
326
327                 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
328                 const uint64x2_t and_mask0 = {
329                         0xFFFFFFFFFFFFFFFF,
330                         0x000000000000FFFF,
331                 };
332
333                 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
334                 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
335                 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
336                 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
337
338                 /*
339                  * Pick only 16 bits of pktlen preset at bits 63:32
340                  * and place them at bits 15:0.
341                  */
342                 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
343                 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
344
345                 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
346                 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
347                 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
348
349                 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
350                  * pktlen at 15:0 position.
351                  */
352                 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
353                 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
354                 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
355                 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
356
357                 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
358                     !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
359                         /*
360                          * Lookup table to translate ol_flags to
361                          * il3/il4 types. But we still use ol3/ol4 types in
362                          * senddesc_w1 as only one header processing is enabled.
363                          */
364                         const uint8x16_t tbl = {
365                                 /* [0-15] = il4type:il3type */
366                                 0x04, /* none (IPv6 assumed) */
367                                 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
368                                 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
369                                 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
370                                 0x03, /* PKT_TX_IP_CKSUM */
371                                 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
372                                 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
373                                 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
374                                 0x02, /* PKT_TX_IPV4  */
375                                 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
376                                 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
377                                 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
378                                 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
379                                 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
380                                        * PKT_TX_TCP_CKSUM
381                                        */
382                                 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
383                                        * PKT_TX_SCTP_CKSUM
384                                        */
385                                 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
386                                        * PKT_TX_UDP_CKSUM
387                                        */
388                         };
389
390                         /* Extract olflags to translate to iltypes */
391                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
392                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
393
394                         /*
395                          * E(47):L3_LEN(9):L2_LEN(7+z)
396                          * E(47):L3_LEN(9):L2_LEN(7+z)
397                          */
398                         senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
399                         senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
400
401                         /* Move OLFLAGS bits 55:52 to 51:48
402                          * with zeros preprended on the byte and rest
403                          * don't care
404                          */
405                         xtmp128 = vshrq_n_u8(xtmp128, 4);
406                         ytmp128 = vshrq_n_u8(ytmp128, 4);
407                         /*
408                          * E(48):L3_LEN(8):L2_LEN(z+7)
409                          * E(48):L3_LEN(8):L2_LEN(z+7)
410                          */
411                         const int8x16_t tshft3 = {
412                                 -1, 0, 8, 8, 8, 8, 8, 8,
413                                 -1, 0, 8, 8, 8, 8, 8, 8,
414                         };
415
416                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
417                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
418
419                         /* Do the lookup */
420                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
421                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
422
423                         /* Just use ld1q to retrieve aura
424                          * when we don't need tx_offload
425                          */
426                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
427                                         offsetof(struct rte_mempool, pool_id));
428                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
429                                         offsetof(struct rte_mempool, pool_id));
430                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
431                                         offsetof(struct rte_mempool, pool_id));
432                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
433                                         offsetof(struct rte_mempool, pool_id));
434
435                         /* Pick only relevant fields i.e Bit 48:55 of iltype
436                          * and place it in ol3/ol4type of senddesc_w1
437                          */
438                         const uint8x16_t shuf_mask0 = {
439                                 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
440                                 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
441                         };
442
443                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
444                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
445
446                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
447                          * a [E(32):E(16):OL3(8):OL2(8)]
448                          * a = a + (a << 8)
449                          * a [E(32):E(16):(OL3+OL2):OL2]
450                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
451                          */
452                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
453                                                  vshlq_n_u16(senddesc01_w1, 8));
454                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
455                                                  vshlq_n_u16(senddesc23_w1, 8));
456
457                         /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
458                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
459                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
460                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
461                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
462
463                         xmask01 = vdupq_n_u64(0);
464                         xmask23 = xmask01;
465                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
466                                 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
467
468                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
469                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
470
471                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
472                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
473
474                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
475                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
476                         xmask01 = vshlq_n_u64(xmask01, 20);
477                         xmask23 = vshlq_n_u64(xmask23, 20);
478
479                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
480                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
481                         /* Move ltypes to senddesc*_w1 */
482                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
483                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
484
485                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
486                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
487                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
488                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
489                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
490
491                 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
492                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
493                         /*
494                          * Lookup table to translate ol_flags to
495                          * ol3/ol4 types.
496                          */
497
498                         const uint8x16_t tbl = {
499                                 /* [0-15] = ol4type:ol3type */
500                                 0x00, /* none */
501                                 0x03, /* OUTER_IP_CKSUM */
502                                 0x02, /* OUTER_IPV4 */
503                                 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
504                                 0x04, /* OUTER_IPV6 */
505                                 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
506                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
507                                 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
508                                        * OUTER_IP_CKSUM
509                                        */
510                                 0x00, /* OUTER_UDP_CKSUM */
511                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
512                                 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
513                                 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
514                                        * OUTER_IP_CKSUM
515                                        */
516                                 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
517                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
518                                        * OUTER_IP_CKSUM
519                                        */
520                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
521                                        * OUTER_IPV4
522                                        */
523                                 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
524                                        * OUTER_IPV4 | OUTER_IP_CKSUM
525                                        */
526                         };
527
528                         /* Extract olflags to translate to iltypes */
529                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
530                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
531
532                         /*
533                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
534                          * E(47):OL3_LEN(9):OL2_LEN(7+z)
535                          */
536                         const uint8x16_t shuf_mask5 = {
537                                 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
538                                 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
539                         };
540                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
541                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
542
543                         /* Extract outer ol flags only */
544                         const uint64x2_t o_cksum_mask = {
545                                 0x1C00020000000000,
546                                 0x1C00020000000000,
547                         };
548
549                         xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
550                         ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
551
552                         /* Extract OUTER_UDP_CKSUM bit 41 and
553                          * move it to bit 61
554                          */
555
556                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
557                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
558
559                         /* Shift oltype by 2 to start nibble from BIT(56)
560                          * instead of BIT(58)
561                          */
562                         xtmp128 = vshrq_n_u8(xtmp128, 2);
563                         ytmp128 = vshrq_n_u8(ytmp128, 2);
564                         /*
565                          * E(48):L3_LEN(8):L2_LEN(z+7)
566                          * E(48):L3_LEN(8):L2_LEN(z+7)
567                          */
568                         const int8x16_t tshft3 = {
569                                 -1, 0, 8, 8, 8, 8, 8, 8,
570                                 -1, 0, 8, 8, 8, 8, 8, 8,
571                         };
572
573                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
574                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
575
576                         /* Do the lookup */
577                         ltypes01 = vqtbl1q_u8(tbl, xtmp128);
578                         ltypes23 = vqtbl1q_u8(tbl, ytmp128);
579
580                         /* Just use ld1q to retrieve aura
581                          * when we don't need tx_offload
582                          */
583                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
584                                         offsetof(struct rte_mempool, pool_id));
585                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
586                                         offsetof(struct rte_mempool, pool_id));
587                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
588                                         offsetof(struct rte_mempool, pool_id));
589                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
590                                         offsetof(struct rte_mempool, pool_id));
591
592                         /* Pick only relevant fields i.e Bit 56:63 of oltype
593                          * and place it in ol3/ol4type of senddesc_w1
594                          */
595                         const uint8x16_t shuf_mask0 = {
596                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
597                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
598                         };
599
600                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
601                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
602
603                         /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
604                          * a [E(32):E(16):OL3(8):OL2(8)]
605                          * a = a + (a << 8)
606                          * a [E(32):E(16):(OL3+OL2):OL2]
607                          * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
608                          */
609                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
610                                                  vshlq_n_u16(senddesc01_w1, 8));
611                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
612                                                  vshlq_n_u16(senddesc23_w1, 8));
613
614                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
615                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
616                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
617                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
618                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
619
620                         xmask01 = vdupq_n_u64(0);
621                         xmask23 = xmask01;
622                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
623                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
624
625                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
626                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
627
628                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
629                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
630
631                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
632                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
633                         xmask01 = vshlq_n_u64(xmask01, 20);
634                         xmask23 = vshlq_n_u64(xmask23, 20);
635
636                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
637                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
638                         /* Move ltypes to senddesc*_w1 */
639                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
640                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
641
642                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
643                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
644                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
645                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
646                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
647
648                 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
649                            (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
650                         /* Lookup table to translate ol_flags to
651                          * ol4type, ol3type, il4type, il3type of senddesc_w1
652                          */
653                         const uint8x16x2_t tbl = {
654                         {
655                                 {
656                                         /* [0-15] = il4type:il3type */
657                                         0x04, /* none (IPv6) */
658                                         0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
659                                         0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
660                                         0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
661                                         0x03, /* PKT_TX_IP_CKSUM */
662                                         0x13, /* PKT_TX_IP_CKSUM |
663                                                * PKT_TX_TCP_CKSUM
664                                                */
665                                         0x23, /* PKT_TX_IP_CKSUM |
666                                                * PKT_TX_SCTP_CKSUM
667                                                */
668                                         0x33, /* PKT_TX_IP_CKSUM |
669                                                * PKT_TX_UDP_CKSUM
670                                                */
671                                         0x02, /* PKT_TX_IPV4 */
672                                         0x12, /* PKT_TX_IPV4 |
673                                                * PKT_TX_TCP_CKSUM
674                                                */
675                                         0x22, /* PKT_TX_IPV4 |
676                                                * PKT_TX_SCTP_CKSUM
677                                                */
678                                         0x32, /* PKT_TX_IPV4 |
679                                                * PKT_TX_UDP_CKSUM
680                                                */
681                                         0x03, /* PKT_TX_IPV4 |
682                                                * PKT_TX_IP_CKSUM
683                                                */
684                                         0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
685                                                * PKT_TX_TCP_CKSUM
686                                                */
687                                         0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
688                                                * PKT_TX_SCTP_CKSUM
689                                                */
690                                         0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
691                                                * PKT_TX_UDP_CKSUM
692                                                */
693                                 },
694
695                                 {
696                                         /* [16-31] = ol4type:ol3type */
697                                         0x00, /* none */
698                                         0x03, /* OUTER_IP_CKSUM */
699                                         0x02, /* OUTER_IPV4 */
700                                         0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
701                                         0x04, /* OUTER_IPV6 */
702                                         0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
703                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 */
704                                         0x00, /* OUTER_IPV6 | OUTER_IPV4 |
705                                                * OUTER_IP_CKSUM
706                                                */
707                                         0x00, /* OUTER_UDP_CKSUM */
708                                         0x33, /* OUTER_UDP_CKSUM |
709                                                * OUTER_IP_CKSUM
710                                                */
711                                         0x32, /* OUTER_UDP_CKSUM |
712                                                * OUTER_IPV4
713                                                */
714                                         0x33, /* OUTER_UDP_CKSUM |
715                                                * OUTER_IPV4 | OUTER_IP_CKSUM
716                                                */
717                                         0x34, /* OUTER_UDP_CKSUM |
718                                                * OUTER_IPV6
719                                                */
720                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
721                                                * OUTER_IP_CKSUM
722                                                */
723                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
724                                                * OUTER_IPV4
725                                                */
726                                         0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
727                                                * OUTER_IPV4 | OUTER_IP_CKSUM
728                                                */
729                                 },
730                         }
731                         };
732
733                         /* Extract olflags to translate to oltype & iltype */
734                         xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
735                         ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
736
737                         /*
738                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
739                          * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
740                          */
741                         const uint32x4_t tshft_4 = {
742                                 1, 0,
743                                 1, 0,
744                         };
745                         senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
746                         senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
747
748                         /*
749                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
750                          * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
751                          */
752                         const uint8x16_t shuf_mask5 = {
753                                 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
754                                 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
755                         };
756                         senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
757                         senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
758
759                         /* Extract outer and inner header ol_flags */
760                         const uint64x2_t oi_cksum_mask = {
761                                 0x1CF0020000000000,
762                                 0x1CF0020000000000,
763                         };
764
765                         xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
766                         ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
767
768                         /* Extract OUTER_UDP_CKSUM bit 41 and
769                          * move it to bit 61
770                          */
771
772                         xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
773                         ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
774
775                         /* Shift right oltype by 2 and iltype by 4
776                          * to start oltype nibble from BIT(58)
777                          * instead of BIT(56) and iltype nibble from BIT(48)
778                          * instead of BIT(52).
779                          */
780                         const int8x16_t tshft5 = {
781                                 8, 8, 8, 8, 8, 8, -4, -2,
782                                 8, 8, 8, 8, 8, 8, -4, -2,
783                         };
784
785                         xtmp128 = vshlq_u8(xtmp128, tshft5);
786                         ytmp128 = vshlq_u8(ytmp128, tshft5);
787                         /*
788                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
789                          * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
790                          */
791                         const int8x16_t tshft3 = {
792                                 -1, 0, -1, 0, 0, 0, 0, 0,
793                                 -1, 0, -1, 0, 0, 0, 0, 0,
794                         };
795
796                         senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
797                         senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
798
799                         /* Mark Bit(4) of oltype */
800                         const uint64x2_t oi_cksum_mask2 = {
801                                 0x1000000000000000,
802                                 0x1000000000000000,
803                         };
804
805                         xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
806                         ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
807
808                         /* Do the lookup */
809                         ltypes01 = vqtbl2q_u8(tbl, xtmp128);
810                         ltypes23 = vqtbl2q_u8(tbl, ytmp128);
811
812                         /* Just use ld1q to retrieve aura
813                          * when we don't need tx_offload
814                          */
815                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
816                                         offsetof(struct rte_mempool, pool_id));
817                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
818                                         offsetof(struct rte_mempool, pool_id));
819                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
820                                         offsetof(struct rte_mempool, pool_id));
821                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
822                                         offsetof(struct rte_mempool, pool_id));
823
824                         /* Pick only relevant fields i.e Bit 48:55 of iltype and
825                          * Bit 56:63 of oltype and place it in corresponding
826                          * place in senddesc_w1.
827                          */
828                         const uint8x16_t shuf_mask0 = {
829                                 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
830                                 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
831                         };
832
833                         ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
834                         ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
835
836                         /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
837                          * l3len, l2len, ol3len, ol2len.
838                          * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
839                          * a = a + (a << 8)
840                          * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
841                          * a = a + (a << 16)
842                          * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
843                          * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
844                          */
845                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
846                                                  vshlq_n_u32(senddesc01_w1, 8));
847                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
848                                                  vshlq_n_u32(senddesc23_w1, 8));
849
850                         /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
851                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
852                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
853                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
854                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
855
856                         /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
857                         senddesc01_w1 = vaddq_u8(senddesc01_w1,
858                                                 vshlq_n_u32(senddesc01_w1, 16));
859                         senddesc23_w1 = vaddq_u8(senddesc23_w1,
860                                                 vshlq_n_u32(senddesc23_w1, 16));
861
862                         xmask01 = vdupq_n_u64(0);
863                         xmask23 = xmask01;
864                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
865                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
866
867                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
868                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
869
870                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
871                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
872
873                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
874                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
875                         xmask01 = vshlq_n_u64(xmask01, 20);
876                         xmask23 = vshlq_n_u64(xmask23, 20);
877
878                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
879                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
880                         /* Move ltypes to senddesc*_w1 */
881                         senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
882                         senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
883
884                         /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
885                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
886                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
887                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
888                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
889                 } else {
890                         /* Just use ld1q to retrieve aura
891                          * when we don't need tx_offload
892                          */
893                         mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
894                                         offsetof(struct rte_mempool, pool_id));
895                         mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
896                                         offsetof(struct rte_mempool, pool_id));
897                         mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
898                                         offsetof(struct rte_mempool, pool_id));
899                         mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
900                                         offsetof(struct rte_mempool, pool_id));
901                         xmask01 = vdupq_n_u64(0);
902                         xmask23 = xmask01;
903                         asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
904                                  [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
905
906                         asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
907                                  [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
908
909                         asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
910                                  [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
911
912                         asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
913                                  [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
914                         xmask01 = vshlq_n_u64(xmask01, 20);
915                         xmask23 = vshlq_n_u64(xmask23, 20);
916
917                         senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
918                         senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
919
920                         /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
921                         cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
922                         cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
923                         cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
924                         cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
925                         cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
926                         cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
927                         cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
928                         cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
929                 }
930
931                 do {
932                         vst1q_u64(lmt_addr, cmd00);
933                         vst1q_u64(lmt_addr + 2, cmd01);
934                         vst1q_u64(lmt_addr + 4, cmd10);
935                         vst1q_u64(lmt_addr + 6, cmd11);
936                         vst1q_u64(lmt_addr + 8, cmd20);
937                         vst1q_u64(lmt_addr + 10, cmd21);
938                         vst1q_u64(lmt_addr + 12, cmd30);
939                         vst1q_u64(lmt_addr + 14, cmd31);
940                         lmt_status = otx2_lmt_submit(io_addr);
941
942                 } while (lmt_status == 0);
943                 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
944         }
945
946         if (unlikely(pkts_left))
947                 pkts += nix_xmit_pkts(tx_queue, tx_pkts, pkts_left, cmd, flags);
948
949         return pkts;
950 }
951
952 #else
953 static __rte_always_inline uint16_t
954 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
955                      uint16_t pkts, uint64_t *cmd, const uint16_t flags)
956 {
957         RTE_SET_USED(tx_queue);
958         RTE_SET_USED(tx_pkts);
959         RTE_SET_USED(pkts);
960         RTE_SET_USED(cmd);
961         RTE_SET_USED(flags);
962         return 0;
963 }
964 #endif
965
966 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
967 static uint16_t __rte_noinline  __rte_hot                                       \
968 otx2_nix_xmit_pkts_ ## name(void *tx_queue,                             \
969                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
970 {                                                                       \
971         uint64_t cmd[sz];                                               \
972                                                                         \
973         /* For TSO inner checksum is a must */                          \
974         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
975             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
976                 return 0;                                               \
977         return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags);      \
978 }
979
980 NIX_TX_FASTPATH_MODES
981 #undef T
982
983 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
984 static uint16_t __rte_noinline  __rte_hot                                       \
985 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue,                        \
986                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
987 {                                                                       \
988         uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2];                 \
989                                                                         \
990         /* For TSO inner checksum is a must */                          \
991         if (((flags) & NIX_TX_OFFLOAD_TSO_F) &&                         \
992             !((flags) & NIX_TX_OFFLOAD_L3_L4_CSUM_F))                   \
993                 return 0;                                               \
994         return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd,         \
995                                   (flags) | NIX_TX_MULTI_SEG_F);        \
996 }
997
998 NIX_TX_FASTPATH_MODES
999 #undef T
1000
1001 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1002 static uint16_t __rte_noinline  __rte_hot                                       \
1003 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue,                         \
1004                         struct rte_mbuf **tx_pkts, uint16_t pkts)       \
1005 {                                                                       \
1006         uint64_t cmd[sz];                                               \
1007                                                                         \
1008         /* VLAN, TSTMP, TSO is not supported by vec */                  \
1009         if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F ||                     \
1010             (flags) & NIX_TX_OFFLOAD_TSTAMP_F ||                        \
1011             (flags) & NIX_TX_OFFLOAD_TSO_F)                             \
1012                 return 0;                                               \
1013         return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, cmd, (flags)); \
1014 }
1015
1016 NIX_TX_FASTPATH_MODES
1017 #undef T
1018
1019 static inline void
1020 pick_tx_func(struct rte_eth_dev *eth_dev,
1021              const eth_tx_burst_t tx_burst[2][2][2][2][2][2][2])
1022 {
1023         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1024
1025         /* [SEC] [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
1026         eth_dev->tx_pkt_burst = tx_burst
1027                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_SECURITY_F)]
1028                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSO_F)]
1029                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
1030                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
1031                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
1032                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
1033                 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
1034 }
1035
1036 void
1037 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
1038 {
1039         struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
1040
1041         const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2][2][2] = {
1042 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1043         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_ ## name,
1044
1045 NIX_TX_FASTPATH_MODES
1046 #undef T
1047         };
1048
1049         const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2][2][2] = {
1050 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1051         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_mseg_ ## name,
1052
1053 NIX_TX_FASTPATH_MODES
1054 #undef T
1055         };
1056
1057         const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2][2][2] = {
1058 #define T(name, f6, f5, f4, f3, f2, f1, f0, sz, flags)                  \
1059         [f6][f5][f4][f3][f2][f1][f0] =  otx2_nix_xmit_pkts_vec_ ## name,
1060
1061 NIX_TX_FASTPATH_MODES
1062 #undef T
1063         };
1064
1065         if (dev->scalar_ena ||
1066             (dev->tx_offload_flags &
1067              (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F |
1068               NIX_TX_OFFLOAD_TSO_F)))
1069                 pick_tx_func(eth_dev, nix_eth_tx_burst);
1070         else
1071                 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1072
1073         if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1074                 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);
1075
1076         rte_mb();
1077 }