1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(C) 2019 Marvell International Ltd.
7 #include "otx2_ethdev.h"
9 #define NIX_XMIT_FC_OR_RETURN(txq, pkts) do { \
10 /* Cached value is low, Update the fc_cache_pkts */ \
11 if (unlikely((txq)->fc_cache_pkts < (pkts))) { \
12 /* Multiply with sqe_per_sqb to express in pkts */ \
13 (txq)->fc_cache_pkts = \
14 ((txq)->nb_sqb_bufs_adj - *(txq)->fc_mem) << \
15 (txq)->sqes_per_sqb_log2; \
16 /* Check it again for the room */ \
17 if (unlikely((txq)->fc_cache_pkts < (pkts))) \
23 static __rte_always_inline uint16_t
24 nix_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
25 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
27 struct otx2_eth_txq *txq = tx_queue; uint16_t i;
28 const rte_iova_t io_addr = txq->io_addr;
29 void *lmt_addr = txq->lmt_addr;
31 NIX_XMIT_FC_OR_RETURN(txq, pkts);
33 otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
35 /* Lets commit any changes in the packet */
38 for (i = 0; i < pkts; i++) {
39 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
40 /* Passing no of segdw as 4: HDR + EXT + SG + SMEM */
41 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
42 tx_pkts[i]->ol_flags, 4, flags);
43 otx2_nix_xmit_one(cmd, lmt_addr, io_addr, flags);
46 /* Reduce the cached count */
47 txq->fc_cache_pkts -= pkts;
52 static __rte_always_inline uint16_t
53 nix_xmit_pkts_mseg(void *tx_queue, struct rte_mbuf **tx_pkts,
54 uint16_t pkts, uint64_t *cmd, const uint16_t flags)
56 struct otx2_eth_txq *txq = tx_queue; uint64_t i;
57 const rte_iova_t io_addr = txq->io_addr;
58 void *lmt_addr = txq->lmt_addr;
61 NIX_XMIT_FC_OR_RETURN(txq, pkts);
63 otx2_lmt_mov(cmd, &txq->cmd[0], otx2_nix_tx_ext_subs(flags));
65 /* Lets commit any changes in the packet */
68 for (i = 0; i < pkts; i++) {
69 otx2_nix_xmit_prepare(tx_pkts[i], cmd, flags);
70 segdw = otx2_nix_prepare_mseg(tx_pkts[i], cmd, flags);
71 otx2_nix_xmit_prepare_tstamp(cmd, &txq->cmd[0],
72 tx_pkts[i]->ol_flags, segdw,
74 otx2_nix_xmit_mseg_one(cmd, lmt_addr, io_addr, segdw);
77 /* Reduce the cached count */
78 txq->fc_cache_pkts -= pkts;
83 #if defined(RTE_ARCH_ARM64)
85 #define NIX_DESCS_PER_LOOP 4
86 static __rte_always_inline uint16_t
87 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
88 uint16_t pkts, const uint16_t flags)
90 uint64x2_t dataoff_iova0, dataoff_iova1, dataoff_iova2, dataoff_iova3;
91 uint64x2_t len_olflags0, len_olflags1, len_olflags2, len_olflags3;
92 uint64_t *mbuf0, *mbuf1, *mbuf2, *mbuf3;
93 uint64x2_t senddesc01_w0, senddesc23_w0;
94 uint64x2_t senddesc01_w1, senddesc23_w1;
95 uint64x2_t sgdesc01_w0, sgdesc23_w0;
96 uint64x2_t sgdesc01_w1, sgdesc23_w1;
97 struct otx2_eth_txq *txq = tx_queue;
98 uint64_t *lmt_addr = txq->lmt_addr;
99 rte_iova_t io_addr = txq->io_addr;
100 uint64x2_t ltypes01, ltypes23;
101 uint64x2_t xtmp128, ytmp128;
102 uint64x2_t xmask01, xmask23;
103 uint64x2_t mbuf01, mbuf23;
104 uint64x2_t cmd00, cmd01;
105 uint64x2_t cmd10, cmd11;
106 uint64x2_t cmd20, cmd21;
107 uint64x2_t cmd30, cmd31;
108 uint64_t lmt_status, i;
110 pkts = RTE_ALIGN_FLOOR(pkts, NIX_DESCS_PER_LOOP);
112 NIX_XMIT_FC_OR_RETURN(txq, pkts);
114 /* Reduce the cached count */
115 txq->fc_cache_pkts -= pkts;
117 /* Lets commit any changes in the packet */
120 senddesc01_w0 = vld1q_dup_u64(&txq->cmd[0]);
121 senddesc23_w0 = senddesc01_w0;
122 senddesc01_w1 = vdupq_n_u64(0);
123 senddesc23_w1 = senddesc01_w1;
124 sgdesc01_w0 = vld1q_dup_u64(&txq->cmd[2]);
125 sgdesc23_w0 = sgdesc01_w0;
127 for (i = 0; i < pkts; i += NIX_DESCS_PER_LOOP) {
128 mbuf01 = vld1q_u64((uint64_t *)tx_pkts);
129 mbuf23 = vld1q_u64((uint64_t *)(tx_pkts + 2));
131 /* Clear lower 32bit of SEND_HDR_W0 and SEND_SG_W0 */
132 senddesc01_w0 = vbicq_u64(senddesc01_w0,
133 vdupq_n_u64(0xFFFFFFFF));
134 sgdesc01_w0 = vbicq_u64(sgdesc01_w0,
135 vdupq_n_u64(0xFFFFFFFF));
137 senddesc23_w0 = senddesc01_w0;
138 sgdesc23_w0 = sgdesc01_w0;
140 tx_pkts = tx_pkts + NIX_DESCS_PER_LOOP;
142 /* Move mbufs to iova */
143 mbuf0 = (uint64_t *)vgetq_lane_u64(mbuf01, 0);
144 mbuf1 = (uint64_t *)vgetq_lane_u64(mbuf01, 1);
145 mbuf2 = (uint64_t *)vgetq_lane_u64(mbuf23, 0);
146 mbuf3 = (uint64_t *)vgetq_lane_u64(mbuf23, 1);
148 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
149 offsetof(struct rte_mbuf, buf_iova));
150 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
151 offsetof(struct rte_mbuf, buf_iova));
152 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
153 offsetof(struct rte_mbuf, buf_iova));
154 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
155 offsetof(struct rte_mbuf, buf_iova));
157 * Get mbuf's, olflags, iova, pktlen, dataoff
158 * dataoff_iovaX.D[0] = iova,
159 * dataoff_iovaX.D[1](15:0) = mbuf->dataoff
160 * len_olflagsX.D[0] = ol_flags,
161 * len_olflagsX.D[1](63:32) = mbuf->pkt_len
163 dataoff_iova0 = vld1q_u64(mbuf0);
164 len_olflags0 = vld1q_u64(mbuf0 + 2);
165 dataoff_iova1 = vld1q_u64(mbuf1);
166 len_olflags1 = vld1q_u64(mbuf1 + 2);
167 dataoff_iova2 = vld1q_u64(mbuf2);
168 len_olflags2 = vld1q_u64(mbuf2 + 2);
169 dataoff_iova3 = vld1q_u64(mbuf3);
170 len_olflags3 = vld1q_u64(mbuf3 + 2);
172 if (flags & NIX_TX_OFFLOAD_MBUF_NOFF_F) {
173 struct rte_mbuf *mbuf;
174 /* Set don't free bit if reference count > 1 */
175 xmask01 = vdupq_n_u64(0);
178 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
179 offsetof(struct rte_mbuf, buf_iova));
181 if (otx2_nix_prefree_seg(mbuf))
182 vsetq_lane_u64(0x80000, xmask01, 0);
184 __mempool_check_cookies(mbuf->pool,
188 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
189 offsetof(struct rte_mbuf, buf_iova));
190 if (otx2_nix_prefree_seg(mbuf))
191 vsetq_lane_u64(0x80000, xmask01, 1);
193 __mempool_check_cookies(mbuf->pool,
197 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
198 offsetof(struct rte_mbuf, buf_iova));
199 if (otx2_nix_prefree_seg(mbuf))
200 vsetq_lane_u64(0x80000, xmask23, 0);
202 __mempool_check_cookies(mbuf->pool,
206 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
207 offsetof(struct rte_mbuf, buf_iova));
208 if (otx2_nix_prefree_seg(mbuf))
209 vsetq_lane_u64(0x80000, xmask23, 1);
211 __mempool_check_cookies(mbuf->pool,
214 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
215 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
217 struct rte_mbuf *mbuf;
218 /* Mark mempool object as "put" since
221 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf0 -
222 offsetof(struct rte_mbuf, buf_iova));
223 __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
226 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf1 -
227 offsetof(struct rte_mbuf, buf_iova));
228 __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
231 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf2 -
232 offsetof(struct rte_mbuf, buf_iova));
233 __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
236 mbuf = (struct rte_mbuf *)((uintptr_t)mbuf3 -
237 offsetof(struct rte_mbuf, buf_iova));
238 __mempool_check_cookies(mbuf->pool, (void **)&mbuf,
243 /* Move mbufs to point pool */
244 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
245 offsetof(struct rte_mbuf, pool) -
246 offsetof(struct rte_mbuf, buf_iova));
247 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
248 offsetof(struct rte_mbuf, pool) -
249 offsetof(struct rte_mbuf, buf_iova));
250 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
251 offsetof(struct rte_mbuf, pool) -
252 offsetof(struct rte_mbuf, buf_iova));
253 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
254 offsetof(struct rte_mbuf, pool) -
255 offsetof(struct rte_mbuf, buf_iova));
258 (NIX_TX_OFFLOAD_OL3_OL4_CSUM_F |
259 NIX_TX_OFFLOAD_L3_L4_CSUM_F)) {
260 /* Get tx_offload for ol2, ol3, l2, l3 lengths */
262 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
263 * E(8):OL2_LEN(7):OL3_LEN(9):E(24):L3_LEN(9):L2_LEN(7)
266 asm volatile ("LD1 {%[a].D}[0],[%[in]]\n\t" :
267 [a]"+w"(senddesc01_w1) :
268 [in]"r"(mbuf0 + 2) : "memory");
270 asm volatile ("LD1 {%[a].D}[1],[%[in]]\n\t" :
271 [a]"+w"(senddesc01_w1) :
272 [in]"r"(mbuf1 + 2) : "memory");
274 asm volatile ("LD1 {%[b].D}[0],[%[in]]\n\t" :
275 [b]"+w"(senddesc23_w1) :
276 [in]"r"(mbuf2 + 2) : "memory");
278 asm volatile ("LD1 {%[b].D}[1],[%[in]]\n\t" :
279 [b]"+w"(senddesc23_w1) :
280 [in]"r"(mbuf3 + 2) : "memory");
282 /* Get pool pointer alone */
283 mbuf0 = (uint64_t *)*mbuf0;
284 mbuf1 = (uint64_t *)*mbuf1;
285 mbuf2 = (uint64_t *)*mbuf2;
286 mbuf3 = (uint64_t *)*mbuf3;
288 /* Get pool pointer alone */
289 mbuf0 = (uint64_t *)*mbuf0;
290 mbuf1 = (uint64_t *)*mbuf1;
291 mbuf2 = (uint64_t *)*mbuf2;
292 mbuf3 = (uint64_t *)*mbuf3;
295 const uint8x16_t shuf_mask2 = {
296 0x4, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
297 0xc, 0xd, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
299 xtmp128 = vzip2q_u64(len_olflags0, len_olflags1);
300 ytmp128 = vzip2q_u64(len_olflags2, len_olflags3);
302 /* Clear dataoff_iovaX.D[1] bits other than dataoff(15:0) */
303 const uint64x2_t and_mask0 = {
308 dataoff_iova0 = vandq_u64(dataoff_iova0, and_mask0);
309 dataoff_iova1 = vandq_u64(dataoff_iova1, and_mask0);
310 dataoff_iova2 = vandq_u64(dataoff_iova2, and_mask0);
311 dataoff_iova3 = vandq_u64(dataoff_iova3, and_mask0);
314 * Pick only 16 bits of pktlen preset at bits 63:32
315 * and place them at bits 15:0.
317 xtmp128 = vqtbl1q_u8(xtmp128, shuf_mask2);
318 ytmp128 = vqtbl1q_u8(ytmp128, shuf_mask2);
320 /* Add pairwise to get dataoff + iova in sgdesc_w1 */
321 sgdesc01_w1 = vpaddq_u64(dataoff_iova0, dataoff_iova1);
322 sgdesc23_w1 = vpaddq_u64(dataoff_iova2, dataoff_iova3);
324 /* Orr both sgdesc_w0 and senddesc_w0 with 16 bits of
325 * pktlen at 15:0 position.
327 sgdesc01_w0 = vorrq_u64(sgdesc01_w0, xtmp128);
328 sgdesc23_w0 = vorrq_u64(sgdesc23_w0, ytmp128);
329 senddesc01_w0 = vorrq_u64(senddesc01_w0, xtmp128);
330 senddesc23_w0 = vorrq_u64(senddesc23_w0, ytmp128);
332 if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
333 !(flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
335 * Lookup table to translate ol_flags to
336 * il3/il4 types. But we still use ol3/ol4 types in
337 * senddesc_w1 as only one header processing is enabled.
339 const uint8x16_t tbl = {
340 /* [0-15] = il4type:il3type */
341 0x04, /* none (IPv6 assumed) */
342 0x14, /* PKT_TX_TCP_CKSUM (IPv6 assumed) */
343 0x24, /* PKT_TX_SCTP_CKSUM (IPv6 assumed) */
344 0x34, /* PKT_TX_UDP_CKSUM (IPv6 assumed) */
345 0x03, /* PKT_TX_IP_CKSUM */
346 0x13, /* PKT_TX_IP_CKSUM | PKT_TX_TCP_CKSUM */
347 0x23, /* PKT_TX_IP_CKSUM | PKT_TX_SCTP_CKSUM */
348 0x33, /* PKT_TX_IP_CKSUM | PKT_TX_UDP_CKSUM */
349 0x02, /* PKT_TX_IPV4 */
350 0x12, /* PKT_TX_IPV4 | PKT_TX_TCP_CKSUM */
351 0x22, /* PKT_TX_IPV4 | PKT_TX_SCTP_CKSUM */
352 0x32, /* PKT_TX_IPV4 | PKT_TX_UDP_CKSUM */
353 0x03, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM */
354 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
357 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
360 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
365 /* Extract olflags to translate to iltypes */
366 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
367 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
370 * E(47):L3_LEN(9):L2_LEN(7+z)
371 * E(47):L3_LEN(9):L2_LEN(7+z)
373 senddesc01_w1 = vshlq_n_u64(senddesc01_w1, 1);
374 senddesc23_w1 = vshlq_n_u64(senddesc23_w1, 1);
376 /* Move OLFLAGS bits 55:52 to 51:48
377 * with zeros preprended on the byte and rest
380 xtmp128 = vshrq_n_u8(xtmp128, 4);
381 ytmp128 = vshrq_n_u8(ytmp128, 4);
383 * E(48):L3_LEN(8):L2_LEN(z+7)
384 * E(48):L3_LEN(8):L2_LEN(z+7)
386 const int8x16_t tshft3 = {
387 -1, 0, 8, 8, 8, 8, 8, 8,
388 -1, 0, 8, 8, 8, 8, 8, 8,
391 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
392 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
395 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
396 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
398 /* Just use ld1q to retrieve aura
399 * when we don't need tx_offload
401 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
402 offsetof(struct rte_mempool, pool_id));
403 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
404 offsetof(struct rte_mempool, pool_id));
405 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
406 offsetof(struct rte_mempool, pool_id));
407 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
408 offsetof(struct rte_mempool, pool_id));
410 /* Pick only relevant fields i.e Bit 48:55 of iltype
411 * and place it in ol3/ol4type of senddesc_w1
413 const uint8x16_t shuf_mask0 = {
414 0xFF, 0xFF, 0xFF, 0xFF, 0x6, 0xFF, 0xFF, 0xFF,
415 0xFF, 0xFF, 0xFF, 0xFF, 0xE, 0xFF, 0xFF, 0xFF,
418 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
419 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
421 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
422 * a [E(32):E(16):OL3(8):OL2(8)]
424 * a [E(32):E(16):(OL3+OL2):OL2]
425 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
427 senddesc01_w1 = vaddq_u8(senddesc01_w1,
428 vshlq_n_u16(senddesc01_w1, 8));
429 senddesc23_w1 = vaddq_u8(senddesc23_w1,
430 vshlq_n_u16(senddesc23_w1, 8));
432 /* Create first half of 4W cmd for 4 mbufs (sgdesc) */
433 cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
434 cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
435 cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
436 cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
438 xmask01 = vdupq_n_u64(0);
440 asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
441 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
443 asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
444 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
446 asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
447 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
449 asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
450 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
451 xmask01 = vshlq_n_u64(xmask01, 20);
452 xmask23 = vshlq_n_u64(xmask23, 20);
454 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
455 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
456 /* Move ltypes to senddesc*_w1 */
457 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
458 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
460 /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
461 cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
462 cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
463 cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
464 cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
466 } else if (!(flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
467 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
469 * Lookup table to translate ol_flags to
473 const uint8x16_t tbl = {
474 /* [0-15] = ol4type:ol3type */
476 0x03, /* OUTER_IP_CKSUM */
477 0x02, /* OUTER_IPV4 */
478 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
479 0x04, /* OUTER_IPV6 */
480 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
481 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
482 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
485 0x00, /* OUTER_UDP_CKSUM */
486 0x33, /* OUTER_UDP_CKSUM | OUTER_IP_CKSUM */
487 0x32, /* OUTER_UDP_CKSUM | OUTER_IPV4 */
488 0x33, /* OUTER_UDP_CKSUM | OUTER_IPV4 |
491 0x34, /* OUTER_UDP_CKSUM | OUTER_IPV6 */
492 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
495 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
498 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
499 * OUTER_IPV4 | OUTER_IP_CKSUM
503 /* Extract olflags to translate to iltypes */
504 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
505 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
508 * E(47):OL3_LEN(9):OL2_LEN(7+z)
509 * E(47):OL3_LEN(9):OL2_LEN(7+z)
511 const uint8x16_t shuf_mask5 = {
512 0x6, 0x5, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
513 0xE, 0xD, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
515 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
516 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
518 /* Extract outer ol flags only */
519 const uint64x2_t o_cksum_mask = {
524 xtmp128 = vandq_u64(xtmp128, o_cksum_mask);
525 ytmp128 = vandq_u64(ytmp128, o_cksum_mask);
527 /* Extract OUTER_UDP_CKSUM bit 41 and
531 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
532 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
534 /* Shift oltype by 2 to start nibble from BIT(56)
537 xtmp128 = vshrq_n_u8(xtmp128, 2);
538 ytmp128 = vshrq_n_u8(ytmp128, 2);
540 * E(48):L3_LEN(8):L2_LEN(z+7)
541 * E(48):L3_LEN(8):L2_LEN(z+7)
543 const int8x16_t tshft3 = {
544 -1, 0, 8, 8, 8, 8, 8, 8,
545 -1, 0, 8, 8, 8, 8, 8, 8,
548 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
549 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
552 ltypes01 = vqtbl1q_u8(tbl, xtmp128);
553 ltypes23 = vqtbl1q_u8(tbl, ytmp128);
555 /* Just use ld1q to retrieve aura
556 * when we don't need tx_offload
558 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
559 offsetof(struct rte_mempool, pool_id));
560 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
561 offsetof(struct rte_mempool, pool_id));
562 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
563 offsetof(struct rte_mempool, pool_id));
564 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
565 offsetof(struct rte_mempool, pool_id));
567 /* Pick only relevant fields i.e Bit 56:63 of oltype
568 * and place it in ol3/ol4type of senddesc_w1
570 const uint8x16_t shuf_mask0 = {
571 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0xFF, 0xFF, 0xFF,
572 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xFF, 0xFF, 0xFF,
575 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
576 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
578 /* Prepare ol4ptr, ol3ptr from ol3len, ol2len.
579 * a [E(32):E(16):OL3(8):OL2(8)]
581 * a [E(32):E(16):(OL3+OL2):OL2]
582 * => E(32):E(16)::OL4PTR(8):OL3PTR(8)
584 senddesc01_w1 = vaddq_u8(senddesc01_w1,
585 vshlq_n_u16(senddesc01_w1, 8));
586 senddesc23_w1 = vaddq_u8(senddesc23_w1,
587 vshlq_n_u16(senddesc23_w1, 8));
589 /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
590 cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
591 cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
592 cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
593 cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
595 xmask01 = vdupq_n_u64(0);
597 asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
598 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
600 asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
601 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
603 asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
604 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
606 asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
607 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
608 xmask01 = vshlq_n_u64(xmask01, 20);
609 xmask23 = vshlq_n_u64(xmask23, 20);
611 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
612 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
613 /* Move ltypes to senddesc*_w1 */
614 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
615 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
617 /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
618 cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
619 cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
620 cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
621 cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
623 } else if ((flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F) &&
624 (flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)) {
625 /* Lookup table to translate ol_flags to
626 * ol4type, ol3type, il4type, il3type of senddesc_w1
628 const uint8x16x2_t tbl = {
631 /* [0-15] = il4type:il3type */
632 0x04, /* none (IPv6) */
633 0x14, /* PKT_TX_TCP_CKSUM (IPv6) */
634 0x24, /* PKT_TX_SCTP_CKSUM (IPv6) */
635 0x34, /* PKT_TX_UDP_CKSUM (IPv6) */
636 0x03, /* PKT_TX_IP_CKSUM */
637 0x13, /* PKT_TX_IP_CKSUM |
640 0x23, /* PKT_TX_IP_CKSUM |
643 0x33, /* PKT_TX_IP_CKSUM |
646 0x02, /* PKT_TX_IPV4 */
647 0x12, /* PKT_TX_IPV4 |
650 0x22, /* PKT_TX_IPV4 |
653 0x32, /* PKT_TX_IPV4 |
656 0x03, /* PKT_TX_IPV4 |
659 0x13, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
662 0x23, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
665 0x33, /* PKT_TX_IPV4 | PKT_TX_IP_CKSUM |
671 /* [16-31] = ol4type:ol3type */
673 0x03, /* OUTER_IP_CKSUM */
674 0x02, /* OUTER_IPV4 */
675 0x03, /* OUTER_IPV4 | OUTER_IP_CKSUM */
676 0x04, /* OUTER_IPV6 */
677 0x00, /* OUTER_IPV6 | OUTER_IP_CKSUM */
678 0x00, /* OUTER_IPV6 | OUTER_IPV4 */
679 0x00, /* OUTER_IPV6 | OUTER_IPV4 |
682 0x00, /* OUTER_UDP_CKSUM */
683 0x33, /* OUTER_UDP_CKSUM |
686 0x32, /* OUTER_UDP_CKSUM |
689 0x33, /* OUTER_UDP_CKSUM |
690 * OUTER_IPV4 | OUTER_IP_CKSUM
692 0x34, /* OUTER_UDP_CKSUM |
695 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
698 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
701 0x00, /* OUTER_UDP_CKSUM | OUTER_IPV6 |
702 * OUTER_IPV4 | OUTER_IP_CKSUM
708 /* Extract olflags to translate to oltype & iltype */
709 xtmp128 = vzip1q_u64(len_olflags0, len_olflags1);
710 ytmp128 = vzip1q_u64(len_olflags2, len_olflags3);
713 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
714 * E(8):OL2_LN(7):OL3_LN(9):E(23):L3_LN(9):L2_LN(7+z)
716 const uint32x4_t tshft_4 = {
720 senddesc01_w1 = vshlq_u32(senddesc01_w1, tshft_4);
721 senddesc23_w1 = vshlq_u32(senddesc23_w1, tshft_4);
724 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
725 * E(32):L3_LEN(8):L2_LEN(7+Z):OL3_LEN(8):OL2_LEN(7+Z)
727 const uint8x16_t shuf_mask5 = {
728 0x6, 0x5, 0x0, 0x1, 0xFF, 0xFF, 0xFF, 0xFF,
729 0xE, 0xD, 0x8, 0x9, 0xFF, 0xFF, 0xFF, 0xFF,
731 senddesc01_w1 = vqtbl1q_u8(senddesc01_w1, shuf_mask5);
732 senddesc23_w1 = vqtbl1q_u8(senddesc23_w1, shuf_mask5);
734 /* Extract outer and inner header ol_flags */
735 const uint64x2_t oi_cksum_mask = {
740 xtmp128 = vandq_u64(xtmp128, oi_cksum_mask);
741 ytmp128 = vandq_u64(ytmp128, oi_cksum_mask);
743 /* Extract OUTER_UDP_CKSUM bit 41 and
747 xtmp128 = xtmp128 | vshlq_n_u64(xtmp128, 20);
748 ytmp128 = ytmp128 | vshlq_n_u64(ytmp128, 20);
750 /* Shift right oltype by 2 and iltype by 4
751 * to start oltype nibble from BIT(58)
752 * instead of BIT(56) and iltype nibble from BIT(48)
753 * instead of BIT(52).
755 const int8x16_t tshft5 = {
756 8, 8, 8, 8, 8, 8, -4, -2,
757 8, 8, 8, 8, 8, 8, -4, -2,
760 xtmp128 = vshlq_u8(xtmp128, tshft5);
761 ytmp128 = vshlq_u8(ytmp128, tshft5);
763 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
764 * E(32):L3_LEN(8):L2_LEN(8):OL3_LEN(8):OL2_LEN(8)
766 const int8x16_t tshft3 = {
767 -1, 0, -1, 0, 0, 0, 0, 0,
768 -1, 0, -1, 0, 0, 0, 0, 0,
771 senddesc01_w1 = vshlq_u8(senddesc01_w1, tshft3);
772 senddesc23_w1 = vshlq_u8(senddesc23_w1, tshft3);
774 /* Mark Bit(4) of oltype */
775 const uint64x2_t oi_cksum_mask2 = {
780 xtmp128 = vorrq_u64(xtmp128, oi_cksum_mask2);
781 ytmp128 = vorrq_u64(ytmp128, oi_cksum_mask2);
784 ltypes01 = vqtbl2q_u8(tbl, xtmp128);
785 ltypes23 = vqtbl2q_u8(tbl, ytmp128);
787 /* Just use ld1q to retrieve aura
788 * when we don't need tx_offload
790 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
791 offsetof(struct rte_mempool, pool_id));
792 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
793 offsetof(struct rte_mempool, pool_id));
794 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
795 offsetof(struct rte_mempool, pool_id));
796 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
797 offsetof(struct rte_mempool, pool_id));
799 /* Pick only relevant fields i.e Bit 48:55 of iltype and
800 * Bit 56:63 of oltype and place it in corresponding
801 * place in senddesc_w1.
803 const uint8x16_t shuf_mask0 = {
804 0xFF, 0xFF, 0xFF, 0xFF, 0x7, 0x6, 0xFF, 0xFF,
805 0xFF, 0xFF, 0xFF, 0xFF, 0xF, 0xE, 0xFF, 0xFF,
808 ltypes01 = vqtbl1q_u8(ltypes01, shuf_mask0);
809 ltypes23 = vqtbl1q_u8(ltypes23, shuf_mask0);
811 /* Prepare l4ptr, l3ptr, ol4ptr, ol3ptr from
812 * l3len, l2len, ol3len, ol2len.
813 * a [E(32):L3(8):L2(8):OL3(8):OL2(8)]
815 * a [E:(L3+L2):(L2+OL3):(OL3+OL2):OL2]
817 * a [E:(L3+L2+OL3+OL2):(L2+OL3+OL2):(OL3+OL2):OL2]
818 * => E(32):IL4PTR(8):IL3PTR(8):OL4PTR(8):OL3PTR(8)
820 senddesc01_w1 = vaddq_u8(senddesc01_w1,
821 vshlq_n_u32(senddesc01_w1, 8));
822 senddesc23_w1 = vaddq_u8(senddesc23_w1,
823 vshlq_n_u32(senddesc23_w1, 8));
825 /* Create second half of 4W cmd for 4 mbufs (sgdesc) */
826 cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
827 cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
828 cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
829 cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
831 /* Continue preparing l4ptr, l3ptr, ol4ptr, ol3ptr */
832 senddesc01_w1 = vaddq_u8(senddesc01_w1,
833 vshlq_n_u32(senddesc01_w1, 16));
834 senddesc23_w1 = vaddq_u8(senddesc23_w1,
835 vshlq_n_u32(senddesc23_w1, 16));
837 xmask01 = vdupq_n_u64(0);
839 asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
840 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
842 asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
843 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
845 asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
846 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
848 asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
849 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
850 xmask01 = vshlq_n_u64(xmask01, 20);
851 xmask23 = vshlq_n_u64(xmask23, 20);
853 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
854 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
855 /* Move ltypes to senddesc*_w1 */
856 senddesc01_w1 = vorrq_u64(senddesc01_w1, ltypes01);
857 senddesc23_w1 = vorrq_u64(senddesc23_w1, ltypes23);
859 /* Create first half of 4W cmd for 4 mbufs (sendhdr) */
860 cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
861 cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
862 cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
863 cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
865 /* Just use ld1q to retrieve aura
866 * when we don't need tx_offload
868 mbuf0 = (uint64_t *)((uintptr_t)mbuf0 +
869 offsetof(struct rte_mempool, pool_id));
870 mbuf1 = (uint64_t *)((uintptr_t)mbuf1 +
871 offsetof(struct rte_mempool, pool_id));
872 mbuf2 = (uint64_t *)((uintptr_t)mbuf2 +
873 offsetof(struct rte_mempool, pool_id));
874 mbuf3 = (uint64_t *)((uintptr_t)mbuf3 +
875 offsetof(struct rte_mempool, pool_id));
876 xmask01 = vdupq_n_u64(0);
878 asm volatile ("LD1 {%[a].H}[0],[%[in]]\n\t" :
879 [a]"+w"(xmask01) : [in]"r"(mbuf0) : "memory");
881 asm volatile ("LD1 {%[a].H}[4],[%[in]]\n\t" :
882 [a]"+w"(xmask01) : [in]"r"(mbuf1) : "memory");
884 asm volatile ("LD1 {%[b].H}[0],[%[in]]\n\t" :
885 [b]"+w"(xmask23) : [in]"r"(mbuf2) : "memory");
887 asm volatile ("LD1 {%[b].H}[4],[%[in]]\n\t" :
888 [b]"+w"(xmask23) : [in]"r"(mbuf3) : "memory");
889 xmask01 = vshlq_n_u64(xmask01, 20);
890 xmask23 = vshlq_n_u64(xmask23, 20);
892 senddesc01_w0 = vorrq_u64(senddesc01_w0, xmask01);
893 senddesc23_w0 = vorrq_u64(senddesc23_w0, xmask23);
895 /* Create 4W cmd for 4 mbufs (sendhdr, sgdesc) */
896 cmd00 = vzip1q_u64(senddesc01_w0, senddesc01_w1);
897 cmd01 = vzip1q_u64(sgdesc01_w0, sgdesc01_w1);
898 cmd10 = vzip2q_u64(senddesc01_w0, senddesc01_w1);
899 cmd11 = vzip2q_u64(sgdesc01_w0, sgdesc01_w1);
900 cmd20 = vzip1q_u64(senddesc23_w0, senddesc23_w1);
901 cmd21 = vzip1q_u64(sgdesc23_w0, sgdesc23_w1);
902 cmd30 = vzip2q_u64(senddesc23_w0, senddesc23_w1);
903 cmd31 = vzip2q_u64(sgdesc23_w0, sgdesc23_w1);
907 vst1q_u64(lmt_addr, cmd00);
908 vst1q_u64(lmt_addr + 2, cmd01);
909 vst1q_u64(lmt_addr + 4, cmd10);
910 vst1q_u64(lmt_addr + 6, cmd11);
911 vst1q_u64(lmt_addr + 8, cmd20);
912 vst1q_u64(lmt_addr + 10, cmd21);
913 vst1q_u64(lmt_addr + 12, cmd30);
914 vst1q_u64(lmt_addr + 14, cmd31);
915 lmt_status = otx2_lmt_submit(io_addr);
917 } while (lmt_status == 0);
924 static __rte_always_inline uint16_t
925 nix_xmit_pkts_vector(void *tx_queue, struct rte_mbuf **tx_pkts,
926 uint16_t pkts, const uint16_t flags)
928 RTE_SET_USED(tx_queue);
929 RTE_SET_USED(tx_pkts);
936 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
937 static uint16_t __rte_noinline __hot \
938 otx2_nix_xmit_pkts_ ## name(void *tx_queue, \
939 struct rte_mbuf **tx_pkts, uint16_t pkts) \
943 return nix_xmit_pkts(tx_queue, tx_pkts, pkts, cmd, flags); \
946 NIX_TX_FASTPATH_MODES
949 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
950 static uint16_t __rte_noinline __hot \
951 otx2_nix_xmit_pkts_mseg_ ## name(void *tx_queue, \
952 struct rte_mbuf **tx_pkts, uint16_t pkts) \
954 uint64_t cmd[(sz) + NIX_TX_MSEG_SG_DWORDS - 2]; \
956 return nix_xmit_pkts_mseg(tx_queue, tx_pkts, pkts, cmd, \
957 (flags) | NIX_TX_MULTI_SEG_F); \
960 NIX_TX_FASTPATH_MODES
963 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
964 static uint16_t __rte_noinline __hot \
965 otx2_nix_xmit_pkts_vec_ ## name(void *tx_queue, \
966 struct rte_mbuf **tx_pkts, uint16_t pkts) \
968 /* VLAN and TSTMP is not supported by vec */ \
969 if ((flags) & NIX_TX_OFFLOAD_VLAN_QINQ_F || \
970 (flags) & NIX_TX_OFFLOAD_TSTAMP_F) \
972 return nix_xmit_pkts_vector(tx_queue, tx_pkts, pkts, (flags)); \
975 NIX_TX_FASTPATH_MODES
979 pick_tx_func(struct rte_eth_dev *eth_dev,
980 const eth_tx_burst_t tx_burst[2][2][2][2][2])
982 struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
984 /* [TSTMP] [NOFF] [VLAN] [OL3_OL4_CSUM] [IL3_IL4_CSUM] */
985 eth_dev->tx_pkt_burst = tx_burst
986 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_TSTAMP_F)]
987 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_MBUF_NOFF_F)]
988 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_VLAN_QINQ_F)]
989 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_OL3_OL4_CSUM_F)]
990 [!!(dev->tx_offload_flags & NIX_TX_OFFLOAD_L3_L4_CSUM_F)];
994 otx2_eth_set_tx_function(struct rte_eth_dev *eth_dev)
996 struct otx2_eth_dev *dev = otx2_eth_pmd_priv(eth_dev);
998 const eth_tx_burst_t nix_eth_tx_burst[2][2][2][2][2] = {
999 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
1000 [f4][f3][f2][f1][f0] = otx2_nix_xmit_pkts_ ## name,
1002 NIX_TX_FASTPATH_MODES
1006 const eth_tx_burst_t nix_eth_tx_burst_mseg[2][2][2][2][2] = {
1007 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
1008 [f4][f3][f2][f1][f0] = otx2_nix_xmit_pkts_mseg_ ## name,
1010 NIX_TX_FASTPATH_MODES
1014 const eth_tx_burst_t nix_eth_tx_vec_burst[2][2][2][2][2] = {
1015 #define T(name, f4, f3, f2, f1, f0, sz, flags) \
1016 [f4][f3][f2][f1][f0] = otx2_nix_xmit_pkts_vec_ ## name,
1018 NIX_TX_FASTPATH_MODES
1022 if (dev->scalar_ena ||
1023 (dev->tx_offload_flags &
1024 (NIX_TX_OFFLOAD_VLAN_QINQ_F | NIX_TX_OFFLOAD_TSTAMP_F)))
1025 pick_tx_func(eth_dev, nix_eth_tx_burst);
1027 pick_tx_func(eth_dev, nix_eth_tx_vec_burst);
1029 if (dev->tx_offloads & DEV_TX_OFFLOAD_MULTI_SEGS)
1030 pick_tx_func(eth_dev, nix_eth_tx_burst_mseg);