net/mlx5: use SPDX tags in 6WIND copyrighted files
[dpdk.git] / drivers / net / mlx5 / mlx5_rxtx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox.
4  */
5
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28
29 #include "mlx5.h"
30 #include "mlx5_utils.h"
31 #include "mlx5_rxtx.h"
32 #include "mlx5_autoconf.h"
33 #include "mlx5_defs.h"
34 #include "mlx5_prm.h"
35
36 static __rte_always_inline uint32_t
37 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe);
38
39 static __rte_always_inline int
40 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
41                  uint16_t cqe_cnt, uint32_t *rss_hash);
42
43 static __rte_always_inline uint32_t
44 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
45
46 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
47         [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
48 };
49
50 /**
51  * Build a table to translate Rx completion flags to packet type.
52  *
53  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
54  */
55 void
56 mlx5_set_ptype_table(void)
57 {
58         unsigned int i;
59         uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
60
61         /* Last entry must not be overwritten, reserved for errored packet. */
62         for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
63                 (*p)[i] = RTE_PTYPE_UNKNOWN;
64         /*
65          * The index to the array should have:
66          * bit[1:0] = l3_hdr_type
67          * bit[4:2] = l4_hdr_type
68          * bit[5] = ip_frag
69          * bit[6] = tunneled
70          * bit[7] = outer_l3_type
71          */
72         /* L2 */
73         (*p)[0x00] = RTE_PTYPE_L2_ETHER;
74         /* L3 */
75         (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
76                      RTE_PTYPE_L4_NONFRAG;
77         (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
78                      RTE_PTYPE_L4_NONFRAG;
79         /* Fragmented */
80         (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
81                      RTE_PTYPE_L4_FRAG;
82         (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
83                      RTE_PTYPE_L4_FRAG;
84         /* TCP */
85         (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
86                      RTE_PTYPE_L4_TCP;
87         (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
88                      RTE_PTYPE_L4_TCP;
89         /* UDP */
90         (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
91                      RTE_PTYPE_L4_UDP;
92         (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
93                      RTE_PTYPE_L4_UDP;
94         /* Repeat with outer_l3_type being set. Just in case. */
95         (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
96                      RTE_PTYPE_L4_NONFRAG;
97         (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
98                      RTE_PTYPE_L4_NONFRAG;
99         (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
100                      RTE_PTYPE_L4_FRAG;
101         (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
102                      RTE_PTYPE_L4_FRAG;
103         (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
104                      RTE_PTYPE_L4_TCP;
105         (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
106                      RTE_PTYPE_L4_TCP;
107         (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
108                      RTE_PTYPE_L4_UDP;
109         (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
110                      RTE_PTYPE_L4_UDP;
111         /* Tunneled - L3 */
112         (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
113                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
114                      RTE_PTYPE_INNER_L4_NONFRAG;
115         (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
116                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
117                      RTE_PTYPE_INNER_L4_NONFRAG;
118         (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
120                      RTE_PTYPE_INNER_L4_NONFRAG;
121         (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
122                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
123                      RTE_PTYPE_INNER_L4_NONFRAG;
124         /* Tunneled - Fragmented */
125         (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
126                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
127                      RTE_PTYPE_INNER_L4_FRAG;
128         (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
130                      RTE_PTYPE_INNER_L4_FRAG;
131         (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
132                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
133                      RTE_PTYPE_INNER_L4_FRAG;
134         (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
135                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
136                      RTE_PTYPE_INNER_L4_FRAG;
137         /* Tunneled - TCP */
138         (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
139                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
140                      RTE_PTYPE_INNER_L4_TCP;
141         (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
142                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
143                      RTE_PTYPE_INNER_L4_TCP;
144         (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
145                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
146                      RTE_PTYPE_INNER_L4_TCP;
147         (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
148                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
149                      RTE_PTYPE_INNER_L4_TCP;
150         /* Tunneled - UDP */
151         (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
152                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
153                      RTE_PTYPE_INNER_L4_UDP;
154         (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
155                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
156                      RTE_PTYPE_INNER_L4_UDP;
157         (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
158                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
159                      RTE_PTYPE_INNER_L4_UDP;
160         (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
161                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
162                      RTE_PTYPE_INNER_L4_UDP;
163 }
164
165 /**
166  * Return the size of tailroom of WQ.
167  *
168  * @param txq
169  *   Pointer to TX queue structure.
170  * @param addr
171  *   Pointer to tail of WQ.
172  *
173  * @return
174  *   Size of tailroom.
175  */
176 static inline size_t
177 tx_mlx5_wq_tailroom(struct mlx5_txq_data *txq, void *addr)
178 {
179         size_t tailroom;
180         tailroom = (uintptr_t)(txq->wqes) +
181                    (1 << txq->wqe_n) * MLX5_WQE_SIZE -
182                    (uintptr_t)addr;
183         return tailroom;
184 }
185
186 /**
187  * Copy data to tailroom of circular queue.
188  *
189  * @param dst
190  *   Pointer to destination.
191  * @param src
192  *   Pointer to source.
193  * @param n
194  *   Number of bytes to copy.
195  * @param base
196  *   Pointer to head of queue.
197  * @param tailroom
198  *   Size of tailroom from dst.
199  *
200  * @return
201  *   Pointer after copied data.
202  */
203 static inline void *
204 mlx5_copy_to_wq(void *dst, const void *src, size_t n,
205                 void *base, size_t tailroom)
206 {
207         void *ret;
208
209         if (n > tailroom) {
210                 rte_memcpy(dst, src, tailroom);
211                 rte_memcpy(base, (void *)((uintptr_t)src + tailroom),
212                            n - tailroom);
213                 ret = (uint8_t *)base + n - tailroom;
214         } else {
215                 rte_memcpy(dst, src, n);
216                 ret = (n == tailroom) ? base : (uint8_t *)dst + n;
217         }
218         return ret;
219 }
220
221 /**
222  * DPDK callback to check the status of a tx descriptor.
223  *
224  * @param tx_queue
225  *   The tx queue.
226  * @param[in] offset
227  *   The index of the descriptor in the ring.
228  *
229  * @return
230  *   The status of the tx descriptor.
231  */
232 int
233 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
234 {
235         struct mlx5_txq_data *txq = tx_queue;
236         uint16_t used;
237
238         mlx5_tx_complete(txq);
239         used = txq->elts_head - txq->elts_tail;
240         if (offset < used)
241                 return RTE_ETH_TX_DESC_FULL;
242         return RTE_ETH_TX_DESC_DONE;
243 }
244
245 /**
246  * DPDK callback to check the status of a rx descriptor.
247  *
248  * @param rx_queue
249  *   The rx queue.
250  * @param[in] offset
251  *   The index of the descriptor in the ring.
252  *
253  * @return
254  *   The status of the tx descriptor.
255  */
256 int
257 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
258 {
259         struct mlx5_rxq_data *rxq = rx_queue;
260         struct rxq_zip *zip = &rxq->zip;
261         volatile struct mlx5_cqe *cqe;
262         const unsigned int cqe_n = (1 << rxq->cqe_n);
263         const unsigned int cqe_cnt = cqe_n - 1;
264         unsigned int cq_ci;
265         unsigned int used;
266
267         /* if we are processing a compressed cqe */
268         if (zip->ai) {
269                 used = zip->cqe_cnt - zip->ca;
270                 cq_ci = zip->cq_ci;
271         } else {
272                 used = 0;
273                 cq_ci = rxq->cq_ci;
274         }
275         cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
276         while (check_cqe(cqe, cqe_n, cq_ci) == 0) {
277                 int8_t op_own;
278                 unsigned int n;
279
280                 op_own = cqe->op_own;
281                 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
282                         n = rte_be_to_cpu_32(cqe->byte_cnt);
283                 else
284                         n = 1;
285                 cq_ci += n;
286                 used += n;
287                 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
288         }
289         used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
290         if (offset < used)
291                 return RTE_ETH_RX_DESC_DONE;
292         return RTE_ETH_RX_DESC_AVAIL;
293 }
294
295 /**
296  * DPDK callback for TX.
297  *
298  * @param dpdk_txq
299  *   Generic pointer to TX queue structure.
300  * @param[in] pkts
301  *   Packets to transmit.
302  * @param pkts_n
303  *   Number of packets in array.
304  *
305  * @return
306  *   Number of packets successfully transmitted (<= pkts_n).
307  */
308 uint16_t
309 mlx5_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
310 {
311         struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
312         uint16_t elts_head = txq->elts_head;
313         const uint16_t elts_n = 1 << txq->elts_n;
314         const uint16_t elts_m = elts_n - 1;
315         unsigned int i = 0;
316         unsigned int j = 0;
317         unsigned int k = 0;
318         uint16_t max_elts;
319         uint16_t max_wqe;
320         unsigned int comp;
321         volatile struct mlx5_wqe_ctrl *last_wqe = NULL;
322         unsigned int segs_n = 0;
323         const unsigned int max_inline = txq->max_inline;
324
325         if (unlikely(!pkts_n))
326                 return 0;
327         /* Prefetch first packet cacheline. */
328         rte_prefetch0(*pkts);
329         /* Start processing. */
330         mlx5_tx_complete(txq);
331         max_elts = (elts_n - (elts_head - txq->elts_tail));
332         /* A CQE slot must always be available. */
333         assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
334         max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
335         if (unlikely(!max_wqe))
336                 return 0;
337         do {
338                 struct rte_mbuf *buf = NULL;
339                 uint8_t *raw;
340                 volatile struct mlx5_wqe_v *wqe = NULL;
341                 volatile rte_v128u32_t *dseg = NULL;
342                 uint32_t length;
343                 unsigned int ds = 0;
344                 unsigned int sg = 0; /* counter of additional segs attached. */
345                 uintptr_t addr;
346                 uint16_t pkt_inline_sz = MLX5_WQE_DWORD_SIZE + 2;
347                 uint16_t tso_header_sz = 0;
348                 uint16_t ehdr;
349                 uint8_t cs_flags;
350                 uint64_t tso = 0;
351                 uint16_t tso_segsz = 0;
352 #ifdef MLX5_PMD_SOFT_COUNTERS
353                 uint32_t total_length = 0;
354 #endif
355
356                 /* first_seg */
357                 buf = *pkts;
358                 segs_n = buf->nb_segs;
359                 /*
360                  * Make sure there is enough room to store this packet and
361                  * that one ring entry remains unused.
362                  */
363                 assert(segs_n);
364                 if (max_elts < segs_n)
365                         break;
366                 max_elts -= segs_n;
367                 sg = --segs_n;
368                 if (unlikely(--max_wqe == 0))
369                         break;
370                 wqe = (volatile struct mlx5_wqe_v *)
371                         tx_mlx5_wqe(txq, txq->wqe_ci);
372                 rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
373                 if (pkts_n - i > 1)
374                         rte_prefetch0(*(pkts + 1));
375                 addr = rte_pktmbuf_mtod(buf, uintptr_t);
376                 length = DATA_LEN(buf);
377                 ehdr = (((uint8_t *)addr)[1] << 8) |
378                        ((uint8_t *)addr)[0];
379 #ifdef MLX5_PMD_SOFT_COUNTERS
380                 total_length = length;
381 #endif
382                 if (length < (MLX5_WQE_DWORD_SIZE + 2)) {
383                         txq->stats.oerrors++;
384                         break;
385                 }
386                 /* Update element. */
387                 (*txq->elts)[elts_head & elts_m] = buf;
388                 /* Prefetch next buffer data. */
389                 if (pkts_n - i > 1)
390                         rte_prefetch0(
391                             rte_pktmbuf_mtod(*(pkts + 1), volatile void *));
392                 cs_flags = txq_ol_cksum_to_cs(txq, buf);
393                 raw = ((uint8_t *)(uintptr_t)wqe) + 2 * MLX5_WQE_DWORD_SIZE;
394                 /* Replace the Ethernet type by the VLAN if necessary. */
395                 if (buf->ol_flags & PKT_TX_VLAN_PKT) {
396                         uint32_t vlan = rte_cpu_to_be_32(0x81000000 |
397                                                          buf->vlan_tci);
398                         unsigned int len = 2 * ETHER_ADDR_LEN - 2;
399
400                         addr += 2;
401                         length -= 2;
402                         /* Copy Destination and source mac address. */
403                         memcpy((uint8_t *)raw, ((uint8_t *)addr), len);
404                         /* Copy VLAN. */
405                         memcpy((uint8_t *)raw + len, &vlan, sizeof(vlan));
406                         /* Copy missing two bytes to end the DSeg. */
407                         memcpy((uint8_t *)raw + len + sizeof(vlan),
408                                ((uint8_t *)addr) + len, 2);
409                         addr += len + 2;
410                         length -= (len + 2);
411                 } else {
412                         memcpy((uint8_t *)raw, ((uint8_t *)addr) + 2,
413                                MLX5_WQE_DWORD_SIZE);
414                         length -= pkt_inline_sz;
415                         addr += pkt_inline_sz;
416                 }
417                 raw += MLX5_WQE_DWORD_SIZE;
418                 tso = txq->tso_en && (buf->ol_flags & PKT_TX_TCP_SEG);
419                 if (tso) {
420                         uintptr_t end =
421                                 (uintptr_t)(((uintptr_t)txq->wqes) +
422                                             (1 << txq->wqe_n) * MLX5_WQE_SIZE);
423                         unsigned int copy_b;
424                         uint8_t vlan_sz =
425                                 (buf->ol_flags & PKT_TX_VLAN_PKT) ? 4 : 0;
426                         const uint64_t is_tunneled =
427                                 buf->ol_flags & (PKT_TX_TUNNEL_GRE |
428                                                  PKT_TX_TUNNEL_VXLAN);
429
430                         tso_header_sz = buf->l2_len + vlan_sz +
431                                         buf->l3_len + buf->l4_len;
432                         tso_segsz = buf->tso_segsz;
433                         if (unlikely(tso_segsz == 0)) {
434                                 txq->stats.oerrors++;
435                                 break;
436                         }
437                         if (is_tunneled && txq->tunnel_en) {
438                                 tso_header_sz += buf->outer_l2_len +
439                                                  buf->outer_l3_len;
440                                 cs_flags |= MLX5_ETH_WQE_L4_INNER_CSUM;
441                         } else {
442                                 cs_flags |= MLX5_ETH_WQE_L4_CSUM;
443                         }
444                         if (unlikely(tso_header_sz > MLX5_MAX_TSO_HEADER)) {
445                                 txq->stats.oerrors++;
446                                 break;
447                         }
448                         copy_b = tso_header_sz - pkt_inline_sz;
449                         /* First seg must contain all headers. */
450                         assert(copy_b <= length);
451                         if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
452                                 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
453
454                                 if (unlikely(max_wqe < n))
455                                         break;
456                                 max_wqe -= n;
457                                 rte_memcpy((void *)raw, (void *)addr, copy_b);
458                                 addr += copy_b;
459                                 length -= copy_b;
460                                 /* Include padding for TSO header. */
461                                 copy_b = MLX5_WQE_DS(copy_b) *
462                                          MLX5_WQE_DWORD_SIZE;
463                                 pkt_inline_sz += copy_b;
464                                 raw += copy_b;
465                         } else {
466                                 /* NOP WQE. */
467                                 wqe->ctrl = (rte_v128u32_t){
468                                         rte_cpu_to_be_32(txq->wqe_ci << 8),
469                                         rte_cpu_to_be_32(txq->qp_num_8s | 1),
470                                         0,
471                                         0,
472                                 };
473                                 ds = 1;
474 #ifdef MLX5_PMD_SOFT_COUNTERS
475                                 total_length = 0;
476 #endif
477                                 k++;
478                                 goto next_wqe;
479                         }
480                 }
481                 /* Inline if enough room. */
482                 if (max_inline || tso) {
483                         uint32_t inl = 0;
484                         uintptr_t end = (uintptr_t)
485                                 (((uintptr_t)txq->wqes) +
486                                  (1 << txq->wqe_n) * MLX5_WQE_SIZE);
487                         unsigned int inline_room = max_inline *
488                                                    RTE_CACHE_LINE_SIZE -
489                                                    (pkt_inline_sz - 2) -
490                                                    !!tso * sizeof(inl);
491                         uintptr_t addr_end;
492                         unsigned int copy_b;
493
494 pkt_inline:
495                         addr_end = RTE_ALIGN_FLOOR(addr + inline_room,
496                                                    RTE_CACHE_LINE_SIZE);
497                         copy_b = (addr_end > addr) ?
498                                  RTE_MIN((addr_end - addr), length) : 0;
499                         if (copy_b && ((end - (uintptr_t)raw) > copy_b)) {
500                                 /*
501                                  * One Dseg remains in the current WQE.  To
502                                  * keep the computation positive, it is
503                                  * removed after the bytes to Dseg conversion.
504                                  */
505                                 uint16_t n = (MLX5_WQE_DS(copy_b) - 1 + 3) / 4;
506
507                                 if (unlikely(max_wqe < n))
508                                         break;
509                                 max_wqe -= n;
510                                 if (tso && !inl) {
511                                         inl = rte_cpu_to_be_32(copy_b |
512                                                                MLX5_INLINE_SEG);
513                                         rte_memcpy((void *)raw,
514                                                    (void *)&inl, sizeof(inl));
515                                         raw += sizeof(inl);
516                                         pkt_inline_sz += sizeof(inl);
517                                 }
518                                 rte_memcpy((void *)raw, (void *)addr, copy_b);
519                                 addr += copy_b;
520                                 length -= copy_b;
521                                 pkt_inline_sz += copy_b;
522                         }
523                         /*
524                          * 2 DWORDs consumed by the WQE header + ETH segment +
525                          * the size of the inline part of the packet.
526                          */
527                         ds = 2 + MLX5_WQE_DS(pkt_inline_sz - 2);
528                         if (length > 0) {
529                                 if (ds % (MLX5_WQE_SIZE /
530                                           MLX5_WQE_DWORD_SIZE) == 0) {
531                                         if (unlikely(--max_wqe == 0))
532                                                 break;
533                                         dseg = (volatile rte_v128u32_t *)
534                                                tx_mlx5_wqe(txq, txq->wqe_ci +
535                                                            ds / 4);
536                                 } else {
537                                         dseg = (volatile rte_v128u32_t *)
538                                                 ((uintptr_t)wqe +
539                                                  (ds * MLX5_WQE_DWORD_SIZE));
540                                 }
541                                 goto use_dseg;
542                         } else if (!segs_n) {
543                                 goto next_pkt;
544                         } else {
545                                 raw += copy_b;
546                                 inline_room -= copy_b;
547                                 --segs_n;
548                                 buf = buf->next;
549                                 assert(buf);
550                                 addr = rte_pktmbuf_mtod(buf, uintptr_t);
551                                 length = DATA_LEN(buf);
552 #ifdef MLX5_PMD_SOFT_COUNTERS
553                                 total_length += length;
554 #endif
555                                 (*txq->elts)[++elts_head & elts_m] = buf;
556                                 goto pkt_inline;
557                         }
558                 } else {
559                         /*
560                          * No inline has been done in the packet, only the
561                          * Ethernet Header as been stored.
562                          */
563                         dseg = (volatile rte_v128u32_t *)
564                                 ((uintptr_t)wqe + (3 * MLX5_WQE_DWORD_SIZE));
565                         ds = 3;
566 use_dseg:
567                         /* Add the remaining packet as a simple ds. */
568                         addr = rte_cpu_to_be_64(addr);
569                         *dseg = (rte_v128u32_t){
570                                 rte_cpu_to_be_32(length),
571                                 mlx5_tx_mb2mr(txq, buf),
572                                 addr,
573                                 addr >> 32,
574                         };
575                         ++ds;
576                         if (!segs_n)
577                                 goto next_pkt;
578                 }
579 next_seg:
580                 assert(buf);
581                 assert(ds);
582                 assert(wqe);
583                 /*
584                  * Spill on next WQE when the current one does not have
585                  * enough room left. Size of WQE must a be a multiple
586                  * of data segment size.
587                  */
588                 assert(!(MLX5_WQE_SIZE % MLX5_WQE_DWORD_SIZE));
589                 if (!(ds % (MLX5_WQE_SIZE / MLX5_WQE_DWORD_SIZE))) {
590                         if (unlikely(--max_wqe == 0))
591                                 break;
592                         dseg = (volatile rte_v128u32_t *)
593                                tx_mlx5_wqe(txq, txq->wqe_ci + ds / 4);
594                         rte_prefetch0(tx_mlx5_wqe(txq,
595                                                   txq->wqe_ci + ds / 4 + 1));
596                 } else {
597                         ++dseg;
598                 }
599                 ++ds;
600                 buf = buf->next;
601                 assert(buf);
602                 length = DATA_LEN(buf);
603 #ifdef MLX5_PMD_SOFT_COUNTERS
604                 total_length += length;
605 #endif
606                 /* Store segment information. */
607                 addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf, uintptr_t));
608                 *dseg = (rte_v128u32_t){
609                         rte_cpu_to_be_32(length),
610                         mlx5_tx_mb2mr(txq, buf),
611                         addr,
612                         addr >> 32,
613                 };
614                 (*txq->elts)[++elts_head & elts_m] = buf;
615                 if (--segs_n)
616                         goto next_seg;
617 next_pkt:
618                 if (ds > MLX5_DSEG_MAX) {
619                         txq->stats.oerrors++;
620                         break;
621                 }
622                 ++elts_head;
623                 ++pkts;
624                 ++i;
625                 j += sg;
626                 /* Initialize known and common part of the WQE structure. */
627                 if (tso) {
628                         wqe->ctrl = (rte_v128u32_t){
629                                 rte_cpu_to_be_32((txq->wqe_ci << 8) |
630                                                  MLX5_OPCODE_TSO),
631                                 rte_cpu_to_be_32(txq->qp_num_8s | ds),
632                                 0,
633                                 0,
634                         };
635                         wqe->eseg = (rte_v128u32_t){
636                                 0,
637                                 cs_flags | (rte_cpu_to_be_16(tso_segsz) << 16),
638                                 0,
639                                 (ehdr << 16) | rte_cpu_to_be_16(tso_header_sz),
640                         };
641                 } else {
642                         wqe->ctrl = (rte_v128u32_t){
643                                 rte_cpu_to_be_32((txq->wqe_ci << 8) |
644                                                  MLX5_OPCODE_SEND),
645                                 rte_cpu_to_be_32(txq->qp_num_8s | ds),
646                                 0,
647                                 0,
648                         };
649                         wqe->eseg = (rte_v128u32_t){
650                                 0,
651                                 cs_flags,
652                                 0,
653                                 (ehdr << 16) | rte_cpu_to_be_16(pkt_inline_sz),
654                         };
655                 }
656 next_wqe:
657                 txq->wqe_ci += (ds + 3) / 4;
658                 /* Save the last successful WQE for completion request */
659                 last_wqe = (volatile struct mlx5_wqe_ctrl *)wqe;
660 #ifdef MLX5_PMD_SOFT_COUNTERS
661                 /* Increment sent bytes counter. */
662                 txq->stats.obytes += total_length;
663 #endif
664         } while (i < pkts_n);
665         /* Take a shortcut if nothing must be sent. */
666         if (unlikely((i + k) == 0))
667                 return 0;
668         txq->elts_head += (i + j);
669         /* Check whether completion threshold has been reached. */
670         comp = txq->elts_comp + i + j + k;
671         if (comp >= MLX5_TX_COMP_THRESH) {
672                 /* Request completion on last WQE. */
673                 last_wqe->ctrl2 = rte_cpu_to_be_32(8);
674                 /* Save elts_head in unused "immediate" field of WQE. */
675                 last_wqe->ctrl3 = txq->elts_head;
676                 txq->elts_comp = 0;
677 #ifndef NDEBUG
678                 ++txq->cq_pi;
679 #endif
680         } else {
681                 txq->elts_comp = comp;
682         }
683 #ifdef MLX5_PMD_SOFT_COUNTERS
684         /* Increment sent packets counter. */
685         txq->stats.opackets += i;
686 #endif
687         /* Ring QP doorbell. */
688         mlx5_tx_dbrec(txq, (volatile struct mlx5_wqe *)last_wqe);
689         return i;
690 }
691
692 /**
693  * Open a MPW session.
694  *
695  * @param txq
696  *   Pointer to TX queue structure.
697  * @param mpw
698  *   Pointer to MPW session structure.
699  * @param length
700  *   Packet length.
701  */
702 static inline void
703 mlx5_mpw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, uint32_t length)
704 {
705         uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
706         volatile struct mlx5_wqe_data_seg (*dseg)[MLX5_MPW_DSEG_MAX] =
707                 (volatile struct mlx5_wqe_data_seg (*)[])
708                 tx_mlx5_wqe(txq, idx + 1);
709
710         mpw->state = MLX5_MPW_STATE_OPENED;
711         mpw->pkts_n = 0;
712         mpw->len = length;
713         mpw->total_len = 0;
714         mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
715         mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
716         mpw->wqe->eseg.inline_hdr_sz = 0;
717         mpw->wqe->eseg.rsvd0 = 0;
718         mpw->wqe->eseg.rsvd1 = 0;
719         mpw->wqe->eseg.rsvd2 = 0;
720         mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
721                                              (txq->wqe_ci << 8) |
722                                              MLX5_OPCODE_TSO);
723         mpw->wqe->ctrl[2] = 0;
724         mpw->wqe->ctrl[3] = 0;
725         mpw->data.dseg[0] = (volatile struct mlx5_wqe_data_seg *)
726                 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
727         mpw->data.dseg[1] = (volatile struct mlx5_wqe_data_seg *)
728                 (((uintptr_t)mpw->wqe) + (3 * MLX5_WQE_DWORD_SIZE));
729         mpw->data.dseg[2] = &(*dseg)[0];
730         mpw->data.dseg[3] = &(*dseg)[1];
731         mpw->data.dseg[4] = &(*dseg)[2];
732 }
733
734 /**
735  * Close a MPW session.
736  *
737  * @param txq
738  *   Pointer to TX queue structure.
739  * @param mpw
740  *   Pointer to MPW session structure.
741  */
742 static inline void
743 mlx5_mpw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
744 {
745         unsigned int num = mpw->pkts_n;
746
747         /*
748          * Store size in multiple of 16 bytes. Control and Ethernet segments
749          * count as 2.
750          */
751         mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s | (2 + num));
752         mpw->state = MLX5_MPW_STATE_CLOSED;
753         if (num < 3)
754                 ++txq->wqe_ci;
755         else
756                 txq->wqe_ci += 2;
757         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
758         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
759 }
760
761 /**
762  * DPDK callback for TX with MPW support.
763  *
764  * @param dpdk_txq
765  *   Generic pointer to TX queue structure.
766  * @param[in] pkts
767  *   Packets to transmit.
768  * @param pkts_n
769  *   Number of packets in array.
770  *
771  * @return
772  *   Number of packets successfully transmitted (<= pkts_n).
773  */
774 uint16_t
775 mlx5_tx_burst_mpw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
776 {
777         struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
778         uint16_t elts_head = txq->elts_head;
779         const uint16_t elts_n = 1 << txq->elts_n;
780         const uint16_t elts_m = elts_n - 1;
781         unsigned int i = 0;
782         unsigned int j = 0;
783         uint16_t max_elts;
784         uint16_t max_wqe;
785         unsigned int comp;
786         struct mlx5_mpw mpw = {
787                 .state = MLX5_MPW_STATE_CLOSED,
788         };
789
790         if (unlikely(!pkts_n))
791                 return 0;
792         /* Prefetch first packet cacheline. */
793         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
794         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
795         /* Start processing. */
796         mlx5_tx_complete(txq);
797         max_elts = (elts_n - (elts_head - txq->elts_tail));
798         /* A CQE slot must always be available. */
799         assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
800         max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
801         if (unlikely(!max_wqe))
802                 return 0;
803         do {
804                 struct rte_mbuf *buf = *(pkts++);
805                 uint32_t length;
806                 unsigned int segs_n = buf->nb_segs;
807                 uint32_t cs_flags;
808
809                 /*
810                  * Make sure there is enough room to store this packet and
811                  * that one ring entry remains unused.
812                  */
813                 assert(segs_n);
814                 if (max_elts < segs_n)
815                         break;
816                 /* Do not bother with large packets MPW cannot handle. */
817                 if (segs_n > MLX5_MPW_DSEG_MAX) {
818                         txq->stats.oerrors++;
819                         break;
820                 }
821                 max_elts -= segs_n;
822                 --pkts_n;
823                 cs_flags = txq_ol_cksum_to_cs(txq, buf);
824                 /* Retrieve packet information. */
825                 length = PKT_LEN(buf);
826                 assert(length);
827                 /* Start new session if packet differs. */
828                 if ((mpw.state == MLX5_MPW_STATE_OPENED) &&
829                     ((mpw.len != length) ||
830                      (segs_n != 1) ||
831                      (mpw.wqe->eseg.cs_flags != cs_flags)))
832                         mlx5_mpw_close(txq, &mpw);
833                 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
834                         /*
835                          * Multi-Packet WQE consumes at most two WQE.
836                          * mlx5_mpw_new() expects to be able to use such
837                          * resources.
838                          */
839                         if (unlikely(max_wqe < 2))
840                                 break;
841                         max_wqe -= 2;
842                         mlx5_mpw_new(txq, &mpw, length);
843                         mpw.wqe->eseg.cs_flags = cs_flags;
844                 }
845                 /* Multi-segment packets must be alone in their MPW. */
846                 assert((segs_n == 1) || (mpw.pkts_n == 0));
847 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
848                 length = 0;
849 #endif
850                 do {
851                         volatile struct mlx5_wqe_data_seg *dseg;
852                         uintptr_t addr;
853
854                         assert(buf);
855                         (*txq->elts)[elts_head++ & elts_m] = buf;
856                         dseg = mpw.data.dseg[mpw.pkts_n];
857                         addr = rte_pktmbuf_mtod(buf, uintptr_t);
858                         *dseg = (struct mlx5_wqe_data_seg){
859                                 .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
860                                 .lkey = mlx5_tx_mb2mr(txq, buf),
861                                 .addr = rte_cpu_to_be_64(addr),
862                         };
863 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
864                         length += DATA_LEN(buf);
865 #endif
866                         buf = buf->next;
867                         ++mpw.pkts_n;
868                         ++j;
869                 } while (--segs_n);
870                 assert(length == mpw.len);
871                 if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
872                         mlx5_mpw_close(txq, &mpw);
873 #ifdef MLX5_PMD_SOFT_COUNTERS
874                 /* Increment sent bytes counter. */
875                 txq->stats.obytes += length;
876 #endif
877                 ++i;
878         } while (pkts_n);
879         /* Take a shortcut if nothing must be sent. */
880         if (unlikely(i == 0))
881                 return 0;
882         /* Check whether completion threshold has been reached. */
883         /* "j" includes both packets and segments. */
884         comp = txq->elts_comp + j;
885         if (comp >= MLX5_TX_COMP_THRESH) {
886                 volatile struct mlx5_wqe *wqe = mpw.wqe;
887
888                 /* Request completion on last WQE. */
889                 wqe->ctrl[2] = rte_cpu_to_be_32(8);
890                 /* Save elts_head in unused "immediate" field of WQE. */
891                 wqe->ctrl[3] = elts_head;
892                 txq->elts_comp = 0;
893 #ifndef NDEBUG
894                 ++txq->cq_pi;
895 #endif
896         } else {
897                 txq->elts_comp = comp;
898         }
899 #ifdef MLX5_PMD_SOFT_COUNTERS
900         /* Increment sent packets counter. */
901         txq->stats.opackets += i;
902 #endif
903         /* Ring QP doorbell. */
904         if (mpw.state == MLX5_MPW_STATE_OPENED)
905                 mlx5_mpw_close(txq, &mpw);
906         mlx5_tx_dbrec(txq, mpw.wqe);
907         txq->elts_head = elts_head;
908         return i;
909 }
910
911 /**
912  * Open a MPW inline session.
913  *
914  * @param txq
915  *   Pointer to TX queue structure.
916  * @param mpw
917  *   Pointer to MPW session structure.
918  * @param length
919  *   Packet length.
920  */
921 static inline void
922 mlx5_mpw_inline_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw,
923                     uint32_t length)
924 {
925         uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
926         struct mlx5_wqe_inl_small *inl;
927
928         mpw->state = MLX5_MPW_INL_STATE_OPENED;
929         mpw->pkts_n = 0;
930         mpw->len = length;
931         mpw->total_len = 0;
932         mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
933         mpw->wqe->ctrl[0] = rte_cpu_to_be_32((MLX5_OPC_MOD_MPW << 24) |
934                                              (txq->wqe_ci << 8) |
935                                              MLX5_OPCODE_TSO);
936         mpw->wqe->ctrl[2] = 0;
937         mpw->wqe->ctrl[3] = 0;
938         mpw->wqe->eseg.mss = rte_cpu_to_be_16(length);
939         mpw->wqe->eseg.inline_hdr_sz = 0;
940         mpw->wqe->eseg.cs_flags = 0;
941         mpw->wqe->eseg.rsvd0 = 0;
942         mpw->wqe->eseg.rsvd1 = 0;
943         mpw->wqe->eseg.rsvd2 = 0;
944         inl = (struct mlx5_wqe_inl_small *)
945                 (((uintptr_t)mpw->wqe) + 2 * MLX5_WQE_DWORD_SIZE);
946         mpw->data.raw = (uint8_t *)&inl->raw;
947 }
948
949 /**
950  * Close a MPW inline session.
951  *
952  * @param txq
953  *   Pointer to TX queue structure.
954  * @param mpw
955  *   Pointer to MPW session structure.
956  */
957 static inline void
958 mlx5_mpw_inline_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
959 {
960         unsigned int size;
961         struct mlx5_wqe_inl_small *inl = (struct mlx5_wqe_inl_small *)
962                 (((uintptr_t)mpw->wqe) + (2 * MLX5_WQE_DWORD_SIZE));
963
964         size = MLX5_WQE_SIZE - MLX5_MWQE64_INL_DATA + mpw->total_len;
965         /*
966          * Store size in multiple of 16 bytes. Control and Ethernet segments
967          * count as 2.
968          */
969         mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
970                                              MLX5_WQE_DS(size));
971         mpw->state = MLX5_MPW_STATE_CLOSED;
972         inl->byte_cnt = rte_cpu_to_be_32(mpw->total_len | MLX5_INLINE_SEG);
973         txq->wqe_ci += (size + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
974 }
975
976 /**
977  * DPDK callback for TX with MPW inline support.
978  *
979  * @param dpdk_txq
980  *   Generic pointer to TX queue structure.
981  * @param[in] pkts
982  *   Packets to transmit.
983  * @param pkts_n
984  *   Number of packets in array.
985  *
986  * @return
987  *   Number of packets successfully transmitted (<= pkts_n).
988  */
989 uint16_t
990 mlx5_tx_burst_mpw_inline(void *dpdk_txq, struct rte_mbuf **pkts,
991                          uint16_t pkts_n)
992 {
993         struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
994         uint16_t elts_head = txq->elts_head;
995         const uint16_t elts_n = 1 << txq->elts_n;
996         const uint16_t elts_m = elts_n - 1;
997         unsigned int i = 0;
998         unsigned int j = 0;
999         uint16_t max_elts;
1000         uint16_t max_wqe;
1001         unsigned int comp;
1002         unsigned int inline_room = txq->max_inline * RTE_CACHE_LINE_SIZE;
1003         struct mlx5_mpw mpw = {
1004                 .state = MLX5_MPW_STATE_CLOSED,
1005         };
1006         /*
1007          * Compute the maximum number of WQE which can be consumed by inline
1008          * code.
1009          * - 2 DSEG for:
1010          *   - 1 control segment,
1011          *   - 1 Ethernet segment,
1012          * - N Dseg from the inline request.
1013          */
1014         const unsigned int wqe_inl_n =
1015                 ((2 * MLX5_WQE_DWORD_SIZE +
1016                   txq->max_inline * RTE_CACHE_LINE_SIZE) +
1017                  RTE_CACHE_LINE_SIZE - 1) / RTE_CACHE_LINE_SIZE;
1018
1019         if (unlikely(!pkts_n))
1020                 return 0;
1021         /* Prefetch first packet cacheline. */
1022         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci));
1023         rte_prefetch0(tx_mlx5_wqe(txq, txq->wqe_ci + 1));
1024         /* Start processing. */
1025         mlx5_tx_complete(txq);
1026         max_elts = (elts_n - (elts_head - txq->elts_tail));
1027         /* A CQE slot must always be available. */
1028         assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1029         do {
1030                 struct rte_mbuf *buf = *(pkts++);
1031                 uintptr_t addr;
1032                 uint32_t length;
1033                 unsigned int segs_n = buf->nb_segs;
1034                 uint8_t cs_flags;
1035
1036                 /*
1037                  * Make sure there is enough room to store this packet and
1038                  * that one ring entry remains unused.
1039                  */
1040                 assert(segs_n);
1041                 if (max_elts < segs_n)
1042                         break;
1043                 /* Do not bother with large packets MPW cannot handle. */
1044                 if (segs_n > MLX5_MPW_DSEG_MAX) {
1045                         txq->stats.oerrors++;
1046                         break;
1047                 }
1048                 max_elts -= segs_n;
1049                 --pkts_n;
1050                 /*
1051                  * Compute max_wqe in case less WQE were consumed in previous
1052                  * iteration.
1053                  */
1054                 max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1055                 cs_flags = txq_ol_cksum_to_cs(txq, buf);
1056                 /* Retrieve packet information. */
1057                 length = PKT_LEN(buf);
1058                 /* Start new session if packet differs. */
1059                 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1060                         if ((mpw.len != length) ||
1061                             (segs_n != 1) ||
1062                             (mpw.wqe->eseg.cs_flags != cs_flags))
1063                                 mlx5_mpw_close(txq, &mpw);
1064                 } else if (mpw.state == MLX5_MPW_INL_STATE_OPENED) {
1065                         if ((mpw.len != length) ||
1066                             (segs_n != 1) ||
1067                             (length > inline_room) ||
1068                             (mpw.wqe->eseg.cs_flags != cs_flags)) {
1069                                 mlx5_mpw_inline_close(txq, &mpw);
1070                                 inline_room =
1071                                         txq->max_inline * RTE_CACHE_LINE_SIZE;
1072                         }
1073                 }
1074                 if (mpw.state == MLX5_MPW_STATE_CLOSED) {
1075                         if ((segs_n != 1) ||
1076                             (length > inline_room)) {
1077                                 /*
1078                                  * Multi-Packet WQE consumes at most two WQE.
1079                                  * mlx5_mpw_new() expects to be able to use
1080                                  * such resources.
1081                                  */
1082                                 if (unlikely(max_wqe < 2))
1083                                         break;
1084                                 max_wqe -= 2;
1085                                 mlx5_mpw_new(txq, &mpw, length);
1086                                 mpw.wqe->eseg.cs_flags = cs_flags;
1087                         } else {
1088                                 if (unlikely(max_wqe < wqe_inl_n))
1089                                         break;
1090                                 max_wqe -= wqe_inl_n;
1091                                 mlx5_mpw_inline_new(txq, &mpw, length);
1092                                 mpw.wqe->eseg.cs_flags = cs_flags;
1093                         }
1094                 }
1095                 /* Multi-segment packets must be alone in their MPW. */
1096                 assert((segs_n == 1) || (mpw.pkts_n == 0));
1097                 if (mpw.state == MLX5_MPW_STATE_OPENED) {
1098                         assert(inline_room ==
1099                                txq->max_inline * RTE_CACHE_LINE_SIZE);
1100 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1101                         length = 0;
1102 #endif
1103                         do {
1104                                 volatile struct mlx5_wqe_data_seg *dseg;
1105
1106                                 assert(buf);
1107                                 (*txq->elts)[elts_head++ & elts_m] = buf;
1108                                 dseg = mpw.data.dseg[mpw.pkts_n];
1109                                 addr = rte_pktmbuf_mtod(buf, uintptr_t);
1110                                 *dseg = (struct mlx5_wqe_data_seg){
1111                                         .byte_count =
1112                                                rte_cpu_to_be_32(DATA_LEN(buf)),
1113                                         .lkey = mlx5_tx_mb2mr(txq, buf),
1114                                         .addr = rte_cpu_to_be_64(addr),
1115                                 };
1116 #if defined(MLX5_PMD_SOFT_COUNTERS) || !defined(NDEBUG)
1117                                 length += DATA_LEN(buf);
1118 #endif
1119                                 buf = buf->next;
1120                                 ++mpw.pkts_n;
1121                                 ++j;
1122                         } while (--segs_n);
1123                         assert(length == mpw.len);
1124                         if (mpw.pkts_n == MLX5_MPW_DSEG_MAX)
1125                                 mlx5_mpw_close(txq, &mpw);
1126                 } else {
1127                         unsigned int max;
1128
1129                         assert(mpw.state == MLX5_MPW_INL_STATE_OPENED);
1130                         assert(length <= inline_room);
1131                         assert(length == DATA_LEN(buf));
1132                         addr = rte_pktmbuf_mtod(buf, uintptr_t);
1133                         (*txq->elts)[elts_head++ & elts_m] = buf;
1134                         /* Maximum number of bytes before wrapping. */
1135                         max = ((((uintptr_t)(txq->wqes)) +
1136                                 (1 << txq->wqe_n) *
1137                                 MLX5_WQE_SIZE) -
1138                                (uintptr_t)mpw.data.raw);
1139                         if (length > max) {
1140                                 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1141                                            (void *)addr,
1142                                            max);
1143                                 mpw.data.raw = (volatile void *)txq->wqes;
1144                                 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1145                                            (void *)(addr + max),
1146                                            length - max);
1147                                 mpw.data.raw += length - max;
1148                         } else {
1149                                 rte_memcpy((void *)(uintptr_t)mpw.data.raw,
1150                                            (void *)addr,
1151                                            length);
1152
1153                                 if (length == max)
1154                                         mpw.data.raw =
1155                                                 (volatile void *)txq->wqes;
1156                                 else
1157                                         mpw.data.raw += length;
1158                         }
1159                         ++mpw.pkts_n;
1160                         mpw.total_len += length;
1161                         ++j;
1162                         if (mpw.pkts_n == MLX5_MPW_DSEG_MAX) {
1163                                 mlx5_mpw_inline_close(txq, &mpw);
1164                                 inline_room =
1165                                         txq->max_inline * RTE_CACHE_LINE_SIZE;
1166                         } else {
1167                                 inline_room -= length;
1168                         }
1169                 }
1170 #ifdef MLX5_PMD_SOFT_COUNTERS
1171                 /* Increment sent bytes counter. */
1172                 txq->stats.obytes += length;
1173 #endif
1174                 ++i;
1175         } while (pkts_n);
1176         /* Take a shortcut if nothing must be sent. */
1177         if (unlikely(i == 0))
1178                 return 0;
1179         /* Check whether completion threshold has been reached. */
1180         /* "j" includes both packets and segments. */
1181         comp = txq->elts_comp + j;
1182         if (comp >= MLX5_TX_COMP_THRESH) {
1183                 volatile struct mlx5_wqe *wqe = mpw.wqe;
1184
1185                 /* Request completion on last WQE. */
1186                 wqe->ctrl[2] = rte_cpu_to_be_32(8);
1187                 /* Save elts_head in unused "immediate" field of WQE. */
1188                 wqe->ctrl[3] = elts_head;
1189                 txq->elts_comp = 0;
1190 #ifndef NDEBUG
1191                 ++txq->cq_pi;
1192 #endif
1193         } else {
1194                 txq->elts_comp = comp;
1195         }
1196 #ifdef MLX5_PMD_SOFT_COUNTERS
1197         /* Increment sent packets counter. */
1198         txq->stats.opackets += i;
1199 #endif
1200         /* Ring QP doorbell. */
1201         if (mpw.state == MLX5_MPW_INL_STATE_OPENED)
1202                 mlx5_mpw_inline_close(txq, &mpw);
1203         else if (mpw.state == MLX5_MPW_STATE_OPENED)
1204                 mlx5_mpw_close(txq, &mpw);
1205         mlx5_tx_dbrec(txq, mpw.wqe);
1206         txq->elts_head = elts_head;
1207         return i;
1208 }
1209
1210 /**
1211  * Open an Enhanced MPW session.
1212  *
1213  * @param txq
1214  *   Pointer to TX queue structure.
1215  * @param mpw
1216  *   Pointer to MPW session structure.
1217  * @param length
1218  *   Packet length.
1219  */
1220 static inline void
1221 mlx5_empw_new(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw, int padding)
1222 {
1223         uint16_t idx = txq->wqe_ci & ((1 << txq->wqe_n) - 1);
1224
1225         mpw->state = MLX5_MPW_ENHANCED_STATE_OPENED;
1226         mpw->pkts_n = 0;
1227         mpw->total_len = sizeof(struct mlx5_wqe);
1228         mpw->wqe = (volatile struct mlx5_wqe *)tx_mlx5_wqe(txq, idx);
1229         mpw->wqe->ctrl[0] =
1230                 rte_cpu_to_be_32((MLX5_OPC_MOD_ENHANCED_MPSW << 24) |
1231                                  (txq->wqe_ci << 8) |
1232                                  MLX5_OPCODE_ENHANCED_MPSW);
1233         mpw->wqe->ctrl[2] = 0;
1234         mpw->wqe->ctrl[3] = 0;
1235         memset((void *)(uintptr_t)&mpw->wqe->eseg, 0, MLX5_WQE_DWORD_SIZE);
1236         if (unlikely(padding)) {
1237                 uintptr_t addr = (uintptr_t)(mpw->wqe + 1);
1238
1239                 /* Pad the first 2 DWORDs with zero-length inline header. */
1240                 *(volatile uint32_t *)addr = rte_cpu_to_be_32(MLX5_INLINE_SEG);
1241                 *(volatile uint32_t *)(addr + MLX5_WQE_DWORD_SIZE) =
1242                         rte_cpu_to_be_32(MLX5_INLINE_SEG);
1243                 mpw->total_len += 2 * MLX5_WQE_DWORD_SIZE;
1244                 /* Start from the next WQEBB. */
1245                 mpw->data.raw = (volatile void *)(tx_mlx5_wqe(txq, idx + 1));
1246         } else {
1247                 mpw->data.raw = (volatile void *)(mpw->wqe + 1);
1248         }
1249 }
1250
1251 /**
1252  * Close an Enhanced MPW session.
1253  *
1254  * @param txq
1255  *   Pointer to TX queue structure.
1256  * @param mpw
1257  *   Pointer to MPW session structure.
1258  *
1259  * @return
1260  *   Number of consumed WQEs.
1261  */
1262 static inline uint16_t
1263 mlx5_empw_close(struct mlx5_txq_data *txq, struct mlx5_mpw *mpw)
1264 {
1265         uint16_t ret;
1266
1267         /* Store size in multiple of 16 bytes. Control and Ethernet segments
1268          * count as 2.
1269          */
1270         mpw->wqe->ctrl[1] = rte_cpu_to_be_32(txq->qp_num_8s |
1271                                              MLX5_WQE_DS(mpw->total_len));
1272         mpw->state = MLX5_MPW_STATE_CLOSED;
1273         ret = (mpw->total_len + (MLX5_WQE_SIZE - 1)) / MLX5_WQE_SIZE;
1274         txq->wqe_ci += ret;
1275         return ret;
1276 }
1277
1278 /**
1279  * TX with Enhanced MPW support.
1280  *
1281  * @param txq
1282  *   Pointer to TX queue structure.
1283  * @param[in] pkts
1284  *   Packets to transmit.
1285  * @param pkts_n
1286  *   Number of packets in array.
1287  *
1288  * @return
1289  *   Number of packets successfully transmitted (<= pkts_n).
1290  */
1291 static inline uint16_t
1292 txq_burst_empw(struct mlx5_txq_data *txq, struct rte_mbuf **pkts,
1293                uint16_t pkts_n)
1294 {
1295         uint16_t elts_head = txq->elts_head;
1296         const uint16_t elts_n = 1 << txq->elts_n;
1297         const uint16_t elts_m = elts_n - 1;
1298         unsigned int i = 0;
1299         unsigned int j = 0;
1300         uint16_t max_elts;
1301         uint16_t max_wqe;
1302         unsigned int max_inline = txq->max_inline * RTE_CACHE_LINE_SIZE;
1303         unsigned int mpw_room = 0;
1304         unsigned int inl_pad = 0;
1305         uint32_t inl_hdr;
1306         struct mlx5_mpw mpw = {
1307                 .state = MLX5_MPW_STATE_CLOSED,
1308         };
1309
1310         if (unlikely(!pkts_n))
1311                 return 0;
1312         /* Start processing. */
1313         mlx5_tx_complete(txq);
1314         max_elts = (elts_n - (elts_head - txq->elts_tail));
1315         /* A CQE slot must always be available. */
1316         assert((1u << txq->cqe_n) - (txq->cq_pi - txq->cq_ci));
1317         max_wqe = (1u << txq->wqe_n) - (txq->wqe_ci - txq->wqe_pi);
1318         if (unlikely(!max_wqe))
1319                 return 0;
1320         do {
1321                 struct rte_mbuf *buf = *(pkts++);
1322                 uintptr_t addr;
1323                 unsigned int n;
1324                 unsigned int do_inline = 0; /* Whether inline is possible. */
1325                 uint32_t length;
1326                 uint8_t cs_flags;
1327
1328                 /* Multi-segmented packet is handled in slow-path outside. */
1329                 assert(NB_SEGS(buf) == 1);
1330                 /* Make sure there is enough room to store this packet. */
1331                 if (max_elts - j == 0)
1332                         break;
1333                 cs_flags = txq_ol_cksum_to_cs(txq, buf);
1334                 /* Retrieve packet information. */
1335                 length = PKT_LEN(buf);
1336                 /* Start new session if:
1337                  * - multi-segment packet
1338                  * - no space left even for a dseg
1339                  * - next packet can be inlined with a new WQE
1340                  * - cs_flag differs
1341                  */
1342                 if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED) {
1343                         if ((inl_pad + sizeof(struct mlx5_wqe_data_seg) >
1344                              mpw_room) ||
1345                             (length <= txq->inline_max_packet_sz &&
1346                              inl_pad + sizeof(inl_hdr) + length >
1347                              mpw_room) ||
1348                             (mpw.wqe->eseg.cs_flags != cs_flags))
1349                                 max_wqe -= mlx5_empw_close(txq, &mpw);
1350                 }
1351                 if (unlikely(mpw.state == MLX5_MPW_STATE_CLOSED)) {
1352                         /* In Enhanced MPW, inline as much as the budget is
1353                          * allowed. The remaining space is to be filled with
1354                          * dsegs. If the title WQEBB isn't padded, it will have
1355                          * 2 dsegs there.
1356                          */
1357                         mpw_room = RTE_MIN(MLX5_WQE_SIZE_MAX,
1358                                            (max_inline ? max_inline :
1359                                             pkts_n * MLX5_WQE_DWORD_SIZE) +
1360                                            MLX5_WQE_SIZE);
1361                         if (unlikely(max_wqe * MLX5_WQE_SIZE < mpw_room))
1362                                 break;
1363                         /* Don't pad the title WQEBB to not waste WQ. */
1364                         mlx5_empw_new(txq, &mpw, 0);
1365                         mpw_room -= mpw.total_len;
1366                         inl_pad = 0;
1367                         do_inline = length <= txq->inline_max_packet_sz &&
1368                                     sizeof(inl_hdr) + length <= mpw_room &&
1369                                     !txq->mpw_hdr_dseg;
1370                         mpw.wqe->eseg.cs_flags = cs_flags;
1371                 } else {
1372                         /* Evaluate whether the next packet can be inlined.
1373                          * Inlininig is possible when:
1374                          * - length is less than configured value
1375                          * - length fits for remaining space
1376                          * - not required to fill the title WQEBB with dsegs
1377                          */
1378                         do_inline =
1379                                 length <= txq->inline_max_packet_sz &&
1380                                 inl_pad + sizeof(inl_hdr) + length <=
1381                                  mpw_room &&
1382                                 (!txq->mpw_hdr_dseg ||
1383                                  mpw.total_len >= MLX5_WQE_SIZE);
1384                 }
1385                 if (do_inline) {
1386                         /* Inline packet into WQE. */
1387                         unsigned int max;
1388
1389                         assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1390                         assert(length == DATA_LEN(buf));
1391                         inl_hdr = rte_cpu_to_be_32(length | MLX5_INLINE_SEG);
1392                         addr = rte_pktmbuf_mtod(buf, uintptr_t);
1393                         mpw.data.raw = (volatile void *)
1394                                 ((uintptr_t)mpw.data.raw + inl_pad);
1395                         max = tx_mlx5_wq_tailroom(txq,
1396                                         (void *)(uintptr_t)mpw.data.raw);
1397                         /* Copy inline header. */
1398                         mpw.data.raw = (volatile void *)
1399                                 mlx5_copy_to_wq(
1400                                           (void *)(uintptr_t)mpw.data.raw,
1401                                           &inl_hdr,
1402                                           sizeof(inl_hdr),
1403                                           (void *)(uintptr_t)txq->wqes,
1404                                           max);
1405                         max = tx_mlx5_wq_tailroom(txq,
1406                                         (void *)(uintptr_t)mpw.data.raw);
1407                         /* Copy packet data. */
1408                         mpw.data.raw = (volatile void *)
1409                                 mlx5_copy_to_wq(
1410                                           (void *)(uintptr_t)mpw.data.raw,
1411                                           (void *)addr,
1412                                           length,
1413                                           (void *)(uintptr_t)txq->wqes,
1414                                           max);
1415                         ++mpw.pkts_n;
1416                         mpw.total_len += (inl_pad + sizeof(inl_hdr) + length);
1417                         /* No need to get completion as the entire packet is
1418                          * copied to WQ. Free the buf right away.
1419                          */
1420                         rte_pktmbuf_free_seg(buf);
1421                         mpw_room -= (inl_pad + sizeof(inl_hdr) + length);
1422                         /* Add pad in the next packet if any. */
1423                         inl_pad = (((uintptr_t)mpw.data.raw +
1424                                         (MLX5_WQE_DWORD_SIZE - 1)) &
1425                                         ~(MLX5_WQE_DWORD_SIZE - 1)) -
1426                                   (uintptr_t)mpw.data.raw;
1427                 } else {
1428                         /* No inline. Load a dseg of packet pointer. */
1429                         volatile rte_v128u32_t *dseg;
1430
1431                         assert(mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED);
1432                         assert((inl_pad + sizeof(*dseg)) <= mpw_room);
1433                         assert(length == DATA_LEN(buf));
1434                         if (!tx_mlx5_wq_tailroom(txq,
1435                                         (void *)((uintptr_t)mpw.data.raw
1436                                                 + inl_pad)))
1437                                 dseg = (volatile void *)txq->wqes;
1438                         else
1439                                 dseg = (volatile void *)
1440                                         ((uintptr_t)mpw.data.raw +
1441                                          inl_pad);
1442                         (*txq->elts)[elts_head++ & elts_m] = buf;
1443                         addr = rte_pktmbuf_mtod(buf, uintptr_t);
1444                         for (n = 0; n * RTE_CACHE_LINE_SIZE < length; n++)
1445                                 rte_prefetch2((void *)(addr +
1446                                                 n * RTE_CACHE_LINE_SIZE));
1447                         addr = rte_cpu_to_be_64(addr);
1448                         *dseg = (rte_v128u32_t) {
1449                                 rte_cpu_to_be_32(length),
1450                                 mlx5_tx_mb2mr(txq, buf),
1451                                 addr,
1452                                 addr >> 32,
1453                         };
1454                         mpw.data.raw = (volatile void *)(dseg + 1);
1455                         mpw.total_len += (inl_pad + sizeof(*dseg));
1456                         ++j;
1457                         ++mpw.pkts_n;
1458                         mpw_room -= (inl_pad + sizeof(*dseg));
1459                         inl_pad = 0;
1460                 }
1461 #ifdef MLX5_PMD_SOFT_COUNTERS
1462                 /* Increment sent bytes counter. */
1463                 txq->stats.obytes += length;
1464 #endif
1465                 ++i;
1466         } while (i < pkts_n);
1467         /* Take a shortcut if nothing must be sent. */
1468         if (unlikely(i == 0))
1469                 return 0;
1470         /* Check whether completion threshold has been reached. */
1471         if (txq->elts_comp + j >= MLX5_TX_COMP_THRESH ||
1472                         (uint16_t)(txq->wqe_ci - txq->mpw_comp) >=
1473                          (1 << txq->wqe_n) / MLX5_TX_COMP_THRESH_INLINE_DIV) {
1474                 volatile struct mlx5_wqe *wqe = mpw.wqe;
1475
1476                 /* Request completion on last WQE. */
1477                 wqe->ctrl[2] = rte_cpu_to_be_32(8);
1478                 /* Save elts_head in unused "immediate" field of WQE. */
1479                 wqe->ctrl[3] = elts_head;
1480                 txq->elts_comp = 0;
1481                 txq->mpw_comp = txq->wqe_ci;
1482 #ifndef NDEBUG
1483                 ++txq->cq_pi;
1484 #endif
1485         } else {
1486                 txq->elts_comp += j;
1487         }
1488 #ifdef MLX5_PMD_SOFT_COUNTERS
1489         /* Increment sent packets counter. */
1490         txq->stats.opackets += i;
1491 #endif
1492         if (mpw.state == MLX5_MPW_ENHANCED_STATE_OPENED)
1493                 mlx5_empw_close(txq, &mpw);
1494         /* Ring QP doorbell. */
1495         mlx5_tx_dbrec(txq, mpw.wqe);
1496         txq->elts_head = elts_head;
1497         return i;
1498 }
1499
1500 /**
1501  * DPDK callback for TX with Enhanced MPW support.
1502  *
1503  * @param dpdk_txq
1504  *   Generic pointer to TX queue structure.
1505  * @param[in] pkts
1506  *   Packets to transmit.
1507  * @param pkts_n
1508  *   Number of packets in array.
1509  *
1510  * @return
1511  *   Number of packets successfully transmitted (<= pkts_n).
1512  */
1513 uint16_t
1514 mlx5_tx_burst_empw(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1515 {
1516         struct mlx5_txq_data *txq = (struct mlx5_txq_data *)dpdk_txq;
1517         uint16_t nb_tx = 0;
1518
1519         while (pkts_n > nb_tx) {
1520                 uint16_t n;
1521                 uint16_t ret;
1522
1523                 n = txq_count_contig_multi_seg(&pkts[nb_tx], pkts_n - nb_tx);
1524                 if (n) {
1525                         ret = mlx5_tx_burst(dpdk_txq, &pkts[nb_tx], n);
1526                         if (!ret)
1527                                 break;
1528                         nb_tx += ret;
1529                 }
1530                 n = txq_count_contig_single_seg(&pkts[nb_tx], pkts_n - nb_tx);
1531                 if (n) {
1532                         ret = txq_burst_empw(txq, &pkts[nb_tx], n);
1533                         if (!ret)
1534                                 break;
1535                         nb_tx += ret;
1536                 }
1537         }
1538         return nb_tx;
1539 }
1540
1541 /**
1542  * Translate RX completion flags to packet type.
1543  *
1544  * @param[in] cqe
1545  *   Pointer to CQE.
1546  *
1547  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
1548  *
1549  * @return
1550  *   Packet type for struct rte_mbuf.
1551  */
1552 static inline uint32_t
1553 rxq_cq_to_pkt_type(volatile struct mlx5_cqe *cqe)
1554 {
1555         uint8_t idx;
1556         uint8_t pinfo = cqe->pkt_info;
1557         uint16_t ptype = cqe->hdr_type_etc;
1558
1559         /*
1560          * The index to the array should have:
1561          * bit[1:0] = l3_hdr_type
1562          * bit[4:2] = l4_hdr_type
1563          * bit[5] = ip_frag
1564          * bit[6] = tunneled
1565          * bit[7] = outer_l3_type
1566          */
1567         idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
1568         return mlx5_ptype_table[idx];
1569 }
1570
1571 /**
1572  * Get size of the next packet for a given CQE. For compressed CQEs, the
1573  * consumer index is updated only once all packets of the current one have
1574  * been processed.
1575  *
1576  * @param rxq
1577  *   Pointer to RX queue.
1578  * @param cqe
1579  *   CQE to process.
1580  * @param[out] rss_hash
1581  *   Packet RSS Hash result.
1582  *
1583  * @return
1584  *   Packet size in bytes (0 if there is none), -1 in case of completion
1585  *   with error.
1586  */
1587 static inline int
1588 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
1589                  uint16_t cqe_cnt, uint32_t *rss_hash)
1590 {
1591         struct rxq_zip *zip = &rxq->zip;
1592         uint16_t cqe_n = cqe_cnt + 1;
1593         int len = 0;
1594         uint16_t idx, end;
1595
1596         /* Process compressed data in the CQE and mini arrays. */
1597         if (zip->ai) {
1598                 volatile struct mlx5_mini_cqe8 (*mc)[8] =
1599                         (volatile struct mlx5_mini_cqe8 (*)[8])
1600                         (uintptr_t)(&(*rxq->cqes)[zip->ca & cqe_cnt].pkt_info);
1601
1602                 len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
1603                 *rss_hash = rte_be_to_cpu_32((*mc)[zip->ai & 7].rx_hash_result);
1604                 if ((++zip->ai & 7) == 0) {
1605                         /* Invalidate consumed CQEs */
1606                         idx = zip->ca;
1607                         end = zip->na;
1608                         while (idx != end) {
1609                                 (*rxq->cqes)[idx & cqe_cnt].op_own =
1610                                         MLX5_CQE_INVALIDATE;
1611                                 ++idx;
1612                         }
1613                         /*
1614                          * Increment consumer index to skip the number of
1615                          * CQEs consumed. Hardware leaves holes in the CQ
1616                          * ring for software use.
1617                          */
1618                         zip->ca = zip->na;
1619                         zip->na += 8;
1620                 }
1621                 if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
1622                         /* Invalidate the rest */
1623                         idx = zip->ca;
1624                         end = zip->cq_ci;
1625
1626                         while (idx != end) {
1627                                 (*rxq->cqes)[idx & cqe_cnt].op_own =
1628                                         MLX5_CQE_INVALIDATE;
1629                                 ++idx;
1630                         }
1631                         rxq->cq_ci = zip->cq_ci;
1632                         zip->ai = 0;
1633                 }
1634         /* No compressed data, get next CQE and verify if it is compressed. */
1635         } else {
1636                 int ret;
1637                 int8_t op_own;
1638
1639                 ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
1640                 if (unlikely(ret == 1))
1641                         return 0;
1642                 ++rxq->cq_ci;
1643                 op_own = cqe->op_own;
1644                 rte_cio_rmb();
1645                 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
1646                         volatile struct mlx5_mini_cqe8 (*mc)[8] =
1647                                 (volatile struct mlx5_mini_cqe8 (*)[8])
1648                                 (uintptr_t)(&(*rxq->cqes)[rxq->cq_ci &
1649                                                           cqe_cnt].pkt_info);
1650
1651                         /* Fix endianness. */
1652                         zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
1653                         /*
1654                          * Current mini array position is the one returned by
1655                          * check_cqe64().
1656                          *
1657                          * If completion comprises several mini arrays, as a
1658                          * special case the second one is located 7 CQEs after
1659                          * the initial CQE instead of 8 for subsequent ones.
1660                          */
1661                         zip->ca = rxq->cq_ci;
1662                         zip->na = zip->ca + 7;
1663                         /* Compute the next non compressed CQE. */
1664                         --rxq->cq_ci;
1665                         zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
1666                         /* Get packet size to return. */
1667                         len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
1668                         *rss_hash = rte_be_to_cpu_32((*mc)[0].rx_hash_result);
1669                         zip->ai = 1;
1670                         /* Prefetch all the entries to be invalidated */
1671                         idx = zip->ca;
1672                         end = zip->cq_ci;
1673                         while (idx != end) {
1674                                 rte_prefetch0(&(*rxq->cqes)[(idx) & cqe_cnt]);
1675                                 ++idx;
1676                         }
1677                 } else {
1678                         len = rte_be_to_cpu_32(cqe->byte_cnt);
1679                         *rss_hash = rte_be_to_cpu_32(cqe->rx_hash_res);
1680                 }
1681                 /* Error while receiving packet. */
1682                 if (unlikely(MLX5_CQE_OPCODE(op_own) == MLX5_CQE_RESP_ERR))
1683                         return -1;
1684         }
1685         return len;
1686 }
1687
1688 /**
1689  * Translate RX completion flags to offload flags.
1690  *
1691  * @param[in] rxq
1692  *   Pointer to RX queue structure.
1693  * @param[in] cqe
1694  *   Pointer to CQE.
1695  *
1696  * @return
1697  *   Offload flags (ol_flags) for struct rte_mbuf.
1698  */
1699 static inline uint32_t
1700 rxq_cq_to_ol_flags(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
1701 {
1702         uint32_t ol_flags = 0;
1703         uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1704
1705         ol_flags =
1706                 TRANSPOSE(flags,
1707                           MLX5_CQE_RX_L3_HDR_VALID,
1708                           PKT_RX_IP_CKSUM_GOOD) |
1709                 TRANSPOSE(flags,
1710                           MLX5_CQE_RX_L4_HDR_VALID,
1711                           PKT_RX_L4_CKSUM_GOOD);
1712         if ((cqe->pkt_info & MLX5_CQE_RX_TUNNEL_PACKET) && (rxq->csum_l2tun))
1713                 ol_flags |=
1714                         TRANSPOSE(flags,
1715                                   MLX5_CQE_RX_L3_HDR_VALID,
1716                                   PKT_RX_IP_CKSUM_GOOD) |
1717                         TRANSPOSE(flags,
1718                                   MLX5_CQE_RX_L4_HDR_VALID,
1719                                   PKT_RX_L4_CKSUM_GOOD);
1720         return ol_flags;
1721 }
1722
1723 /**
1724  * DPDK callback for RX.
1725  *
1726  * @param dpdk_rxq
1727  *   Generic pointer to RX queue structure.
1728  * @param[out] pkts
1729  *   Array to store received packets.
1730  * @param pkts_n
1731  *   Maximum number of packets in array.
1732  *
1733  * @return
1734  *   Number of packets successfully received (<= pkts_n).
1735  */
1736 uint16_t
1737 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1738 {
1739         struct mlx5_rxq_data *rxq = dpdk_rxq;
1740         const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1741         const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1742         const unsigned int sges_n = rxq->sges_n;
1743         struct rte_mbuf *pkt = NULL;
1744         struct rte_mbuf *seg = NULL;
1745         volatile struct mlx5_cqe *cqe =
1746                 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1747         unsigned int i = 0;
1748         unsigned int rq_ci = rxq->rq_ci << sges_n;
1749         int len = 0; /* keep its value across iterations. */
1750
1751         while (pkts_n) {
1752                 unsigned int idx = rq_ci & wqe_cnt;
1753                 volatile struct mlx5_wqe_data_seg *wqe = &(*rxq->wqes)[idx];
1754                 struct rte_mbuf *rep = (*rxq->elts)[idx];
1755                 uint32_t rss_hash_res = 0;
1756
1757                 if (pkt)
1758                         NEXT(seg) = rep;
1759                 seg = rep;
1760                 rte_prefetch0(seg);
1761                 rte_prefetch0(cqe);
1762                 rte_prefetch0(wqe);
1763                 rep = rte_mbuf_raw_alloc(rxq->mp);
1764                 if (unlikely(rep == NULL)) {
1765                         ++rxq->stats.rx_nombuf;
1766                         if (!pkt) {
1767                                 /*
1768                                  * no buffers before we even started,
1769                                  * bail out silently.
1770                                  */
1771                                 break;
1772                         }
1773                         while (pkt != seg) {
1774                                 assert(pkt != (*rxq->elts)[idx]);
1775                                 rep = NEXT(pkt);
1776                                 NEXT(pkt) = NULL;
1777                                 NB_SEGS(pkt) = 1;
1778                                 rte_mbuf_raw_free(pkt);
1779                                 pkt = rep;
1780                         }
1781                         break;
1782                 }
1783                 if (!pkt) {
1784                         cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1785                         len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt,
1786                                                &rss_hash_res);
1787                         if (!len) {
1788                                 rte_mbuf_raw_free(rep);
1789                                 break;
1790                         }
1791                         if (unlikely(len == -1)) {
1792                                 /* RX error, packet is likely too large. */
1793                                 rte_mbuf_raw_free(rep);
1794                                 ++rxq->stats.idropped;
1795                                 goto skip;
1796                         }
1797                         pkt = seg;
1798                         assert(len >= (rxq->crc_present << 2));
1799                         /* Update packet information. */
1800                         pkt->packet_type = rxq_cq_to_pkt_type(cqe);
1801                         pkt->ol_flags = 0;
1802                         if (rss_hash_res && rxq->rss_hash) {
1803                                 pkt->hash.rss = rss_hash_res;
1804                                 pkt->ol_flags = PKT_RX_RSS_HASH;
1805                         }
1806                         if (rxq->mark &&
1807                             MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1808                                 pkt->ol_flags |= PKT_RX_FDIR;
1809                                 if (cqe->sop_drop_qpn !=
1810                                     rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1811                                         uint32_t mark = cqe->sop_drop_qpn;
1812
1813                                         pkt->ol_flags |= PKT_RX_FDIR_ID;
1814                                         pkt->hash.fdir.hi =
1815                                                 mlx5_flow_mark_get(mark);
1816                                 }
1817                         }
1818                         if (rxq->csum | rxq->csum_l2tun)
1819                                 pkt->ol_flags |= rxq_cq_to_ol_flags(rxq, cqe);
1820                         if (rxq->vlan_strip &&
1821                             (cqe->hdr_type_etc &
1822                              rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1823                                 pkt->ol_flags |= PKT_RX_VLAN |
1824                                         PKT_RX_VLAN_STRIPPED;
1825                                 pkt->vlan_tci =
1826                                         rte_be_to_cpu_16(cqe->vlan_info);
1827                         }
1828                         if (rxq->hw_timestamp) {
1829                                 pkt->timestamp =
1830                                         rte_be_to_cpu_64(cqe->timestamp);
1831                                 pkt->ol_flags |= PKT_RX_TIMESTAMP;
1832                         }
1833                         if (rxq->crc_present)
1834                                 len -= ETHER_CRC_LEN;
1835                         PKT_LEN(pkt) = len;
1836                 }
1837                 DATA_LEN(rep) = DATA_LEN(seg);
1838                 PKT_LEN(rep) = PKT_LEN(seg);
1839                 SET_DATA_OFF(rep, DATA_OFF(seg));
1840                 PORT(rep) = PORT(seg);
1841                 (*rxq->elts)[idx] = rep;
1842                 /*
1843                  * Fill NIC descriptor with the new buffer.  The lkey and size
1844                  * of the buffers are already known, only the buffer address
1845                  * changes.
1846                  */
1847                 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1848                 if (len > DATA_LEN(seg)) {
1849                         len -= DATA_LEN(seg);
1850                         ++NB_SEGS(pkt);
1851                         ++rq_ci;
1852                         continue;
1853                 }
1854                 DATA_LEN(seg) = len;
1855 #ifdef MLX5_PMD_SOFT_COUNTERS
1856                 /* Increment bytes counter. */
1857                 rxq->stats.ibytes += PKT_LEN(pkt);
1858 #endif
1859                 /* Return packet. */
1860                 *(pkts++) = pkt;
1861                 pkt = NULL;
1862                 --pkts_n;
1863                 ++i;
1864 skip:
1865                 /* Align consumer index to the next stride. */
1866                 rq_ci >>= sges_n;
1867                 ++rq_ci;
1868                 rq_ci <<= sges_n;
1869         }
1870         if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1871                 return 0;
1872         /* Update the consumer index. */
1873         rxq->rq_ci = rq_ci >> sges_n;
1874         rte_cio_wmb();
1875         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1876         rte_cio_wmb();
1877         *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1878 #ifdef MLX5_PMD_SOFT_COUNTERS
1879         /* Increment packets counter. */
1880         rxq->stats.ipackets += i;
1881 #endif
1882         return i;
1883 }
1884
1885 /**
1886  * Dummy DPDK callback for TX.
1887  *
1888  * This function is used to temporarily replace the real callback during
1889  * unsafe control operations on the queue, or in case of error.
1890  *
1891  * @param dpdk_txq
1892  *   Generic pointer to TX queue structure.
1893  * @param[in] pkts
1894  *   Packets to transmit.
1895  * @param pkts_n
1896  *   Number of packets in array.
1897  *
1898  * @return
1899  *   Number of packets successfully transmitted (<= pkts_n).
1900  */
1901 uint16_t
1902 removed_tx_burst(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1903 {
1904         (void)dpdk_txq;
1905         (void)pkts;
1906         (void)pkts_n;
1907         return 0;
1908 }
1909
1910 /**
1911  * Dummy DPDK callback for RX.
1912  *
1913  * This function is used to temporarily replace the real callback during
1914  * unsafe control operations on the queue, or in case of error.
1915  *
1916  * @param dpdk_rxq
1917  *   Generic pointer to RX queue structure.
1918  * @param[out] pkts
1919  *   Array to store received packets.
1920  * @param pkts_n
1921  *   Maximum number of packets in array.
1922  *
1923  * @return
1924  *   Number of packets successfully received (<= pkts_n).
1925  */
1926 uint16_t
1927 removed_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1928 {
1929         (void)dpdk_rxq;
1930         (void)pkts;
1931         (void)pkts_n;
1932         return 0;
1933 }
1934
1935 /*
1936  * Vectorized Rx/Tx routines are not compiled in when required vector
1937  * instructions are not supported on a target architecture. The following null
1938  * stubs are needed for linkage when those are not included outside of this file
1939  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1940  */
1941
1942 uint16_t __attribute__((weak))
1943 mlx5_tx_burst_raw_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1944 {
1945         (void)dpdk_txq;
1946         (void)pkts;
1947         (void)pkts_n;
1948         return 0;
1949 }
1950
1951 uint16_t __attribute__((weak))
1952 mlx5_tx_burst_vec(void *dpdk_txq, struct rte_mbuf **pkts, uint16_t pkts_n)
1953 {
1954         (void)dpdk_txq;
1955         (void)pkts;
1956         (void)pkts_n;
1957         return 0;
1958 }
1959
1960 uint16_t __attribute__((weak))
1961 mlx5_rx_burst_vec(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1962 {
1963         (void)dpdk_rxq;
1964         (void)pkts;
1965         (void)pkts_n;
1966         return 0;
1967 }
1968
1969 int __attribute__((weak))
1970 priv_check_raw_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev)
1971 {
1972         (void)priv;
1973         (void)dev;
1974         return -ENOTSUP;
1975 }
1976
1977 int __attribute__((weak))
1978 priv_check_vec_tx_support(struct priv *priv, struct rte_eth_dev *dev)
1979 {
1980         (void)priv;
1981         (void)dev;
1982         return -ENOTSUP;
1983 }
1984
1985 int __attribute__((weak))
1986 rxq_check_vec_support(struct mlx5_rxq_data *rxq)
1987 {
1988         (void)rxq;
1989         return -ENOTSUP;
1990 }
1991
1992 int __attribute__((weak))
1993 priv_check_vec_rx_support(struct priv *priv)
1994 {
1995         (void)priv;
1996         return -ENOTSUP;
1997 }