net/mlx5: add Tx configuration and setup
[dpdk.git] / drivers / net / mlx5 / mlx5_rxtx.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2015 Mellanox Technologies, Ltd
4  */
5
6 #include <assert.h>
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdlib.h>
10
11 /* Verbs header. */
12 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
13 #ifdef PEDANTIC
14 #pragma GCC diagnostic ignored "-Wpedantic"
15 #endif
16 #include <infiniband/verbs.h>
17 #include <infiniband/mlx5dv.h>
18 #ifdef PEDANTIC
19 #pragma GCC diagnostic error "-Wpedantic"
20 #endif
21
22 #include <rte_mbuf.h>
23 #include <rte_mempool.h>
24 #include <rte_prefetch.h>
25 #include <rte_common.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <rte_cycles.h>
29
30 #include "mlx5.h"
31 #include "mlx5_utils.h"
32 #include "mlx5_rxtx.h"
33 #include "mlx5_autoconf.h"
34 #include "mlx5_defs.h"
35 #include "mlx5_prm.h"
36
37 static __rte_always_inline uint32_t
38 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe);
39
40 static __rte_always_inline int
41 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
42                  uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe);
43
44 static __rte_always_inline uint32_t
45 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe);
46
47 static __rte_always_inline void
48 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
49                volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res);
50
51 static __rte_always_inline void
52 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx);
53
54 static int
55 mlx5_queue_state_modify(struct rte_eth_dev *dev,
56                         struct mlx5_mp_arg_queue_state_modify *sm);
57
58 uint32_t mlx5_ptype_table[] __rte_cache_aligned = {
59         [0xff] = RTE_PTYPE_ALL_MASK, /* Last entry for errored packet. */
60 };
61
62 uint8_t mlx5_cksum_table[1 << 10] __rte_cache_aligned;
63 uint8_t mlx5_swp_types_table[1 << 10] __rte_cache_aligned;
64
65 /**
66  * Build a table to translate Rx completion flags to packet type.
67  *
68  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
69  */
70 void
71 mlx5_set_ptype_table(void)
72 {
73         unsigned int i;
74         uint32_t (*p)[RTE_DIM(mlx5_ptype_table)] = &mlx5_ptype_table;
75
76         /* Last entry must not be overwritten, reserved for errored packet. */
77         for (i = 0; i < RTE_DIM(mlx5_ptype_table) - 1; ++i)
78                 (*p)[i] = RTE_PTYPE_UNKNOWN;
79         /*
80          * The index to the array should have:
81          * bit[1:0] = l3_hdr_type
82          * bit[4:2] = l4_hdr_type
83          * bit[5] = ip_frag
84          * bit[6] = tunneled
85          * bit[7] = outer_l3_type
86          */
87         /* L2 */
88         (*p)[0x00] = RTE_PTYPE_L2_ETHER;
89         /* L3 */
90         (*p)[0x01] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
91                      RTE_PTYPE_L4_NONFRAG;
92         (*p)[0x02] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
93                      RTE_PTYPE_L4_NONFRAG;
94         /* Fragmented */
95         (*p)[0x21] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
96                      RTE_PTYPE_L4_FRAG;
97         (*p)[0x22] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
98                      RTE_PTYPE_L4_FRAG;
99         /* TCP */
100         (*p)[0x05] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
101                      RTE_PTYPE_L4_TCP;
102         (*p)[0x06] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
103                      RTE_PTYPE_L4_TCP;
104         (*p)[0x0d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
105                      RTE_PTYPE_L4_TCP;
106         (*p)[0x0e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
107                      RTE_PTYPE_L4_TCP;
108         (*p)[0x11] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
109                      RTE_PTYPE_L4_TCP;
110         (*p)[0x12] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
111                      RTE_PTYPE_L4_TCP;
112         /* UDP */
113         (*p)[0x09] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
114                      RTE_PTYPE_L4_UDP;
115         (*p)[0x0a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
116                      RTE_PTYPE_L4_UDP;
117         /* Repeat with outer_l3_type being set. Just in case. */
118         (*p)[0x81] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
119                      RTE_PTYPE_L4_NONFRAG;
120         (*p)[0x82] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
121                      RTE_PTYPE_L4_NONFRAG;
122         (*p)[0xa1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
123                      RTE_PTYPE_L4_FRAG;
124         (*p)[0xa2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
125                      RTE_PTYPE_L4_FRAG;
126         (*p)[0x85] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
127                      RTE_PTYPE_L4_TCP;
128         (*p)[0x86] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
129                      RTE_PTYPE_L4_TCP;
130         (*p)[0x8d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
131                      RTE_PTYPE_L4_TCP;
132         (*p)[0x8e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
133                      RTE_PTYPE_L4_TCP;
134         (*p)[0x91] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
135                      RTE_PTYPE_L4_TCP;
136         (*p)[0x92] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
137                      RTE_PTYPE_L4_TCP;
138         (*p)[0x89] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
139                      RTE_PTYPE_L4_UDP;
140         (*p)[0x8a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
141                      RTE_PTYPE_L4_UDP;
142         /* Tunneled - L3 */
143         (*p)[0x40] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN;
144         (*p)[0x41] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
145                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
146                      RTE_PTYPE_INNER_L4_NONFRAG;
147         (*p)[0x42] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
148                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
149                      RTE_PTYPE_INNER_L4_NONFRAG;
150         (*p)[0xc0] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN;
151         (*p)[0xc1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
152                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
153                      RTE_PTYPE_INNER_L4_NONFRAG;
154         (*p)[0xc2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
155                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
156                      RTE_PTYPE_INNER_L4_NONFRAG;
157         /* Tunneled - Fragmented */
158         (*p)[0x61] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
159                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
160                      RTE_PTYPE_INNER_L4_FRAG;
161         (*p)[0x62] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
162                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
163                      RTE_PTYPE_INNER_L4_FRAG;
164         (*p)[0xe1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
165                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
166                      RTE_PTYPE_INNER_L4_FRAG;
167         (*p)[0xe2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
168                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
169                      RTE_PTYPE_INNER_L4_FRAG;
170         /* Tunneled - TCP */
171         (*p)[0x45] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
172                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
173                      RTE_PTYPE_INNER_L4_TCP;
174         (*p)[0x46] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
175                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
176                      RTE_PTYPE_INNER_L4_TCP;
177         (*p)[0x4d] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
178                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
179                      RTE_PTYPE_INNER_L4_TCP;
180         (*p)[0x4e] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
181                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
182                      RTE_PTYPE_INNER_L4_TCP;
183         (*p)[0x51] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
184                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
185                      RTE_PTYPE_INNER_L4_TCP;
186         (*p)[0x52] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
187                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
188                      RTE_PTYPE_INNER_L4_TCP;
189         (*p)[0xc5] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
190                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
191                      RTE_PTYPE_INNER_L4_TCP;
192         (*p)[0xc6] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
193                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
194                      RTE_PTYPE_INNER_L4_TCP;
195         (*p)[0xcd] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
196                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
197                      RTE_PTYPE_INNER_L4_TCP;
198         (*p)[0xce] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
199                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
200                      RTE_PTYPE_INNER_L4_TCP;
201         (*p)[0xd1] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
202                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
203                      RTE_PTYPE_INNER_L4_TCP;
204         (*p)[0xd2] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
205                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
206                      RTE_PTYPE_INNER_L4_TCP;
207         /* Tunneled - UDP */
208         (*p)[0x49] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
209                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
210                      RTE_PTYPE_INNER_L4_UDP;
211         (*p)[0x4a] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV4_EXT_UNKNOWN |
212                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
213                      RTE_PTYPE_INNER_L4_UDP;
214         (*p)[0xc9] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
215                      RTE_PTYPE_INNER_L3_IPV6_EXT_UNKNOWN |
216                      RTE_PTYPE_INNER_L4_UDP;
217         (*p)[0xca] = RTE_PTYPE_L2_ETHER | RTE_PTYPE_L3_IPV6_EXT_UNKNOWN |
218                      RTE_PTYPE_INNER_L3_IPV4_EXT_UNKNOWN |
219                      RTE_PTYPE_INNER_L4_UDP;
220 }
221
222 /**
223  * Build a table to translate packet to checksum type of Verbs.
224  */
225 void
226 mlx5_set_cksum_table(void)
227 {
228         unsigned int i;
229         uint8_t v;
230
231         /*
232          * The index should have:
233          * bit[0] = PKT_TX_TCP_SEG
234          * bit[2:3] = PKT_TX_UDP_CKSUM, PKT_TX_TCP_CKSUM
235          * bit[4] = PKT_TX_IP_CKSUM
236          * bit[8] = PKT_TX_OUTER_IP_CKSUM
237          * bit[9] = tunnel
238          */
239         for (i = 0; i < RTE_DIM(mlx5_cksum_table); ++i) {
240                 v = 0;
241                 if (i & (1 << 9)) {
242                         /* Tunneled packet. */
243                         if (i & (1 << 8)) /* Outer IP. */
244                                 v |= MLX5_ETH_WQE_L3_CSUM;
245                         if (i & (1 << 4)) /* Inner IP. */
246                                 v |= MLX5_ETH_WQE_L3_INNER_CSUM;
247                         if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
248                                 v |= MLX5_ETH_WQE_L4_INNER_CSUM;
249                 } else {
250                         /* No tunnel. */
251                         if (i & (1 << 4)) /* IP. */
252                                 v |= MLX5_ETH_WQE_L3_CSUM;
253                         if (i & (3 << 2 | 1 << 0)) /* L4 or TSO. */
254                                 v |= MLX5_ETH_WQE_L4_CSUM;
255                 }
256                 mlx5_cksum_table[i] = v;
257         }
258 }
259
260 /**
261  * Build a table to translate packet type of mbuf to SWP type of Verbs.
262  */
263 void
264 mlx5_set_swp_types_table(void)
265 {
266         unsigned int i;
267         uint8_t v;
268
269         /*
270          * The index should have:
271          * bit[0:1] = PKT_TX_L4_MASK
272          * bit[4] = PKT_TX_IPV6
273          * bit[8] = PKT_TX_OUTER_IPV6
274          * bit[9] = PKT_TX_OUTER_UDP
275          */
276         for (i = 0; i < RTE_DIM(mlx5_swp_types_table); ++i) {
277                 v = 0;
278                 if (i & (1 << 8))
279                         v |= MLX5_ETH_WQE_L3_OUTER_IPV6;
280                 if (i & (1 << 9))
281                         v |= MLX5_ETH_WQE_L4_OUTER_UDP;
282                 if (i & (1 << 4))
283                         v |= MLX5_ETH_WQE_L3_INNER_IPV6;
284                 if ((i & 3) == (PKT_TX_UDP_CKSUM >> 52))
285                         v |= MLX5_ETH_WQE_L4_INNER_UDP;
286                 mlx5_swp_types_table[i] = v;
287         }
288 }
289
290 /**
291  * Internal function to compute the number of used descriptors in an RX queue
292  *
293  * @param rxq
294  *   The Rx queue.
295  *
296  * @return
297  *   The number of used rx descriptor.
298  */
299 static uint32_t
300 rx_queue_count(struct mlx5_rxq_data *rxq)
301 {
302         struct rxq_zip *zip = &rxq->zip;
303         volatile struct mlx5_cqe *cqe;
304         const unsigned int cqe_n = (1 << rxq->cqe_n);
305         const unsigned int cqe_cnt = cqe_n - 1;
306         unsigned int cq_ci;
307         unsigned int used;
308
309         /* if we are processing a compressed cqe */
310         if (zip->ai) {
311                 used = zip->cqe_cnt - zip->ca;
312                 cq_ci = zip->cq_ci;
313         } else {
314                 used = 0;
315                 cq_ci = rxq->cq_ci;
316         }
317         cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
318         while (check_cqe(cqe, cqe_n, cq_ci) != MLX5_CQE_STATUS_HW_OWN) {
319                 int8_t op_own;
320                 unsigned int n;
321
322                 op_own = cqe->op_own;
323                 if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED)
324                         n = rte_be_to_cpu_32(cqe->byte_cnt);
325                 else
326                         n = 1;
327                 cq_ci += n;
328                 used += n;
329                 cqe = &(*rxq->cqes)[cq_ci & cqe_cnt];
330         }
331         used = RTE_MIN(used, (1U << rxq->elts_n) - 1);
332         return used;
333 }
334
335 /**
336  * DPDK callback to check the status of a rx descriptor.
337  *
338  * @param rx_queue
339  *   The Rx queue.
340  * @param[in] offset
341  *   The index of the descriptor in the ring.
342  *
343  * @return
344  *   The status of the tx descriptor.
345  */
346 int
347 mlx5_rx_descriptor_status(void *rx_queue, uint16_t offset)
348 {
349         struct mlx5_rxq_data *rxq = rx_queue;
350         struct mlx5_rxq_ctrl *rxq_ctrl =
351                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
352         struct rte_eth_dev *dev = ETH_DEV(rxq_ctrl->priv);
353
354         if (dev->rx_pkt_burst != mlx5_rx_burst) {
355                 rte_errno = ENOTSUP;
356                 return -rte_errno;
357         }
358         if (offset >= (1 << rxq->elts_n)) {
359                 rte_errno = EINVAL;
360                 return -rte_errno;
361         }
362         if (offset < rx_queue_count(rxq))
363                 return RTE_ETH_RX_DESC_DONE;
364         return RTE_ETH_RX_DESC_AVAIL;
365 }
366
367 /**
368  * DPDK callback to get the number of used descriptors in a RX queue
369  *
370  * @param dev
371  *   Pointer to the device structure.
372  *
373  * @param rx_queue_id
374  *   The Rx queue.
375  *
376  * @return
377  *   The number of used rx descriptor.
378  *   -EINVAL if the queue is invalid
379  */
380 uint32_t
381 mlx5_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
382 {
383         struct mlx5_priv *priv = dev->data->dev_private;
384         struct mlx5_rxq_data *rxq;
385
386         if (dev->rx_pkt_burst != mlx5_rx_burst) {
387                 rte_errno = ENOTSUP;
388                 return -rte_errno;
389         }
390         rxq = (*priv->rxqs)[rx_queue_id];
391         if (!rxq) {
392                 rte_errno = EINVAL;
393                 return -rte_errno;
394         }
395         return rx_queue_count(rxq);
396 }
397
398 #define MLX5_SYSTEM_LOG_DIR "/var/log"
399 /**
400  * Dump debug information to log file.
401  *
402  * @param fname
403  *   The file name.
404  * @param hex_title
405  *   If not NULL this string is printed as a header to the output
406  *   and the output will be in hexadecimal view.
407  * @param buf
408  *   This is the buffer address to print out.
409  * @param len
410  *   The number of bytes to dump out.
411  */
412 void
413 mlx5_dump_debug_information(const char *fname, const char *hex_title,
414                             const void *buf, unsigned int hex_len)
415 {
416         FILE *fd;
417
418         MKSTR(path, "%s/%s", MLX5_SYSTEM_LOG_DIR, fname);
419         fd = fopen(path, "a+");
420         if (!fd) {
421                 DRV_LOG(WARNING, "cannot open %s for debug dump\n",
422                         path);
423                 MKSTR(path2, "./%s", fname);
424                 fd = fopen(path2, "a+");
425                 if (!fd) {
426                         DRV_LOG(ERR, "cannot open %s for debug dump\n",
427                                 path2);
428                         return;
429                 }
430                 DRV_LOG(INFO, "New debug dump in file %s\n", path2);
431         } else {
432                 DRV_LOG(INFO, "New debug dump in file %s\n", path);
433         }
434         if (hex_title)
435                 rte_hexdump(fd, hex_title, buf, hex_len);
436         else
437                 fprintf(fd, "%s", (const char *)buf);
438         fprintf(fd, "\n\n\n");
439         fclose(fd);
440 }
441
442 /**
443  * Move QP from error state to running state and initialize indexes.
444  *
445  * @param txq_ctrl
446  *   Pointer to TX queue control structure.
447  *
448  * @return
449  *   0 on success, else -1.
450  */
451 static int
452 tx_recover_qp(struct mlx5_txq_ctrl *txq_ctrl)
453 {
454         struct mlx5_mp_arg_queue_state_modify sm = {
455                         .is_wq = 0,
456                         .queue_id = txq_ctrl->txq.idx,
457         };
458
459         if (mlx5_queue_state_modify(ETH_DEV(txq_ctrl->priv), &sm))
460                 return -1;
461         txq_ctrl->txq.wqe_ci = 0;
462         txq_ctrl->txq.wqe_pi = 0;
463         txq_ctrl->txq.elts_comp = 0;
464         return 0;
465 }
466
467 /* Return 1 if the error CQE is signed otherwise, sign it and return 0. */
468 static int
469 check_err_cqe_seen(volatile struct mlx5_err_cqe *err_cqe)
470 {
471         static const uint8_t magic[] = "seen";
472         int ret = 1;
473         unsigned int i;
474
475         for (i = 0; i < sizeof(magic); ++i)
476                 if (!ret || err_cqe->rsvd1[i] != magic[i]) {
477                         ret = 0;
478                         err_cqe->rsvd1[i] = magic[i];
479                 }
480         return ret;
481 }
482
483 /**
484  * Handle error CQE.
485  *
486  * @param txq
487  *   Pointer to TX queue structure.
488  * @param error_cqe
489  *   Pointer to the error CQE.
490  *
491  * @return
492  *   The last Tx buffer element to free.
493  */
494 uint16_t
495 mlx5_tx_error_cqe_handle(struct mlx5_txq_data *txq,
496                          volatile struct mlx5_err_cqe *err_cqe)
497 {
498         if (err_cqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR) {
499                 const uint16_t wqe_m = ((1 << txq->wqe_n) - 1);
500                 struct mlx5_txq_ctrl *txq_ctrl =
501                                 container_of(txq, struct mlx5_txq_ctrl, txq);
502                 uint16_t new_wqe_pi = rte_be_to_cpu_16(err_cqe->wqe_counter);
503                 int seen = check_err_cqe_seen(err_cqe);
504
505                 if (!seen && txq_ctrl->dump_file_n <
506                     txq_ctrl->priv->config.max_dump_files_num) {
507                         MKSTR(err_str, "Unexpected CQE error syndrome "
508                               "0x%02x CQN = %u SQN = %u wqe_counter = %u "
509                               "wq_ci = %u cq_ci = %u", err_cqe->syndrome,
510                               txq->cqe_s, txq->qp_num_8s >> 8,
511                               rte_be_to_cpu_16(err_cqe->wqe_counter),
512                               txq->wqe_ci, txq->cq_ci);
513                         MKSTR(name, "dpdk_mlx5_port_%u_txq_%u_index_%u_%u",
514                               PORT_ID(txq_ctrl->priv), txq->idx,
515                               txq_ctrl->dump_file_n, (uint32_t)rte_rdtsc());
516                         mlx5_dump_debug_information(name, NULL, err_str, 0);
517                         mlx5_dump_debug_information(name, "MLX5 Error CQ:",
518                                                     (const void *)((uintptr_t)
519                                                     txq->cqes),
520                                                     sizeof(*err_cqe) *
521                                                     (1 << txq->cqe_n));
522                         mlx5_dump_debug_information(name, "MLX5 Error SQ:",
523                                                     (const void *)((uintptr_t)
524                                                     txq->wqes),
525                                                     MLX5_WQE_SIZE *
526                                                     (1 << txq->wqe_n));
527                         txq_ctrl->dump_file_n++;
528                 }
529                 if (!seen)
530                         /*
531                          * Count errors in WQEs units.
532                          * Later it can be improved to count error packets,
533                          * for example, by SQ parsing to find how much packets
534                          * should be counted for each WQE.
535                          */
536                         txq->stats.oerrors += ((txq->wqe_ci & wqe_m) -
537                                                 new_wqe_pi) & wqe_m;
538                 if (tx_recover_qp(txq_ctrl) == 0) {
539                         txq->cq_ci++;
540                         /* Release all the remaining buffers. */
541                         return txq->elts_head;
542                 }
543                 /* Recovering failed - try again later on the same WQE. */
544         } else {
545                 txq->cq_ci++;
546         }
547         /* Do not release buffers. */
548         return txq->elts_tail;
549 }
550
551 /**
552  * Translate RX completion flags to packet type.
553  *
554  * @param[in] rxq
555  *   Pointer to RX queue structure.
556  * @param[in] cqe
557  *   Pointer to CQE.
558  *
559  * @note: fix mlx5_dev_supported_ptypes_get() if any change here.
560  *
561  * @return
562  *   Packet type for struct rte_mbuf.
563  */
564 static inline uint32_t
565 rxq_cq_to_pkt_type(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe)
566 {
567         uint8_t idx;
568         uint8_t pinfo = cqe->pkt_info;
569         uint16_t ptype = cqe->hdr_type_etc;
570
571         /*
572          * The index to the array should have:
573          * bit[1:0] = l3_hdr_type
574          * bit[4:2] = l4_hdr_type
575          * bit[5] = ip_frag
576          * bit[6] = tunneled
577          * bit[7] = outer_l3_type
578          */
579         idx = ((pinfo & 0x3) << 6) | ((ptype & 0xfc00) >> 10);
580         return mlx5_ptype_table[idx] | rxq->tunnel * !!(idx & (1 << 6));
581 }
582
583 /**
584  * Initialize Rx WQ and indexes.
585  *
586  * @param[in] rxq
587  *   Pointer to RX queue structure.
588  */
589 void
590 mlx5_rxq_initialize(struct mlx5_rxq_data *rxq)
591 {
592         const unsigned int wqe_n = 1 << rxq->elts_n;
593         unsigned int i;
594
595         for (i = 0; (i != wqe_n); ++i) {
596                 volatile struct mlx5_wqe_data_seg *scat;
597                 uintptr_t addr;
598                 uint32_t byte_count;
599
600                 if (mlx5_rxq_mprq_enabled(rxq)) {
601                         struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[i];
602
603                         scat = &((volatile struct mlx5_wqe_mprq *)
604                                 rxq->wqes)[i].dseg;
605                         addr = (uintptr_t)mlx5_mprq_buf_addr(buf);
606                         byte_count = (1 << rxq->strd_sz_n) *
607                                         (1 << rxq->strd_num_n);
608                 } else {
609                         struct rte_mbuf *buf = (*rxq->elts)[i];
610
611                         scat = &((volatile struct mlx5_wqe_data_seg *)
612                                         rxq->wqes)[i];
613                         addr = rte_pktmbuf_mtod(buf, uintptr_t);
614                         byte_count = DATA_LEN(buf);
615                 }
616                 /* scat->addr must be able to store a pointer. */
617                 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
618                 *scat = (struct mlx5_wqe_data_seg){
619                         .addr = rte_cpu_to_be_64(addr),
620                         .byte_count = rte_cpu_to_be_32(byte_count),
621                         .lkey = mlx5_rx_addr2mr(rxq, addr),
622                 };
623         }
624         rxq->consumed_strd = 0;
625         rxq->decompressed = 0;
626         rxq->rq_pi = 0;
627         rxq->zip = (struct rxq_zip){
628                 .ai = 0,
629         };
630         /* Update doorbell counter. */
631         rxq->rq_ci = wqe_n >> rxq->sges_n;
632         rte_cio_wmb();
633         *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
634 }
635
636 /**
637  * Modify a Verbs queue state.
638  * This must be called from the primary process.
639  *
640  * @param dev
641  *   Pointer to Ethernet device.
642  * @param sm
643  *   State modify request parameters.
644  *
645  * @return
646  *   0 in case of success else non-zero value and rte_errno is set.
647  */
648 int
649 mlx5_queue_state_modify_primary(struct rte_eth_dev *dev,
650                         const struct mlx5_mp_arg_queue_state_modify *sm)
651 {
652         int ret;
653         struct mlx5_priv *priv = dev->data->dev_private;
654
655         if (sm->is_wq) {
656                 struct ibv_wq_attr mod = {
657                         .attr_mask = IBV_WQ_ATTR_STATE,
658                         .wq_state = sm->state,
659                 };
660                 struct mlx5_rxq_data *rxq = (*priv->rxqs)[sm->queue_id];
661                 struct mlx5_rxq_ctrl *rxq_ctrl =
662                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
663
664                 ret = mlx5_glue->modify_wq(rxq_ctrl->ibv->wq, &mod);
665                 if (ret) {
666                         DRV_LOG(ERR, "Cannot change Rx WQ state to %u  - %s\n",
667                                         sm->state, strerror(errno));
668                         rte_errno = errno;
669                         return ret;
670                 }
671         } else {
672                 struct mlx5_txq_data *txq = (*priv->txqs)[sm->queue_id];
673                 struct mlx5_txq_ctrl *txq_ctrl =
674                         container_of(txq, struct mlx5_txq_ctrl, txq);
675                 struct ibv_qp_attr mod = {
676                         .qp_state = IBV_QPS_RESET,
677                         .port_num = (uint8_t)priv->ibv_port,
678                 };
679                 struct ibv_qp *qp = txq_ctrl->ibv->qp;
680
681                 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
682                 if (ret) {
683                         DRV_LOG(ERR, "Cannot change the Tx QP state to RESET "
684                                 "%s\n", strerror(errno));
685                         rte_errno = errno;
686                         return ret;
687                 }
688                 mod.qp_state = IBV_QPS_INIT;
689                 ret = mlx5_glue->modify_qp(qp, &mod,
690                                            (IBV_QP_STATE | IBV_QP_PORT));
691                 if (ret) {
692                         DRV_LOG(ERR, "Cannot change Tx QP state to INIT %s\n",
693                                 strerror(errno));
694                         rte_errno = errno;
695                         return ret;
696                 }
697                 mod.qp_state = IBV_QPS_RTR;
698                 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
699                 if (ret) {
700                         DRV_LOG(ERR, "Cannot change Tx QP state to RTR %s\n",
701                                 strerror(errno));
702                         rte_errno = errno;
703                         return ret;
704                 }
705                 mod.qp_state = IBV_QPS_RTS;
706                 ret = mlx5_glue->modify_qp(qp, &mod, IBV_QP_STATE);
707                 if (ret) {
708                         DRV_LOG(ERR, "Cannot change Tx QP state to RTS %s\n",
709                                 strerror(errno));
710                         rte_errno = errno;
711                         return ret;
712                 }
713         }
714         return 0;
715 }
716
717 /**
718  * Modify a Verbs queue state.
719  *
720  * @param dev
721  *   Pointer to Ethernet device.
722  * @param sm
723  *   State modify request parameters.
724  *
725  * @return
726  *   0 in case of success else non-zero value.
727  */
728 static int
729 mlx5_queue_state_modify(struct rte_eth_dev *dev,
730                         struct mlx5_mp_arg_queue_state_modify *sm)
731 {
732         int ret = 0;
733
734         switch (rte_eal_process_type()) {
735         case RTE_PROC_PRIMARY:
736                 ret = mlx5_queue_state_modify_primary(dev, sm);
737                 break;
738         case RTE_PROC_SECONDARY:
739                 ret = mlx5_mp_req_queue_state_modify(dev, sm);
740                 break;
741         default:
742                 break;
743         }
744         return ret;
745 }
746
747 /**
748  * Handle a Rx error.
749  * The function inserts the RQ state to reset when the first error CQE is
750  * shown, then drains the CQ by the caller function loop. When the CQ is empty,
751  * it moves the RQ state to ready and initializes the RQ.
752  * Next CQE identification and error counting are in the caller responsibility.
753  *
754  * @param[in] rxq
755  *   Pointer to RX queue structure.
756  * @param[in] mbuf_prepare
757  *   Whether to prepare mbufs for the RQ.
758  *
759  * @return
760  *   -1 in case of recovery error, otherwise the CQE status.
761  */
762 int
763 mlx5_rx_err_handle(struct mlx5_rxq_data *rxq, uint8_t mbuf_prepare)
764 {
765         const uint16_t cqe_n = 1 << rxq->cqe_n;
766         const uint16_t cqe_mask = cqe_n - 1;
767         const unsigned int wqe_n = 1 << rxq->elts_n;
768         struct mlx5_rxq_ctrl *rxq_ctrl =
769                         container_of(rxq, struct mlx5_rxq_ctrl, rxq);
770         union {
771                 volatile struct mlx5_cqe *cqe;
772                 volatile struct mlx5_err_cqe *err_cqe;
773         } u = {
774                 .cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_mask],
775         };
776         struct mlx5_mp_arg_queue_state_modify sm;
777         int ret;
778
779         switch (rxq->err_state) {
780         case MLX5_RXQ_ERR_STATE_NO_ERROR:
781                 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_RESET;
782                 /* Fall-through */
783         case MLX5_RXQ_ERR_STATE_NEED_RESET:
784                 sm.is_wq = 1;
785                 sm.queue_id = rxq->idx;
786                 sm.state = IBV_WQS_RESET;
787                 if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv), &sm))
788                         return -1;
789                 if (rxq_ctrl->dump_file_n <
790                     rxq_ctrl->priv->config.max_dump_files_num) {
791                         MKSTR(err_str, "Unexpected CQE error syndrome "
792                               "0x%02x CQN = %u RQN = %u wqe_counter = %u"
793                               " rq_ci = %u cq_ci = %u", u.err_cqe->syndrome,
794                               rxq->cqn, rxq_ctrl->wqn,
795                               rte_be_to_cpu_16(u.err_cqe->wqe_counter),
796                               rxq->rq_ci << rxq->sges_n, rxq->cq_ci);
797                         MKSTR(name, "dpdk_mlx5_port_%u_rxq_%u_%u",
798                               rxq->port_id, rxq->idx, (uint32_t)rte_rdtsc());
799                         mlx5_dump_debug_information(name, NULL, err_str, 0);
800                         mlx5_dump_debug_information(name, "MLX5 Error CQ:",
801                                                     (const void *)((uintptr_t)
802                                                                     rxq->cqes),
803                                                     sizeof(*u.cqe) * cqe_n);
804                         mlx5_dump_debug_information(name, "MLX5 Error RQ:",
805                                                     (const void *)((uintptr_t)
806                                                                     rxq->wqes),
807                                                     16 * wqe_n);
808                         rxq_ctrl->dump_file_n++;
809                 }
810                 rxq->err_state = MLX5_RXQ_ERR_STATE_NEED_READY;
811                 /* Fall-through */
812         case MLX5_RXQ_ERR_STATE_NEED_READY:
813                 ret = check_cqe(u.cqe, cqe_n, rxq->cq_ci);
814                 if (ret == MLX5_CQE_STATUS_HW_OWN) {
815                         rte_cio_wmb();
816                         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
817                         rte_cio_wmb();
818                         /*
819                          * The RQ consumer index must be zeroed while moving
820                          * from RESET state to RDY state.
821                          */
822                         *rxq->rq_db = rte_cpu_to_be_32(0);
823                         rte_cio_wmb();
824                         sm.is_wq = 1;
825                         sm.queue_id = rxq->idx;
826                         sm.state = IBV_WQS_RDY;
827                         if (mlx5_queue_state_modify(ETH_DEV(rxq_ctrl->priv),
828                                                     &sm))
829                                 return -1;
830                         if (mbuf_prepare) {
831                                 const uint16_t q_mask = wqe_n - 1;
832                                 uint16_t elt_idx;
833                                 struct rte_mbuf **elt;
834                                 int i;
835                                 unsigned int n = wqe_n - (rxq->rq_ci -
836                                                           rxq->rq_pi);
837
838                                 for (i = 0; i < (int)n; ++i) {
839                                         elt_idx = (rxq->rq_ci + i) & q_mask;
840                                         elt = &(*rxq->elts)[elt_idx];
841                                         *elt = rte_mbuf_raw_alloc(rxq->mp);
842                                         if (!*elt) {
843                                                 for (i--; i >= 0; --i) {
844                                                         elt_idx = (rxq->rq_ci +
845                                                                    i) & q_mask;
846                                                         elt = &(*rxq->elts)
847                                                                 [elt_idx];
848                                                         rte_pktmbuf_free_seg
849                                                                 (*elt);
850                                                 }
851                                                 return -1;
852                                         }
853                                 }
854                         }
855                         mlx5_rxq_initialize(rxq);
856                         rxq->err_state = MLX5_RXQ_ERR_STATE_NO_ERROR;
857                 }
858                 return ret;
859         default:
860                 return -1;
861         }
862 }
863
864 /**
865  * Get size of the next packet for a given CQE. For compressed CQEs, the
866  * consumer index is updated only once all packets of the current one have
867  * been processed.
868  *
869  * @param rxq
870  *   Pointer to RX queue.
871  * @param cqe
872  *   CQE to process.
873  * @param[out] mcqe
874  *   Store pointer to mini-CQE if compressed. Otherwise, the pointer is not
875  *   written.
876  *
877  * @return
878  *   0 in case of empty CQE, otherwise the packet size in bytes.
879  */
880 static inline int
881 mlx5_rx_poll_len(struct mlx5_rxq_data *rxq, volatile struct mlx5_cqe *cqe,
882                  uint16_t cqe_cnt, volatile struct mlx5_mini_cqe8 **mcqe)
883 {
884         struct rxq_zip *zip = &rxq->zip;
885         uint16_t cqe_n = cqe_cnt + 1;
886         int len;
887         uint16_t idx, end;
888
889         do {
890                 len = 0;
891                 /* Process compressed data in the CQE and mini arrays. */
892                 if (zip->ai) {
893                         volatile struct mlx5_mini_cqe8 (*mc)[8] =
894                                 (volatile struct mlx5_mini_cqe8 (*)[8])
895                                 (uintptr_t)(&(*rxq->cqes)[zip->ca &
896                                                           cqe_cnt].pkt_info);
897
898                         len = rte_be_to_cpu_32((*mc)[zip->ai & 7].byte_cnt);
899                         *mcqe = &(*mc)[zip->ai & 7];
900                         if ((++zip->ai & 7) == 0) {
901                                 /* Invalidate consumed CQEs */
902                                 idx = zip->ca;
903                                 end = zip->na;
904                                 while (idx != end) {
905                                         (*rxq->cqes)[idx & cqe_cnt].op_own =
906                                                 MLX5_CQE_INVALIDATE;
907                                         ++idx;
908                                 }
909                                 /*
910                                  * Increment consumer index to skip the number
911                                  * of CQEs consumed. Hardware leaves holes in
912                                  * the CQ ring for software use.
913                                  */
914                                 zip->ca = zip->na;
915                                 zip->na += 8;
916                         }
917                         if (unlikely(rxq->zip.ai == rxq->zip.cqe_cnt)) {
918                                 /* Invalidate the rest */
919                                 idx = zip->ca;
920                                 end = zip->cq_ci;
921
922                                 while (idx != end) {
923                                         (*rxq->cqes)[idx & cqe_cnt].op_own =
924                                                 MLX5_CQE_INVALIDATE;
925                                         ++idx;
926                                 }
927                                 rxq->cq_ci = zip->cq_ci;
928                                 zip->ai = 0;
929                         }
930                 /*
931                  * No compressed data, get next CQE and verify if it is
932                  * compressed.
933                  */
934                 } else {
935                         int ret;
936                         int8_t op_own;
937
938                         ret = check_cqe(cqe, cqe_n, rxq->cq_ci);
939                         if (unlikely(ret != MLX5_CQE_STATUS_SW_OWN)) {
940                                 if (unlikely(ret == MLX5_CQE_STATUS_ERR ||
941                                              rxq->err_state)) {
942                                         ret = mlx5_rx_err_handle(rxq, 0);
943                                         if (ret == MLX5_CQE_STATUS_HW_OWN ||
944                                             ret == -1)
945                                                 return 0;
946                                 } else {
947                                         return 0;
948                                 }
949                         }
950                         ++rxq->cq_ci;
951                         op_own = cqe->op_own;
952                         if (MLX5_CQE_FORMAT(op_own) == MLX5_COMPRESSED) {
953                                 volatile struct mlx5_mini_cqe8 (*mc)[8] =
954                                         (volatile struct mlx5_mini_cqe8 (*)[8])
955                                         (uintptr_t)(&(*rxq->cqes)
956                                                 [rxq->cq_ci &
957                                                  cqe_cnt].pkt_info);
958
959                                 /* Fix endianness. */
960                                 zip->cqe_cnt = rte_be_to_cpu_32(cqe->byte_cnt);
961                                 /*
962                                  * Current mini array position is the one
963                                  * returned by check_cqe64().
964                                  *
965                                  * If completion comprises several mini arrays,
966                                  * as a special case the second one is located
967                                  * 7 CQEs after the initial CQE instead of 8
968                                  * for subsequent ones.
969                                  */
970                                 zip->ca = rxq->cq_ci;
971                                 zip->na = zip->ca + 7;
972                                 /* Compute the next non compressed CQE. */
973                                 --rxq->cq_ci;
974                                 zip->cq_ci = rxq->cq_ci + zip->cqe_cnt;
975                                 /* Get packet size to return. */
976                                 len = rte_be_to_cpu_32((*mc)[0].byte_cnt);
977                                 *mcqe = &(*mc)[0];
978                                 zip->ai = 1;
979                                 /* Prefetch all to be invalidated */
980                                 idx = zip->ca;
981                                 end = zip->cq_ci;
982                                 while (idx != end) {
983                                         rte_prefetch0(&(*rxq->cqes)[(idx) &
984                                                                     cqe_cnt]);
985                                         ++idx;
986                                 }
987                         } else {
988                                 len = rte_be_to_cpu_32(cqe->byte_cnt);
989                         }
990                 }
991                 if (unlikely(rxq->err_state)) {
992                         cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
993                         ++rxq->stats.idropped;
994                 } else {
995                         return len;
996                 }
997         } while (1);
998 }
999
1000 /**
1001  * Translate RX completion flags to offload flags.
1002  *
1003  * @param[in] cqe
1004  *   Pointer to CQE.
1005  *
1006  * @return
1007  *   Offload flags (ol_flags) for struct rte_mbuf.
1008  */
1009 static inline uint32_t
1010 rxq_cq_to_ol_flags(volatile struct mlx5_cqe *cqe)
1011 {
1012         uint32_t ol_flags = 0;
1013         uint16_t flags = rte_be_to_cpu_16(cqe->hdr_type_etc);
1014
1015         ol_flags =
1016                 TRANSPOSE(flags,
1017                           MLX5_CQE_RX_L3_HDR_VALID,
1018                           PKT_RX_IP_CKSUM_GOOD) |
1019                 TRANSPOSE(flags,
1020                           MLX5_CQE_RX_L4_HDR_VALID,
1021                           PKT_RX_L4_CKSUM_GOOD);
1022         return ol_flags;
1023 }
1024
1025 /**
1026  * Fill in mbuf fields from RX completion flags.
1027  * Note that pkt->ol_flags should be initialized outside of this function.
1028  *
1029  * @param rxq
1030  *   Pointer to RX queue.
1031  * @param pkt
1032  *   mbuf to fill.
1033  * @param cqe
1034  *   CQE to process.
1035  * @param rss_hash_res
1036  *   Packet RSS Hash result.
1037  */
1038 static inline void
1039 rxq_cq_to_mbuf(struct mlx5_rxq_data *rxq, struct rte_mbuf *pkt,
1040                volatile struct mlx5_cqe *cqe, uint32_t rss_hash_res)
1041 {
1042         /* Update packet information. */
1043         pkt->packet_type = rxq_cq_to_pkt_type(rxq, cqe);
1044         if (rss_hash_res && rxq->rss_hash) {
1045                 pkt->hash.rss = rss_hash_res;
1046                 pkt->ol_flags |= PKT_RX_RSS_HASH;
1047         }
1048         if (rxq->mark && MLX5_FLOW_MARK_IS_VALID(cqe->sop_drop_qpn)) {
1049                 pkt->ol_flags |= PKT_RX_FDIR;
1050                 if (cqe->sop_drop_qpn !=
1051                     rte_cpu_to_be_32(MLX5_FLOW_MARK_DEFAULT)) {
1052                         uint32_t mark = cqe->sop_drop_qpn;
1053
1054                         pkt->ol_flags |= PKT_RX_FDIR_ID;
1055                         pkt->hash.fdir.hi = mlx5_flow_mark_get(mark);
1056                 }
1057         }
1058         if (rxq->csum)
1059                 pkt->ol_flags |= rxq_cq_to_ol_flags(cqe);
1060         if (rxq->vlan_strip &&
1061             (cqe->hdr_type_etc & rte_cpu_to_be_16(MLX5_CQE_VLAN_STRIPPED))) {
1062                 pkt->ol_flags |= PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED;
1063                 pkt->vlan_tci = rte_be_to_cpu_16(cqe->vlan_info);
1064         }
1065         if (rxq->hw_timestamp) {
1066                 pkt->timestamp = rte_be_to_cpu_64(cqe->timestamp);
1067                 pkt->ol_flags |= PKT_RX_TIMESTAMP;
1068         }
1069 }
1070
1071 /**
1072  * DPDK callback for RX.
1073  *
1074  * @param dpdk_rxq
1075  *   Generic pointer to RX queue structure.
1076  * @param[out] pkts
1077  *   Array to store received packets.
1078  * @param pkts_n
1079  *   Maximum number of packets in array.
1080  *
1081  * @return
1082  *   Number of packets successfully received (<= pkts_n).
1083  */
1084 uint16_t
1085 mlx5_rx_burst(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1086 {
1087         struct mlx5_rxq_data *rxq = dpdk_rxq;
1088         const unsigned int wqe_cnt = (1 << rxq->elts_n) - 1;
1089         const unsigned int cqe_cnt = (1 << rxq->cqe_n) - 1;
1090         const unsigned int sges_n = rxq->sges_n;
1091         struct rte_mbuf *pkt = NULL;
1092         struct rte_mbuf *seg = NULL;
1093         volatile struct mlx5_cqe *cqe =
1094                 &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1095         unsigned int i = 0;
1096         unsigned int rq_ci = rxq->rq_ci << sges_n;
1097         int len = 0; /* keep its value across iterations. */
1098
1099         while (pkts_n) {
1100                 unsigned int idx = rq_ci & wqe_cnt;
1101                 volatile struct mlx5_wqe_data_seg *wqe =
1102                         &((volatile struct mlx5_wqe_data_seg *)rxq->wqes)[idx];
1103                 struct rte_mbuf *rep = (*rxq->elts)[idx];
1104                 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1105                 uint32_t rss_hash_res;
1106
1107                 if (pkt)
1108                         NEXT(seg) = rep;
1109                 seg = rep;
1110                 rte_prefetch0(seg);
1111                 rte_prefetch0(cqe);
1112                 rte_prefetch0(wqe);
1113                 rep = rte_mbuf_raw_alloc(rxq->mp);
1114                 if (unlikely(rep == NULL)) {
1115                         ++rxq->stats.rx_nombuf;
1116                         if (!pkt) {
1117                                 /*
1118                                  * no buffers before we even started,
1119                                  * bail out silently.
1120                                  */
1121                                 break;
1122                         }
1123                         while (pkt != seg) {
1124                                 assert(pkt != (*rxq->elts)[idx]);
1125                                 rep = NEXT(pkt);
1126                                 NEXT(pkt) = NULL;
1127                                 NB_SEGS(pkt) = 1;
1128                                 rte_mbuf_raw_free(pkt);
1129                                 pkt = rep;
1130                         }
1131                         break;
1132                 }
1133                 if (!pkt) {
1134                         cqe = &(*rxq->cqes)[rxq->cq_ci & cqe_cnt];
1135                         len = mlx5_rx_poll_len(rxq, cqe, cqe_cnt, &mcqe);
1136                         if (!len) {
1137                                 rte_mbuf_raw_free(rep);
1138                                 break;
1139                         }
1140                         pkt = seg;
1141                         assert(len >= (rxq->crc_present << 2));
1142                         pkt->ol_flags = 0;
1143                         /* If compressed, take hash result from mini-CQE. */
1144                         rss_hash_res = rte_be_to_cpu_32(mcqe == NULL ?
1145                                                         cqe->rx_hash_res :
1146                                                         mcqe->rx_hash_result);
1147                         rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1148                         if (rxq->crc_present)
1149                                 len -= RTE_ETHER_CRC_LEN;
1150                         PKT_LEN(pkt) = len;
1151                 }
1152                 DATA_LEN(rep) = DATA_LEN(seg);
1153                 PKT_LEN(rep) = PKT_LEN(seg);
1154                 SET_DATA_OFF(rep, DATA_OFF(seg));
1155                 PORT(rep) = PORT(seg);
1156                 (*rxq->elts)[idx] = rep;
1157                 /*
1158                  * Fill NIC descriptor with the new buffer.  The lkey and size
1159                  * of the buffers are already known, only the buffer address
1160                  * changes.
1161                  */
1162                 wqe->addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(rep, uintptr_t));
1163                 /* If there's only one MR, no need to replace LKey in WQE. */
1164                 if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1165                         wqe->lkey = mlx5_rx_mb2mr(rxq, rep);
1166                 if (len > DATA_LEN(seg)) {
1167                         len -= DATA_LEN(seg);
1168                         ++NB_SEGS(pkt);
1169                         ++rq_ci;
1170                         continue;
1171                 }
1172                 DATA_LEN(seg) = len;
1173 #ifdef MLX5_PMD_SOFT_COUNTERS
1174                 /* Increment bytes counter. */
1175                 rxq->stats.ibytes += PKT_LEN(pkt);
1176 #endif
1177                 /* Return packet. */
1178                 *(pkts++) = pkt;
1179                 pkt = NULL;
1180                 --pkts_n;
1181                 ++i;
1182                 /* Align consumer index to the next stride. */
1183                 rq_ci >>= sges_n;
1184                 ++rq_ci;
1185                 rq_ci <<= sges_n;
1186         }
1187         if (unlikely((i == 0) && ((rq_ci >> sges_n) == rxq->rq_ci)))
1188                 return 0;
1189         /* Update the consumer index. */
1190         rxq->rq_ci = rq_ci >> sges_n;
1191         rte_cio_wmb();
1192         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1193         rte_cio_wmb();
1194         *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1195 #ifdef MLX5_PMD_SOFT_COUNTERS
1196         /* Increment packets counter. */
1197         rxq->stats.ipackets += i;
1198 #endif
1199         return i;
1200 }
1201
1202 void
1203 mlx5_mprq_buf_free_cb(void *addr __rte_unused, void *opaque)
1204 {
1205         struct mlx5_mprq_buf *buf = opaque;
1206
1207         if (rte_atomic16_read(&buf->refcnt) == 1) {
1208                 rte_mempool_put(buf->mp, buf);
1209         } else if (rte_atomic16_add_return(&buf->refcnt, -1) == 0) {
1210                 rte_atomic16_set(&buf->refcnt, 1);
1211                 rte_mempool_put(buf->mp, buf);
1212         }
1213 }
1214
1215 void
1216 mlx5_mprq_buf_free(struct mlx5_mprq_buf *buf)
1217 {
1218         mlx5_mprq_buf_free_cb(NULL, buf);
1219 }
1220
1221 static inline void
1222 mprq_buf_replace(struct mlx5_rxq_data *rxq, uint16_t rq_idx)
1223 {
1224         struct mlx5_mprq_buf *rep = rxq->mprq_repl;
1225         volatile struct mlx5_wqe_data_seg *wqe =
1226                 &((volatile struct mlx5_wqe_mprq *)rxq->wqes)[rq_idx].dseg;
1227         void *addr;
1228
1229         assert(rep != NULL);
1230         /* Replace MPRQ buf. */
1231         (*rxq->mprq_bufs)[rq_idx] = rep;
1232         /* Replace WQE. */
1233         addr = mlx5_mprq_buf_addr(rep);
1234         wqe->addr = rte_cpu_to_be_64((uintptr_t)addr);
1235         /* If there's only one MR, no need to replace LKey in WQE. */
1236         if (unlikely(mlx5_mr_btree_len(&rxq->mr_ctrl.cache_bh) > 1))
1237                 wqe->lkey = mlx5_rx_addr2mr(rxq, (uintptr_t)addr);
1238         /* Stash a mbuf for next replacement. */
1239         if (likely(!rte_mempool_get(rxq->mprq_mp, (void **)&rep)))
1240                 rxq->mprq_repl = rep;
1241         else
1242                 rxq->mprq_repl = NULL;
1243 }
1244
1245 /**
1246  * DPDK callback for RX with Multi-Packet RQ support.
1247  *
1248  * @param dpdk_rxq
1249  *   Generic pointer to RX queue structure.
1250  * @param[out] pkts
1251  *   Array to store received packets.
1252  * @param pkts_n
1253  *   Maximum number of packets in array.
1254  *
1255  * @return
1256  *   Number of packets successfully received (<= pkts_n).
1257  */
1258 uint16_t
1259 mlx5_rx_burst_mprq(void *dpdk_rxq, struct rte_mbuf **pkts, uint16_t pkts_n)
1260 {
1261         struct mlx5_rxq_data *rxq = dpdk_rxq;
1262         const unsigned int strd_n = 1 << rxq->strd_num_n;
1263         const unsigned int strd_sz = 1 << rxq->strd_sz_n;
1264         const unsigned int strd_shift =
1265                 MLX5_MPRQ_STRIDE_SHIFT_BYTE * rxq->strd_shift_en;
1266         const unsigned int cq_mask = (1 << rxq->cqe_n) - 1;
1267         const unsigned int wq_mask = (1 << rxq->elts_n) - 1;
1268         volatile struct mlx5_cqe *cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1269         unsigned int i = 0;
1270         uint32_t rq_ci = rxq->rq_ci;
1271         uint16_t consumed_strd = rxq->consumed_strd;
1272         struct mlx5_mprq_buf *buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1273
1274         while (i < pkts_n) {
1275                 struct rte_mbuf *pkt;
1276                 void *addr;
1277                 int ret;
1278                 unsigned int len;
1279                 uint16_t strd_cnt;
1280                 uint16_t strd_idx;
1281                 uint32_t offset;
1282                 uint32_t byte_cnt;
1283                 volatile struct mlx5_mini_cqe8 *mcqe = NULL;
1284                 uint32_t rss_hash_res = 0;
1285
1286                 if (consumed_strd == strd_n) {
1287                         /* Replace WQE only if the buffer is still in use. */
1288                         if (rte_atomic16_read(&buf->refcnt) > 1) {
1289                                 mprq_buf_replace(rxq, rq_ci & wq_mask);
1290                                 /* Release the old buffer. */
1291                                 mlx5_mprq_buf_free(buf);
1292                         } else if (unlikely(rxq->mprq_repl == NULL)) {
1293                                 struct mlx5_mprq_buf *rep;
1294
1295                                 /*
1296                                  * Currently, the MPRQ mempool is out of buffer
1297                                  * and doing memcpy regardless of the size of Rx
1298                                  * packet. Retry allocation to get back to
1299                                  * normal.
1300                                  */
1301                                 if (!rte_mempool_get(rxq->mprq_mp,
1302                                                      (void **)&rep))
1303                                         rxq->mprq_repl = rep;
1304                         }
1305                         /* Advance to the next WQE. */
1306                         consumed_strd = 0;
1307                         ++rq_ci;
1308                         buf = (*rxq->mprq_bufs)[rq_ci & wq_mask];
1309                 }
1310                 cqe = &(*rxq->cqes)[rxq->cq_ci & cq_mask];
1311                 ret = mlx5_rx_poll_len(rxq, cqe, cq_mask, &mcqe);
1312                 if (!ret)
1313                         break;
1314                 byte_cnt = ret;
1315                 strd_cnt = (byte_cnt & MLX5_MPRQ_STRIDE_NUM_MASK) >>
1316                            MLX5_MPRQ_STRIDE_NUM_SHIFT;
1317                 assert(strd_cnt);
1318                 consumed_strd += strd_cnt;
1319                 if (byte_cnt & MLX5_MPRQ_FILLER_MASK)
1320                         continue;
1321                 if (mcqe == NULL) {
1322                         rss_hash_res = rte_be_to_cpu_32(cqe->rx_hash_res);
1323                         strd_idx = rte_be_to_cpu_16(cqe->wqe_counter);
1324                 } else {
1325                         /* mini-CQE for MPRQ doesn't have hash result. */
1326                         strd_idx = rte_be_to_cpu_16(mcqe->stride_idx);
1327                 }
1328                 assert(strd_idx < strd_n);
1329                 assert(!((rte_be_to_cpu_16(cqe->wqe_id) ^ rq_ci) & wq_mask));
1330                 /*
1331                  * Currently configured to receive a packet per a stride. But if
1332                  * MTU is adjusted through kernel interface, device could
1333                  * consume multiple strides without raising an error. In this
1334                  * case, the packet should be dropped because it is bigger than
1335                  * the max_rx_pkt_len.
1336                  */
1337                 if (unlikely(strd_cnt > 1)) {
1338                         ++rxq->stats.idropped;
1339                         continue;
1340                 }
1341                 pkt = rte_pktmbuf_alloc(rxq->mp);
1342                 if (unlikely(pkt == NULL)) {
1343                         ++rxq->stats.rx_nombuf;
1344                         break;
1345                 }
1346                 len = (byte_cnt & MLX5_MPRQ_LEN_MASK) >> MLX5_MPRQ_LEN_SHIFT;
1347                 assert((int)len >= (rxq->crc_present << 2));
1348                 if (rxq->crc_present)
1349                         len -= RTE_ETHER_CRC_LEN;
1350                 offset = strd_idx * strd_sz + strd_shift;
1351                 addr = RTE_PTR_ADD(mlx5_mprq_buf_addr(buf), offset);
1352                 /* Initialize the offload flag. */
1353                 pkt->ol_flags = 0;
1354                 /*
1355                  * Memcpy packets to the target mbuf if:
1356                  * - The size of packet is smaller than mprq_max_memcpy_len.
1357                  * - Out of buffer in the Mempool for Multi-Packet RQ.
1358                  */
1359                 if (len <= rxq->mprq_max_memcpy_len || rxq->mprq_repl == NULL) {
1360                         /*
1361                          * When memcpy'ing packet due to out-of-buffer, the
1362                          * packet must be smaller than the target mbuf.
1363                          */
1364                         if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
1365                                 rte_pktmbuf_free_seg(pkt);
1366                                 ++rxq->stats.idropped;
1367                                 continue;
1368                         }
1369                         rte_memcpy(rte_pktmbuf_mtod(pkt, void *), addr, len);
1370                 } else {
1371                         rte_iova_t buf_iova;
1372                         struct rte_mbuf_ext_shared_info *shinfo;
1373                         uint16_t buf_len = strd_cnt * strd_sz;
1374
1375                         /* Increment the refcnt of the whole chunk. */
1376                         rte_atomic16_add_return(&buf->refcnt, 1);
1377                         assert((uint16_t)rte_atomic16_read(&buf->refcnt) <=
1378                                strd_n + 1);
1379                         addr = RTE_PTR_SUB(addr, RTE_PKTMBUF_HEADROOM);
1380                         /*
1381                          * MLX5 device doesn't use iova but it is necessary in a
1382                          * case where the Rx packet is transmitted via a
1383                          * different PMD.
1384                          */
1385                         buf_iova = rte_mempool_virt2iova(buf) +
1386                                    RTE_PTR_DIFF(addr, buf);
1387                         shinfo = rte_pktmbuf_ext_shinfo_init_helper(addr,
1388                                         &buf_len, mlx5_mprq_buf_free_cb, buf);
1389                         /*
1390                          * EXT_ATTACHED_MBUF will be set to pkt->ol_flags when
1391                          * attaching the stride to mbuf and more offload flags
1392                          * will be added below by calling rxq_cq_to_mbuf().
1393                          * Other fields will be overwritten.
1394                          */
1395                         rte_pktmbuf_attach_extbuf(pkt, addr, buf_iova, buf_len,
1396                                                   shinfo);
1397                         rte_pktmbuf_reset_headroom(pkt);
1398                         assert(pkt->ol_flags == EXT_ATTACHED_MBUF);
1399                         /*
1400                          * Prevent potential overflow due to MTU change through
1401                          * kernel interface.
1402                          */
1403                         if (unlikely(rte_pktmbuf_tailroom(pkt) < len)) {
1404                                 rte_pktmbuf_free_seg(pkt);
1405                                 ++rxq->stats.idropped;
1406                                 continue;
1407                         }
1408                 }
1409                 rxq_cq_to_mbuf(rxq, pkt, cqe, rss_hash_res);
1410                 PKT_LEN(pkt) = len;
1411                 DATA_LEN(pkt) = len;
1412                 PORT(pkt) = rxq->port_id;
1413 #ifdef MLX5_PMD_SOFT_COUNTERS
1414                 /* Increment bytes counter. */
1415                 rxq->stats.ibytes += PKT_LEN(pkt);
1416 #endif
1417                 /* Return packet. */
1418                 *(pkts++) = pkt;
1419                 ++i;
1420         }
1421         /* Update the consumer indexes. */
1422         rxq->consumed_strd = consumed_strd;
1423         rte_cio_wmb();
1424         *rxq->cq_db = rte_cpu_to_be_32(rxq->cq_ci);
1425         if (rq_ci != rxq->rq_ci) {
1426                 rxq->rq_ci = rq_ci;
1427                 rte_cio_wmb();
1428                 *rxq->rq_db = rte_cpu_to_be_32(rxq->rq_ci);
1429         }
1430 #ifdef MLX5_PMD_SOFT_COUNTERS
1431         /* Increment packets counter. */
1432         rxq->stats.ipackets += i;
1433 #endif
1434         return i;
1435 }
1436
1437 /**
1438  * Dummy DPDK callback for TX.
1439  *
1440  * This function is used to temporarily replace the real callback during
1441  * unsafe control operations on the queue, or in case of error.
1442  *
1443  * @param dpdk_txq
1444  *   Generic pointer to TX queue structure.
1445  * @param[in] pkts
1446  *   Packets to transmit.
1447  * @param pkts_n
1448  *   Number of packets in array.
1449  *
1450  * @return
1451  *   Number of packets successfully transmitted (<= pkts_n).
1452  */
1453 uint16_t
1454 removed_tx_burst(void *dpdk_txq __rte_unused,
1455                  struct rte_mbuf **pkts __rte_unused,
1456                  uint16_t pkts_n __rte_unused)
1457 {
1458         rte_mb();
1459         return 0;
1460 }
1461
1462 /**
1463  * Dummy DPDK callback for RX.
1464  *
1465  * This function is used to temporarily replace the real callback during
1466  * unsafe control operations on the queue, or in case of error.
1467  *
1468  * @param dpdk_rxq
1469  *   Generic pointer to RX queue structure.
1470  * @param[out] pkts
1471  *   Array to store received packets.
1472  * @param pkts_n
1473  *   Maximum number of packets in array.
1474  *
1475  * @return
1476  *   Number of packets successfully received (<= pkts_n).
1477  */
1478 uint16_t
1479 removed_rx_burst(void *dpdk_txq __rte_unused,
1480                  struct rte_mbuf **pkts __rte_unused,
1481                  uint16_t pkts_n __rte_unused)
1482 {
1483         rte_mb();
1484         return 0;
1485 }
1486
1487 /*
1488  * Vectorized Rx/Tx routines are not compiled in when required vector
1489  * instructions are not supported on a target architecture. The following null
1490  * stubs are needed for linkage when those are not included outside of this file
1491  * (e.g.  mlx5_rxtx_vec_sse.c for x86).
1492  */
1493
1494 __rte_weak uint16_t
1495 mlx5_rx_burst_vec(void *dpdk_txq __rte_unused,
1496                   struct rte_mbuf **pkts __rte_unused,
1497                   uint16_t pkts_n __rte_unused)
1498 {
1499         return 0;
1500 }
1501
1502 __rte_weak int
1503 mlx5_rxq_check_vec_support(struct mlx5_rxq_data *rxq __rte_unused)
1504 {
1505         return -ENOTSUP;
1506 }
1507
1508 __rte_weak int
1509 mlx5_check_vec_rx_support(struct rte_eth_dev *dev __rte_unused)
1510 {
1511         return -ENOTSUP;
1512 }
1513
1514 /**
1515  * DPDK callback to check the status of a tx descriptor.
1516  *
1517  * @param tx_queue
1518  *   The tx queue.
1519  * @param[in] offset
1520  *   The index of the descriptor in the ring.
1521  *
1522  * @return
1523  *   The status of the tx descriptor.
1524  */
1525 int
1526 mlx5_tx_descriptor_status(void *tx_queue, uint16_t offset)
1527 {
1528         (void)tx_queue;
1529         (void)offset;
1530         return RTE_ETH_TX_DESC_FULL;
1531 }
1532
1533 /**
1534  * Configure the TX function to use.
1535  *
1536  * @param dev
1537  *   Pointer to private data structure.
1538  *
1539  * @return
1540  *   Pointer to selected Tx burst function.
1541  */
1542 eth_tx_burst_t
1543 mlx5_select_tx_function(struct rte_eth_dev *dev)
1544 {
1545         (void)dev;
1546         return removed_tx_burst;
1547 }
1548
1549