net/mlx5: use flow to enable all multi mode
[dpdk.git] / drivers / net / mlx5 / mlx5_rxq.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright 2015 6WIND S.A.
5  *   Copyright 2015 Mellanox.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of 6WIND S.A. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33
34 #include <stddef.h>
35 #include <assert.h>
36 #include <errno.h>
37 #include <string.h>
38 #include <stdint.h>
39 #include <fcntl.h>
40 #include <sys/queue.h>
41
42 /* Verbs header. */
43 /* ISO C doesn't support unnamed structs/unions, disabling -pedantic. */
44 #ifdef PEDANTIC
45 #pragma GCC diagnostic ignored "-Wpedantic"
46 #endif
47 #include <infiniband/verbs.h>
48 #include <infiniband/mlx5dv.h>
49 #ifdef PEDANTIC
50 #pragma GCC diagnostic error "-Wpedantic"
51 #endif
52
53 #include <rte_mbuf.h>
54 #include <rte_malloc.h>
55 #include <rte_ethdev.h>
56 #include <rte_common.h>
57 #include <rte_interrupts.h>
58 #include <rte_debug.h>
59 #include <rte_io.h>
60
61 #include "mlx5.h"
62 #include "mlx5_rxtx.h"
63 #include "mlx5_utils.h"
64 #include "mlx5_autoconf.h"
65 #include "mlx5_defs.h"
66
67 /* Initialization data for hash RX queues. */
68 const struct hash_rxq_init hash_rxq_init[] = {
69         [HASH_RXQ_TCPV4] = {
70                 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
71                                 IBV_RX_HASH_DST_IPV4 |
72                                 IBV_RX_HASH_SRC_PORT_TCP |
73                                 IBV_RX_HASH_DST_PORT_TCP),
74                 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_TCP,
75                 .flow_priority = 0,
76                 .flow_spec.tcp_udp = {
77                         .type = IBV_FLOW_SPEC_TCP,
78                         .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
79                 },
80                 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
81         },
82         [HASH_RXQ_UDPV4] = {
83                 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
84                                 IBV_RX_HASH_DST_IPV4 |
85                                 IBV_RX_HASH_SRC_PORT_UDP |
86                                 IBV_RX_HASH_DST_PORT_UDP),
87                 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV4_UDP,
88                 .flow_priority = 0,
89                 .flow_spec.tcp_udp = {
90                         .type = IBV_FLOW_SPEC_UDP,
91                         .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
92                 },
93                 .underlayer = &hash_rxq_init[HASH_RXQ_IPV4],
94         },
95         [HASH_RXQ_IPV4] = {
96                 .hash_fields = (IBV_RX_HASH_SRC_IPV4 |
97                                 IBV_RX_HASH_DST_IPV4),
98                 .dpdk_rss_hf = (ETH_RSS_IPV4 |
99                                 ETH_RSS_FRAG_IPV4),
100                 .flow_priority = 1,
101                 .flow_spec.ipv4 = {
102                         .type = IBV_FLOW_SPEC_IPV4,
103                         .size = sizeof(hash_rxq_init[0].flow_spec.ipv4),
104                 },
105                 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
106         },
107         [HASH_RXQ_TCPV6] = {
108                 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
109                                 IBV_RX_HASH_DST_IPV6 |
110                                 IBV_RX_HASH_SRC_PORT_TCP |
111                                 IBV_RX_HASH_DST_PORT_TCP),
112                 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_TCP,
113                 .flow_priority = 0,
114                 .flow_spec.tcp_udp = {
115                         .type = IBV_FLOW_SPEC_TCP,
116                         .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
117                 },
118                 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
119         },
120         [HASH_RXQ_UDPV6] = {
121                 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
122                                 IBV_RX_HASH_DST_IPV6 |
123                                 IBV_RX_HASH_SRC_PORT_UDP |
124                                 IBV_RX_HASH_DST_PORT_UDP),
125                 .dpdk_rss_hf = ETH_RSS_NONFRAG_IPV6_UDP,
126                 .flow_priority = 0,
127                 .flow_spec.tcp_udp = {
128                         .type = IBV_FLOW_SPEC_UDP,
129                         .size = sizeof(hash_rxq_init[0].flow_spec.tcp_udp),
130                 },
131                 .underlayer = &hash_rxq_init[HASH_RXQ_IPV6],
132         },
133         [HASH_RXQ_IPV6] = {
134                 .hash_fields = (IBV_RX_HASH_SRC_IPV6 |
135                                 IBV_RX_HASH_DST_IPV6),
136                 .dpdk_rss_hf = (ETH_RSS_IPV6 |
137                                 ETH_RSS_FRAG_IPV6),
138                 .flow_priority = 1,
139                 .flow_spec.ipv6 = {
140                         .type = IBV_FLOW_SPEC_IPV6,
141                         .size = sizeof(hash_rxq_init[0].flow_spec.ipv6),
142                 },
143                 .underlayer = &hash_rxq_init[HASH_RXQ_ETH],
144         },
145         [HASH_RXQ_ETH] = {
146                 .hash_fields = 0,
147                 .dpdk_rss_hf = 0,
148                 .flow_priority = 2,
149                 .flow_spec.eth = {
150                         .type = IBV_FLOW_SPEC_ETH,
151                         .size = sizeof(hash_rxq_init[0].flow_spec.eth),
152                 },
153                 .underlayer = NULL,
154         },
155 };
156
157 /* Number of entries in hash_rxq_init[]. */
158 const unsigned int hash_rxq_init_n = RTE_DIM(hash_rxq_init);
159
160 /* Initialization data for hash RX queue indirection tables. */
161 static const struct ind_table_init ind_table_init[] = {
162         {
163                 .max_size = -1u, /* Superseded by HW limitations. */
164                 .hash_types =
165                         1 << HASH_RXQ_TCPV4 |
166                         1 << HASH_RXQ_UDPV4 |
167                         1 << HASH_RXQ_IPV4 |
168                         1 << HASH_RXQ_TCPV6 |
169                         1 << HASH_RXQ_UDPV6 |
170                         1 << HASH_RXQ_IPV6 |
171                         0,
172                 .hash_types_n = 6,
173         },
174         {
175                 .max_size = 1,
176                 .hash_types = 1 << HASH_RXQ_ETH,
177                 .hash_types_n = 1,
178         },
179 };
180
181 #define IND_TABLE_INIT_N RTE_DIM(ind_table_init)
182
183 /* Default RSS hash key also used for ConnectX-3. */
184 uint8_t rss_hash_default_key[] = {
185         0x2c, 0xc6, 0x81, 0xd1,
186         0x5b, 0xdb, 0xf4, 0xf7,
187         0xfc, 0xa2, 0x83, 0x19,
188         0xdb, 0x1a, 0x3e, 0x94,
189         0x6b, 0x9e, 0x38, 0xd9,
190         0x2c, 0x9c, 0x03, 0xd1,
191         0xad, 0x99, 0x44, 0xa7,
192         0xd9, 0x56, 0x3d, 0x59,
193         0x06, 0x3c, 0x25, 0xf3,
194         0xfc, 0x1f, 0xdc, 0x2a,
195 };
196
197 /* Length of the default RSS hash key. */
198 const size_t rss_hash_default_key_len = sizeof(rss_hash_default_key);
199
200 /**
201  * Populate flow steering rule for a given hash RX queue type using
202  * information from hash_rxq_init[]. Nothing is written to flow_attr when
203  * flow_attr_size is not large enough, but the required size is still returned.
204  *
205  * @param priv
206  *   Pointer to private structure.
207  * @param[out] flow_attr
208  *   Pointer to flow attribute structure to fill. Note that the allocated
209  *   area must be larger and large enough to hold all flow specifications.
210  * @param flow_attr_size
211  *   Entire size of flow_attr and trailing room for flow specifications.
212  * @param type
213  *   Hash RX queue type to use for flow steering rule.
214  *
215  * @return
216  *   Total size of the flow attribute buffer. No errors are defined.
217  */
218 size_t
219 priv_flow_attr(struct priv *priv, struct ibv_flow_attr *flow_attr,
220                size_t flow_attr_size, enum hash_rxq_type type)
221 {
222         size_t offset = sizeof(*flow_attr);
223         const struct hash_rxq_init *init = &hash_rxq_init[type];
224
225         assert(priv != NULL);
226         assert((size_t)type < RTE_DIM(hash_rxq_init));
227         do {
228                 offset += init->flow_spec.hdr.size;
229                 init = init->underlayer;
230         } while (init != NULL);
231         if (offset > flow_attr_size)
232                 return offset;
233         flow_attr_size = offset;
234         init = &hash_rxq_init[type];
235         *flow_attr = (struct ibv_flow_attr){
236                 .type = IBV_FLOW_ATTR_NORMAL,
237                 /* Priorities < 3 are reserved for flow director. */
238                 .priority = init->flow_priority + 3,
239                 .num_of_specs = 0,
240                 .port = priv->port,
241                 .flags = 0,
242         };
243         do {
244                 offset -= init->flow_spec.hdr.size;
245                 memcpy((void *)((uintptr_t)flow_attr + offset),
246                        &init->flow_spec,
247                        init->flow_spec.hdr.size);
248                 ++flow_attr->num_of_specs;
249                 init = init->underlayer;
250         } while (init != NULL);
251         return flow_attr_size;
252 }
253
254 /**
255  * Convert hash type position in indirection table initializer to
256  * hash RX queue type.
257  *
258  * @param table
259  *   Indirection table initializer.
260  * @param pos
261  *   Hash type position.
262  *
263  * @return
264  *   Hash RX queue type.
265  */
266 static enum hash_rxq_type
267 hash_rxq_type_from_pos(const struct ind_table_init *table, unsigned int pos)
268 {
269         enum hash_rxq_type type = HASH_RXQ_TCPV4;
270
271         assert(pos < table->hash_types_n);
272         do {
273                 if ((table->hash_types & (1 << type)) && (pos-- == 0))
274                         break;
275                 ++type;
276         } while (1);
277         return type;
278 }
279
280 /**
281  * Filter out disabled hash RX queue types from ind_table_init[].
282  *
283  * @param priv
284  *   Pointer to private structure.
285  * @param[out] table
286  *   Output table.
287  *
288  * @return
289  *   Number of table entries.
290  */
291 static unsigned int
292 priv_make_ind_table_init(struct priv *priv,
293                          struct ind_table_init (*table)[IND_TABLE_INIT_N])
294 {
295         uint64_t rss_hf;
296         unsigned int i;
297         unsigned int j;
298         unsigned int table_n = 0;
299         /* Mandatory to receive frames not handled by normal hash RX queues. */
300         unsigned int hash_types_sup = 1 << HASH_RXQ_ETH;
301
302         rss_hf = priv->rss_hf;
303         /* Process other protocols only if more than one queue. */
304         if (priv->rxqs_n > 1)
305                 for (i = 0; (i != hash_rxq_init_n); ++i)
306                         if (rss_hf & hash_rxq_init[i].dpdk_rss_hf)
307                                 hash_types_sup |= (1 << i);
308
309         /* Filter out entries whose protocols are not in the set. */
310         for (i = 0, j = 0; (i != IND_TABLE_INIT_N); ++i) {
311                 unsigned int nb;
312                 unsigned int h;
313
314                 /* j is increased only if the table has valid protocols. */
315                 assert(j <= i);
316                 (*table)[j] = ind_table_init[i];
317                 (*table)[j].hash_types &= hash_types_sup;
318                 for (h = 0, nb = 0; (h != hash_rxq_init_n); ++h)
319                         if (((*table)[j].hash_types >> h) & 0x1)
320                                 ++nb;
321                 (*table)[i].hash_types_n = nb;
322                 if (nb) {
323                         ++table_n;
324                         ++j;
325                 }
326         }
327         return table_n;
328 }
329
330 /**
331  * Initialize hash RX queues and indirection table.
332  *
333  * @param priv
334  *   Pointer to private structure.
335  *
336  * @return
337  *   0 on success, errno value on failure.
338  */
339 int
340 priv_create_hash_rxqs(struct priv *priv)
341 {
342         struct ibv_wq *wqs[priv->reta_idx_n];
343         struct ind_table_init ind_table_init[IND_TABLE_INIT_N];
344         unsigned int ind_tables_n =
345                 priv_make_ind_table_init(priv, &ind_table_init);
346         unsigned int hash_rxqs_n = 0;
347         struct hash_rxq (*hash_rxqs)[] = NULL;
348         struct ibv_rwq_ind_table *(*ind_tables)[] = NULL;
349         unsigned int i;
350         unsigned int j;
351         unsigned int k;
352         int err = 0;
353
354         assert(priv->ind_tables == NULL);
355         assert(priv->ind_tables_n == 0);
356         assert(priv->hash_rxqs == NULL);
357         assert(priv->hash_rxqs_n == 0);
358         assert(priv->pd != NULL);
359         assert(priv->ctx != NULL);
360         if (priv->isolated)
361                 return 0;
362         if (priv->rxqs_n == 0)
363                 return EINVAL;
364         assert(priv->rxqs != NULL);
365         if (ind_tables_n == 0) {
366                 ERROR("all hash RX queue types have been filtered out,"
367                       " indirection table cannot be created");
368                 return EINVAL;
369         }
370         if (priv->rxqs_n & (priv->rxqs_n - 1)) {
371                 INFO("%u RX queues are configured, consider rounding this"
372                      " number to the next power of two for better balancing",
373                      priv->rxqs_n);
374                 DEBUG("indirection table extended to assume %u WQs",
375                       priv->reta_idx_n);
376         }
377         for (i = 0; (i != priv->reta_idx_n); ++i) {
378                 struct mlx5_rxq_ctrl *rxq_ctrl;
379
380                 rxq_ctrl = container_of((*priv->rxqs)[(*priv->reta_idx)[i]],
381                                         struct mlx5_rxq_ctrl, rxq);
382                 wqs[i] = rxq_ctrl->ibv->wq;
383         }
384         /* Get number of hash RX queues to configure. */
385         for (i = 0, hash_rxqs_n = 0; (i != ind_tables_n); ++i)
386                 hash_rxqs_n += ind_table_init[i].hash_types_n;
387         DEBUG("allocating %u hash RX queues for %u WQs, %u indirection tables",
388               hash_rxqs_n, priv->rxqs_n, ind_tables_n);
389         /* Create indirection tables. */
390         ind_tables = rte_calloc(__func__, ind_tables_n,
391                                 sizeof((*ind_tables)[0]), 0);
392         if (ind_tables == NULL) {
393                 err = ENOMEM;
394                 ERROR("cannot allocate indirection tables container: %s",
395                       strerror(err));
396                 goto error;
397         }
398         for (i = 0; (i != ind_tables_n); ++i) {
399                 struct ibv_rwq_ind_table_init_attr ind_init_attr = {
400                         .log_ind_tbl_size = 0, /* Set below. */
401                         .ind_tbl = wqs,
402                         .comp_mask = 0,
403                 };
404                 unsigned int ind_tbl_size = ind_table_init[i].max_size;
405                 struct ibv_rwq_ind_table *ind_table;
406
407                 if (priv->reta_idx_n < ind_tbl_size)
408                         ind_tbl_size = priv->reta_idx_n;
409                 ind_init_attr.log_ind_tbl_size = log2above(ind_tbl_size);
410                 errno = 0;
411                 ind_table = ibv_create_rwq_ind_table(priv->ctx,
412                                                      &ind_init_attr);
413                 if (ind_table != NULL) {
414                         (*ind_tables)[i] = ind_table;
415                         continue;
416                 }
417                 /* Not clear whether errno is set. */
418                 err = (errno ? errno : EINVAL);
419                 ERROR("RX indirection table creation failed with error %d: %s",
420                       err, strerror(err));
421                 goto error;
422         }
423         /* Allocate array that holds hash RX queues and related data. */
424         hash_rxqs = rte_calloc(__func__, hash_rxqs_n,
425                                sizeof((*hash_rxqs)[0]), 0);
426         if (hash_rxqs == NULL) {
427                 err = ENOMEM;
428                 ERROR("cannot allocate hash RX queues container: %s",
429                       strerror(err));
430                 goto error;
431         }
432         for (i = 0, j = 0, k = 0;
433              ((i != hash_rxqs_n) && (j != ind_tables_n));
434              ++i) {
435                 struct hash_rxq *hash_rxq = &(*hash_rxqs)[i];
436                 enum hash_rxq_type type =
437                         hash_rxq_type_from_pos(&ind_table_init[j], k);
438                 struct rte_eth_rss_conf *priv_rss_conf =
439                         (*priv->rss_conf)[type];
440                 struct ibv_rx_hash_conf hash_conf = {
441                         .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
442                         .rx_hash_key_len = (priv_rss_conf ?
443                                             priv_rss_conf->rss_key_len :
444                                             rss_hash_default_key_len),
445                         .rx_hash_key = (priv_rss_conf ?
446                                         priv_rss_conf->rss_key :
447                                         rss_hash_default_key),
448                         .rx_hash_fields_mask = hash_rxq_init[type].hash_fields,
449                 };
450                 struct ibv_qp_init_attr_ex qp_init_attr = {
451                         .qp_type = IBV_QPT_RAW_PACKET,
452                         .comp_mask = (IBV_QP_INIT_ATTR_PD |
453                                       IBV_QP_INIT_ATTR_IND_TABLE |
454                                       IBV_QP_INIT_ATTR_RX_HASH),
455                         .rx_hash_conf = hash_conf,
456                         .rwq_ind_tbl = (*ind_tables)[j],
457                         .pd = priv->pd,
458                 };
459
460                 DEBUG("using indirection table %u for hash RX queue %u type %d",
461                       j, i, type);
462                 *hash_rxq = (struct hash_rxq){
463                         .priv = priv,
464                         .qp = ibv_create_qp_ex(priv->ctx, &qp_init_attr),
465                         .type = type,
466                 };
467                 if (hash_rxq->qp == NULL) {
468                         err = (errno ? errno : EINVAL);
469                         ERROR("Hash RX QP creation failure: %s",
470                               strerror(err));
471                         goto error;
472                 }
473                 if (++k < ind_table_init[j].hash_types_n)
474                         continue;
475                 /* Switch to the next indirection table and reset hash RX
476                  * queue type array index. */
477                 ++j;
478                 k = 0;
479         }
480         priv->ind_tables = ind_tables;
481         priv->ind_tables_n = ind_tables_n;
482         priv->hash_rxqs = hash_rxqs;
483         priv->hash_rxqs_n = hash_rxqs_n;
484         assert(err == 0);
485         return 0;
486 error:
487         if (hash_rxqs != NULL) {
488                 for (i = 0; (i != hash_rxqs_n); ++i) {
489                         struct ibv_qp *qp = (*hash_rxqs)[i].qp;
490
491                         if (qp == NULL)
492                                 continue;
493                         claim_zero(ibv_destroy_qp(qp));
494                 }
495                 rte_free(hash_rxqs);
496         }
497         if (ind_tables != NULL) {
498                 for (j = 0; (j != ind_tables_n); ++j) {
499                         struct ibv_rwq_ind_table *ind_table =
500                                 (*ind_tables)[j];
501
502                         if (ind_table == NULL)
503                                 continue;
504                         claim_zero(ibv_destroy_rwq_ind_table(ind_table));
505                 }
506                 rte_free(ind_tables);
507         }
508         return err;
509 }
510
511 /**
512  * Clean up hash RX queues and indirection table.
513  *
514  * @param priv
515  *   Pointer to private structure.
516  */
517 void
518 priv_destroy_hash_rxqs(struct priv *priv)
519 {
520         unsigned int i;
521
522         DEBUG("destroying %u hash RX queues", priv->hash_rxqs_n);
523         if (priv->hash_rxqs_n == 0) {
524                 assert(priv->hash_rxqs == NULL);
525                 assert(priv->ind_tables == NULL);
526                 return;
527         }
528         for (i = 0; (i != priv->hash_rxqs_n); ++i) {
529                 struct hash_rxq *hash_rxq = &(*priv->hash_rxqs)[i];
530                 unsigned int j, k;
531
532                 assert(hash_rxq->priv == priv);
533                 assert(hash_rxq->qp != NULL);
534                 /* Also check that there are no remaining flows. */
535                 for (j = 0; (j != RTE_DIM(hash_rxq->special_flow)); ++j)
536                         for (k = 0;
537                              (k != RTE_DIM(hash_rxq->special_flow[j]));
538                              ++k)
539                                 assert(hash_rxq->special_flow[j][k] == NULL);
540                 for (j = 0; (j != RTE_DIM(hash_rxq->mac_flow)); ++j)
541                         for (k = 0; (k != RTE_DIM(hash_rxq->mac_flow[j])); ++k)
542                                 assert(hash_rxq->mac_flow[j][k] == NULL);
543                 claim_zero(ibv_destroy_qp(hash_rxq->qp));
544         }
545         priv->hash_rxqs_n = 0;
546         rte_free(priv->hash_rxqs);
547         priv->hash_rxqs = NULL;
548         for (i = 0; (i != priv->ind_tables_n); ++i) {
549                 struct ibv_rwq_ind_table *ind_table =
550                         (*priv->ind_tables)[i];
551
552                 assert(ind_table != NULL);
553                 claim_zero(ibv_destroy_rwq_ind_table(ind_table));
554         }
555         priv->ind_tables_n = 0;
556         rte_free(priv->ind_tables);
557         priv->ind_tables = NULL;
558 }
559
560 /**
561  * Check whether a given flow type is allowed.
562  *
563  * @param priv
564  *   Pointer to private structure.
565  * @param type
566  *   Flow type to check.
567  *
568  * @return
569  *   Nonzero if the given flow type is allowed.
570  */
571 int
572 priv_allow_flow_type(struct priv *priv, enum hash_rxq_flow_type type)
573 {
574         (void)priv;
575         switch (type) {
576         case HASH_RXQ_FLOW_TYPE_BROADCAST:
577         case HASH_RXQ_FLOW_TYPE_IPV6MULTI:
578         case HASH_RXQ_FLOW_TYPE_MAC:
579                 return 1;
580                 return 1;
581         default:
582                 /* Unsupported flow type is not allowed. */
583                 return 0;
584         }
585         return 0;
586 }
587
588 /**
589  * Automatically enable/disable flows according to configuration.
590  *
591  * @param priv
592  *   Private structure.
593  *
594  * @return
595  *   0 on success, errno value on failure.
596  */
597 int
598 priv_rehash_flows(struct priv *priv)
599 {
600         size_t i;
601
602         for (i = 0; i != RTE_DIM((*priv->hash_rxqs)[0].special_flow); ++i)
603                 if (!priv_allow_flow_type(priv, i)) {
604                         priv_special_flow_disable(priv, i);
605                 } else {
606                         int ret = priv_special_flow_enable(priv, i);
607
608                         if (ret)
609                                 return ret;
610                 }
611         if (priv_allow_flow_type(priv, HASH_RXQ_FLOW_TYPE_MAC))
612                 return priv_mac_addrs_enable(priv);
613         priv_mac_addrs_disable(priv);
614         return 0;
615 }
616
617 /**
618  * Allocate RX queue elements.
619  *
620  * @param rxq_ctrl
621  *   Pointer to RX queue structure.
622  *
623  * @return
624  *   0 on success, errno value on failure.
625  */
626 int
627 rxq_alloc_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
628 {
629         const unsigned int sges_n = 1 << rxq_ctrl->rxq.sges_n;
630         unsigned int elts_n = 1 << rxq_ctrl->rxq.elts_n;
631         unsigned int i;
632         int ret = 0;
633
634         /* Iterate on segments. */
635         for (i = 0; (i != elts_n); ++i) {
636                 struct rte_mbuf *buf;
637
638                 buf = rte_pktmbuf_alloc(rxq_ctrl->rxq.mp);
639                 if (buf == NULL) {
640                         ERROR("%p: empty mbuf pool", (void *)rxq_ctrl);
641                         ret = ENOMEM;
642                         goto error;
643                 }
644                 /* Headroom is reserved by rte_pktmbuf_alloc(). */
645                 assert(DATA_OFF(buf) == RTE_PKTMBUF_HEADROOM);
646                 /* Buffer is supposed to be empty. */
647                 assert(rte_pktmbuf_data_len(buf) == 0);
648                 assert(rte_pktmbuf_pkt_len(buf) == 0);
649                 assert(!buf->next);
650                 /* Only the first segment keeps headroom. */
651                 if (i % sges_n)
652                         SET_DATA_OFF(buf, 0);
653                 PORT(buf) = rxq_ctrl->rxq.port_id;
654                 DATA_LEN(buf) = rte_pktmbuf_tailroom(buf);
655                 PKT_LEN(buf) = DATA_LEN(buf);
656                 NB_SEGS(buf) = 1;
657                 (*rxq_ctrl->rxq.elts)[i] = buf;
658         }
659         /* If Rx vector is activated. */
660         if (rxq_check_vec_support(&rxq_ctrl->rxq) > 0) {
661                 struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
662                 struct rte_mbuf *mbuf_init = &rxq->fake_mbuf;
663                 int j;
664
665                 /* Initialize default rearm_data for vPMD. */
666                 mbuf_init->data_off = RTE_PKTMBUF_HEADROOM;
667                 rte_mbuf_refcnt_set(mbuf_init, 1);
668                 mbuf_init->nb_segs = 1;
669                 mbuf_init->port = rxq->port_id;
670                 /*
671                  * prevent compiler reordering:
672                  * rearm_data covers previous fields.
673                  */
674                 rte_compiler_barrier();
675                 rxq->mbuf_initializer =
676                         *(uint64_t *)&mbuf_init->rearm_data;
677                 /* Padding with a fake mbuf for vectorized Rx. */
678                 for (j = 0; j < MLX5_VPMD_DESCS_PER_LOOP; ++j)
679                         (*rxq->elts)[elts_n + j] = &rxq->fake_mbuf;
680         }
681         DEBUG("%p: allocated and configured %u segments (max %u packets)",
682               (void *)rxq_ctrl, elts_n, elts_n / (1 << rxq_ctrl->rxq.sges_n));
683         assert(ret == 0);
684         return 0;
685 error:
686         elts_n = i;
687         for (i = 0; (i != elts_n); ++i) {
688                 if ((*rxq_ctrl->rxq.elts)[i] != NULL)
689                         rte_pktmbuf_free_seg((*rxq_ctrl->rxq.elts)[i]);
690                 (*rxq_ctrl->rxq.elts)[i] = NULL;
691         }
692         DEBUG("%p: failed, freed everything", (void *)rxq_ctrl);
693         assert(ret > 0);
694         return ret;
695 }
696
697 /**
698  * Free RX queue elements.
699  *
700  * @param rxq_ctrl
701  *   Pointer to RX queue structure.
702  */
703 static void
704 rxq_free_elts(struct mlx5_rxq_ctrl *rxq_ctrl)
705 {
706         struct mlx5_rxq_data *rxq = &rxq_ctrl->rxq;
707         const uint16_t q_n = (1 << rxq->elts_n);
708         const uint16_t q_mask = q_n - 1;
709         uint16_t used = q_n - (rxq->rq_ci - rxq->rq_pi);
710         uint16_t i;
711
712         DEBUG("%p: freeing WRs", (void *)rxq_ctrl);
713         if (rxq->elts == NULL)
714                 return;
715         /**
716          * Some mbuf in the Ring belongs to the application.  They cannot be
717          * freed.
718          */
719         if (rxq_check_vec_support(rxq) > 0) {
720                 for (i = 0; i < used; ++i)
721                         (*rxq->elts)[(rxq->rq_ci + i) & q_mask] = NULL;
722                 rxq->rq_pi = rxq->rq_ci;
723         }
724         for (i = 0; (i != (1u << rxq->elts_n)); ++i) {
725                 if ((*rxq->elts)[i] != NULL)
726                         rte_pktmbuf_free_seg((*rxq->elts)[i]);
727                 (*rxq->elts)[i] = NULL;
728         }
729 }
730
731 /**
732  * Clean up a RX queue.
733  *
734  * Destroy objects, free allocated memory and reset the structure for reuse.
735  *
736  * @param rxq_ctrl
737  *   Pointer to RX queue structure.
738  */
739 void
740 mlx5_rxq_cleanup(struct mlx5_rxq_ctrl *rxq_ctrl)
741 {
742         DEBUG("cleaning up %p", (void *)rxq_ctrl);
743         if (rxq_ctrl->ibv)
744                 mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
745         memset(rxq_ctrl, 0, sizeof(*rxq_ctrl));
746 }
747
748 /**
749  *
750  * @param dev
751  *   Pointer to Ethernet device structure.
752  * @param idx
753  *   RX queue index.
754  * @param desc
755  *   Number of descriptors to configure in queue.
756  * @param socket
757  *   NUMA socket on which memory must be allocated.
758  * @param[in] conf
759  *   Thresholds parameters.
760  * @param mp
761  *   Memory pool for buffer allocations.
762  *
763  * @return
764  *   0 on success, negative errno value on failure.
765  */
766 int
767 mlx5_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
768                     unsigned int socket, const struct rte_eth_rxconf *conf,
769                     struct rte_mempool *mp)
770 {
771         struct priv *priv = dev->data->dev_private;
772         struct mlx5_rxq_data *rxq = (*priv->rxqs)[idx];
773         struct mlx5_rxq_ctrl *rxq_ctrl =
774                 container_of(rxq, struct mlx5_rxq_ctrl, rxq);
775         int ret = 0;
776
777         (void)conf;
778         if (mlx5_is_secondary())
779                 return -E_RTE_SECONDARY;
780         priv_lock(priv);
781         if (!rte_is_power_of_2(desc)) {
782                 desc = 1 << log2above(desc);
783                 WARN("%p: increased number of descriptors in RX queue %u"
784                      " to the next power of two (%d)",
785                      (void *)dev, idx, desc);
786         }
787         DEBUG("%p: configuring queue %u for %u descriptors",
788               (void *)dev, idx, desc);
789         if (idx >= priv->rxqs_n) {
790                 ERROR("%p: queue index out of range (%u >= %u)",
791                       (void *)dev, idx, priv->rxqs_n);
792                 priv_unlock(priv);
793                 return -EOVERFLOW;
794         }
795         if (!mlx5_priv_rxq_releasable(priv, idx)) {
796                 ret = EBUSY;
797                 ERROR("%p: unable to release queue index %u",
798                       (void *)dev, idx);
799                 goto out;
800         }
801         mlx5_priv_rxq_release(priv, idx);
802         rxq_ctrl = mlx5_priv_rxq_new(priv, idx, desc, socket, mp);
803         if (!rxq_ctrl) {
804                 ERROR("%p: unable to allocate queue index %u",
805                       (void *)dev, idx);
806                 ret = ENOMEM;
807                 goto out;
808         }
809         DEBUG("%p: adding RX queue %p to list",
810               (void *)dev, (void *)rxq_ctrl);
811         (*priv->rxqs)[idx] = &rxq_ctrl->rxq;
812 out:
813         priv_unlock(priv);
814         return -ret;
815 }
816
817 /**
818  * DPDK callback to release a RX queue.
819  *
820  * @param dpdk_rxq
821  *   Generic RX queue pointer.
822  */
823 void
824 mlx5_rx_queue_release(void *dpdk_rxq)
825 {
826         struct mlx5_rxq_data *rxq = (struct mlx5_rxq_data *)dpdk_rxq;
827         struct mlx5_rxq_ctrl *rxq_ctrl;
828         struct priv *priv;
829
830         if (mlx5_is_secondary())
831                 return;
832
833         if (rxq == NULL)
834                 return;
835         rxq_ctrl = container_of(rxq, struct mlx5_rxq_ctrl, rxq);
836         priv = rxq_ctrl->priv;
837         priv_lock(priv);
838         if (!mlx5_priv_rxq_releasable(priv, rxq_ctrl->rxq.stats.idx))
839                 rte_panic("Rx queue %p is still used by a flow and cannot be"
840                           " removed\n", (void *)rxq_ctrl);
841         mlx5_priv_rxq_release(priv, rxq_ctrl->rxq.stats.idx);
842         priv_unlock(priv);
843 }
844
845 /**
846  * Allocate queue vector and fill epoll fd list for Rx interrupts.
847  *
848  * @param priv
849  *   Pointer to private structure.
850  *
851  * @return
852  *   0 on success, negative on failure.
853  */
854 int
855 priv_rx_intr_vec_enable(struct priv *priv)
856 {
857         unsigned int i;
858         unsigned int rxqs_n = priv->rxqs_n;
859         unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
860         unsigned int count = 0;
861         struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
862
863         assert(!mlx5_is_secondary());
864         if (!priv->dev->data->dev_conf.intr_conf.rxq)
865                 return 0;
866         priv_rx_intr_vec_disable(priv);
867         intr_handle->intr_vec = malloc(sizeof(intr_handle->intr_vec[rxqs_n]));
868         if (intr_handle->intr_vec == NULL) {
869                 ERROR("failed to allocate memory for interrupt vector,"
870                       " Rx interrupts will not be supported");
871                 return -ENOMEM;
872         }
873         intr_handle->type = RTE_INTR_HANDLE_EXT;
874         for (i = 0; i != n; ++i) {
875                 /* This rxq ibv must not be released in this function. */
876                 struct mlx5_rxq_ibv *rxq_ibv = mlx5_priv_rxq_ibv_get(priv, i);
877                 int fd;
878                 int flags;
879                 int rc;
880
881                 /* Skip queues that cannot request interrupts. */
882                 if (!rxq_ibv || !rxq_ibv->channel) {
883                         /* Use invalid intr_vec[] index to disable entry. */
884                         intr_handle->intr_vec[i] =
885                                 RTE_INTR_VEC_RXTX_OFFSET +
886                                 RTE_MAX_RXTX_INTR_VEC_ID;
887                         continue;
888                 }
889                 if (count >= RTE_MAX_RXTX_INTR_VEC_ID) {
890                         ERROR("too many Rx queues for interrupt vector size"
891                               " (%d), Rx interrupts cannot be enabled",
892                               RTE_MAX_RXTX_INTR_VEC_ID);
893                         priv_rx_intr_vec_disable(priv);
894                         return -1;
895                 }
896                 fd = rxq_ibv->channel->fd;
897                 flags = fcntl(fd, F_GETFL);
898                 rc = fcntl(fd, F_SETFL, flags | O_NONBLOCK);
899                 if (rc < 0) {
900                         ERROR("failed to make Rx interrupt file descriptor"
901                               " %d non-blocking for queue index %d", fd, i);
902                         priv_rx_intr_vec_disable(priv);
903                         return -1;
904                 }
905                 intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + count;
906                 intr_handle->efds[count] = fd;
907                 count++;
908         }
909         if (!count)
910                 priv_rx_intr_vec_disable(priv);
911         else
912                 intr_handle->nb_efd = count;
913         return 0;
914 }
915
916 /**
917  * Clean up Rx interrupts handler.
918  *
919  * @param priv
920  *   Pointer to private structure.
921  */
922 void
923 priv_rx_intr_vec_disable(struct priv *priv)
924 {
925         struct rte_intr_handle *intr_handle = priv->dev->intr_handle;
926         unsigned int i;
927         unsigned int rxqs_n = priv->rxqs_n;
928         unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
929
930         if (!priv->dev->data->dev_conf.intr_conf.rxq)
931                 return;
932         for (i = 0; i != n; ++i) {
933                 struct mlx5_rxq_ctrl *rxq_ctrl;
934                 struct mlx5_rxq_data *rxq_data;
935
936                 if (intr_handle->intr_vec[i] == RTE_INTR_VEC_RXTX_OFFSET +
937                     RTE_MAX_RXTX_INTR_VEC_ID)
938                         continue;
939                 /**
940                  * Need to access directly the queue to release the reference
941                  * kept in priv_rx_intr_vec_enable().
942                  */
943                 rxq_data = (*priv->rxqs)[i];
944                 rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
945                 mlx5_priv_rxq_ibv_release(priv, rxq_ctrl->ibv);
946         }
947         rte_intr_free_epoll_fd(intr_handle);
948         free(intr_handle->intr_vec);
949         intr_handle->nb_efd = 0;
950         intr_handle->intr_vec = NULL;
951 }
952
953 /**
954  *  MLX5 CQ notification .
955  *
956  *  @param rxq
957  *     Pointer to receive queue structure.
958  *  @param sq_n_rxq
959  *     Sequence number per receive queue .
960  */
961 static inline void
962 mlx5_arm_cq(struct mlx5_rxq_data *rxq, int sq_n_rxq)
963 {
964         int sq_n = 0;
965         uint32_t doorbell_hi;
966         uint64_t doorbell;
967         void *cq_db_reg = (char *)rxq->cq_uar + MLX5_CQ_DOORBELL;
968
969         sq_n = sq_n_rxq & MLX5_CQ_SQN_MASK;
970         doorbell_hi = sq_n << MLX5_CQ_SQN_OFFSET | (rxq->cq_ci & MLX5_CI_MASK);
971         doorbell = (uint64_t)doorbell_hi << 32;
972         doorbell |=  rxq->cqn;
973         rxq->cq_db[MLX5_CQ_ARM_DB] = rte_cpu_to_be_32(doorbell_hi);
974         rte_wmb();
975         rte_write64(rte_cpu_to_be_64(doorbell), cq_db_reg);
976 }
977
978 /**
979  * DPDK callback for Rx queue interrupt enable.
980  *
981  * @param dev
982  *   Pointer to Ethernet device structure.
983  * @param rx_queue_id
984  *   Rx queue number.
985  *
986  * @return
987  *   0 on success, negative on failure.
988  */
989 int
990 mlx5_rx_intr_enable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
991 {
992         struct priv *priv = mlx5_get_priv(dev);
993         struct mlx5_rxq_data *rxq_data;
994         struct mlx5_rxq_ctrl *rxq_ctrl;
995         int ret = 0;
996
997         priv_lock(priv);
998         rxq_data = (*priv->rxqs)[rx_queue_id];
999         if (!rxq_data) {
1000                 ret = EINVAL;
1001                 goto exit;
1002         }
1003         rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1004         if (rxq_ctrl->irq) {
1005                 struct mlx5_rxq_ibv *rxq_ibv;
1006
1007                 rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
1008                 if (!rxq_ibv) {
1009                         ret = EINVAL;
1010                         goto exit;
1011                 }
1012                 mlx5_arm_cq(rxq_data, rxq_data->cq_arm_sn);
1013                 mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
1014         }
1015 exit:
1016         priv_unlock(priv);
1017         if (ret)
1018                 WARN("unable to arm interrupt on rx queue %d", rx_queue_id);
1019         return -ret;
1020 }
1021
1022 /**
1023  * DPDK callback for Rx queue interrupt disable.
1024  *
1025  * @param dev
1026  *   Pointer to Ethernet device structure.
1027  * @param rx_queue_id
1028  *   Rx queue number.
1029  *
1030  * @return
1031  *   0 on success, negative on failure.
1032  */
1033 int
1034 mlx5_rx_intr_disable(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1035 {
1036         struct priv *priv = mlx5_get_priv(dev);
1037         struct mlx5_rxq_data *rxq_data;
1038         struct mlx5_rxq_ctrl *rxq_ctrl;
1039         struct mlx5_rxq_ibv *rxq_ibv = NULL;
1040         struct ibv_cq *ev_cq;
1041         void *ev_ctx;
1042         int ret = 0;
1043
1044         priv_lock(priv);
1045         rxq_data = (*priv->rxqs)[rx_queue_id];
1046         if (!rxq_data) {
1047                 ret = EINVAL;
1048                 goto exit;
1049         }
1050         rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1051         if (!rxq_ctrl->irq)
1052                 goto exit;
1053         rxq_ibv = mlx5_priv_rxq_ibv_get(priv, rx_queue_id);
1054         if (!rxq_ibv) {
1055                 ret = EINVAL;
1056                 goto exit;
1057         }
1058         ret = ibv_get_cq_event(rxq_ibv->channel, &ev_cq, &ev_ctx);
1059         if (ret || ev_cq != rxq_ibv->cq) {
1060                 ret = EINVAL;
1061                 goto exit;
1062         }
1063         rxq_data->cq_arm_sn++;
1064         ibv_ack_cq_events(rxq_ibv->cq, 1);
1065 exit:
1066         if (rxq_ibv)
1067                 mlx5_priv_rxq_ibv_release(priv, rxq_ibv);
1068         priv_unlock(priv);
1069         if (ret)
1070                 WARN("unable to disable interrupt on rx queue %d",
1071                      rx_queue_id);
1072         return -ret;
1073 }
1074
1075 /**
1076  * Create the Rx queue Verbs object.
1077  *
1078  * @param priv
1079  *   Pointer to private structure.
1080  * @param idx
1081  *   Queue index in DPDK Rx queue array
1082  *
1083  * @return
1084  *   The Verbs object initialised if it can be created.
1085  */
1086 struct mlx5_rxq_ibv*
1087 mlx5_priv_rxq_ibv_new(struct priv *priv, uint16_t idx)
1088 {
1089         struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1090         struct mlx5_rxq_ctrl *rxq_ctrl =
1091                 container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1092         struct ibv_wq_attr mod;
1093         union {
1094                 struct ibv_cq_init_attr_ex cq;
1095                 struct ibv_wq_init_attr wq;
1096                 struct ibv_cq_ex cq_attr;
1097         } attr;
1098         unsigned int cqe_n = (1 << rxq_data->elts_n) - 1;
1099         struct mlx5_rxq_ibv *tmpl;
1100         struct mlx5dv_cq cq_info;
1101         struct mlx5dv_rwq rwq;
1102         unsigned int i;
1103         int ret = 0;
1104         struct mlx5dv_obj obj;
1105
1106         assert(rxq_data);
1107         assert(!rxq_ctrl->ibv);
1108         tmpl = rte_calloc_socket(__func__, 1, sizeof(*tmpl), 0,
1109                                  rxq_ctrl->socket);
1110         if (!tmpl) {
1111                 ERROR("%p: cannot allocate verbs resources",
1112                        (void *)rxq_ctrl);
1113                 goto error;
1114         }
1115         tmpl->rxq_ctrl = rxq_ctrl;
1116         /* Use the entire RX mempool as the memory region. */
1117         tmpl->mr = priv_mr_get(priv, rxq_data->mp);
1118         if (!tmpl->mr) {
1119                 tmpl->mr = priv_mr_new(priv, rxq_data->mp);
1120                 if (!tmpl->mr) {
1121                         ERROR("%p: MR creation failure", (void *)rxq_ctrl);
1122                         goto error;
1123                 }
1124         }
1125         if (rxq_ctrl->irq) {
1126                 tmpl->channel = ibv_create_comp_channel(priv->ctx);
1127                 if (!tmpl->channel) {
1128                         ERROR("%p: Comp Channel creation failure",
1129                               (void *)rxq_ctrl);
1130                         goto error;
1131                 }
1132         }
1133         attr.cq = (struct ibv_cq_init_attr_ex){
1134                 .comp_mask = 0,
1135         };
1136         if (priv->cqe_comp) {
1137                 attr.cq.comp_mask |= IBV_CQ_INIT_ATTR_MASK_FLAGS;
1138                 attr.cq.flags |= MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE;
1139                 /*
1140                  * For vectorized Rx, it must not be doubled in order to
1141                  * make cq_ci and rq_ci aligned.
1142                  */
1143                 if (rxq_check_vec_support(rxq_data) < 0)
1144                         cqe_n *= 2;
1145         }
1146         tmpl->cq = ibv_create_cq(priv->ctx, cqe_n, NULL, tmpl->channel, 0);
1147         if (tmpl->cq == NULL) {
1148                 ERROR("%p: CQ creation failure", (void *)rxq_ctrl);
1149                 goto error;
1150         }
1151         DEBUG("priv->device_attr.max_qp_wr is %d",
1152               priv->device_attr.orig_attr.max_qp_wr);
1153         DEBUG("priv->device_attr.max_sge is %d",
1154               priv->device_attr.orig_attr.max_sge);
1155         attr.wq = (struct ibv_wq_init_attr){
1156                 .wq_context = NULL, /* Could be useful in the future. */
1157                 .wq_type = IBV_WQT_RQ,
1158                 /* Max number of outstanding WRs. */
1159                 .max_wr = (1 << rxq_data->elts_n) >> rxq_data->sges_n,
1160                 /* Max number of scatter/gather elements in a WR. */
1161                 .max_sge = 1 << rxq_data->sges_n,
1162                 .pd = priv->pd,
1163                 .cq = tmpl->cq,
1164                 .comp_mask =
1165                         IBV_WQ_FLAGS_CVLAN_STRIPPING |
1166                         0,
1167                 .create_flags = (rxq_data->vlan_strip ?
1168                                  IBV_WQ_FLAGS_CVLAN_STRIPPING :
1169                                  0),
1170         };
1171         /* By default, FCS (CRC) is stripped by hardware. */
1172         if (rxq_data->crc_present) {
1173                 attr.wq.create_flags |= IBV_WQ_FLAGS_SCATTER_FCS;
1174                 attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
1175         }
1176 #ifdef HAVE_IBV_WQ_FLAG_RX_END_PADDING
1177         if (priv->hw_padding) {
1178                 attr.wq.create_flags |= IBV_WQ_FLAG_RX_END_PADDING;
1179                 attr.wq.comp_mask |= IBV_WQ_INIT_ATTR_FLAGS;
1180         }
1181 #endif
1182         tmpl->wq = ibv_create_wq(priv->ctx, &attr.wq);
1183         if (tmpl->wq == NULL) {
1184                 ERROR("%p: WQ creation failure", (void *)rxq_ctrl);
1185                 goto error;
1186         }
1187         /*
1188          * Make sure number of WRs*SGEs match expectations since a queue
1189          * cannot allocate more than "desc" buffers.
1190          */
1191         if (((int)attr.wq.max_wr !=
1192              ((1 << rxq_data->elts_n) >> rxq_data->sges_n)) ||
1193             ((int)attr.wq.max_sge != (1 << rxq_data->sges_n))) {
1194                 ERROR("%p: requested %u*%u but got %u*%u WRs*SGEs",
1195                       (void *)rxq_ctrl,
1196                       ((1 << rxq_data->elts_n) >> rxq_data->sges_n),
1197                       (1 << rxq_data->sges_n),
1198                       attr.wq.max_wr, attr.wq.max_sge);
1199                 goto error;
1200         }
1201         /* Change queue state to ready. */
1202         mod = (struct ibv_wq_attr){
1203                 .attr_mask = IBV_WQ_ATTR_STATE,
1204                 .wq_state = IBV_WQS_RDY,
1205         };
1206         ret = ibv_modify_wq(tmpl->wq, &mod);
1207         if (ret) {
1208                 ERROR("%p: WQ state to IBV_WQS_RDY failed",
1209                       (void *)rxq_ctrl);
1210                 goto error;
1211         }
1212         obj.cq.in = tmpl->cq;
1213         obj.cq.out = &cq_info;
1214         obj.rwq.in = tmpl->wq;
1215         obj.rwq.out = &rwq;
1216         ret = mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ | MLX5DV_OBJ_RWQ);
1217         if (ret != 0)
1218                 goto error;
1219         if (cq_info.cqe_size != RTE_CACHE_LINE_SIZE) {
1220                 ERROR("Wrong MLX5_CQE_SIZE environment variable value: "
1221                       "it should be set to %u", RTE_CACHE_LINE_SIZE);
1222                 goto error;
1223         }
1224         /* Fill the rings. */
1225         rxq_data->wqes = (volatile struct mlx5_wqe_data_seg (*)[])
1226                 (uintptr_t)rwq.buf;
1227         for (i = 0; (i != (unsigned int)(1 << rxq_data->elts_n)); ++i) {
1228                 struct rte_mbuf *buf = (*rxq_data->elts)[i];
1229                 volatile struct mlx5_wqe_data_seg *scat = &(*rxq_data->wqes)[i];
1230
1231                 /* scat->addr must be able to store a pointer. */
1232                 assert(sizeof(scat->addr) >= sizeof(uintptr_t));
1233                 *scat = (struct mlx5_wqe_data_seg){
1234                         .addr = rte_cpu_to_be_64(rte_pktmbuf_mtod(buf,
1235                                                                   uintptr_t)),
1236                         .byte_count = rte_cpu_to_be_32(DATA_LEN(buf)),
1237                         .lkey = tmpl->mr->lkey,
1238                 };
1239         }
1240         rxq_data->rq_db = rwq.dbrec;
1241         rxq_data->cqe_n = log2above(cq_info.cqe_cnt);
1242         rxq_data->cq_ci = 0;
1243         rxq_data->rq_ci = 0;
1244         rxq_data->rq_pi = 0;
1245         rxq_data->zip = (struct rxq_zip){
1246                 .ai = 0,
1247         };
1248         rxq_data->cq_db = cq_info.dbrec;
1249         rxq_data->cqes = (volatile struct mlx5_cqe (*)[])(uintptr_t)cq_info.buf;
1250         /* Update doorbell counter. */
1251         rxq_data->rq_ci = (1 << rxq_data->elts_n) >> rxq_data->sges_n;
1252         rte_wmb();
1253         *rxq_data->rq_db = rte_cpu_to_be_32(rxq_data->rq_ci);
1254         DEBUG("%p: rxq updated with %p", (void *)rxq_ctrl, (void *)&tmpl);
1255         rte_atomic32_inc(&tmpl->refcnt);
1256         DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1257               (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
1258         LIST_INSERT_HEAD(&priv->rxqsibv, tmpl, next);
1259         return tmpl;
1260 error:
1261         if (tmpl->wq)
1262                 claim_zero(ibv_destroy_wq(tmpl->wq));
1263         if (tmpl->cq)
1264                 claim_zero(ibv_destroy_cq(tmpl->cq));
1265         if (tmpl->channel)
1266                 claim_zero(ibv_destroy_comp_channel(tmpl->channel));
1267         if (tmpl->mr)
1268                 priv_mr_release(priv, tmpl->mr);
1269         return NULL;
1270 }
1271
1272 /**
1273  * Get an Rx queue Verbs object.
1274  *
1275  * @param priv
1276  *   Pointer to private structure.
1277  * @param idx
1278  *   Queue index in DPDK Rx queue array
1279  *
1280  * @return
1281  *   The Verbs object if it exists.
1282  */
1283 struct mlx5_rxq_ibv*
1284 mlx5_priv_rxq_ibv_get(struct priv *priv, uint16_t idx)
1285 {
1286         struct mlx5_rxq_data *rxq_data = (*priv->rxqs)[idx];
1287         struct mlx5_rxq_ctrl *rxq_ctrl;
1288
1289         if (idx >= priv->rxqs_n)
1290                 return NULL;
1291         if (!rxq_data)
1292                 return NULL;
1293         rxq_ctrl = container_of(rxq_data, struct mlx5_rxq_ctrl, rxq);
1294         if (rxq_ctrl->ibv) {
1295                 priv_mr_get(priv, rxq_data->mp);
1296                 rte_atomic32_inc(&rxq_ctrl->ibv->refcnt);
1297                 DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1298                       (void *)rxq_ctrl->ibv,
1299                       rte_atomic32_read(&rxq_ctrl->ibv->refcnt));
1300         }
1301         return rxq_ctrl->ibv;
1302 }
1303
1304 /**
1305  * Release an Rx verbs queue object.
1306  *
1307  * @param priv
1308  *   Pointer to private structure.
1309  * @param rxq_ibv
1310  *   Verbs Rx queue object.
1311  *
1312  * @return
1313  *   0 on success, errno value on failure.
1314  */
1315 int
1316 mlx5_priv_rxq_ibv_release(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
1317 {
1318         int ret;
1319
1320         assert(rxq_ibv);
1321         assert(rxq_ibv->wq);
1322         assert(rxq_ibv->cq);
1323         assert(rxq_ibv->mr);
1324         ret = priv_mr_release(priv, rxq_ibv->mr);
1325         if (!ret)
1326                 rxq_ibv->mr = NULL;
1327         DEBUG("%p: Verbs Rx queue %p: refcnt %d", (void *)priv,
1328               (void *)rxq_ibv, rte_atomic32_read(&rxq_ibv->refcnt));
1329         if (rte_atomic32_dec_and_test(&rxq_ibv->refcnt)) {
1330                 rxq_free_elts(rxq_ibv->rxq_ctrl);
1331                 claim_zero(ibv_destroy_wq(rxq_ibv->wq));
1332                 claim_zero(ibv_destroy_cq(rxq_ibv->cq));
1333                 if (rxq_ibv->channel)
1334                         claim_zero(ibv_destroy_comp_channel(rxq_ibv->channel));
1335                 LIST_REMOVE(rxq_ibv, next);
1336                 rte_free(rxq_ibv);
1337                 return 0;
1338         }
1339         return EBUSY;
1340 }
1341
1342 /**
1343  * Verify the Verbs Rx queue list is empty
1344  *
1345  * @param priv
1346  *  Pointer to private structure.
1347  *
1348  * @return the number of object not released.
1349  */
1350 int
1351 mlx5_priv_rxq_ibv_verify(struct priv *priv)
1352 {
1353         int ret = 0;
1354         struct mlx5_rxq_ibv *rxq_ibv;
1355
1356         LIST_FOREACH(rxq_ibv, &priv->rxqsibv, next) {
1357                 DEBUG("%p: Verbs Rx queue %p still referenced", (void *)priv,
1358                       (void *)rxq_ibv);
1359                 ++ret;
1360         }
1361         return ret;
1362 }
1363
1364 /**
1365  * Return true if a single reference exists on the object.
1366  *
1367  * @param priv
1368  *   Pointer to private structure.
1369  * @param rxq_ibv
1370  *   Verbs Rx queue object.
1371  */
1372 int
1373 mlx5_priv_rxq_ibv_releasable(struct priv *priv, struct mlx5_rxq_ibv *rxq_ibv)
1374 {
1375         (void)priv;
1376         assert(rxq_ibv);
1377         return (rte_atomic32_read(&rxq_ibv->refcnt) == 1);
1378 }
1379
1380 /**
1381  * Create a DPDK Rx queue.
1382  *
1383  * @param priv
1384  *   Pointer to private structure.
1385  * @param idx
1386  *   TX queue index.
1387  * @param desc
1388  *   Number of descriptors to configure in queue.
1389  * @param socket
1390  *   NUMA socket on which memory must be allocated.
1391  *
1392  * @return
1393  *   A DPDK queue object on success.
1394  */
1395 struct mlx5_rxq_ctrl*
1396 mlx5_priv_rxq_new(struct priv *priv, uint16_t idx, uint16_t desc,
1397                   unsigned int socket, struct rte_mempool *mp)
1398 {
1399         struct rte_eth_dev *dev = priv->dev;
1400         struct mlx5_rxq_ctrl *tmpl;
1401         const uint16_t desc_n =
1402                 desc + priv->rx_vec_en * MLX5_VPMD_DESCS_PER_LOOP;
1403         unsigned int mb_len = rte_pktmbuf_data_room_size(mp);
1404
1405         tmpl = rte_calloc_socket("RXQ", 1,
1406                                  sizeof(*tmpl) +
1407                                  desc_n * sizeof(struct rte_mbuf *),
1408                                  0, socket);
1409         if (!tmpl)
1410                 return NULL;
1411         if (priv->dev->data->dev_conf.intr_conf.rxq)
1412                 tmpl->irq = 1;
1413         /* Enable scattered packets support for this queue if necessary. */
1414         assert(mb_len >= RTE_PKTMBUF_HEADROOM);
1415         if (dev->data->dev_conf.rxmode.max_rx_pkt_len <=
1416             (mb_len - RTE_PKTMBUF_HEADROOM)) {
1417                 tmpl->rxq.sges_n = 0;
1418         } else if (dev->data->dev_conf.rxmode.enable_scatter) {
1419                 unsigned int size =
1420                         RTE_PKTMBUF_HEADROOM +
1421                         dev->data->dev_conf.rxmode.max_rx_pkt_len;
1422                 unsigned int sges_n;
1423
1424                 /*
1425                  * Determine the number of SGEs needed for a full packet
1426                  * and round it to the next power of two.
1427                  */
1428                 sges_n = log2above((size / mb_len) + !!(size % mb_len));
1429                 tmpl->rxq.sges_n = sges_n;
1430                 /* Make sure rxq.sges_n did not overflow. */
1431                 size = mb_len * (1 << tmpl->rxq.sges_n);
1432                 size -= RTE_PKTMBUF_HEADROOM;
1433                 if (size < dev->data->dev_conf.rxmode.max_rx_pkt_len) {
1434                         ERROR("%p: too many SGEs (%u) needed to handle"
1435                               " requested maximum packet size %u",
1436                               (void *)dev,
1437                               1 << sges_n,
1438                               dev->data->dev_conf.rxmode.max_rx_pkt_len);
1439                         goto error;
1440                 }
1441         } else {
1442                 WARN("%p: the requested maximum Rx packet size (%u) is"
1443                      " larger than a single mbuf (%u) and scattered"
1444                      " mode has not been requested",
1445                      (void *)dev,
1446                      dev->data->dev_conf.rxmode.max_rx_pkt_len,
1447                      mb_len - RTE_PKTMBUF_HEADROOM);
1448         }
1449         DEBUG("%p: maximum number of segments per packet: %u",
1450               (void *)dev, 1 << tmpl->rxq.sges_n);
1451         if (desc % (1 << tmpl->rxq.sges_n)) {
1452                 ERROR("%p: number of RX queue descriptors (%u) is not a"
1453                       " multiple of SGEs per packet (%u)",
1454                       (void *)dev,
1455                       desc,
1456                       1 << tmpl->rxq.sges_n);
1457                 goto error;
1458         }
1459         /* Toggle RX checksum offload if hardware supports it. */
1460         if (priv->hw_csum)
1461                 tmpl->rxq.csum = !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1462         if (priv->hw_csum_l2tun)
1463                 tmpl->rxq.csum_l2tun =
1464                         !!dev->data->dev_conf.rxmode.hw_ip_checksum;
1465         /* Configure VLAN stripping. */
1466         tmpl->rxq.vlan_strip = (priv->hw_vlan_strip &&
1467                                !!dev->data->dev_conf.rxmode.hw_vlan_strip);
1468         /* By default, FCS (CRC) is stripped by hardware. */
1469         if (dev->data->dev_conf.rxmode.hw_strip_crc) {
1470                 tmpl->rxq.crc_present = 0;
1471         } else if (priv->hw_fcs_strip) {
1472                 tmpl->rxq.crc_present = 1;
1473         } else {
1474                 WARN("%p: CRC stripping has been disabled but will still"
1475                      " be performed by hardware, make sure MLNX_OFED and"
1476                      " firmware are up to date",
1477                      (void *)dev);
1478                 tmpl->rxq.crc_present = 0;
1479         }
1480         DEBUG("%p: CRC stripping is %s, %u bytes will be subtracted from"
1481               " incoming frames to hide it",
1482               (void *)dev,
1483               tmpl->rxq.crc_present ? "disabled" : "enabled",
1484               tmpl->rxq.crc_present << 2);
1485         /* Save port ID. */
1486         tmpl->rxq.rss_hash = priv->rxqs_n > 1;
1487         tmpl->rxq.port_id = dev->data->port_id;
1488         tmpl->priv = priv;
1489         tmpl->rxq.mp = mp;
1490         tmpl->rxq.stats.idx = idx;
1491         tmpl->rxq.elts_n = log2above(desc);
1492         tmpl->rxq.elts =
1493                 (struct rte_mbuf *(*)[1 << tmpl->rxq.elts_n])(tmpl + 1);
1494         rte_atomic32_inc(&tmpl->refcnt);
1495         DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1496               (void *)tmpl, rte_atomic32_read(&tmpl->refcnt));
1497         LIST_INSERT_HEAD(&priv->rxqsctrl, tmpl, next);
1498         return tmpl;
1499 error:
1500         rte_free(tmpl);
1501         return NULL;
1502 }
1503
1504 /**
1505  * Get a Rx queue.
1506  *
1507  * @param priv
1508  *   Pointer to private structure.
1509  * @param idx
1510  *   TX queue index.
1511  *
1512  * @return
1513  *   A pointer to the queue if it exists.
1514  */
1515 struct mlx5_rxq_ctrl*
1516 mlx5_priv_rxq_get(struct priv *priv, uint16_t idx)
1517 {
1518         struct mlx5_rxq_ctrl *rxq_ctrl = NULL;
1519
1520         if ((*priv->rxqs)[idx]) {
1521                 rxq_ctrl = container_of((*priv->rxqs)[idx],
1522                                         struct mlx5_rxq_ctrl,
1523                                         rxq);
1524
1525                 mlx5_priv_rxq_ibv_get(priv, idx);
1526                 rte_atomic32_inc(&rxq_ctrl->refcnt);
1527                 DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1528                       (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1529         }
1530         return rxq_ctrl;
1531 }
1532
1533 /**
1534  * Release a Rx queue.
1535  *
1536  * @param priv
1537  *   Pointer to private structure.
1538  * @param idx
1539  *   TX queue index.
1540  *
1541  * @return
1542  *   0 on success, errno value on failure.
1543  */
1544 int
1545 mlx5_priv_rxq_release(struct priv *priv, uint16_t idx)
1546 {
1547         struct mlx5_rxq_ctrl *rxq_ctrl;
1548
1549         if (!(*priv->rxqs)[idx])
1550                 return 0;
1551         rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1552         assert(rxq_ctrl->priv);
1553         if (rxq_ctrl->ibv) {
1554                 int ret;
1555
1556                 ret = mlx5_priv_rxq_ibv_release(rxq_ctrl->priv, rxq_ctrl->ibv);
1557                 if (!ret)
1558                         rxq_ctrl->ibv = NULL;
1559         }
1560         DEBUG("%p: Rx queue %p: refcnt %d", (void *)priv,
1561               (void *)rxq_ctrl, rte_atomic32_read(&rxq_ctrl->refcnt));
1562         if (rte_atomic32_dec_and_test(&rxq_ctrl->refcnt)) {
1563                 LIST_REMOVE(rxq_ctrl, next);
1564                 rte_free(rxq_ctrl);
1565                 (*priv->rxqs)[idx] = NULL;
1566                 return 0;
1567         }
1568         return EBUSY;
1569 }
1570
1571 /**
1572  * Verify if the queue can be released.
1573  *
1574  * @param priv
1575  *   Pointer to private structure.
1576  * @param idx
1577  *   TX queue index.
1578  *
1579  * @return
1580  *   1 if the queue can be released.
1581  */
1582 int
1583 mlx5_priv_rxq_releasable(struct priv *priv, uint16_t idx)
1584 {
1585         struct mlx5_rxq_ctrl *rxq_ctrl;
1586
1587         if (!(*priv->rxqs)[idx])
1588                 return -1;
1589         rxq_ctrl = container_of((*priv->rxqs)[idx], struct mlx5_rxq_ctrl, rxq);
1590         return (rte_atomic32_read(&rxq_ctrl->refcnt) == 1);
1591 }
1592
1593 /**
1594  * Verify the Rx Queue list is empty
1595  *
1596  * @param priv
1597  *  Pointer to private structure.
1598  *
1599  * @return the number of object not released.
1600  */
1601 int
1602 mlx5_priv_rxq_verify(struct priv *priv)
1603 {
1604         struct mlx5_rxq_ctrl *rxq_ctrl;
1605         int ret = 0;
1606
1607         LIST_FOREACH(rxq_ctrl, &priv->rxqsctrl, next) {
1608                 DEBUG("%p: Rx Queue %p still referenced", (void *)priv,
1609                       (void *)rxq_ctrl);
1610                 ++ret;
1611         }
1612         return ret;
1613 }
1614
1615 /**
1616  * Create an indirection table.
1617  *
1618  * @param priv
1619  *   Pointer to private structure.
1620  * @param queues
1621  *   Queues entering in the indirection table.
1622  * @param queues_n
1623  *   Number of queues in the array.
1624  *
1625  * @return
1626  *   A new indirection table.
1627  */
1628 struct mlx5_ind_table_ibv*
1629 mlx5_priv_ind_table_ibv_new(struct priv *priv, uint16_t queues[],
1630                             uint16_t queues_n)
1631 {
1632         struct mlx5_ind_table_ibv *ind_tbl;
1633         const unsigned int wq_n = rte_is_power_of_2(queues_n) ?
1634                 log2above(queues_n) :
1635                 priv->ind_table_max_size;
1636         struct ibv_wq *wq[1 << wq_n];
1637         unsigned int i;
1638         unsigned int j;
1639
1640         ind_tbl = rte_calloc(__func__, 1, sizeof(*ind_tbl) +
1641                              queues_n * sizeof(uint16_t), 0);
1642         if (!ind_tbl)
1643                 return NULL;
1644         for (i = 0; i != queues_n; ++i) {
1645                 struct mlx5_rxq_ctrl *rxq =
1646                         mlx5_priv_rxq_get(priv, queues[i]);
1647
1648                 if (!rxq)
1649                         goto error;
1650                 wq[i] = rxq->ibv->wq;
1651                 ind_tbl->queues[i] = queues[i];
1652         }
1653         ind_tbl->queues_n = queues_n;
1654         /* Finalise indirection table. */
1655         for (j = 0; i != (unsigned int)(1 << wq_n); ++i, ++j)
1656                 wq[i] = wq[j];
1657         ind_tbl->ind_table = ibv_create_rwq_ind_table(
1658                 priv->ctx,
1659                 &(struct ibv_rwq_ind_table_init_attr){
1660                         .log_ind_tbl_size = wq_n,
1661                         .ind_tbl = wq,
1662                         .comp_mask = 0,
1663                 });
1664         if (!ind_tbl->ind_table)
1665                 goto error;
1666         rte_atomic32_inc(&ind_tbl->refcnt);
1667         LIST_INSERT_HEAD(&priv->ind_tbls, ind_tbl, next);
1668         DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1669               (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1670         return ind_tbl;
1671 error:
1672         rte_free(ind_tbl);
1673         DEBUG("%p cannot create indirection table", (void *)priv);
1674         return NULL;
1675 }
1676
1677 /**
1678  * Get an indirection table.
1679  *
1680  * @param priv
1681  *   Pointer to private structure.
1682  * @param queues
1683  *   Queues entering in the indirection table.
1684  * @param queues_n
1685  *   Number of queues in the array.
1686  *
1687  * @return
1688  *   An indirection table if found.
1689  */
1690 struct mlx5_ind_table_ibv*
1691 mlx5_priv_ind_table_ibv_get(struct priv *priv, uint16_t queues[],
1692                             uint16_t queues_n)
1693 {
1694         struct mlx5_ind_table_ibv *ind_tbl;
1695
1696         LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1697                 if ((ind_tbl->queues_n == queues_n) &&
1698                     (memcmp(ind_tbl->queues, queues,
1699                             ind_tbl->queues_n * sizeof(ind_tbl->queues[0]))
1700                      == 0))
1701                         break;
1702         }
1703         if (ind_tbl) {
1704                 unsigned int i;
1705
1706                 rte_atomic32_inc(&ind_tbl->refcnt);
1707                 DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1708                       (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1709                 for (i = 0; i != ind_tbl->queues_n; ++i)
1710                         mlx5_priv_rxq_get(priv, ind_tbl->queues[i]);
1711         }
1712         return ind_tbl;
1713 }
1714
1715 /**
1716  * Release an indirection table.
1717  *
1718  * @param priv
1719  *   Pointer to private structure.
1720  * @param ind_table
1721  *   Indirection table to release.
1722  *
1723  * @return
1724  *   0 on success, errno value on failure.
1725  */
1726 int
1727 mlx5_priv_ind_table_ibv_release(struct priv *priv,
1728                                 struct mlx5_ind_table_ibv *ind_tbl)
1729 {
1730         unsigned int i;
1731
1732         DEBUG("%p: Indirection table %p: refcnt %d", (void *)priv,
1733               (void *)ind_tbl, rte_atomic32_read(&ind_tbl->refcnt));
1734         if (rte_atomic32_dec_and_test(&ind_tbl->refcnt))
1735                 claim_zero(ibv_destroy_rwq_ind_table(ind_tbl->ind_table));
1736         for (i = 0; i != ind_tbl->queues_n; ++i)
1737                 claim_nonzero(mlx5_priv_rxq_release(priv, ind_tbl->queues[i]));
1738         if (!rte_atomic32_read(&ind_tbl->refcnt)) {
1739                 LIST_REMOVE(ind_tbl, next);
1740                 rte_free(ind_tbl);
1741                 return 0;
1742         }
1743         return EBUSY;
1744 }
1745
1746 /**
1747  * Verify the Rx Queue list is empty
1748  *
1749  * @param priv
1750  *  Pointer to private structure.
1751  *
1752  * @return the number of object not released.
1753  */
1754 int
1755 mlx5_priv_ind_table_ibv_verify(struct priv *priv)
1756 {
1757         struct mlx5_ind_table_ibv *ind_tbl;
1758         int ret = 0;
1759
1760         LIST_FOREACH(ind_tbl, &priv->ind_tbls, next) {
1761                 DEBUG("%p: Verbs indirection table %p still referenced",
1762                       (void *)priv, (void *)ind_tbl);
1763                 ++ret;
1764         }
1765         return ret;
1766 }
1767
1768 /**
1769  * Create an Rx Hash queue.
1770  *
1771  * @param priv
1772  *   Pointer to private structure.
1773  * @param rss_key
1774  *   RSS key for the Rx hash queue.
1775  * @param rss_key_len
1776  *   RSS key length.
1777  * @param hash_fields
1778  *   Verbs protocol hash field to make the RSS on.
1779  * @param queues
1780  *   Queues entering in hash queue.
1781  * @param queues_n
1782  *   Number of queues.
1783  *
1784  * @return
1785  *   An hash Rx queue on success.
1786  */
1787 struct mlx5_hrxq*
1788 mlx5_priv_hrxq_new(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1789                    uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1790 {
1791         struct mlx5_hrxq *hrxq;
1792         struct mlx5_ind_table_ibv *ind_tbl;
1793         struct ibv_qp *qp;
1794
1795         ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1796         if (!ind_tbl)
1797                 ind_tbl = mlx5_priv_ind_table_ibv_new(priv, queues, queues_n);
1798         if (!ind_tbl)
1799                 return NULL;
1800         qp = ibv_create_qp_ex(
1801                 priv->ctx,
1802                 &(struct ibv_qp_init_attr_ex){
1803                         .qp_type = IBV_QPT_RAW_PACKET,
1804                         .comp_mask =
1805                                 IBV_QP_INIT_ATTR_PD |
1806                                 IBV_QP_INIT_ATTR_IND_TABLE |
1807                                 IBV_QP_INIT_ATTR_RX_HASH,
1808                         .rx_hash_conf = (struct ibv_rx_hash_conf){
1809                                 .rx_hash_function = IBV_RX_HASH_FUNC_TOEPLITZ,
1810                                 .rx_hash_key_len = rss_key_len,
1811                                 .rx_hash_key = rss_key,
1812                                 .rx_hash_fields_mask = hash_fields,
1813                         },
1814                         .rwq_ind_tbl = ind_tbl->ind_table,
1815                         .pd = priv->pd,
1816                 });
1817         if (!qp)
1818                 goto error;
1819         hrxq = rte_calloc(__func__, 1, sizeof(*hrxq) + rss_key_len, 0);
1820         if (!hrxq)
1821                 goto error;
1822         hrxq->ind_table = ind_tbl;
1823         hrxq->qp = qp;
1824         hrxq->rss_key_len = rss_key_len;
1825         hrxq->hash_fields = hash_fields;
1826         memcpy(hrxq->rss_key, rss_key, rss_key_len);
1827         rte_atomic32_inc(&hrxq->refcnt);
1828         LIST_INSERT_HEAD(&priv->hrxqs, hrxq, next);
1829         DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1830               (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1831         return hrxq;
1832 error:
1833         mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1834         if (qp)
1835                 claim_zero(ibv_destroy_qp(qp));
1836         return NULL;
1837 }
1838
1839 /**
1840  * Get an Rx Hash queue.
1841  *
1842  * @param priv
1843  *   Pointer to private structure.
1844  * @param rss_conf
1845  *   RSS configuration for the Rx hash queue.
1846  * @param queues
1847  *   Queues entering in hash queue.
1848  * @param queues_n
1849  *   Number of queues.
1850  *
1851  * @return
1852  *   An hash Rx queue on success.
1853  */
1854 struct mlx5_hrxq*
1855 mlx5_priv_hrxq_get(struct priv *priv, uint8_t *rss_key, uint8_t rss_key_len,
1856                    uint64_t hash_fields, uint16_t queues[], uint16_t queues_n)
1857 {
1858         struct mlx5_hrxq *hrxq;
1859
1860         LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1861                 struct mlx5_ind_table_ibv *ind_tbl;
1862
1863                 if (hrxq->rss_key_len != rss_key_len)
1864                         continue;
1865                 if (memcmp(hrxq->rss_key, rss_key, rss_key_len))
1866                         continue;
1867                 if (hrxq->hash_fields != hash_fields)
1868                         continue;
1869                 ind_tbl = mlx5_priv_ind_table_ibv_get(priv, queues, queues_n);
1870                 if (!ind_tbl)
1871                         continue;
1872                 if (ind_tbl != hrxq->ind_table) {
1873                         mlx5_priv_ind_table_ibv_release(priv, ind_tbl);
1874                         continue;
1875                 }
1876                 rte_atomic32_inc(&hrxq->refcnt);
1877                 DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1878                       (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1879                 return hrxq;
1880         }
1881         return NULL;
1882 }
1883
1884 /**
1885  * Release the hash Rx queue.
1886  *
1887  * @param priv
1888  *   Pointer to private structure.
1889  * @param hrxq
1890  *   Pointer to Hash Rx queue to release.
1891  *
1892  * @return
1893  *   0 on success, errno value on failure.
1894  */
1895 int
1896 mlx5_priv_hrxq_release(struct priv *priv, struct mlx5_hrxq *hrxq)
1897 {
1898         DEBUG("%p: Hash Rx queue %p: refcnt %d", (void *)priv,
1899               (void *)hrxq, rte_atomic32_read(&hrxq->refcnt));
1900         if (rte_atomic32_dec_and_test(&hrxq->refcnt)) {
1901                 claim_zero(ibv_destroy_qp(hrxq->qp));
1902                 mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table);
1903                 LIST_REMOVE(hrxq, next);
1904                 rte_free(hrxq);
1905                 return 0;
1906         }
1907         claim_nonzero(mlx5_priv_ind_table_ibv_release(priv, hrxq->ind_table));
1908         return EBUSY;
1909 }
1910
1911 /**
1912  * Verify the Rx Queue list is empty
1913  *
1914  * @param priv
1915  *  Pointer to private structure.
1916  *
1917  * @return the number of object not released.
1918  */
1919 int
1920 mlx5_priv_hrxq_ibv_verify(struct priv *priv)
1921 {
1922         struct mlx5_hrxq *hrxq;
1923         int ret = 0;
1924
1925         LIST_FOREACH(hrxq, &priv->hrxqs, next) {
1926                 DEBUG("%p: Verbs Hash Rx queue %p still referenced",
1927                       (void *)priv, (void *)hrxq);
1928                 ++ret;
1929         }
1930         return ret;
1931 }