net/mlx5: manage modify actions with hashed list
[dpdk.git] / drivers / net / mlx5 / linux / mlx5_os.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2015 6WIND S.A.
3  * Copyright 2020 Mellanox Technologies, Ltd
4  */
5
6 #include <stddef.h>
7 #include <unistd.h>
8 #include <string.h>
9 #include <stdint.h>
10 #include <stdlib.h>
11 #include <errno.h>
12 #include <net/if.h>
13 #include <linux/rtnetlink.h>
14 #include <linux/sockios.h>
15 #include <linux/ethtool.h>
16 #include <fcntl.h>
17
18 #include <rte_malloc.h>
19 #include <rte_ethdev_driver.h>
20 #include <rte_ethdev_pci.h>
21 #include <rte_pci.h>
22 #include <rte_bus_pci.h>
23 #include <rte_common.h>
24 #include <rte_kvargs.h>
25 #include <rte_rwlock.h>
26 #include <rte_spinlock.h>
27 #include <rte_string_fns.h>
28 #include <rte_alarm.h>
29 #include <rte_eal_paging.h>
30
31 #include <mlx5_glue.h>
32 #include <mlx5_devx_cmds.h>
33 #include <mlx5_common.h>
34 #include <mlx5_common_mp.h>
35 #include <mlx5_common_mr.h>
36 #include <mlx5_malloc.h>
37
38 #include "mlx5_defs.h"
39 #include "mlx5.h"
40 #include "mlx5_common_os.h"
41 #include "mlx5_utils.h"
42 #include "mlx5_rxtx.h"
43 #include "mlx5_autoconf.h"
44 #include "mlx5_mr.h"
45 #include "mlx5_flow.h"
46 #include "rte_pmd_mlx5.h"
47 #include "mlx5_verbs.h"
48
49 #define MLX5_TAGS_HLIST_ARRAY_SIZE 8192
50
51 #ifndef HAVE_IBV_MLX5_MOD_MPW
52 #define MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED (1 << 2)
53 #define MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW (1 << 3)
54 #endif
55
56 #ifndef HAVE_IBV_MLX5_MOD_CQE_128B_COMP
57 #define MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP (1 << 4)
58 #endif
59
60 static const char *MZ_MLX5_PMD_SHARED_DATA = "mlx5_pmd_shared_data";
61
62 /* Spinlock for mlx5_shared_data allocation. */
63 static rte_spinlock_t mlx5_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
64
65 /* Process local data for secondary processes. */
66 static struct mlx5_local_data mlx5_local_data;
67
68 /**
69  * Set the completion channel file descriptor interrupt as non-blocking.
70  *
71  * @param[in] rxq_obj
72  *   Pointer to RQ channel object, which includes the channel fd
73  *
74  * @param[out] fd
75  *   The file descriptor (representing the intetrrupt) used in this channel.
76  *
77  * @return
78  *   0 on successfully setting the fd to non-blocking, non-zero otherwise.
79  */
80 int
81 mlx5_os_set_nonblock_channel_fd(int fd)
82 {
83         int flags;
84
85         flags = fcntl(fd, F_GETFL);
86         return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
87 }
88
89 /**
90  * Get mlx5 device attributes. The glue function query_device_ex() is called
91  * with out parameter of type 'struct ibv_device_attr_ex *'. Then fill in mlx5
92  * device attributes from the glue out parameter.
93  *
94  * @param dev
95  *   Pointer to ibv context.
96  *
97  * @param device_attr
98  *   Pointer to mlx5 device attributes.
99  *
100  * @return
101  *   0 on success, non zero error number otherwise
102  */
103 int
104 mlx5_os_get_dev_attr(void *ctx, struct mlx5_dev_attr *device_attr)
105 {
106         int err;
107         struct ibv_device_attr_ex attr_ex;
108         memset(device_attr, 0, sizeof(*device_attr));
109         err = mlx5_glue->query_device_ex(ctx, NULL, &attr_ex);
110         if (err)
111                 return err;
112
113         device_attr->device_cap_flags_ex = attr_ex.device_cap_flags_ex;
114         device_attr->max_qp_wr = attr_ex.orig_attr.max_qp_wr;
115         device_attr->max_sge = attr_ex.orig_attr.max_sge;
116         device_attr->max_cq = attr_ex.orig_attr.max_cq;
117         device_attr->max_qp = attr_ex.orig_attr.max_qp;
118         device_attr->raw_packet_caps = attr_ex.raw_packet_caps;
119         device_attr->max_rwq_indirection_table_size =
120                 attr_ex.rss_caps.max_rwq_indirection_table_size;
121         device_attr->max_tso = attr_ex.tso_caps.max_tso;
122         device_attr->tso_supported_qpts = attr_ex.tso_caps.supported_qpts;
123
124         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
125         err = mlx5_glue->dv_query_device(ctx, &dv_attr);
126         if (err)
127                 return err;
128
129         device_attr->flags = dv_attr.flags;
130         device_attr->comp_mask = dv_attr.comp_mask;
131 #ifdef HAVE_IBV_MLX5_MOD_SWP
132         device_attr->sw_parsing_offloads =
133                 dv_attr.sw_parsing_caps.sw_parsing_offloads;
134 #endif
135         device_attr->min_single_stride_log_num_of_bytes =
136                 dv_attr.striding_rq_caps.min_single_stride_log_num_of_bytes;
137         device_attr->max_single_stride_log_num_of_bytes =
138                 dv_attr.striding_rq_caps.max_single_stride_log_num_of_bytes;
139         device_attr->min_single_wqe_log_num_of_strides =
140                 dv_attr.striding_rq_caps.min_single_wqe_log_num_of_strides;
141         device_attr->max_single_wqe_log_num_of_strides =
142                 dv_attr.striding_rq_caps.max_single_wqe_log_num_of_strides;
143         device_attr->stride_supported_qpts =
144                 dv_attr.striding_rq_caps.supported_qpts;
145 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
146         device_attr->tunnel_offloads_caps = dv_attr.tunnel_offloads_caps;
147 #endif
148
149         return err;
150 }
151
152 /**
153  * Verbs callback to allocate a memory. This function should allocate the space
154  * according to the size provided residing inside a huge page.
155  * Please note that all allocation must respect the alignment from libmlx5
156  * (i.e. currently rte_mem_page_size()).
157  *
158  * @param[in] size
159  *   The size in bytes of the memory to allocate.
160  * @param[in] data
161  *   A pointer to the callback data.
162  *
163  * @return
164  *   Allocated buffer, NULL otherwise and rte_errno is set.
165  */
166 static void *
167 mlx5_alloc_verbs_buf(size_t size, void *data)
168 {
169         struct mlx5_priv *priv = data;
170         void *ret;
171         unsigned int socket = SOCKET_ID_ANY;
172         size_t alignment = rte_mem_page_size();
173         if (alignment == (size_t)-1) {
174                 DRV_LOG(ERR, "Failed to get mem page size");
175                 rte_errno = ENOMEM;
176                 return NULL;
177         }
178
179         if (priv->verbs_alloc_ctx.type == MLX5_VERBS_ALLOC_TYPE_TX_QUEUE) {
180                 const struct mlx5_txq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
181
182                 socket = ctrl->socket;
183         } else if (priv->verbs_alloc_ctx.type ==
184                    MLX5_VERBS_ALLOC_TYPE_RX_QUEUE) {
185                 const struct mlx5_rxq_ctrl *ctrl = priv->verbs_alloc_ctx.obj;
186
187                 socket = ctrl->socket;
188         }
189         MLX5_ASSERT(data != NULL);
190         ret = mlx5_malloc(0, size, alignment, socket);
191         if (!ret && size)
192                 rte_errno = ENOMEM;
193         return ret;
194 }
195
196 /**
197  * Verbs callback to free a memory.
198  *
199  * @param[in] ptr
200  *   A pointer to the memory to free.
201  * @param[in] data
202  *   A pointer to the callback data.
203  */
204 static void
205 mlx5_free_verbs_buf(void *ptr, void *data __rte_unused)
206 {
207         MLX5_ASSERT(data != NULL);
208         mlx5_free(ptr);
209 }
210
211 /**
212  * Initialize DR related data within private structure.
213  * Routine checks the reference counter and does actual
214  * resources creation/initialization only if counter is zero.
215  *
216  * @param[in] priv
217  *   Pointer to the private device data structure.
218  *
219  * @return
220  *   Zero on success, positive error code otherwise.
221  */
222 static int
223 mlx5_alloc_shared_dr(struct mlx5_priv *priv)
224 {
225         struct mlx5_dev_ctx_shared *sh = priv->sh;
226         char s[MLX5_HLIST_NAMESIZE];
227         int err = 0;
228
229         if (!sh->flow_tbls)
230                 err = mlx5_alloc_table_hash_list(priv);
231         else
232                 DRV_LOG(DEBUG, "sh->flow_tbls[%p] already created, reuse\n",
233                         (void *)sh->flow_tbls);
234         if (err)
235                 return err;
236         /* Create tags hash list table. */
237         snprintf(s, sizeof(s), "%s_tags", sh->ibdev_name);
238         sh->tag_table = mlx5_hlist_create(s, MLX5_TAGS_HLIST_ARRAY_SIZE);
239         if (!sh->tag_table) {
240                 DRV_LOG(ERR, "tags with hash creation failed.");
241                 err = ENOMEM;
242                 goto error;
243         }
244         snprintf(s, sizeof(s), "%s_hdr_modify", sh->ibdev_name);
245         sh->modify_cmds = mlx5_hlist_create(s, MLX5_FLOW_HDR_MODIFY_HTABLE_SZ);
246         if (!sh->modify_cmds) {
247                 DRV_LOG(ERR, "hdr modify hash creation failed");
248                 err = ENOMEM;
249                 goto error;
250         }
251 #ifdef HAVE_MLX5DV_DR
252         void *domain;
253
254         if (sh->dv_refcnt) {
255                 /* Shared DV/DR structures is already initialized. */
256                 sh->dv_refcnt++;
257                 priv->dr_shared = 1;
258                 return 0;
259         }
260         /* Reference counter is zero, we should initialize structures. */
261         domain = mlx5_glue->dr_create_domain(sh->ctx,
262                                              MLX5DV_DR_DOMAIN_TYPE_NIC_RX);
263         if (!domain) {
264                 DRV_LOG(ERR, "ingress mlx5dv_dr_create_domain failed");
265                 err = errno;
266                 goto error;
267         }
268         sh->rx_domain = domain;
269         domain = mlx5_glue->dr_create_domain(sh->ctx,
270                                              MLX5DV_DR_DOMAIN_TYPE_NIC_TX);
271         if (!domain) {
272                 DRV_LOG(ERR, "egress mlx5dv_dr_create_domain failed");
273                 err = errno;
274                 goto error;
275         }
276         pthread_mutex_init(&sh->dv_mutex, NULL);
277         sh->tx_domain = domain;
278 #ifdef HAVE_MLX5DV_DR_ESWITCH
279         if (priv->config.dv_esw_en) {
280                 domain  = mlx5_glue->dr_create_domain
281                         (sh->ctx, MLX5DV_DR_DOMAIN_TYPE_FDB);
282                 if (!domain) {
283                         DRV_LOG(ERR, "FDB mlx5dv_dr_create_domain failed");
284                         err = errno;
285                         goto error;
286                 }
287                 sh->fdb_domain = domain;
288                 sh->esw_drop_action = mlx5_glue->dr_create_flow_action_drop();
289         }
290 #endif
291         if (priv->config.reclaim_mode == MLX5_RCM_AGGR) {
292                 mlx5_glue->dr_reclaim_domain_memory(sh->rx_domain, 1);
293                 mlx5_glue->dr_reclaim_domain_memory(sh->tx_domain, 1);
294                 if (sh->fdb_domain)
295                         mlx5_glue->dr_reclaim_domain_memory(sh->fdb_domain, 1);
296         }
297         sh->pop_vlan_action = mlx5_glue->dr_create_flow_action_pop_vlan();
298 #endif /* HAVE_MLX5DV_DR */
299         sh->dv_refcnt++;
300         priv->dr_shared = 1;
301         return 0;
302 error:
303         /* Rollback the created objects. */
304         if (sh->rx_domain) {
305                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
306                 sh->rx_domain = NULL;
307         }
308         if (sh->tx_domain) {
309                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
310                 sh->tx_domain = NULL;
311         }
312         if (sh->fdb_domain) {
313                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
314                 sh->fdb_domain = NULL;
315         }
316         if (sh->esw_drop_action) {
317                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
318                 sh->esw_drop_action = NULL;
319         }
320         if (sh->pop_vlan_action) {
321                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
322                 sh->pop_vlan_action = NULL;
323         }
324         if (sh->modify_cmds) {
325                 mlx5_hlist_destroy(sh->modify_cmds, NULL, NULL);
326                 sh->modify_cmds = NULL;
327         }
328         if (sh->tag_table) {
329                 /* tags should be destroyed with flow before. */
330                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
331                 sh->tag_table = NULL;
332         }
333         mlx5_free_table_hash_list(priv);
334         return err;
335 }
336
337 /**
338  * Destroy DR related data within private structure.
339  *
340  * @param[in] priv
341  *   Pointer to the private device data structure.
342  */
343 void
344 mlx5_os_free_shared_dr(struct mlx5_priv *priv)
345 {
346         struct mlx5_dev_ctx_shared *sh;
347
348         if (!priv->dr_shared)
349                 return;
350         priv->dr_shared = 0;
351         sh = priv->sh;
352         MLX5_ASSERT(sh);
353 #ifdef HAVE_MLX5DV_DR
354         MLX5_ASSERT(sh->dv_refcnt);
355         if (sh->dv_refcnt && --sh->dv_refcnt)
356                 return;
357         if (sh->rx_domain) {
358                 mlx5_glue->dr_destroy_domain(sh->rx_domain);
359                 sh->rx_domain = NULL;
360         }
361         if (sh->tx_domain) {
362                 mlx5_glue->dr_destroy_domain(sh->tx_domain);
363                 sh->tx_domain = NULL;
364         }
365 #ifdef HAVE_MLX5DV_DR_ESWITCH
366         if (sh->fdb_domain) {
367                 mlx5_glue->dr_destroy_domain(sh->fdb_domain);
368                 sh->fdb_domain = NULL;
369         }
370         if (sh->esw_drop_action) {
371                 mlx5_glue->destroy_flow_action(sh->esw_drop_action);
372                 sh->esw_drop_action = NULL;
373         }
374 #endif
375         if (sh->pop_vlan_action) {
376                 mlx5_glue->destroy_flow_action(sh->pop_vlan_action);
377                 sh->pop_vlan_action = NULL;
378         }
379         pthread_mutex_destroy(&sh->dv_mutex);
380 #endif /* HAVE_MLX5DV_DR */
381         if (sh->modify_cmds) {
382                 mlx5_hlist_destroy(sh->modify_cmds, NULL, NULL);
383                 sh->modify_cmds = NULL;
384         }
385         if (sh->tag_table) {
386                 /* tags should be destroyed with flow before. */
387                 mlx5_hlist_destroy(sh->tag_table, NULL, NULL);
388                 sh->tag_table = NULL;
389         }
390         mlx5_free_table_hash_list(priv);
391 }
392
393 /**
394  * Initialize shared data between primary and secondary process.
395  *
396  * A memzone is reserved by primary process and secondary processes attach to
397  * the memzone.
398  *
399  * @return
400  *   0 on success, a negative errno value otherwise and rte_errno is set.
401  */
402 static int
403 mlx5_init_shared_data(void)
404 {
405         const struct rte_memzone *mz;
406         int ret = 0;
407
408         rte_spinlock_lock(&mlx5_shared_data_lock);
409         if (mlx5_shared_data == NULL) {
410                 if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
411                         /* Allocate shared memory. */
412                         mz = rte_memzone_reserve(MZ_MLX5_PMD_SHARED_DATA,
413                                                  sizeof(*mlx5_shared_data),
414                                                  SOCKET_ID_ANY, 0);
415                         if (mz == NULL) {
416                                 DRV_LOG(ERR,
417                                         "Cannot allocate mlx5 shared data");
418                                 ret = -rte_errno;
419                                 goto error;
420                         }
421                         mlx5_shared_data = mz->addr;
422                         memset(mlx5_shared_data, 0, sizeof(*mlx5_shared_data));
423                         rte_spinlock_init(&mlx5_shared_data->lock);
424                 } else {
425                         /* Lookup allocated shared memory. */
426                         mz = rte_memzone_lookup(MZ_MLX5_PMD_SHARED_DATA);
427                         if (mz == NULL) {
428                                 DRV_LOG(ERR,
429                                         "Cannot attach mlx5 shared data");
430                                 ret = -rte_errno;
431                                 goto error;
432                         }
433                         mlx5_shared_data = mz->addr;
434                         memset(&mlx5_local_data, 0, sizeof(mlx5_local_data));
435                 }
436         }
437 error:
438         rte_spinlock_unlock(&mlx5_shared_data_lock);
439         return ret;
440 }
441
442 /**
443  * PMD global initialization.
444  *
445  * Independent from individual device, this function initializes global
446  * per-PMD data structures distinguishing primary and secondary processes.
447  * Hence, each initialization is called once per a process.
448  *
449  * @return
450  *   0 on success, a negative errno value otherwise and rte_errno is set.
451  */
452 static int
453 mlx5_init_once(void)
454 {
455         struct mlx5_shared_data *sd;
456         struct mlx5_local_data *ld = &mlx5_local_data;
457         int ret = 0;
458
459         if (mlx5_init_shared_data())
460                 return -rte_errno;
461         sd = mlx5_shared_data;
462         MLX5_ASSERT(sd);
463         rte_spinlock_lock(&sd->lock);
464         switch (rte_eal_process_type()) {
465         case RTE_PROC_PRIMARY:
466                 if (sd->init_done)
467                         break;
468                 LIST_INIT(&sd->mem_event_cb_list);
469                 rte_rwlock_init(&sd->mem_event_rwlock);
470                 rte_mem_event_callback_register("MLX5_MEM_EVENT_CB",
471                                                 mlx5_mr_mem_event_cb, NULL);
472                 ret = mlx5_mp_init_primary(MLX5_MP_NAME,
473                                            mlx5_mp_os_primary_handle);
474                 if (ret)
475                         goto out;
476                 sd->init_done = true;
477                 break;
478         case RTE_PROC_SECONDARY:
479                 if (ld->init_done)
480                         break;
481                 ret = mlx5_mp_init_secondary(MLX5_MP_NAME,
482                                              mlx5_mp_os_secondary_handle);
483                 if (ret)
484                         goto out;
485                 ++sd->secondary_cnt;
486                 ld->init_done = true;
487                 break;
488         default:
489                 break;
490         }
491 out:
492         rte_spinlock_unlock(&sd->lock);
493         return ret;
494 }
495
496 /**
497  * Spawn an Ethernet device from Verbs information.
498  *
499  * @param dpdk_dev
500  *   Backing DPDK device.
501  * @param spawn
502  *   Verbs device parameters (name, port, switch_info) to spawn.
503  * @param config
504  *   Device configuration parameters.
505  *
506  * @return
507  *   A valid Ethernet device object on success, NULL otherwise and rte_errno
508  *   is set. The following errors are defined:
509  *
510  *   EBUSY: device is not supposed to be spawned.
511  *   EEXIST: device is already spawned
512  */
513 static struct rte_eth_dev *
514 mlx5_dev_spawn(struct rte_device *dpdk_dev,
515                struct mlx5_dev_spawn_data *spawn,
516                struct mlx5_dev_config *config)
517 {
518         const struct mlx5_switch_info *switch_info = &spawn->info;
519         struct mlx5_dev_ctx_shared *sh = NULL;
520         struct ibv_port_attr port_attr;
521         struct mlx5dv_context dv_attr = { .comp_mask = 0 };
522         struct rte_eth_dev *eth_dev = NULL;
523         struct mlx5_priv *priv = NULL;
524         int err = 0;
525         unsigned int hw_padding = 0;
526         unsigned int mps;
527         unsigned int cqe_comp;
528         unsigned int cqe_pad = 0;
529         unsigned int tunnel_en = 0;
530         unsigned int mpls_en = 0;
531         unsigned int swp = 0;
532         unsigned int mprq = 0;
533         unsigned int mprq_min_stride_size_n = 0;
534         unsigned int mprq_max_stride_size_n = 0;
535         unsigned int mprq_min_stride_num_n = 0;
536         unsigned int mprq_max_stride_num_n = 0;
537         struct rte_ether_addr mac;
538         char name[RTE_ETH_NAME_MAX_LEN];
539         int own_domain_id = 0;
540         uint16_t port_id;
541         unsigned int i;
542 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
543         struct mlx5dv_devx_port devx_port = { .comp_mask = 0 };
544 #endif
545
546         /* Determine if this port representor is supposed to be spawned. */
547         if (switch_info->representor && dpdk_dev->devargs) {
548                 struct rte_eth_devargs eth_da;
549
550                 err = rte_eth_devargs_parse(dpdk_dev->devargs->args, &eth_da);
551                 if (err) {
552                         rte_errno = -err;
553                         DRV_LOG(ERR, "failed to process device arguments: %s",
554                                 strerror(rte_errno));
555                         return NULL;
556                 }
557                 for (i = 0; i < eth_da.nb_representor_ports; ++i)
558                         if (eth_da.representor_ports[i] ==
559                             (uint16_t)switch_info->port_name)
560                                 break;
561                 if (i == eth_da.nb_representor_ports) {
562                         rte_errno = EBUSY;
563                         return NULL;
564                 }
565         }
566         /* Build device name. */
567         if (spawn->pf_bond <  0) {
568                 /* Single device. */
569                 if (!switch_info->representor)
570                         strlcpy(name, dpdk_dev->name, sizeof(name));
571                 else
572                         snprintf(name, sizeof(name), "%s_representor_%u",
573                                  dpdk_dev->name, switch_info->port_name);
574         } else {
575                 /* Bonding device. */
576                 if (!switch_info->representor)
577                         snprintf(name, sizeof(name), "%s_%s",
578                                  dpdk_dev->name,
579                                  mlx5_os_get_dev_device_name(spawn->phys_dev));
580                 else
581                         snprintf(name, sizeof(name), "%s_%s_representor_%u",
582                                  dpdk_dev->name,
583                                  mlx5_os_get_dev_device_name(spawn->phys_dev),
584                                  switch_info->port_name);
585         }
586         /* check if the device is already spawned */
587         if (rte_eth_dev_get_port_by_name(name, &port_id) == 0) {
588                 rte_errno = EEXIST;
589                 return NULL;
590         }
591         DRV_LOG(DEBUG, "naming Ethernet device \"%s\"", name);
592         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
593                 struct mlx5_mp_id mp_id;
594
595                 eth_dev = rte_eth_dev_attach_secondary(name);
596                 if (eth_dev == NULL) {
597                         DRV_LOG(ERR, "can not attach rte ethdev");
598                         rte_errno = ENOMEM;
599                         return NULL;
600                 }
601                 eth_dev->device = dpdk_dev;
602                 eth_dev->dev_ops = &mlx5_os_dev_sec_ops;
603                 err = mlx5_proc_priv_init(eth_dev);
604                 if (err)
605                         return NULL;
606                 mp_id.port_id = eth_dev->data->port_id;
607                 strlcpy(mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
608                 /* Receive command fd from primary process */
609                 err = mlx5_mp_req_verbs_cmd_fd(&mp_id);
610                 if (err < 0)
611                         goto err_secondary;
612                 /* Remap UAR for Tx queues. */
613                 err = mlx5_tx_uar_init_secondary(eth_dev, err);
614                 if (err)
615                         goto err_secondary;
616                 /*
617                  * Ethdev pointer is still required as input since
618                  * the primary device is not accessible from the
619                  * secondary process.
620                  */
621                 eth_dev->rx_pkt_burst = mlx5_select_rx_function(eth_dev);
622                 eth_dev->tx_pkt_burst = mlx5_select_tx_function(eth_dev);
623                 return eth_dev;
624 err_secondary:
625                 mlx5_dev_close(eth_dev);
626                 return NULL;
627         }
628         /*
629          * Some parameters ("tx_db_nc" in particularly) are needed in
630          * advance to create dv/verbs device context. We proceed the
631          * devargs here to get ones, and later proceed devargs again
632          * to override some hardware settings.
633          */
634         err = mlx5_args(config, dpdk_dev->devargs);
635         if (err) {
636                 err = rte_errno;
637                 DRV_LOG(ERR, "failed to process device arguments: %s",
638                         strerror(rte_errno));
639                 goto error;
640         }
641         mlx5_malloc_mem_select(config->sys_mem_en);
642         sh = mlx5_alloc_shared_dev_ctx(spawn, config);
643         if (!sh)
644                 return NULL;
645         config->devx = sh->devx;
646 #ifdef HAVE_MLX5DV_DR_ACTION_DEST_DEVX_TIR
647         config->dest_tir = 1;
648 #endif
649 #ifdef HAVE_IBV_MLX5_MOD_SWP
650         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_SWP;
651 #endif
652         /*
653          * Multi-packet send is supported by ConnectX-4 Lx PF as well
654          * as all ConnectX-5 devices.
655          */
656 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
657         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS;
658 #endif
659 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
660         dv_attr.comp_mask |= MLX5DV_CONTEXT_MASK_STRIDING_RQ;
661 #endif
662         mlx5_glue->dv_query_device(sh->ctx, &dv_attr);
663         if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_MPW_ALLOWED) {
664                 if (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_ENHANCED_MPW) {
665                         DRV_LOG(DEBUG, "enhanced MPW is supported");
666                         mps = MLX5_MPW_ENHANCED;
667                 } else {
668                         DRV_LOG(DEBUG, "MPW is supported");
669                         mps = MLX5_MPW;
670                 }
671         } else {
672                 DRV_LOG(DEBUG, "MPW isn't supported");
673                 mps = MLX5_MPW_DISABLED;
674         }
675 #ifdef HAVE_IBV_MLX5_MOD_SWP
676         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_SWP)
677                 swp = dv_attr.sw_parsing_caps.sw_parsing_offloads;
678         DRV_LOG(DEBUG, "SWP support: %u", swp);
679 #endif
680         config->swp = !!swp;
681 #ifdef HAVE_IBV_DEVICE_STRIDING_RQ_SUPPORT
682         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_STRIDING_RQ) {
683                 struct mlx5dv_striding_rq_caps mprq_caps =
684                         dv_attr.striding_rq_caps;
685
686                 DRV_LOG(DEBUG, "\tmin_single_stride_log_num_of_bytes: %d",
687                         mprq_caps.min_single_stride_log_num_of_bytes);
688                 DRV_LOG(DEBUG, "\tmax_single_stride_log_num_of_bytes: %d",
689                         mprq_caps.max_single_stride_log_num_of_bytes);
690                 DRV_LOG(DEBUG, "\tmin_single_wqe_log_num_of_strides: %d",
691                         mprq_caps.min_single_wqe_log_num_of_strides);
692                 DRV_LOG(DEBUG, "\tmax_single_wqe_log_num_of_strides: %d",
693                         mprq_caps.max_single_wqe_log_num_of_strides);
694                 DRV_LOG(DEBUG, "\tsupported_qpts: %d",
695                         mprq_caps.supported_qpts);
696                 DRV_LOG(DEBUG, "device supports Multi-Packet RQ");
697                 mprq = 1;
698                 mprq_min_stride_size_n =
699                         mprq_caps.min_single_stride_log_num_of_bytes;
700                 mprq_max_stride_size_n =
701                         mprq_caps.max_single_stride_log_num_of_bytes;
702                 mprq_min_stride_num_n =
703                         mprq_caps.min_single_wqe_log_num_of_strides;
704                 mprq_max_stride_num_n =
705                         mprq_caps.max_single_wqe_log_num_of_strides;
706         }
707 #endif
708         if (RTE_CACHE_LINE_SIZE == 128 &&
709             !(dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_COMP))
710                 cqe_comp = 0;
711         else
712                 cqe_comp = 1;
713         config->cqe_comp = cqe_comp;
714 #ifdef HAVE_IBV_MLX5_MOD_CQE_128B_PAD
715         /* Whether device supports 128B Rx CQE padding. */
716         cqe_pad = RTE_CACHE_LINE_SIZE == 128 &&
717                   (dv_attr.flags & MLX5DV_CONTEXT_FLAGS_CQE_128B_PAD);
718 #endif
719 #ifdef HAVE_IBV_DEVICE_TUNNEL_SUPPORT
720         if (dv_attr.comp_mask & MLX5DV_CONTEXT_MASK_TUNNEL_OFFLOADS) {
721                 tunnel_en = ((dv_attr.tunnel_offloads_caps &
722                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_VXLAN) &&
723                              (dv_attr.tunnel_offloads_caps &
724                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GRE) &&
725                              (dv_attr.tunnel_offloads_caps &
726                               MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_GENEVE));
727         }
728         DRV_LOG(DEBUG, "tunnel offloading is %ssupported",
729                 tunnel_en ? "" : "not ");
730 #else
731         DRV_LOG(WARNING,
732                 "tunnel offloading disabled due to old OFED/rdma-core version");
733 #endif
734         config->tunnel_en = tunnel_en;
735 #ifdef HAVE_IBV_DEVICE_MPLS_SUPPORT
736         mpls_en = ((dv_attr.tunnel_offloads_caps &
737                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_GRE) &&
738                    (dv_attr.tunnel_offloads_caps &
739                     MLX5DV_RAW_PACKET_CAP_TUNNELED_OFFLOAD_CW_MPLS_OVER_UDP));
740         DRV_LOG(DEBUG, "MPLS over GRE/UDP tunnel offloading is %ssupported",
741                 mpls_en ? "" : "not ");
742 #else
743         DRV_LOG(WARNING, "MPLS over GRE/UDP tunnel offloading disabled due to"
744                 " old OFED/rdma-core version or firmware configuration");
745 #endif
746         config->mpls_en = mpls_en;
747         /* Check port status. */
748         err = mlx5_glue->query_port(sh->ctx, spawn->phys_port, &port_attr);
749         if (err) {
750                 DRV_LOG(ERR, "port query failed: %s", strerror(err));
751                 goto error;
752         }
753         if (port_attr.link_layer != IBV_LINK_LAYER_ETHERNET) {
754                 DRV_LOG(ERR, "port is not configured in Ethernet mode");
755                 err = EINVAL;
756                 goto error;
757         }
758         if (port_attr.state != IBV_PORT_ACTIVE)
759                 DRV_LOG(DEBUG, "port is not active: \"%s\" (%d)",
760                         mlx5_glue->port_state_str(port_attr.state),
761                         port_attr.state);
762         /* Allocate private eth device data. */
763         priv = mlx5_malloc(MLX5_MEM_ZERO | MLX5_MEM_RTE,
764                            sizeof(*priv),
765                            RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
766         if (priv == NULL) {
767                 DRV_LOG(ERR, "priv allocation failure");
768                 err = ENOMEM;
769                 goto error;
770         }
771         priv->sh = sh;
772         priv->dev_port = spawn->phys_port;
773         priv->pci_dev = spawn->pci_dev;
774         priv->mtu = RTE_ETHER_MTU;
775         priv->mp_id.port_id = port_id;
776         strlcpy(priv->mp_id.name, MLX5_MP_NAME, RTE_MP_MAX_NAME_LEN);
777         /* Some internal functions rely on Netlink sockets, open them now. */
778         priv->nl_socket_rdma = mlx5_nl_init(NETLINK_RDMA);
779         priv->nl_socket_route = mlx5_nl_init(NETLINK_ROUTE);
780         priv->representor = !!switch_info->representor;
781         priv->master = !!switch_info->master;
782         priv->domain_id = RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID;
783         priv->vport_meta_tag = 0;
784         priv->vport_meta_mask = 0;
785         priv->pf_bond = spawn->pf_bond;
786 #ifdef HAVE_MLX5DV_DR_DEVX_PORT
787         /*
788          * The DevX port query API is implemented. E-Switch may use
789          * either vport or reg_c[0] metadata register to match on
790          * vport index. The engaged part of metadata register is
791          * defined by mask.
792          */
793         if (switch_info->representor || switch_info->master) {
794                 devx_port.comp_mask = MLX5DV_DEVX_PORT_VPORT |
795                                       MLX5DV_DEVX_PORT_MATCH_REG_C_0;
796                 err = mlx5_glue->devx_port_query(sh->ctx, spawn->phys_port,
797                                                  &devx_port);
798                 if (err) {
799                         DRV_LOG(WARNING,
800                                 "can't query devx port %d on device %s",
801                                 spawn->phys_port,
802                                 mlx5_os_get_dev_device_name(spawn->phys_dev));
803                         devx_port.comp_mask = 0;
804                 }
805         }
806         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_MATCH_REG_C_0) {
807                 priv->vport_meta_tag = devx_port.reg_c_0.value;
808                 priv->vport_meta_mask = devx_port.reg_c_0.mask;
809                 if (!priv->vport_meta_mask) {
810                         DRV_LOG(ERR, "vport zero mask for port %d"
811                                      " on bonding device %s",
812                                      spawn->phys_port,
813                                      mlx5_os_get_dev_device_name
814                                                         (spawn->phys_dev));
815                         err = ENOTSUP;
816                         goto error;
817                 }
818                 if (priv->vport_meta_tag & ~priv->vport_meta_mask) {
819                         DRV_LOG(ERR, "invalid vport tag for port %d"
820                                      " on bonding device %s",
821                                      spawn->phys_port,
822                                      mlx5_os_get_dev_device_name
823                                                         (spawn->phys_dev));
824                         err = ENOTSUP;
825                         goto error;
826                 }
827         }
828         if (devx_port.comp_mask & MLX5DV_DEVX_PORT_VPORT) {
829                 priv->vport_id = devx_port.vport_num;
830         } else if (spawn->pf_bond >= 0) {
831                 DRV_LOG(ERR, "can't deduce vport index for port %d"
832                              " on bonding device %s",
833                              spawn->phys_port,
834                              mlx5_os_get_dev_device_name(spawn->phys_dev));
835                 err = ENOTSUP;
836                 goto error;
837         } else {
838                 /* Suppose vport index in compatible way. */
839                 priv->vport_id = switch_info->representor ?
840                                  switch_info->port_name + 1 : -1;
841         }
842 #else
843         /*
844          * Kernel/rdma_core support single E-Switch per PF configurations
845          * only and vport_id field contains the vport index for
846          * associated VF, which is deduced from representor port name.
847          * For example, let's have the IB device port 10, it has
848          * attached network device eth0, which has port name attribute
849          * pf0vf2, we can deduce the VF number as 2, and set vport index
850          * as 3 (2+1). This assigning schema should be changed if the
851          * multiple E-Switch instances per PF configurations or/and PCI
852          * subfunctions are added.
853          */
854         priv->vport_id = switch_info->representor ?
855                          switch_info->port_name + 1 : -1;
856 #endif
857         /* representor_id field keeps the unmodified VF index. */
858         priv->representor_id = switch_info->representor ?
859                                switch_info->port_name : -1;
860         /*
861          * Look for sibling devices in order to reuse their switch domain
862          * if any, otherwise allocate one.
863          */
864         MLX5_ETH_FOREACH_DEV(port_id, priv->pci_dev) {
865                 const struct mlx5_priv *opriv =
866                         rte_eth_devices[port_id].data->dev_private;
867
868                 if (!opriv ||
869                     opriv->sh != priv->sh ||
870                         opriv->domain_id ==
871                         RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID)
872                         continue;
873                 priv->domain_id = opriv->domain_id;
874                 break;
875         }
876         if (priv->domain_id == RTE_ETH_DEV_SWITCH_DOMAIN_ID_INVALID) {
877                 err = rte_eth_switch_domain_alloc(&priv->domain_id);
878                 if (err) {
879                         err = rte_errno;
880                         DRV_LOG(ERR, "unable to allocate switch domain: %s",
881                                 strerror(rte_errno));
882                         goto error;
883                 }
884                 own_domain_id = 1;
885         }
886         /* Override some values set by hardware configuration. */
887         mlx5_args(config, dpdk_dev->devargs);
888         err = mlx5_dev_check_sibling_config(priv, config);
889         if (err)
890                 goto error;
891         config->hw_csum = !!(sh->device_attr.device_cap_flags_ex &
892                             IBV_DEVICE_RAW_IP_CSUM);
893         DRV_LOG(DEBUG, "checksum offloading is %ssupported",
894                 (config->hw_csum ? "" : "not "));
895 #if !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V42) && \
896         !defined(HAVE_IBV_DEVICE_COUNTERS_SET_V45)
897         DRV_LOG(DEBUG, "counters are not supported");
898 #endif
899 #if !defined(HAVE_IBV_FLOW_DV_SUPPORT) || !defined(HAVE_MLX5DV_DR)
900         if (config->dv_flow_en) {
901                 DRV_LOG(WARNING, "DV flow is not supported");
902                 config->dv_flow_en = 0;
903         }
904 #endif
905         config->ind_table_max_size =
906                 sh->device_attr.max_rwq_indirection_table_size;
907         /*
908          * Remove this check once DPDK supports larger/variable
909          * indirection tables.
910          */
911         if (config->ind_table_max_size > (unsigned int)ETH_RSS_RETA_SIZE_512)
912                 config->ind_table_max_size = ETH_RSS_RETA_SIZE_512;
913         DRV_LOG(DEBUG, "maximum Rx indirection table size is %u",
914                 config->ind_table_max_size);
915         config->hw_vlan_strip = !!(sh->device_attr.raw_packet_caps &
916                                   IBV_RAW_PACKET_CAP_CVLAN_STRIPPING);
917         DRV_LOG(DEBUG, "VLAN stripping is %ssupported",
918                 (config->hw_vlan_strip ? "" : "not "));
919         config->hw_fcs_strip = !!(sh->device_attr.raw_packet_caps &
920                                  IBV_RAW_PACKET_CAP_SCATTER_FCS);
921 #if defined(HAVE_IBV_WQ_FLAG_RX_END_PADDING)
922         hw_padding = !!sh->device_attr.rx_pad_end_addr_align;
923 #elif defined(HAVE_IBV_WQ_FLAGS_PCI_WRITE_END_PADDING)
924         hw_padding = !!(sh->device_attr.device_cap_flags_ex &
925                         IBV_DEVICE_PCI_WRITE_END_PADDING);
926 #endif
927         if (config->hw_padding && !hw_padding) {
928                 DRV_LOG(DEBUG, "Rx end alignment padding isn't supported");
929                 config->hw_padding = 0;
930         } else if (config->hw_padding) {
931                 DRV_LOG(DEBUG, "Rx end alignment padding is enabled");
932         }
933         config->tso = (sh->device_attr.max_tso > 0 &&
934                       (sh->device_attr.tso_supported_qpts &
935                        (1 << IBV_QPT_RAW_PACKET)));
936         if (config->tso)
937                 config->tso_max_payload_sz = sh->device_attr.max_tso;
938         /*
939          * MPW is disabled by default, while the Enhanced MPW is enabled
940          * by default.
941          */
942         if (config->mps == MLX5_ARG_UNSET)
943                 config->mps = (mps == MLX5_MPW_ENHANCED) ? MLX5_MPW_ENHANCED :
944                                                           MLX5_MPW_DISABLED;
945         else
946                 config->mps = config->mps ? mps : MLX5_MPW_DISABLED;
947         DRV_LOG(INFO, "%sMPS is %s",
948                 config->mps == MLX5_MPW_ENHANCED ? "enhanced " :
949                 config->mps == MLX5_MPW ? "legacy " : "",
950                 config->mps != MLX5_MPW_DISABLED ? "enabled" : "disabled");
951         if (config->cqe_comp && !cqe_comp) {
952                 DRV_LOG(WARNING, "Rx CQE compression isn't supported");
953                 config->cqe_comp = 0;
954         }
955         if (config->cqe_pad && !cqe_pad) {
956                 DRV_LOG(WARNING, "Rx CQE padding isn't supported");
957                 config->cqe_pad = 0;
958         } else if (config->cqe_pad) {
959                 DRV_LOG(INFO, "Rx CQE padding is enabled");
960         }
961         if (config->devx) {
962                 priv->counter_fallback = 0;
963                 err = mlx5_devx_cmd_query_hca_attr(sh->ctx, &config->hca_attr);
964                 if (err) {
965                         err = -err;
966                         goto error;
967                 }
968                 if (!config->hca_attr.flow_counters_dump)
969                         priv->counter_fallback = 1;
970 #ifndef HAVE_IBV_DEVX_ASYNC
971                 priv->counter_fallback = 1;
972 #endif
973                 if (priv->counter_fallback)
974                         DRV_LOG(INFO, "Use fall-back DV counter management");
975                 /* Check for LRO support. */
976                 if (config->dest_tir && config->hca_attr.lro_cap &&
977                     config->dv_flow_en) {
978                         /* TBD check tunnel lro caps. */
979                         config->lro.supported = config->hca_attr.lro_cap;
980                         DRV_LOG(DEBUG, "Device supports LRO");
981                         /*
982                          * If LRO timeout is not configured by application,
983                          * use the minimal supported value.
984                          */
985                         if (!config->lro.timeout)
986                                 config->lro.timeout =
987                                 config->hca_attr.lro_timer_supported_periods[0];
988                         DRV_LOG(DEBUG, "LRO session timeout set to %d usec",
989                                 config->lro.timeout);
990                 }
991 #if defined(HAVE_MLX5DV_DR) && defined(HAVE_MLX5_DR_CREATE_ACTION_FLOW_METER)
992                 if (config->hca_attr.qos.sup &&
993                     config->hca_attr.qos.srtcm_sup &&
994                     config->dv_flow_en) {
995                         uint8_t reg_c_mask =
996                                 config->hca_attr.qos.flow_meter_reg_c_ids;
997                         /*
998                          * Meter needs two REG_C's for color match and pre-sfx
999                          * flow match. Here get the REG_C for color match.
1000                          * REG_C_0 and REG_C_1 is reserved for metadata feature.
1001                          */
1002                         reg_c_mask &= 0xfc;
1003                         if (__builtin_popcount(reg_c_mask) < 1) {
1004                                 priv->mtr_en = 0;
1005                                 DRV_LOG(WARNING, "No available register for"
1006                                         " meter.");
1007                         } else {
1008                                 priv->mtr_color_reg = ffs(reg_c_mask) - 1 +
1009                                                       REG_C_0;
1010                                 priv->mtr_en = 1;
1011                                 priv->mtr_reg_share =
1012                                       config->hca_attr.qos.flow_meter_reg_share;
1013                                 DRV_LOG(DEBUG, "The REG_C meter uses is %d",
1014                                         priv->mtr_color_reg);
1015                         }
1016                 }
1017 #endif
1018         }
1019         if (config->tx_pp) {
1020                 DRV_LOG(DEBUG, "Timestamp counter frequency %u kHz",
1021                         config->hca_attr.dev_freq_khz);
1022                 DRV_LOG(DEBUG, "Packet pacing is %ssupported",
1023                         config->hca_attr.qos.packet_pacing ? "" : "not ");
1024                 DRV_LOG(DEBUG, "Cross channel ops are %ssupported",
1025                         config->hca_attr.cross_channel ? "" : "not ");
1026                 DRV_LOG(DEBUG, "WQE index ignore is %ssupported",
1027                         config->hca_attr.wqe_index_ignore ? "" : "not ");
1028                 DRV_LOG(DEBUG, "Non-wire SQ feature is %ssupported",
1029                         config->hca_attr.non_wire_sq ? "" : "not ");
1030                 DRV_LOG(DEBUG, "Static WQE SQ feature is %ssupported (%d)",
1031                         config->hca_attr.log_max_static_sq_wq ? "" : "not ",
1032                         config->hca_attr.log_max_static_sq_wq);
1033                 DRV_LOG(DEBUG, "WQE rate PP mode is %ssupported",
1034                         config->hca_attr.qos.wqe_rate_pp ? "" : "not ");
1035                 if (!config->devx) {
1036                         DRV_LOG(ERR, "DevX is required for packet pacing");
1037                         err = ENODEV;
1038                         goto error;
1039                 }
1040                 if (!config->hca_attr.qos.packet_pacing) {
1041                         DRV_LOG(ERR, "Packet pacing is not supported");
1042                         err = ENODEV;
1043                         goto error;
1044                 }
1045                 if (!config->hca_attr.cross_channel) {
1046                         DRV_LOG(ERR, "Cross channel operations are"
1047                                      " required for packet pacing");
1048                         err = ENODEV;
1049                         goto error;
1050                 }
1051                 if (!config->hca_attr.wqe_index_ignore) {
1052                         DRV_LOG(ERR, "WQE index ignore feature is"
1053                                      " required for packet pacing");
1054                         err = ENODEV;
1055                         goto error;
1056                 }
1057                 if (!config->hca_attr.non_wire_sq) {
1058                         DRV_LOG(ERR, "Non-wire SQ feature is"
1059                                      " required for packet pacing");
1060                         err = ENODEV;
1061                         goto error;
1062                 }
1063                 if (!config->hca_attr.log_max_static_sq_wq) {
1064                         DRV_LOG(ERR, "Static WQE SQ feature is"
1065                                      " required for packet pacing");
1066                         err = ENODEV;
1067                         goto error;
1068                 }
1069                 if (!config->hca_attr.qos.wqe_rate_pp) {
1070                         DRV_LOG(ERR, "WQE rate mode is required"
1071                                      " for packet pacing");
1072                         err = ENODEV;
1073                         goto error;
1074                 }
1075 #ifndef HAVE_MLX5DV_DEVX_UAR_OFFSET
1076                 DRV_LOG(ERR, "DevX does not provide UAR offset,"
1077                              " can't create queues for packet pacing");
1078                 err = ENODEV;
1079                 goto error;
1080 #endif
1081         }
1082         if (config->devx) {
1083                 uint32_t reg[MLX5_ST_SZ_DW(register_mtutc)];
1084
1085                 err = config->hca_attr.access_register_user ?
1086                         mlx5_devx_cmd_register_read
1087                                 (sh->ctx, MLX5_REGISTER_ID_MTUTC, 0,
1088                                 reg, MLX5_ST_SZ_DW(register_mtutc)) : ENOTSUP;
1089                 if (!err) {
1090                         uint32_t ts_mode;
1091
1092                         /* MTUTC register is read successfully. */
1093                         ts_mode = MLX5_GET(register_mtutc, reg,
1094                                            time_stamp_mode);
1095                         if (ts_mode == MLX5_MTUTC_TIMESTAMP_MODE_REAL_TIME)
1096                                 config->rt_timestamp = 1;
1097                 } else {
1098                         /* Kernel does not support register reading. */
1099                         if (config->hca_attr.dev_freq_khz ==
1100                                                  (NS_PER_S / MS_PER_S))
1101                                 config->rt_timestamp = 1;
1102                 }
1103         }
1104         /*
1105          * If HW has bug working with tunnel packet decapsulation and
1106          * scatter FCS, and decapsulation is needed, clear the hw_fcs_strip
1107          * bit. Then DEV_RX_OFFLOAD_KEEP_CRC bit will not be set anymore.
1108          */
1109         if (config->hca_attr.scatter_fcs_w_decap_disable && config->decap_en)
1110                 config->hw_fcs_strip = 0;
1111         DRV_LOG(DEBUG, "FCS stripping configuration is %ssupported",
1112                 (config->hw_fcs_strip ? "" : "not "));
1113         if (config->mprq.enabled && mprq) {
1114                 if (config->mprq.stride_num_n &&
1115                     (config->mprq.stride_num_n > mprq_max_stride_num_n ||
1116                      config->mprq.stride_num_n < mprq_min_stride_num_n)) {
1117                         config->mprq.stride_num_n =
1118                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_NUM_N,
1119                                                 mprq_min_stride_num_n),
1120                                         mprq_max_stride_num_n);
1121                         DRV_LOG(WARNING,
1122                                 "the number of strides"
1123                                 " for Multi-Packet RQ is out of range,"
1124                                 " setting default value (%u)",
1125                                 1 << config->mprq.stride_num_n);
1126                 }
1127                 if (config->mprq.stride_size_n &&
1128                     (config->mprq.stride_size_n > mprq_max_stride_size_n ||
1129                      config->mprq.stride_size_n < mprq_min_stride_size_n)) {
1130                         config->mprq.stride_size_n =
1131                                 RTE_MIN(RTE_MAX(MLX5_MPRQ_STRIDE_SIZE_N,
1132                                                 mprq_min_stride_size_n),
1133                                         mprq_max_stride_size_n);
1134                         DRV_LOG(WARNING,
1135                                 "the size of a stride"
1136                                 " for Multi-Packet RQ is out of range,"
1137                                 " setting default value (%u)",
1138                                 1 << config->mprq.stride_size_n);
1139                 }
1140                 config->mprq.min_stride_size_n = mprq_min_stride_size_n;
1141                 config->mprq.max_stride_size_n = mprq_max_stride_size_n;
1142         } else if (config->mprq.enabled && !mprq) {
1143                 DRV_LOG(WARNING, "Multi-Packet RQ isn't supported");
1144                 config->mprq.enabled = 0;
1145         }
1146         if (config->max_dump_files_num == 0)
1147                 config->max_dump_files_num = 128;
1148         eth_dev = rte_eth_dev_allocate(name);
1149         if (eth_dev == NULL) {
1150                 DRV_LOG(ERR, "can not allocate rte ethdev");
1151                 err = ENOMEM;
1152                 goto error;
1153         }
1154         /* Flag to call rte_eth_dev_release_port() in rte_eth_dev_close(). */
1155         eth_dev->data->dev_flags |= RTE_ETH_DEV_CLOSE_REMOVE;
1156         if (priv->representor) {
1157                 eth_dev->data->dev_flags |= RTE_ETH_DEV_REPRESENTOR;
1158                 eth_dev->data->representor_id = priv->representor_id;
1159         }
1160         /*
1161          * Store associated network device interface index. This index
1162          * is permanent throughout the lifetime of device. So, we may store
1163          * the ifindex here and use the cached value further.
1164          */
1165         MLX5_ASSERT(spawn->ifindex);
1166         priv->if_index = spawn->ifindex;
1167         eth_dev->data->dev_private = priv;
1168         priv->dev_data = eth_dev->data;
1169         eth_dev->data->mac_addrs = priv->mac;
1170         eth_dev->device = dpdk_dev;
1171         /* Configure the first MAC address by default. */
1172         if (mlx5_get_mac(eth_dev, &mac.addr_bytes)) {
1173                 DRV_LOG(ERR,
1174                         "port %u cannot get MAC address, is mlx5_en"
1175                         " loaded? (errno: %s)",
1176                         eth_dev->data->port_id, strerror(rte_errno));
1177                 err = ENODEV;
1178                 goto error;
1179         }
1180         DRV_LOG(INFO,
1181                 "port %u MAC address is %02x:%02x:%02x:%02x:%02x:%02x",
1182                 eth_dev->data->port_id,
1183                 mac.addr_bytes[0], mac.addr_bytes[1],
1184                 mac.addr_bytes[2], mac.addr_bytes[3],
1185                 mac.addr_bytes[4], mac.addr_bytes[5]);
1186 #ifdef RTE_LIBRTE_MLX5_DEBUG
1187         {
1188                 char ifname[IF_NAMESIZE];
1189
1190                 if (mlx5_get_ifname(eth_dev, &ifname) == 0)
1191                         DRV_LOG(DEBUG, "port %u ifname is \"%s\"",
1192                                 eth_dev->data->port_id, ifname);
1193                 else
1194                         DRV_LOG(DEBUG, "port %u ifname is unknown",
1195                                 eth_dev->data->port_id);
1196         }
1197 #endif
1198         /* Get actual MTU if possible. */
1199         err = mlx5_get_mtu(eth_dev, &priv->mtu);
1200         if (err) {
1201                 err = rte_errno;
1202                 goto error;
1203         }
1204         DRV_LOG(DEBUG, "port %u MTU is %u", eth_dev->data->port_id,
1205                 priv->mtu);
1206         /* Initialize burst functions to prevent crashes before link-up. */
1207         eth_dev->rx_pkt_burst = removed_rx_burst;
1208         eth_dev->tx_pkt_burst = removed_tx_burst;
1209         eth_dev->dev_ops = &mlx5_os_dev_ops;
1210         /* Register MAC address. */
1211         claim_zero(mlx5_mac_addr_add(eth_dev, &mac, 0, 0));
1212         if (config->vf && config->vf_nl_en)
1213                 mlx5_nl_mac_addr_sync(priv->nl_socket_route,
1214                                       mlx5_ifindex(eth_dev),
1215                                       eth_dev->data->mac_addrs,
1216                                       MLX5_MAX_MAC_ADDRESSES);
1217         priv->flows = 0;
1218         priv->ctrl_flows = 0;
1219         TAILQ_INIT(&priv->flow_meters);
1220         TAILQ_INIT(&priv->flow_meter_profiles);
1221         /* Hint libmlx5 to use PMD allocator for data plane resources */
1222         mlx5_glue->dv_set_context_attr(sh->ctx,
1223                         MLX5DV_CTX_ATTR_BUF_ALLOCATORS,
1224                         (void *)((uintptr_t)&(struct mlx5dv_ctx_allocators){
1225                                 .alloc = &mlx5_alloc_verbs_buf,
1226                                 .free = &mlx5_free_verbs_buf,
1227                                 .data = priv,
1228                         }));
1229         /* Bring Ethernet device up. */
1230         DRV_LOG(DEBUG, "port %u forcing Ethernet interface up",
1231                 eth_dev->data->port_id);
1232         mlx5_set_link_up(eth_dev);
1233         /*
1234          * Even though the interrupt handler is not installed yet,
1235          * interrupts will still trigger on the async_fd from
1236          * Verbs context returned by ibv_open_device().
1237          */
1238         mlx5_link_update(eth_dev, 0);
1239 #ifdef HAVE_MLX5DV_DR_ESWITCH
1240         if (!(config->hca_attr.eswitch_manager && config->dv_flow_en &&
1241               (switch_info->representor || switch_info->master)))
1242                 config->dv_esw_en = 0;
1243 #else
1244         config->dv_esw_en = 0;
1245 #endif
1246         /* Detect minimal data bytes to inline. */
1247         mlx5_set_min_inline(spawn, config);
1248         /* Store device configuration on private structure. */
1249         priv->config = *config;
1250         /* Create context for virtual machine VLAN workaround. */
1251         priv->vmwa_context = mlx5_vlan_vmwa_init(eth_dev, spawn->ifindex);
1252         if (config->dv_flow_en) {
1253                 err = mlx5_alloc_shared_dr(priv);
1254                 if (err)
1255                         goto error;
1256                 /*
1257                  * RSS id is shared with meter flow id. Meter flow id can only
1258                  * use the 24 MSB of the register.
1259                  */
1260                 priv->qrss_id_pool = mlx5_flow_id_pool_alloc(UINT32_MAX >>
1261                                      MLX5_MTR_COLOR_BITS);
1262                 if (!priv->qrss_id_pool) {
1263                         DRV_LOG(ERR, "can't create flow id pool");
1264                         err = ENOMEM;
1265                         goto error;
1266                 }
1267         }
1268         /* Supported Verbs flow priority number detection. */
1269         err = mlx5_flow_discover_priorities(eth_dev);
1270         if (err < 0) {
1271                 err = -err;
1272                 goto error;
1273         }
1274         priv->config.flow_prio = err;
1275         if (!priv->config.dv_esw_en &&
1276             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1277                 DRV_LOG(WARNING, "metadata mode %u is not supported "
1278                                  "(no E-Switch)", priv->config.dv_xmeta_en);
1279                 priv->config.dv_xmeta_en = MLX5_XMETA_MODE_LEGACY;
1280         }
1281         mlx5_set_metadata_mask(eth_dev);
1282         if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1283             !priv->sh->dv_regc0_mask) {
1284                 DRV_LOG(ERR, "metadata mode %u is not supported "
1285                              "(no metadata reg_c[0] is available)",
1286                              priv->config.dv_xmeta_en);
1287                         err = ENOTSUP;
1288                         goto error;
1289         }
1290         /*
1291          * Allocate the buffer for flow creating, just once.
1292          * The allocation must be done before any flow creating.
1293          */
1294         mlx5_flow_alloc_intermediate(eth_dev);
1295         /* Query availability of metadata reg_c's. */
1296         err = mlx5_flow_discover_mreg_c(eth_dev);
1297         if (err < 0) {
1298                 err = -err;
1299                 goto error;
1300         }
1301         if (!mlx5_flow_ext_mreg_supported(eth_dev)) {
1302                 DRV_LOG(DEBUG,
1303                         "port %u extensive metadata register is not supported",
1304                         eth_dev->data->port_id);
1305                 if (priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY) {
1306                         DRV_LOG(ERR, "metadata mode %u is not supported "
1307                                      "(no metadata registers available)",
1308                                      priv->config.dv_xmeta_en);
1309                         err = ENOTSUP;
1310                         goto error;
1311                 }
1312         }
1313         if (priv->config.dv_flow_en &&
1314             priv->config.dv_xmeta_en != MLX5_XMETA_MODE_LEGACY &&
1315             mlx5_flow_ext_mreg_supported(eth_dev) &&
1316             priv->sh->dv_regc0_mask) {
1317                 priv->mreg_cp_tbl = mlx5_hlist_create(MLX5_FLOW_MREG_HNAME,
1318                                                       MLX5_FLOW_MREG_HTABLE_SZ);
1319                 if (!priv->mreg_cp_tbl) {
1320                         err = ENOMEM;
1321                         goto error;
1322                 }
1323         }
1324         return eth_dev;
1325 error:
1326         if (priv) {
1327                 if (priv->mreg_cp_tbl)
1328                         mlx5_hlist_destroy(priv->mreg_cp_tbl, NULL, NULL);
1329                 if (priv->sh)
1330                         mlx5_os_free_shared_dr(priv);
1331                 if (priv->nl_socket_route >= 0)
1332                         close(priv->nl_socket_route);
1333                 if (priv->nl_socket_rdma >= 0)
1334                         close(priv->nl_socket_rdma);
1335                 if (priv->vmwa_context)
1336                         mlx5_vlan_vmwa_exit(priv->vmwa_context);
1337                 if (priv->qrss_id_pool)
1338                         mlx5_flow_id_pool_release(priv->qrss_id_pool);
1339                 if (own_domain_id)
1340                         claim_zero(rte_eth_switch_domain_free(priv->domain_id));
1341                 mlx5_free(priv);
1342                 if (eth_dev != NULL)
1343                         eth_dev->data->dev_private = NULL;
1344         }
1345         if (eth_dev != NULL) {
1346                 /* mac_addrs must not be freed alone because part of
1347                  * dev_private
1348                  **/
1349                 eth_dev->data->mac_addrs = NULL;
1350                 rte_eth_dev_release_port(eth_dev);
1351         }
1352         if (sh)
1353                 mlx5_free_shared_dev_ctx(sh);
1354         MLX5_ASSERT(err > 0);
1355         rte_errno = err;
1356         return NULL;
1357 }
1358
1359 /**
1360  * Comparison callback to sort device data.
1361  *
1362  * This is meant to be used with qsort().
1363  *
1364  * @param a[in]
1365  *   Pointer to pointer to first data object.
1366  * @param b[in]
1367  *   Pointer to pointer to second data object.
1368  *
1369  * @return
1370  *   0 if both objects are equal, less than 0 if the first argument is less
1371  *   than the second, greater than 0 otherwise.
1372  */
1373 static int
1374 mlx5_dev_spawn_data_cmp(const void *a, const void *b)
1375 {
1376         const struct mlx5_switch_info *si_a =
1377                 &((const struct mlx5_dev_spawn_data *)a)->info;
1378         const struct mlx5_switch_info *si_b =
1379                 &((const struct mlx5_dev_spawn_data *)b)->info;
1380         int ret;
1381
1382         /* Master device first. */
1383         ret = si_b->master - si_a->master;
1384         if (ret)
1385                 return ret;
1386         /* Then representor devices. */
1387         ret = si_b->representor - si_a->representor;
1388         if (ret)
1389                 return ret;
1390         /* Unidentified devices come last in no specific order. */
1391         if (!si_a->representor)
1392                 return 0;
1393         /* Order representors by name. */
1394         return si_a->port_name - si_b->port_name;
1395 }
1396
1397 /**
1398  * Match PCI information for possible slaves of bonding device.
1399  *
1400  * @param[in] ibv_dev
1401  *   Pointer to Infiniband device structure.
1402  * @param[in] pci_dev
1403  *   Pointer to PCI device structure to match PCI address.
1404  * @param[in] nl_rdma
1405  *   Netlink RDMA group socket handle.
1406  *
1407  * @return
1408  *   negative value if no bonding device found, otherwise
1409  *   positive index of slave PF in bonding.
1410  */
1411 static int
1412 mlx5_device_bond_pci_match(const struct ibv_device *ibv_dev,
1413                            const struct rte_pci_device *pci_dev,
1414                            int nl_rdma)
1415 {
1416         char ifname[IF_NAMESIZE + 1];
1417         unsigned int ifindex;
1418         unsigned int np, i;
1419         FILE *file = NULL;
1420         int pf = -1;
1421
1422         /*
1423          * Try to get master device name. If something goes
1424          * wrong suppose the lack of kernel support and no
1425          * bonding devices.
1426          */
1427         if (nl_rdma < 0)
1428                 return -1;
1429         if (!strstr(ibv_dev->name, "bond"))
1430                 return -1;
1431         np = mlx5_nl_portnum(nl_rdma, ibv_dev->name);
1432         if (!np)
1433                 return -1;
1434         /*
1435          * The Master device might not be on the predefined
1436          * port (not on port index 1, it is not garanted),
1437          * we have to scan all Infiniband device port and
1438          * find master.
1439          */
1440         for (i = 1; i <= np; ++i) {
1441                 /* Check whether Infiniband port is populated. */
1442                 ifindex = mlx5_nl_ifindex(nl_rdma, ibv_dev->name, i);
1443                 if (!ifindex)
1444                         continue;
1445                 if (!if_indextoname(ifindex, ifname))
1446                         continue;
1447                 /* Try to read bonding slave names from sysfs. */
1448                 MKSTR(slaves,
1449                       "/sys/class/net/%s/master/bonding/slaves", ifname);
1450                 file = fopen(slaves, "r");
1451                 if (file)
1452                         break;
1453         }
1454         if (!file)
1455                 return -1;
1456         /* Use safe format to check maximal buffer length. */
1457         MLX5_ASSERT(atol(RTE_STR(IF_NAMESIZE)) == IF_NAMESIZE);
1458         while (fscanf(file, "%" RTE_STR(IF_NAMESIZE) "s", ifname) == 1) {
1459                 char tmp_str[IF_NAMESIZE + 32];
1460                 struct rte_pci_addr pci_addr;
1461                 struct mlx5_switch_info info;
1462
1463                 /* Process slave interface names in the loop. */
1464                 snprintf(tmp_str, sizeof(tmp_str),
1465                          "/sys/class/net/%s", ifname);
1466                 if (mlx5_dev_to_pci_addr(tmp_str, &pci_addr)) {
1467                         DRV_LOG(WARNING, "can not get PCI address"
1468                                          " for netdev \"%s\"", ifname);
1469                         continue;
1470                 }
1471                 if (pci_dev->addr.domain != pci_addr.domain ||
1472                     pci_dev->addr.bus != pci_addr.bus ||
1473                     pci_dev->addr.devid != pci_addr.devid ||
1474                     pci_dev->addr.function != pci_addr.function)
1475                         continue;
1476                 /* Slave interface PCI address match found. */
1477                 fclose(file);
1478                 snprintf(tmp_str, sizeof(tmp_str),
1479                          "/sys/class/net/%s/phys_port_name", ifname);
1480                 file = fopen(tmp_str, "rb");
1481                 if (!file)
1482                         break;
1483                 info.name_type = MLX5_PHYS_PORT_NAME_TYPE_NOTSET;
1484                 if (fscanf(file, "%32s", tmp_str) == 1)
1485                         mlx5_translate_port_name(tmp_str, &info);
1486                 if (info.name_type == MLX5_PHYS_PORT_NAME_TYPE_LEGACY ||
1487                     info.name_type == MLX5_PHYS_PORT_NAME_TYPE_UPLINK)
1488                         pf = info.port_name;
1489                 break;
1490         }
1491         if (file)
1492                 fclose(file);
1493         return pf;
1494 }
1495
1496 /**
1497  * DPDK callback to register a PCI device.
1498  *
1499  * This function spawns Ethernet devices out of a given PCI device.
1500  *
1501  * @param[in] pci_drv
1502  *   PCI driver structure (mlx5_driver).
1503  * @param[in] pci_dev
1504  *   PCI device information.
1505  *
1506  * @return
1507  *   0 on success, a negative errno value otherwise and rte_errno is set.
1508  */
1509 int
1510 mlx5_os_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1511                   struct rte_pci_device *pci_dev)
1512 {
1513         struct ibv_device **ibv_list;
1514         /*
1515          * Number of found IB Devices matching with requested PCI BDF.
1516          * nd != 1 means there are multiple IB devices over the same
1517          * PCI device and we have representors and master.
1518          */
1519         unsigned int nd = 0;
1520         /*
1521          * Number of found IB device Ports. nd = 1 and np = 1..n means
1522          * we have the single multiport IB device, and there may be
1523          * representors attached to some of found ports.
1524          */
1525         unsigned int np = 0;
1526         /*
1527          * Number of DPDK ethernet devices to Spawn - either over
1528          * multiple IB devices or multiple ports of single IB device.
1529          * Actually this is the number of iterations to spawn.
1530          */
1531         unsigned int ns = 0;
1532         /*
1533          * Bonding device
1534          *   < 0 - no bonding device (single one)
1535          *  >= 0 - bonding device (value is slave PF index)
1536          */
1537         int bd = -1;
1538         struct mlx5_dev_spawn_data *list = NULL;
1539         struct mlx5_dev_config dev_config;
1540         unsigned int dev_config_vf;
1541         int ret;
1542
1543         if (rte_eal_process_type() == RTE_PROC_PRIMARY)
1544                 mlx5_pmd_socket_init();
1545         ret = mlx5_init_once();
1546         if (ret) {
1547                 DRV_LOG(ERR, "unable to init PMD global data: %s",
1548                         strerror(rte_errno));
1549                 return -rte_errno;
1550         }
1551         errno = 0;
1552         ibv_list = mlx5_glue->get_device_list(&ret);
1553         if (!ibv_list) {
1554                 rte_errno = errno ? errno : ENOSYS;
1555                 DRV_LOG(ERR, "cannot list devices, is ib_uverbs loaded?");
1556                 return -rte_errno;
1557         }
1558         /*
1559          * First scan the list of all Infiniband devices to find
1560          * matching ones, gathering into the list.
1561          */
1562         struct ibv_device *ibv_match[ret + 1];
1563         int nl_route = mlx5_nl_init(NETLINK_ROUTE);
1564         int nl_rdma = mlx5_nl_init(NETLINK_RDMA);
1565         unsigned int i;
1566
1567         while (ret-- > 0) {
1568                 struct rte_pci_addr pci_addr;
1569
1570                 DRV_LOG(DEBUG, "checking device \"%s\"", ibv_list[ret]->name);
1571                 bd = mlx5_device_bond_pci_match
1572                                 (ibv_list[ret], pci_dev, nl_rdma);
1573                 if (bd >= 0) {
1574                         /*
1575                          * Bonding device detected. Only one match is allowed,
1576                          * the bonding is supported over multi-port IB device,
1577                          * there should be no matches on representor PCI
1578                          * functions or non VF LAG bonding devices with
1579                          * specified address.
1580                          */
1581                         if (nd) {
1582                                 DRV_LOG(ERR,
1583                                         "multiple PCI match on bonding device"
1584                                         "\"%s\" found", ibv_list[ret]->name);
1585                                 rte_errno = ENOENT;
1586                                 ret = -rte_errno;
1587                                 goto exit;
1588                         }
1589                         DRV_LOG(INFO, "PCI information matches for"
1590                                       " slave %d bonding device \"%s\"",
1591                                       bd, ibv_list[ret]->name);
1592                         ibv_match[nd++] = ibv_list[ret];
1593                         break;
1594                 }
1595                 if (mlx5_dev_to_pci_addr
1596                         (ibv_list[ret]->ibdev_path, &pci_addr))
1597                         continue;
1598                 if (pci_dev->addr.domain != pci_addr.domain ||
1599                     pci_dev->addr.bus != pci_addr.bus ||
1600                     pci_dev->addr.devid != pci_addr.devid ||
1601                     pci_dev->addr.function != pci_addr.function)
1602                         continue;
1603                 DRV_LOG(INFO, "PCI information matches for device \"%s\"",
1604                         ibv_list[ret]->name);
1605                 ibv_match[nd++] = ibv_list[ret];
1606         }
1607         ibv_match[nd] = NULL;
1608         if (!nd) {
1609                 /* No device matches, just complain and bail out. */
1610                 DRV_LOG(WARNING,
1611                         "no Verbs device matches PCI device " PCI_PRI_FMT ","
1612                         " are kernel drivers loaded?",
1613                         pci_dev->addr.domain, pci_dev->addr.bus,
1614                         pci_dev->addr.devid, pci_dev->addr.function);
1615                 rte_errno = ENOENT;
1616                 ret = -rte_errno;
1617                 goto exit;
1618         }
1619         if (nd == 1) {
1620                 /*
1621                  * Found single matching device may have multiple ports.
1622                  * Each port may be representor, we have to check the port
1623                  * number and check the representors existence.
1624                  */
1625                 if (nl_rdma >= 0)
1626                         np = mlx5_nl_portnum(nl_rdma, ibv_match[0]->name);
1627                 if (!np)
1628                         DRV_LOG(WARNING, "can not get IB device \"%s\""
1629                                          " ports number", ibv_match[0]->name);
1630                 if (bd >= 0 && !np) {
1631                         DRV_LOG(ERR, "can not get ports"
1632                                      " for bonding device");
1633                         rte_errno = ENOENT;
1634                         ret = -rte_errno;
1635                         goto exit;
1636                 }
1637         }
1638 #ifndef HAVE_MLX5DV_DR_DEVX_PORT
1639         if (bd >= 0) {
1640                 /*
1641                  * This may happen if there is VF LAG kernel support and
1642                  * application is compiled with older rdma_core library.
1643                  */
1644                 DRV_LOG(ERR,
1645                         "No kernel/verbs support for VF LAG bonding found.");
1646                 rte_errno = ENOTSUP;
1647                 ret = -rte_errno;
1648                 goto exit;
1649         }
1650 #endif
1651         /*
1652          * Now we can determine the maximal
1653          * amount of devices to be spawned.
1654          */
1655         list = mlx5_malloc(MLX5_MEM_ZERO,
1656                            sizeof(struct mlx5_dev_spawn_data) *
1657                            (np ? np : nd),
1658                            RTE_CACHE_LINE_SIZE, SOCKET_ID_ANY);
1659         if (!list) {
1660                 DRV_LOG(ERR, "spawn data array allocation failure");
1661                 rte_errno = ENOMEM;
1662                 ret = -rte_errno;
1663                 goto exit;
1664         }
1665         if (bd >= 0 || np > 1) {
1666                 /*
1667                  * Single IB device with multiple ports found,
1668                  * it may be E-Switch master device and representors.
1669                  * We have to perform identification through the ports.
1670                  */
1671                 MLX5_ASSERT(nl_rdma >= 0);
1672                 MLX5_ASSERT(ns == 0);
1673                 MLX5_ASSERT(nd == 1);
1674                 MLX5_ASSERT(np);
1675                 for (i = 1; i <= np; ++i) {
1676                         list[ns].max_port = np;
1677                         list[ns].phys_port = i;
1678                         list[ns].phys_dev = ibv_match[0];
1679                         list[ns].eth_dev = NULL;
1680                         list[ns].pci_dev = pci_dev;
1681                         list[ns].pf_bond = bd;
1682                         list[ns].ifindex = mlx5_nl_ifindex
1683                                 (nl_rdma,
1684                                 mlx5_os_get_dev_device_name
1685                                                 (list[ns].phys_dev), i);
1686                         if (!list[ns].ifindex) {
1687                                 /*
1688                                  * No network interface index found for the
1689                                  * specified port, it means there is no
1690                                  * representor on this port. It's OK,
1691                                  * there can be disabled ports, for example
1692                                  * if sriov_numvfs < sriov_totalvfs.
1693                                  */
1694                                 continue;
1695                         }
1696                         ret = -1;
1697                         if (nl_route >= 0)
1698                                 ret = mlx5_nl_switch_info
1699                                                (nl_route,
1700                                                 list[ns].ifindex,
1701                                                 &list[ns].info);
1702                         if (ret || (!list[ns].info.representor &&
1703                                     !list[ns].info.master)) {
1704                                 /*
1705                                  * We failed to recognize representors with
1706                                  * Netlink, let's try to perform the task
1707                                  * with sysfs.
1708                                  */
1709                                 ret =  mlx5_sysfs_switch_info
1710                                                 (list[ns].ifindex,
1711                                                  &list[ns].info);
1712                         }
1713                         if (!ret && bd >= 0) {
1714                                 switch (list[ns].info.name_type) {
1715                                 case MLX5_PHYS_PORT_NAME_TYPE_UPLINK:
1716                                         if (list[ns].info.port_name == bd)
1717                                                 ns++;
1718                                         break;
1719                                 case MLX5_PHYS_PORT_NAME_TYPE_PFHPF:
1720                                         /* Fallthrough */
1721                                 case MLX5_PHYS_PORT_NAME_TYPE_PFVF:
1722                                         if (list[ns].info.pf_num == bd)
1723                                                 ns++;
1724                                         break;
1725                                 default:
1726                                         break;
1727                                 }
1728                                 continue;
1729                         }
1730                         if (!ret && (list[ns].info.representor ^
1731                                      list[ns].info.master))
1732                                 ns++;
1733                 }
1734                 if (!ns) {
1735                         DRV_LOG(ERR,
1736                                 "unable to recognize master/representors"
1737                                 " on the IB device with multiple ports");
1738                         rte_errno = ENOENT;
1739                         ret = -rte_errno;
1740                         goto exit;
1741                 }
1742         } else {
1743                 /*
1744                  * The existence of several matching entries (nd > 1) means
1745                  * port representors have been instantiated. No existing Verbs
1746                  * call nor sysfs entries can tell them apart, this can only
1747                  * be done through Netlink calls assuming kernel drivers are
1748                  * recent enough to support them.
1749                  *
1750                  * In the event of identification failure through Netlink,
1751                  * try again through sysfs, then:
1752                  *
1753                  * 1. A single IB device matches (nd == 1) with single
1754                  *    port (np=0/1) and is not a representor, assume
1755                  *    no switch support.
1756                  *
1757                  * 2. Otherwise no safe assumptions can be made;
1758                  *    complain louder and bail out.
1759                  */
1760                 for (i = 0; i != nd; ++i) {
1761                         memset(&list[ns].info, 0, sizeof(list[ns].info));
1762                         list[ns].max_port = 1;
1763                         list[ns].phys_port = 1;
1764                         list[ns].phys_dev = ibv_match[i];
1765                         list[ns].eth_dev = NULL;
1766                         list[ns].pci_dev = pci_dev;
1767                         list[ns].pf_bond = -1;
1768                         list[ns].ifindex = 0;
1769                         if (nl_rdma >= 0)
1770                                 list[ns].ifindex = mlx5_nl_ifindex
1771                                 (nl_rdma,
1772                                 mlx5_os_get_dev_device_name
1773                                                 (list[ns].phys_dev), 1);
1774                         if (!list[ns].ifindex) {
1775                                 char ifname[IF_NAMESIZE];
1776
1777                                 /*
1778                                  * Netlink failed, it may happen with old
1779                                  * ib_core kernel driver (before 4.16).
1780                                  * We can assume there is old driver because
1781                                  * here we are processing single ports IB
1782                                  * devices. Let's try sysfs to retrieve
1783                                  * the ifindex. The method works for
1784                                  * master device only.
1785                                  */
1786                                 if (nd > 1) {
1787                                         /*
1788                                          * Multiple devices found, assume
1789                                          * representors, can not distinguish
1790                                          * master/representor and retrieve
1791                                          * ifindex via sysfs.
1792                                          */
1793                                         continue;
1794                                 }
1795                                 ret = mlx5_get_ifname_sysfs
1796                                         (ibv_match[i]->ibdev_path, ifname);
1797                                 if (!ret)
1798                                         list[ns].ifindex =
1799                                                 if_nametoindex(ifname);
1800                                 if (!list[ns].ifindex) {
1801                                         /*
1802                                          * No network interface index found
1803                                          * for the specified device, it means
1804                                          * there it is neither representor
1805                                          * nor master.
1806                                          */
1807                                         continue;
1808                                 }
1809                         }
1810                         ret = -1;
1811                         if (nl_route >= 0)
1812                                 ret = mlx5_nl_switch_info
1813                                                (nl_route,
1814                                                 list[ns].ifindex,
1815                                                 &list[ns].info);
1816                         if (ret || (!list[ns].info.representor &&
1817                                     !list[ns].info.master)) {
1818                                 /*
1819                                  * We failed to recognize representors with
1820                                  * Netlink, let's try to perform the task
1821                                  * with sysfs.
1822                                  */
1823                                 ret =  mlx5_sysfs_switch_info
1824                                                 (list[ns].ifindex,
1825                                                  &list[ns].info);
1826                         }
1827                         if (!ret && (list[ns].info.representor ^
1828                                      list[ns].info.master)) {
1829                                 ns++;
1830                         } else if ((nd == 1) &&
1831                                    !list[ns].info.representor &&
1832                                    !list[ns].info.master) {
1833                                 /*
1834                                  * Single IB device with
1835                                  * one physical port and
1836                                  * attached network device.
1837                                  * May be SRIOV is not enabled
1838                                  * or there is no representors.
1839                                  */
1840                                 DRV_LOG(INFO, "no E-Switch support detected");
1841                                 ns++;
1842                                 break;
1843                         }
1844                 }
1845                 if (!ns) {
1846                         DRV_LOG(ERR,
1847                                 "unable to recognize master/representors"
1848                                 " on the multiple IB devices");
1849                         rte_errno = ENOENT;
1850                         ret = -rte_errno;
1851                         goto exit;
1852                 }
1853         }
1854         MLX5_ASSERT(ns);
1855         /*
1856          * Sort list to probe devices in natural order for users convenience
1857          * (i.e. master first, then representors from lowest to highest ID).
1858          */
1859         qsort(list, ns, sizeof(*list), mlx5_dev_spawn_data_cmp);
1860         /* Device specific configuration. */
1861         switch (pci_dev->id.device_id) {
1862         case PCI_DEVICE_ID_MELLANOX_CONNECTX4VF:
1863         case PCI_DEVICE_ID_MELLANOX_CONNECTX4LXVF:
1864         case PCI_DEVICE_ID_MELLANOX_CONNECTX5VF:
1865         case PCI_DEVICE_ID_MELLANOX_CONNECTX5EXVF:
1866         case PCI_DEVICE_ID_MELLANOX_CONNECTX5BFVF:
1867         case PCI_DEVICE_ID_MELLANOX_CONNECTX6VF:
1868         case PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF:
1869                 dev_config_vf = 1;
1870                 break;
1871         default:
1872                 dev_config_vf = 0;
1873                 break;
1874         }
1875         for (i = 0; i != ns; ++i) {
1876                 uint32_t restore;
1877
1878                 /* Default configuration. */
1879                 memset(&dev_config, 0, sizeof(struct mlx5_dev_config));
1880                 dev_config.vf = dev_config_vf;
1881                 dev_config.mps = MLX5_ARG_UNSET;
1882                 dev_config.dbnc = MLX5_ARG_UNSET;
1883                 dev_config.rx_vec_en = 1;
1884                 dev_config.txq_inline_max = MLX5_ARG_UNSET;
1885                 dev_config.txq_inline_min = MLX5_ARG_UNSET;
1886                 dev_config.txq_inline_mpw = MLX5_ARG_UNSET;
1887                 dev_config.txqs_inline = MLX5_ARG_UNSET;
1888                 dev_config.vf_nl_en = 1;
1889                 dev_config.mr_ext_memseg_en = 1;
1890                 dev_config.mprq.max_memcpy_len = MLX5_MPRQ_MEMCPY_DEFAULT_LEN;
1891                 dev_config.mprq.min_rxqs_num = MLX5_MPRQ_MIN_RXQS;
1892                 dev_config.dv_esw_en = 1;
1893                 dev_config.dv_flow_en = 1;
1894                 dev_config.decap_en = 1;
1895                 dev_config.log_hp_size = MLX5_ARG_UNSET;
1896                 list[i].eth_dev = mlx5_dev_spawn(&pci_dev->device,
1897                                                  &list[i],
1898                                                  &dev_config);
1899                 if (!list[i].eth_dev) {
1900                         if (rte_errno != EBUSY && rte_errno != EEXIST)
1901                                 break;
1902                         /* Device is disabled or already spawned. Ignore it. */
1903                         continue;
1904                 }
1905                 restore = list[i].eth_dev->data->dev_flags;
1906                 rte_eth_copy_pci_info(list[i].eth_dev, pci_dev);
1907                 /* Restore non-PCI flags cleared by the above call. */
1908                 list[i].eth_dev->data->dev_flags |= restore;
1909                 rte_eth_dev_probing_finish(list[i].eth_dev);
1910         }
1911         if (i != ns) {
1912                 DRV_LOG(ERR,
1913                         "probe of PCI device " PCI_PRI_FMT " aborted after"
1914                         " encountering an error: %s",
1915                         pci_dev->addr.domain, pci_dev->addr.bus,
1916                         pci_dev->addr.devid, pci_dev->addr.function,
1917                         strerror(rte_errno));
1918                 ret = -rte_errno;
1919                 /* Roll back. */
1920                 while (i--) {
1921                         if (!list[i].eth_dev)
1922                                 continue;
1923                         mlx5_dev_close(list[i].eth_dev);
1924                         /* mac_addrs must not be freed because in dev_private */
1925                         list[i].eth_dev->data->mac_addrs = NULL;
1926                         claim_zero(rte_eth_dev_release_port(list[i].eth_dev));
1927                 }
1928                 /* Restore original error. */
1929                 rte_errno = -ret;
1930         } else {
1931                 ret = 0;
1932         }
1933 exit:
1934         /*
1935          * Do the routine cleanup:
1936          * - close opened Netlink sockets
1937          * - free allocated spawn data array
1938          * - free the Infiniband device list
1939          */
1940         if (nl_rdma >= 0)
1941                 close(nl_rdma);
1942         if (nl_route >= 0)
1943                 close(nl_route);
1944         if (list)
1945                 mlx5_free(list);
1946         MLX5_ASSERT(ibv_list);
1947         mlx5_glue->free_device_list(ibv_list);
1948         return ret;
1949 }
1950
1951 static int
1952 mlx5_config_doorbell_mapping_env(const struct mlx5_dev_config *config)
1953 {
1954         char *env;
1955         int value;
1956
1957         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1958         /* Get environment variable to store. */
1959         env = getenv(MLX5_SHUT_UP_BF);
1960         value = env ? !!strcmp(env, "0") : MLX5_ARG_UNSET;
1961         if (config->dbnc == MLX5_ARG_UNSET)
1962                 setenv(MLX5_SHUT_UP_BF, MLX5_SHUT_UP_BF_DEFAULT, 1);
1963         else
1964                 setenv(MLX5_SHUT_UP_BF,
1965                        config->dbnc == MLX5_TXDB_NCACHED ? "1" : "0", 1);
1966         return value;
1967 }
1968
1969 static void
1970 mlx5_restore_doorbell_mapping_env(int value)
1971 {
1972         MLX5_ASSERT(rte_eal_process_type() == RTE_PROC_PRIMARY);
1973         /* Restore the original environment variable state. */
1974         if (value == MLX5_ARG_UNSET)
1975                 unsetenv(MLX5_SHUT_UP_BF);
1976         else
1977                 setenv(MLX5_SHUT_UP_BF, value ? "1" : "0", 1);
1978 }
1979
1980 /**
1981  * Extract pdn of PD object using DV API.
1982  *
1983  * @param[in] pd
1984  *   Pointer to the verbs PD object.
1985  * @param[out] pdn
1986  *   Pointer to the PD object number variable.
1987  *
1988  * @return
1989  *   0 on success, error value otherwise.
1990  */
1991 int
1992 mlx5_os_get_pdn(void *pd, uint32_t *pdn)
1993 {
1994 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
1995         struct mlx5dv_obj obj;
1996         struct mlx5dv_pd pd_info;
1997         int ret = 0;
1998
1999         obj.pd.in = pd;
2000         obj.pd.out = &pd_info;
2001         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
2002         if (ret) {
2003                 DRV_LOG(DEBUG, "Fail to get PD object info");
2004                 return ret;
2005         }
2006         *pdn = pd_info.pdn;
2007         return 0;
2008 #else
2009         (void)pd;
2010         (void)pdn;
2011         return -ENOTSUP;
2012 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
2013 }
2014
2015 /**
2016  * Function API to open IB device.
2017  *
2018  * This function calls the Linux glue APIs to open a device.
2019  *
2020  * @param[in] spawn
2021  *   Pointer to the IB device attributes (name, port, etc).
2022  * @param[out] config
2023  *   Pointer to device configuration structure.
2024  * @param[out] sh
2025  *   Pointer to shared context structure.
2026  *
2027  * @return
2028  *   0 on success, a positive error value otherwise.
2029  */
2030 int
2031 mlx5_os_open_device(const struct mlx5_dev_spawn_data *spawn,
2032                      const struct mlx5_dev_config *config,
2033                      struct mlx5_dev_ctx_shared *sh)
2034 {
2035         int dbmap_env;
2036         int err = 0;
2037
2038         sh->numa_node = spawn->pci_dev->device.numa_node;
2039         pthread_mutex_init(&sh->txpp.mutex, NULL);
2040         /*
2041          * Configure environment variable "MLX5_BF_SHUT_UP"
2042          * before the device creation. The rdma_core library
2043          * checks the variable at device creation and
2044          * stores the result internally.
2045          */
2046         dbmap_env = mlx5_config_doorbell_mapping_env(config);
2047         /* Try to open IB device with DV first, then usual Verbs. */
2048         errno = 0;
2049         sh->ctx = mlx5_glue->dv_open_device(spawn->phys_dev);
2050         if (sh->ctx) {
2051                 sh->devx = 1;
2052                 DRV_LOG(DEBUG, "DevX is supported");
2053                 /* The device is created, no need for environment. */
2054                 mlx5_restore_doorbell_mapping_env(dbmap_env);
2055         } else {
2056                 /* The environment variable is still configured. */
2057                 sh->ctx = mlx5_glue->open_device(spawn->phys_dev);
2058                 err = errno ? errno : ENODEV;
2059                 /*
2060                  * The environment variable is not needed anymore,
2061                  * all device creation attempts are completed.
2062                  */
2063                 mlx5_restore_doorbell_mapping_env(dbmap_env);
2064                 if (!sh->ctx)
2065                         return err;
2066                 DRV_LOG(DEBUG, "DevX is NOT supported");
2067                 err = 0;
2068         }
2069         return err;
2070 }
2071
2072 /**
2073  * Install shared asynchronous device events handler.
2074  * This function is implemented to support event sharing
2075  * between multiple ports of single IB device.
2076  *
2077  * @param sh
2078  *   Pointer to mlx5_dev_ctx_shared object.
2079  */
2080 void
2081 mlx5_os_dev_shared_handler_install(struct mlx5_dev_ctx_shared *sh)
2082 {
2083         int ret;
2084         int flags;
2085
2086         sh->intr_handle.fd = -1;
2087         flags = fcntl(((struct ibv_context *)sh->ctx)->async_fd, F_GETFL);
2088         ret = fcntl(((struct ibv_context *)sh->ctx)->async_fd,
2089                     F_SETFL, flags | O_NONBLOCK);
2090         if (ret) {
2091                 DRV_LOG(INFO, "failed to change file descriptor async event"
2092                         " queue");
2093         } else {
2094                 sh->intr_handle.fd = ((struct ibv_context *)sh->ctx)->async_fd;
2095                 sh->intr_handle.type = RTE_INTR_HANDLE_EXT;
2096                 if (rte_intr_callback_register(&sh->intr_handle,
2097                                         mlx5_dev_interrupt_handler, sh)) {
2098                         DRV_LOG(INFO, "Fail to install the shared interrupt.");
2099                         sh->intr_handle.fd = -1;
2100                 }
2101         }
2102         if (sh->devx) {
2103 #ifdef HAVE_IBV_DEVX_ASYNC
2104                 sh->intr_handle_devx.fd = -1;
2105                 sh->devx_comp =
2106                         (void *)mlx5_glue->devx_create_cmd_comp(sh->ctx);
2107                 struct mlx5dv_devx_cmd_comp *devx_comp = sh->devx_comp;
2108                 if (!devx_comp) {
2109                         DRV_LOG(INFO, "failed to allocate devx_comp.");
2110                         return;
2111                 }
2112                 flags = fcntl(devx_comp->fd, F_GETFL);
2113                 ret = fcntl(devx_comp->fd, F_SETFL, flags | O_NONBLOCK);
2114                 if (ret) {
2115                         DRV_LOG(INFO, "failed to change file descriptor"
2116                                 " devx comp");
2117                         return;
2118                 }
2119                 sh->intr_handle_devx.fd = devx_comp->fd;
2120                 sh->intr_handle_devx.type = RTE_INTR_HANDLE_EXT;
2121                 if (rte_intr_callback_register(&sh->intr_handle_devx,
2122                                         mlx5_dev_interrupt_handler_devx, sh)) {
2123                         DRV_LOG(INFO, "Fail to install the devx shared"
2124                                 " interrupt.");
2125                         sh->intr_handle_devx.fd = -1;
2126                 }
2127 #endif /* HAVE_IBV_DEVX_ASYNC */
2128         }
2129 }
2130
2131 /**
2132  * Uninstall shared asynchronous device events handler.
2133  * This function is implemented to support event sharing
2134  * between multiple ports of single IB device.
2135  *
2136  * @param dev
2137  *   Pointer to mlx5_dev_ctx_shared object.
2138  */
2139 void
2140 mlx5_os_dev_shared_handler_uninstall(struct mlx5_dev_ctx_shared *sh)
2141 {
2142         if (sh->intr_handle.fd >= 0)
2143                 mlx5_intr_callback_unregister(&sh->intr_handle,
2144                                               mlx5_dev_interrupt_handler, sh);
2145 #ifdef HAVE_IBV_DEVX_ASYNC
2146         if (sh->intr_handle_devx.fd >= 0)
2147                 rte_intr_callback_unregister(&sh->intr_handle_devx,
2148                                   mlx5_dev_interrupt_handler_devx, sh);
2149         if (sh->devx_comp)
2150                 mlx5_glue->devx_destroy_cmd_comp(sh->devx_comp);
2151 #endif
2152 }
2153
2154 /**
2155  * Read statistics by a named counter.
2156  *
2157  * @param[in] priv
2158  *   Pointer to the private device data structure.
2159  * @param[in] ctr_name
2160  *   Pointer to the name of the statistic counter to read
2161  * @param[out] stat
2162  *   Pointer to read statistic value.
2163  * @return
2164  *   0 on success and stat is valud, 1 if failed to read the value
2165  *   rte_errno is set.
2166  *
2167  */
2168 int
2169 mlx5_os_read_dev_stat(struct mlx5_priv *priv, const char *ctr_name,
2170                       uint64_t *stat)
2171 {
2172         int fd;
2173
2174         if (priv->sh) {
2175                 MKSTR(path, "%s/ports/%d/hw_counters/%s",
2176                       priv->sh->ibdev_path,
2177                       priv->dev_port,
2178                       ctr_name);
2179                 fd = open(path, O_RDONLY);
2180                 /*
2181                  * in switchdev the file location is not per port
2182                  * but rather in <ibdev_path>/hw_counters/<file_name>.
2183                  */
2184                 if (fd == -1) {
2185                         MKSTR(path1, "%s/hw_counters/%s",
2186                               priv->sh->ibdev_path,
2187                               ctr_name);
2188                         fd = open(path1, O_RDONLY);
2189                 }
2190                 if (fd != -1) {
2191                         char buf[21] = {'\0'};
2192                         ssize_t n = read(fd, buf, sizeof(buf));
2193
2194                         close(fd);
2195                         if (n != -1) {
2196                                 *stat = strtoull(buf, NULL, 10);
2197                                 return 0;
2198                         }
2199                 }
2200         }
2201         *stat = 0;
2202         return 1;
2203 }
2204
2205 /**
2206  * Set the reg_mr and dereg_mr call backs
2207  *
2208  * @param reg_mr_cb[out]
2209  *   Pointer to reg_mr func
2210  * @param dereg_mr_cb[out]
2211  *   Pointer to dereg_mr func
2212  *
2213  */
2214 void
2215 mlx5_os_set_reg_mr_cb(mlx5_reg_mr_t *reg_mr_cb,
2216                       mlx5_dereg_mr_t *dereg_mr_cb)
2217 {
2218         *reg_mr_cb = mlx5_verbs_ops.reg_mr;
2219         *dereg_mr_cb = mlx5_verbs_ops.dereg_mr;
2220 }
2221
2222 /**
2223  * Remove a MAC address from device
2224  *
2225  * @param dev
2226  *   Pointer to Ethernet device structure.
2227  * @param index
2228  *   MAC address index.
2229  */
2230 void
2231 mlx5_os_mac_addr_remove(struct rte_eth_dev *dev, uint32_t index)
2232 {
2233         struct mlx5_priv *priv = dev->data->dev_private;
2234         const int vf = priv->config.vf;
2235
2236         if (vf)
2237                 mlx5_nl_mac_addr_remove(priv->nl_socket_route,
2238                                         mlx5_ifindex(dev), priv->mac_own,
2239                                         &dev->data->mac_addrs[index], index);
2240 }
2241
2242 /**
2243  * Adds a MAC address to the device
2244  *
2245  * @param dev
2246  *   Pointer to Ethernet device structure.
2247  * @param mac_addr
2248  *   MAC address to register.
2249  * @param index
2250  *   MAC address index.
2251  *
2252  * @return
2253  *   0 on success, a negative errno value otherwise
2254  */
2255 int
2256 mlx5_os_mac_addr_add(struct rte_eth_dev *dev, struct rte_ether_addr *mac,
2257                      uint32_t index)
2258 {
2259         struct mlx5_priv *priv = dev->data->dev_private;
2260         const int vf = priv->config.vf;
2261         int ret = 0;
2262
2263         if (vf)
2264                 ret = mlx5_nl_mac_addr_add(priv->nl_socket_route,
2265                                            mlx5_ifindex(dev), priv->mac_own,
2266                                            mac, index);
2267         return ret;
2268 }
2269
2270 /**
2271  * Modify a VF MAC address
2272  *
2273  * @param priv
2274  *   Pointer to device private data.
2275  * @param mac_addr
2276  *   MAC address to modify into.
2277  * @param iface_idx
2278  *   Net device interface index
2279  * @param vf_index
2280  *   VF index
2281  *
2282  * @return
2283  *   0 on success, a negative errno value otherwise
2284  */
2285 int
2286 mlx5_os_vf_mac_addr_modify(struct mlx5_priv *priv,
2287                            unsigned int iface_idx,
2288                            struct rte_ether_addr *mac_addr,
2289                            int vf_index)
2290 {
2291         return mlx5_nl_vf_mac_addr_modify
2292                 (priv->nl_socket_route, iface_idx, mac_addr, vf_index);
2293 }
2294
2295 /**
2296  * Set device promiscuous mode
2297  *
2298  * @param dev
2299  *   Pointer to Ethernet device structure.
2300  * @param enable
2301  *   0 - promiscuous is disabled, otherwise - enabled
2302  *
2303  * @return
2304  *   0 on success, a negative error value otherwise
2305  */
2306 int
2307 mlx5_os_set_promisc(struct rte_eth_dev *dev, int enable)
2308 {
2309         struct mlx5_priv *priv = dev->data->dev_private;
2310
2311         return mlx5_nl_promisc(priv->nl_socket_route,
2312                                mlx5_ifindex(dev), !!enable);
2313 }
2314
2315 /**
2316  * Set device promiscuous mode
2317  *
2318  * @param dev
2319  *   Pointer to Ethernet device structure.
2320  * @param enable
2321  *   0 - all multicase is disabled, otherwise - enabled
2322  *
2323  * @return
2324  *   0 on success, a negative error value otherwise
2325  */
2326 int
2327 mlx5_os_set_allmulti(struct rte_eth_dev *dev, int enable)
2328 {
2329         struct mlx5_priv *priv = dev->data->dev_private;
2330
2331         return mlx5_nl_allmulti(priv->nl_socket_route,
2332                                 mlx5_ifindex(dev), !!enable);
2333 }
2334
2335 const struct eth_dev_ops mlx5_os_dev_ops = {
2336         .dev_configure = mlx5_dev_configure,
2337         .dev_start = mlx5_dev_start,
2338         .dev_stop = mlx5_dev_stop,
2339         .dev_set_link_down = mlx5_set_link_down,
2340         .dev_set_link_up = mlx5_set_link_up,
2341         .dev_close = mlx5_dev_close,
2342         .promiscuous_enable = mlx5_promiscuous_enable,
2343         .promiscuous_disable = mlx5_promiscuous_disable,
2344         .allmulticast_enable = mlx5_allmulticast_enable,
2345         .allmulticast_disable = mlx5_allmulticast_disable,
2346         .link_update = mlx5_link_update,
2347         .stats_get = mlx5_stats_get,
2348         .stats_reset = mlx5_stats_reset,
2349         .xstats_get = mlx5_xstats_get,
2350         .xstats_reset = mlx5_xstats_reset,
2351         .xstats_get_names = mlx5_xstats_get_names,
2352         .fw_version_get = mlx5_fw_version_get,
2353         .dev_infos_get = mlx5_dev_infos_get,
2354         .read_clock = mlx5_txpp_read_clock,
2355         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
2356         .vlan_filter_set = mlx5_vlan_filter_set,
2357         .rx_queue_setup = mlx5_rx_queue_setup,
2358         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
2359         .tx_queue_setup = mlx5_tx_queue_setup,
2360         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
2361         .rx_queue_release = mlx5_rx_queue_release,
2362         .tx_queue_release = mlx5_tx_queue_release,
2363         .rx_queue_start = mlx5_rx_queue_start,
2364         .rx_queue_stop = mlx5_rx_queue_stop,
2365         .tx_queue_start = mlx5_tx_queue_start,
2366         .tx_queue_stop = mlx5_tx_queue_stop,
2367         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
2368         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
2369         .mac_addr_remove = mlx5_mac_addr_remove,
2370         .mac_addr_add = mlx5_mac_addr_add,
2371         .mac_addr_set = mlx5_mac_addr_set,
2372         .set_mc_addr_list = mlx5_set_mc_addr_list,
2373         .mtu_set = mlx5_dev_set_mtu,
2374         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
2375         .vlan_offload_set = mlx5_vlan_offload_set,
2376         .reta_update = mlx5_dev_rss_reta_update,
2377         .reta_query = mlx5_dev_rss_reta_query,
2378         .rss_hash_update = mlx5_rss_hash_update,
2379         .rss_hash_conf_get = mlx5_rss_hash_conf_get,
2380         .filter_ctrl = mlx5_dev_filter_ctrl,
2381         .rx_descriptor_status = mlx5_rx_descriptor_status,
2382         .tx_descriptor_status = mlx5_tx_descriptor_status,
2383         .rxq_info_get = mlx5_rxq_info_get,
2384         .txq_info_get = mlx5_txq_info_get,
2385         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
2386         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
2387         .rx_queue_count = mlx5_rx_queue_count,
2388         .rx_queue_intr_enable = mlx5_rx_intr_enable,
2389         .rx_queue_intr_disable = mlx5_rx_intr_disable,
2390         .is_removed = mlx5_is_removed,
2391         .udp_tunnel_port_add  = mlx5_udp_tunnel_port_add,
2392         .get_module_info = mlx5_get_module_info,
2393         .get_module_eeprom = mlx5_get_module_eeprom,
2394         .hairpin_cap_get = mlx5_hairpin_cap_get,
2395         .mtr_ops_get = mlx5_flow_meter_ops_get,
2396 };
2397
2398 /* Available operations from secondary process. */
2399 const struct eth_dev_ops mlx5_os_dev_sec_ops = {
2400         .stats_get = mlx5_stats_get,
2401         .stats_reset = mlx5_stats_reset,
2402         .xstats_get = mlx5_xstats_get,
2403         .xstats_reset = mlx5_xstats_reset,
2404         .xstats_get_names = mlx5_xstats_get_names,
2405         .fw_version_get = mlx5_fw_version_get,
2406         .dev_infos_get = mlx5_dev_infos_get,
2407         .read_clock = mlx5_txpp_read_clock,
2408         .rx_queue_start = mlx5_rx_queue_start,
2409         .rx_queue_stop = mlx5_rx_queue_stop,
2410         .tx_queue_start = mlx5_tx_queue_start,
2411         .tx_queue_stop = mlx5_tx_queue_stop,
2412         .rx_descriptor_status = mlx5_rx_descriptor_status,
2413         .tx_descriptor_status = mlx5_tx_descriptor_status,
2414         .rxq_info_get = mlx5_rxq_info_get,
2415         .txq_info_get = mlx5_txq_info_get,
2416         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
2417         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
2418         .get_module_info = mlx5_get_module_info,
2419         .get_module_eeprom = mlx5_get_module_eeprom,
2420 };
2421
2422 /* Available operations in flow isolated mode. */
2423 const struct eth_dev_ops mlx5_os_dev_ops_isolate = {
2424         .dev_configure = mlx5_dev_configure,
2425         .dev_start = mlx5_dev_start,
2426         .dev_stop = mlx5_dev_stop,
2427         .dev_set_link_down = mlx5_set_link_down,
2428         .dev_set_link_up = mlx5_set_link_up,
2429         .dev_close = mlx5_dev_close,
2430         .promiscuous_enable = mlx5_promiscuous_enable,
2431         .promiscuous_disable = mlx5_promiscuous_disable,
2432         .allmulticast_enable = mlx5_allmulticast_enable,
2433         .allmulticast_disable = mlx5_allmulticast_disable,
2434         .link_update = mlx5_link_update,
2435         .stats_get = mlx5_stats_get,
2436         .stats_reset = mlx5_stats_reset,
2437         .xstats_get = mlx5_xstats_get,
2438         .xstats_reset = mlx5_xstats_reset,
2439         .xstats_get_names = mlx5_xstats_get_names,
2440         .fw_version_get = mlx5_fw_version_get,
2441         .dev_infos_get = mlx5_dev_infos_get,
2442         .read_clock = mlx5_txpp_read_clock,
2443         .dev_supported_ptypes_get = mlx5_dev_supported_ptypes_get,
2444         .vlan_filter_set = mlx5_vlan_filter_set,
2445         .rx_queue_setup = mlx5_rx_queue_setup,
2446         .rx_hairpin_queue_setup = mlx5_rx_hairpin_queue_setup,
2447         .tx_queue_setup = mlx5_tx_queue_setup,
2448         .tx_hairpin_queue_setup = mlx5_tx_hairpin_queue_setup,
2449         .rx_queue_release = mlx5_rx_queue_release,
2450         .tx_queue_release = mlx5_tx_queue_release,
2451         .rx_queue_start = mlx5_rx_queue_start,
2452         .rx_queue_stop = mlx5_rx_queue_stop,
2453         .tx_queue_start = mlx5_tx_queue_start,
2454         .tx_queue_stop = mlx5_tx_queue_stop,
2455         .flow_ctrl_get = mlx5_dev_get_flow_ctrl,
2456         .flow_ctrl_set = mlx5_dev_set_flow_ctrl,
2457         .mac_addr_remove = mlx5_mac_addr_remove,
2458         .mac_addr_add = mlx5_mac_addr_add,
2459         .mac_addr_set = mlx5_mac_addr_set,
2460         .set_mc_addr_list = mlx5_set_mc_addr_list,
2461         .mtu_set = mlx5_dev_set_mtu,
2462         .vlan_strip_queue_set = mlx5_vlan_strip_queue_set,
2463         .vlan_offload_set = mlx5_vlan_offload_set,
2464         .filter_ctrl = mlx5_dev_filter_ctrl,
2465         .rx_descriptor_status = mlx5_rx_descriptor_status,
2466         .tx_descriptor_status = mlx5_tx_descriptor_status,
2467         .rxq_info_get = mlx5_rxq_info_get,
2468         .txq_info_get = mlx5_txq_info_get,
2469         .rx_burst_mode_get = mlx5_rx_burst_mode_get,
2470         .tx_burst_mode_get = mlx5_tx_burst_mode_get,
2471         .rx_queue_intr_enable = mlx5_rx_intr_enable,
2472         .rx_queue_intr_disable = mlx5_rx_intr_disable,
2473         .is_removed = mlx5_is_removed,
2474         .get_module_info = mlx5_get_module_info,
2475         .get_module_eeprom = mlx5_get_module_eeprom,
2476         .hairpin_cap_get = mlx5_hairpin_cap_get,
2477         .mtr_ops_get = mlx5_flow_meter_ops_get,
2478 };