9bb462e95c4f76c7c10db14c4c99bbc55ed99853
[dpdk.git] / drivers / vdpa / mlx5 / mlx5_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 #include <unistd.h>
5 #include <net/if.h>
6 #include <sys/socket.h>
7 #include <sys/ioctl.h>
8 #include <fcntl.h>
9 #include <netinet/in.h>
10
11 #include <rte_malloc.h>
12 #include <rte_log.h>
13 #include <rte_errno.h>
14 #include <rte_bus_pci.h>
15 #include <rte_pci.h>
16 #include <rte_string_fns.h>
17
18 #include <mlx5_glue.h>
19 #include <mlx5_common.h>
20 #include <mlx5_devx_cmds.h>
21 #include <mlx5_prm.h>
22 #include <mlx5_nl.h>
23
24 #include "mlx5_vdpa_utils.h"
25 #include "mlx5_vdpa.h"
26
27
28 #define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
29                             (1ULL << VIRTIO_F_ANY_LAYOUT) | \
30                             (1ULL << VIRTIO_NET_F_MQ) | \
31                             (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
32                             (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
33                             (1ULL << VHOST_F_LOG_ALL) | \
34                             (1ULL << VIRTIO_NET_F_MTU))
35
36 #define MLX5_VDPA_PROTOCOL_FEATURES \
37                             ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
38                              (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
39                              (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
40                              (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \
41                              (1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
42                              (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))
43
44 #define MLX5_VDPA_MAX_RETRIES 20
45 #define MLX5_VDPA_USEC 1000
46 #define MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S 2LLU
47
48 TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
49                                               TAILQ_HEAD_INITIALIZER(priv_list);
50 static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER;
51
52 static struct mlx5_vdpa_priv *
53 mlx5_vdpa_find_priv_resource_by_vdev(struct rte_vdpa_device *vdev)
54 {
55         struct mlx5_vdpa_priv *priv;
56         int found = 0;
57
58         pthread_mutex_lock(&priv_list_lock);
59         TAILQ_FOREACH(priv, &priv_list, next) {
60                 if (vdev == priv->vdev) {
61                         found = 1;
62                         break;
63                 }
64         }
65         pthread_mutex_unlock(&priv_list_lock);
66         if (!found) {
67                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
68                 rte_errno = EINVAL;
69                 return NULL;
70         }
71         return priv;
72 }
73
74 static int
75 mlx5_vdpa_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
76 {
77         struct mlx5_vdpa_priv *priv =
78                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
79
80         if (priv == NULL) {
81                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
82                 return -1;
83         }
84         *queue_num = priv->caps.max_num_virtio_queues;
85         return 0;
86 }
87
88 static int
89 mlx5_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
90 {
91         struct mlx5_vdpa_priv *priv =
92                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
93
94         if (priv == NULL) {
95                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
96                 return -1;
97         }
98         *features = MLX5_VDPA_DEFAULT_FEATURES;
99         if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED))
100                 *features |= (1ULL << VIRTIO_F_RING_PACKED);
101         if (priv->caps.tso_ipv4)
102                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
103         if (priv->caps.tso_ipv6)
104                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
105         if (priv->caps.tx_csum)
106                 *features |= (1ULL << VIRTIO_NET_F_CSUM);
107         if (priv->caps.rx_csum)
108                 *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
109         if (priv->caps.virtio_version_1_0)
110                 *features |= (1ULL << VIRTIO_F_VERSION_1);
111         return 0;
112 }
113
114 static int
115 mlx5_vdpa_get_protocol_features(struct rte_vdpa_device *vdev,
116                 uint64_t *features)
117 {
118         struct mlx5_vdpa_priv *priv =
119                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
120
121         if (priv == NULL) {
122                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
123                 return -1;
124         }
125         *features = MLX5_VDPA_PROTOCOL_FEATURES;
126         return 0;
127 }
128
129 static int
130 mlx5_vdpa_set_vring_state(int vid, int vring, int state)
131 {
132         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
133         struct mlx5_vdpa_priv *priv =
134                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
135
136         if (priv == NULL) {
137                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
138                 return -EINVAL;
139         }
140         if (vring >= (int)priv->caps.max_num_virtio_queues * 2) {
141                 DRV_LOG(ERR, "Too big vring id: %d.", vring);
142                 return -E2BIG;
143         }
144         return mlx5_vdpa_virtq_enable(priv, vring, state);
145 }
146
147 static int
148 mlx5_vdpa_features_set(int vid)
149 {
150         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
151         struct mlx5_vdpa_priv *priv =
152                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
153         uint64_t log_base, log_size;
154         uint64_t features;
155         int ret;
156
157         if (priv == NULL) {
158                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
159                 return -EINVAL;
160         }
161         ret = rte_vhost_get_negotiated_features(vid, &features);
162         if (ret) {
163                 DRV_LOG(ERR, "Failed to get negotiated features.");
164                 return ret;
165         }
166         if (RTE_VHOST_NEED_LOG(features)) {
167                 ret = rte_vhost_get_log_base(vid, &log_base, &log_size);
168                 if (ret) {
169                         DRV_LOG(ERR, "Failed to get log base.");
170                         return ret;
171                 }
172                 ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size);
173                 if (ret) {
174                         DRV_LOG(ERR, "Failed to set dirty bitmap.");
175                         return ret;
176                 }
177                 DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging...");
178                 ret = mlx5_vdpa_logging_enable(priv, 1);
179                 if (ret) {
180                         DRV_LOG(ERR, "Failed t enable dirty logging.");
181                         return ret;
182                 }
183         }
184         return 0;
185 }
186
187 static int
188 mlx5_vdpa_pd_create(struct mlx5_vdpa_priv *priv)
189 {
190 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
191         priv->pd = mlx5_glue->alloc_pd(priv->ctx);
192         if (priv->pd == NULL) {
193                 DRV_LOG(ERR, "Failed to allocate PD.");
194                 return errno ? -errno : -ENOMEM;
195         }
196         struct mlx5dv_obj obj;
197         struct mlx5dv_pd pd_info;
198         int ret = 0;
199
200         obj.pd.in = priv->pd;
201         obj.pd.out = &pd_info;
202         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
203         if (ret) {
204                 DRV_LOG(ERR, "Fail to get PD object info.");
205                 mlx5_glue->dealloc_pd(priv->pd);
206                 priv->pd = NULL;
207                 return -errno;
208         }
209         priv->pdn = pd_info.pdn;
210         return 0;
211 #else
212         (void)priv;
213         DRV_LOG(ERR, "Cannot get pdn - no DV support.");
214         return -ENOTSUP;
215 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
216 }
217
218 static int
219 mlx5_vdpa_mtu_set(struct mlx5_vdpa_priv *priv)
220 {
221         struct ifreq request;
222         uint16_t vhost_mtu = 0;
223         uint16_t kern_mtu = 0;
224         int ret = rte_vhost_get_mtu(priv->vid, &vhost_mtu);
225         int sock;
226         int retries = MLX5_VDPA_MAX_RETRIES;
227
228         if (ret) {
229                 DRV_LOG(DEBUG, "Cannot get vhost MTU - %d.", ret);
230                 return ret;
231         }
232         if (!vhost_mtu) {
233                 DRV_LOG(DEBUG, "Vhost MTU is 0.");
234                 return ret;
235         }
236         ret = mlx5_get_ifname_sysfs(priv->ctx->device->ibdev_path,
237                                     request.ifr_name);
238         if (ret) {
239                 DRV_LOG(DEBUG, "Cannot get kernel IF name - %d.", ret);
240                 return ret;
241         }
242         sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
243         if (sock == -1) {
244                 DRV_LOG(DEBUG, "Cannot open IF socket.");
245                 return sock;
246         }
247         while (retries--) {
248                 ret = ioctl(sock, SIOCGIFMTU, &request);
249                 if (ret == -1)
250                         break;
251                 kern_mtu = request.ifr_mtu;
252                 DRV_LOG(DEBUG, "MTU: current %d requested %d.", (int)kern_mtu,
253                         (int)vhost_mtu);
254                 if (kern_mtu == vhost_mtu)
255                         break;
256                 request.ifr_mtu = vhost_mtu;
257                 ret = ioctl(sock, SIOCSIFMTU, &request);
258                 if (ret == -1)
259                         break;
260                 request.ifr_mtu = 0;
261                 usleep(MLX5_VDPA_USEC);
262         }
263         close(sock);
264         return kern_mtu == vhost_mtu ? 0 : -1;
265 }
266
267 static int
268 mlx5_vdpa_dev_close(int vid)
269 {
270         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
271         struct mlx5_vdpa_priv *priv =
272                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
273         int ret = 0;
274
275         if (priv == NULL) {
276                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
277                 return -1;
278         }
279         if (priv->configured)
280                 ret |= mlx5_vdpa_lm_log(priv);
281         mlx5_vdpa_cqe_event_unset(priv);
282         mlx5_vdpa_steer_unset(priv);
283         mlx5_vdpa_virtqs_release(priv);
284         mlx5_vdpa_event_qp_global_release(priv);
285         mlx5_vdpa_mem_dereg(priv);
286         if (priv->pd) {
287                 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
288                 priv->pd = NULL;
289         }
290         priv->configured = 0;
291         priv->vid = 0;
292         DRV_LOG(INFO, "vDPA device %d was closed.", vid);
293         return ret;
294 }
295
296 static int
297 mlx5_vdpa_dev_config(int vid)
298 {
299         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
300         struct mlx5_vdpa_priv *priv =
301                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
302
303         if (priv == NULL) {
304                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
305                 return -EINVAL;
306         }
307         if (priv->configured && mlx5_vdpa_dev_close(vid)) {
308                 DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid);
309                 return -1;
310         }
311         priv->vid = vid;
312         if (mlx5_vdpa_mtu_set(priv))
313                 DRV_LOG(WARNING, "MTU cannot be set on device %s.",
314                                 vdev->device->name);
315         if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
316             mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
317             mlx5_vdpa_cqe_event_setup(priv)) {
318                 mlx5_vdpa_dev_close(vid);
319                 return -1;
320         }
321         priv->configured = 1;
322         DRV_LOG(INFO, "vDPA device %d was configured.", vid);
323         return 0;
324 }
325
326 static int
327 mlx5_vdpa_get_device_fd(int vid)
328 {
329         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
330         struct mlx5_vdpa_priv *priv =
331                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
332
333         if (priv == NULL) {
334                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
335                 return -EINVAL;
336         }
337         return priv->ctx->cmd_fd;
338 }
339
340 static int
341 mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
342 {
343         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
344         struct mlx5_vdpa_priv *priv =
345                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
346
347         RTE_SET_USED(qid);
348         if (priv == NULL) {
349                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
350                 return -EINVAL;
351         }
352         if (!priv->var) {
353                 DRV_LOG(ERR, "VAR was not created for device %s, is the device"
354                         " configured?.", vdev->device->name);
355                 return -EINVAL;
356         }
357         *offset = priv->var->mmap_off;
358         *size = priv->var->length;
359         return 0;
360 }
361
362 static int
363 mlx5_vdpa_get_stats_names(struct rte_vdpa_device *vdev,
364                 struct rte_vdpa_stat_name *stats_names,
365                 unsigned int size)
366 {
367         static const char *mlx5_vdpa_stats_names[MLX5_VDPA_STATS_MAX] = {
368                 "received_descriptors",
369                 "completed_descriptors",
370                 "bad descriptor errors",
371                 "exceed max chain",
372                 "invalid buffer",
373                 "completion errors",
374         };
375         struct mlx5_vdpa_priv *priv =
376                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
377         unsigned int i;
378
379         if (priv == NULL) {
380                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
381                 return -ENODEV;
382         }
383         if (!stats_names)
384                 return MLX5_VDPA_STATS_MAX;
385         size = RTE_MIN(size, (unsigned int)MLX5_VDPA_STATS_MAX);
386         for (i = 0; i < size; ++i)
387                 strlcpy(stats_names[i].name, mlx5_vdpa_stats_names[i],
388                         RTE_VDPA_STATS_NAME_SIZE);
389         return size;
390 }
391
392 static int
393 mlx5_vdpa_get_stats(struct rte_vdpa_device *vdev, int qid,
394                 struct rte_vdpa_stat *stats, unsigned int n)
395 {
396         struct mlx5_vdpa_priv *priv =
397                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
398
399         if (priv == NULL) {
400                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
401                 return -ENODEV;
402         }
403         if (!priv->configured) {
404                 DRV_LOG(ERR, "Device %s was not configured.",
405                                 vdev->device->name);
406                 return -ENODATA;
407         }
408         if (qid >= (int)priv->nr_virtqs) {
409                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
410                                 vdev->device->name);
411                 return -E2BIG;
412         }
413         if (!priv->caps.queue_counters_valid) {
414                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
415                         vdev->device->name);
416                 return -ENOTSUP;
417         }
418         return mlx5_vdpa_virtq_stats_get(priv, qid, stats, n);
419 }
420
421 static int
422 mlx5_vdpa_reset_stats(struct rte_vdpa_device *vdev, int qid)
423 {
424         struct mlx5_vdpa_priv *priv =
425                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
426
427         if (priv == NULL) {
428                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
429                 return -ENODEV;
430         }
431         if (!priv->configured) {
432                 DRV_LOG(ERR, "Device %s was not configured.",
433                                 vdev->device->name);
434                 return -ENODATA;
435         }
436         if (qid >= (int)priv->nr_virtqs) {
437                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
438                                 vdev->device->name);
439                 return -E2BIG;
440         }
441         if (!priv->caps.queue_counters_valid) {
442                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
443                         vdev->device->name);
444                 return -ENOTSUP;
445         }
446         return mlx5_vdpa_virtq_stats_reset(priv, qid);
447 }
448
449 static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
450         .get_queue_num = mlx5_vdpa_get_queue_num,
451         .get_features = mlx5_vdpa_get_vdpa_features,
452         .get_protocol_features = mlx5_vdpa_get_protocol_features,
453         .dev_conf = mlx5_vdpa_dev_config,
454         .dev_close = mlx5_vdpa_dev_close,
455         .set_vring_state = mlx5_vdpa_set_vring_state,
456         .set_features = mlx5_vdpa_features_set,
457         .migration_done = NULL,
458         .get_vfio_group_fd = NULL,
459         .get_vfio_device_fd = mlx5_vdpa_get_device_fd,
460         .get_notify_area = mlx5_vdpa_get_notify_area,
461         .get_stats_names = mlx5_vdpa_get_stats_names,
462         .get_stats = mlx5_vdpa_get_stats,
463         .reset_stats = mlx5_vdpa_reset_stats,
464 };
465
466 static struct ibv_device *
467 mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
468 {
469         int n;
470         struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
471         struct ibv_device *ibv_match = NULL;
472
473         if (!ibv_list) {
474                 rte_errno = ENOSYS;
475                 return NULL;
476         }
477         while (n-- > 0) {
478                 struct rte_pci_addr pci_addr;
479
480                 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
481                 if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
482                         continue;
483                 if (rte_pci_addr_cmp(addr, &pci_addr))
484                         continue;
485                 ibv_match = ibv_list[n];
486                 break;
487         }
488         if (!ibv_match)
489                 rte_errno = ENOENT;
490         mlx5_glue->free_device_list(ibv_list);
491         return ibv_match;
492 }
493
494 /* Try to disable ROCE by Netlink\Devlink. */
495 static int
496 mlx5_vdpa_nl_roce_disable(const char *addr)
497 {
498         int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
499         int devlink_id;
500         int enable;
501         int ret;
502
503         if (nlsk_fd < 0)
504                 return nlsk_fd;
505         devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
506         if (devlink_id < 0) {
507                 ret = devlink_id;
508                 DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
509                         " Netlink.");
510                 goto close;
511         }
512         ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
513         if (ret) {
514                 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
515                         ret);
516                 goto close;
517         } else if (!enable) {
518                 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
519                 goto close;
520         }
521         ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
522         if (ret)
523                 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
524         else
525                 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
526 close:
527         close(nlsk_fd);
528         return ret;
529 }
530
531 /* Try to disable ROCE by sysfs. */
532 static int
533 mlx5_vdpa_sys_roce_disable(const char *addr)
534 {
535         FILE *file_o;
536         int enable;
537         int ret;
538
539         MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
540         file_o = fopen(file_p, "rb");
541         if (!file_o) {
542                 rte_errno = ENOTSUP;
543                 return -ENOTSUP;
544         }
545         ret = fscanf(file_o, "%d", &enable);
546         if (ret != 1) {
547                 rte_errno = EINVAL;
548                 ret = EINVAL;
549                 goto close;
550         } else if (!enable) {
551                 ret = 0;
552                 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
553                 goto close;
554         }
555         fclose(file_o);
556         file_o = fopen(file_p, "wb");
557         if (!file_o) {
558                 rte_errno = ENOTSUP;
559                 return -ENOTSUP;
560         }
561         fprintf(file_o, "0\n");
562         ret = 0;
563 close:
564         if (ret)
565                 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
566         else
567                 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
568         fclose(file_o);
569         return ret;
570 }
571
572 static int
573 mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
574 {
575         char addr_name[64] = {0};
576
577         rte_pci_device_name(addr, addr_name, sizeof(addr_name));
578         /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
579         if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
580             mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
581                 /*
582                  * Succeed to disable ROCE, wait for the IB device to appear
583                  * again after reload.
584                  */
585                 int r;
586                 struct ibv_device *ibv_new;
587
588                 for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
589                         ibv_new = mlx5_vdpa_get_ib_device_match(addr);
590                         if (ibv_new) {
591                                 *ibv = ibv_new;
592                                 return 0;
593                         }
594                         usleep(MLX5_VDPA_USEC);
595                 }
596                 DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
597                         "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
598                 rte_errno = EAGAIN;
599         }
600         return -rte_errno;
601 }
602
603 static int
604 mlx5_vdpa_args_check_handler(const char *key, const char *val, void *opaque)
605 {
606         struct mlx5_vdpa_priv *priv = opaque;
607         unsigned long tmp;
608
609         if (strcmp(key, "class") == 0)
610                 return 0;
611         errno = 0;
612         tmp = strtoul(val, NULL, 0);
613         if (errno) {
614                 DRV_LOG(WARNING, "%s: \"%s\" is an invalid integer.", key, val);
615                 return -errno;
616         }
617         if (strcmp(key, "event_mode") == 0) {
618                 if (tmp <= MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT)
619                         priv->event_mode = (int)tmp;
620                 else
621                         DRV_LOG(WARNING, "Invalid event_mode %s.", val);
622         } else if (strcmp(key, "event_us") == 0) {
623                 priv->event_us = (uint32_t)tmp;
624         } else if (strcmp(key, "no_traffic_time") == 0) {
625                 priv->no_traffic_time_s = (uint32_t)tmp;
626         } else {
627                 DRV_LOG(WARNING, "Invalid key %s.", key);
628         }
629         return 0;
630 }
631
632 static void
633 mlx5_vdpa_config_get(struct rte_devargs *devargs, struct mlx5_vdpa_priv *priv)
634 {
635         struct rte_kvargs *kvlist;
636
637         priv->event_mode = MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER;
638         priv->event_us = 0;
639         priv->no_traffic_time_s = MLX5_VDPA_DEFAULT_NO_TRAFFIC_TIME_S;
640         if (devargs == NULL)
641                 return;
642         kvlist = rte_kvargs_parse(devargs->args, NULL);
643         if (kvlist == NULL)
644                 return;
645         rte_kvargs_process(kvlist, NULL, mlx5_vdpa_args_check_handler, priv);
646         rte_kvargs_free(kvlist);
647         if (!priv->event_us) {
648                 if (priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER)
649                         priv->event_us = MLX5_VDPA_DEFAULT_TIMER_STEP_US;
650                 else if (priv->event_mode == MLX5_VDPA_EVENT_MODE_FIXED_TIMER)
651                         priv->event_us = MLX5_VDPA_DEFAULT_TIMER_DELAY_US;
652         }
653         DRV_LOG(DEBUG, "event mode is %d.", priv->event_mode);
654         DRV_LOG(DEBUG, "event_us is %u us.", priv->event_us);
655         DRV_LOG(DEBUG, "no traffic time is %u s.", priv->no_traffic_time_s);
656 }
657
658 /**
659  * DPDK callback to register a PCI device.
660  *
661  * This function spawns vdpa device out of a given PCI device.
662  *
663  * @param[in] pci_drv
664  *   PCI driver structure (mlx5_vpda_driver).
665  * @param[in] pci_dev
666  *   PCI device information.
667  *
668  * @return
669  *   0 on success, 1 to skip this driver, a negative errno value otherwise
670  *   and rte_errno is set.
671  */
672 static int
673 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
674                     struct rte_pci_device *pci_dev __rte_unused)
675 {
676         struct ibv_device *ibv;
677         struct mlx5_vdpa_priv *priv = NULL;
678         struct ibv_context *ctx = NULL;
679         struct mlx5_hca_attr attr;
680         int ret;
681
682         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_VDPA) {
683                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
684                         " driver.");
685                 return 1;
686         }
687         ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
688         if (!ibv) {
689                 DRV_LOG(ERR, "No matching IB device for PCI slot "
690                         PCI_PRI_FMT ".", pci_dev->addr.domain,
691                         pci_dev->addr.bus, pci_dev->addr.devid,
692                         pci_dev->addr.function);
693                 return -rte_errno;
694         } else {
695                 DRV_LOG(INFO, "PCI information matches for device \"%s\".",
696                         ibv->name);
697         }
698         if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
699                 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
700                         ibv->name);
701                 return -rte_errno;
702         }
703         ctx = mlx5_glue->dv_open_device(ibv);
704         if (!ctx) {
705                 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
706                 rte_errno = ENODEV;
707                 return -rte_errno;
708         }
709         ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr);
710         if (ret) {
711                 DRV_LOG(ERR, "Unable to read HCA capabilities.");
712                 rte_errno = ENOTSUP;
713                 goto error;
714         } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) {
715                 DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe "
716                         "old FW/OFED version?");
717                 rte_errno = ENOTSUP;
718                 goto error;
719         }
720         if (!attr.vdpa.queue_counters_valid)
721                 DRV_LOG(DEBUG, "No capability to support virtq statistics.");
722         priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) +
723                            sizeof(struct mlx5_vdpa_virtq) *
724                            attr.vdpa.max_num_virtio_queues * 2,
725                            RTE_CACHE_LINE_SIZE);
726         if (!priv) {
727                 DRV_LOG(ERR, "Failed to allocate private memory.");
728                 rte_errno = ENOMEM;
729                 goto error;
730         }
731         priv->caps = attr.vdpa;
732         priv->log_max_rqt_size = attr.log_max_rqt_size;
733         priv->ctx = ctx;
734         priv->pci_dev = pci_dev;
735         priv->var = mlx5_glue->dv_alloc_var(ctx, 0);
736         if (!priv->var) {
737                 DRV_LOG(ERR, "Failed to allocate VAR %u.\n", errno);
738                 goto error;
739         }
740         priv->vdev = rte_vdpa_register_device(&pci_dev->device,
741                         &mlx5_vdpa_ops);
742         if (priv->vdev == NULL) {
743                 DRV_LOG(ERR, "Failed to register vDPA device.");
744                 rte_errno = rte_errno ? rte_errno : EINVAL;
745                 goto error;
746         }
747         mlx5_vdpa_config_get(pci_dev->device.devargs, priv);
748         SLIST_INIT(&priv->mr_list);
749         pthread_mutex_lock(&priv_list_lock);
750         TAILQ_INSERT_TAIL(&priv_list, priv, next);
751         pthread_mutex_unlock(&priv_list_lock);
752         return 0;
753
754 error:
755         if (priv) {
756                 if (priv->var)
757                         mlx5_glue->dv_free_var(priv->var);
758                 rte_free(priv);
759         }
760         if (ctx)
761                 mlx5_glue->close_device(ctx);
762         return -rte_errno;
763 }
764
765 /**
766  * DPDK callback to remove a PCI device.
767  *
768  * This function removes all vDPA devices belong to a given PCI device.
769  *
770  * @param[in] pci_dev
771  *   Pointer to the PCI device.
772  *
773  * @return
774  *   0 on success, the function cannot fail.
775  */
776 static int
777 mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev)
778 {
779         struct mlx5_vdpa_priv *priv = NULL;
780         int found = 0;
781
782         pthread_mutex_lock(&priv_list_lock);
783         TAILQ_FOREACH(priv, &priv_list, next) {
784                 if (!rte_pci_addr_cmp(&priv->pci_dev->addr, &pci_dev->addr)) {
785                         found = 1;
786                         break;
787                 }
788         }
789         if (found)
790                 TAILQ_REMOVE(&priv_list, priv, next);
791         pthread_mutex_unlock(&priv_list_lock);
792         if (found) {
793                 if (priv->configured)
794                         mlx5_vdpa_dev_close(priv->vid);
795                 if (priv->var) {
796                         mlx5_glue->dv_free_var(priv->var);
797                         priv->var = NULL;
798                 }
799                 mlx5_glue->close_device(priv->ctx);
800                 rte_free(priv);
801         }
802         return 0;
803 }
804
805 static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = {
806         {
807                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
808                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
809         },
810         {
811                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
812                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
813         },
814         {
815                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
816                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
817         },
818         {
819                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
820                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
821         },
822         {
823                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
824                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
825         },
826         {
827                 .vendor_id = 0
828         }
829 };
830
831 static struct rte_pci_driver mlx5_vdpa_driver = {
832         .driver = {
833                 .name = "mlx5_vdpa",
834         },
835         .id_table = mlx5_vdpa_pci_id_map,
836         .probe = mlx5_vdpa_pci_probe,
837         .remove = mlx5_vdpa_pci_remove,
838         .drv_flags = 0,
839 };
840
841 RTE_LOG_REGISTER(mlx5_vdpa_logtype, pmd.vdpa.mlx5, NOTICE)
842
843 /**
844  * Driver initialization routine.
845  */
846 RTE_INIT(rte_mlx5_vdpa_init)
847 {
848         if (mlx5_glue)
849                 rte_pci_register(&mlx5_vdpa_driver);
850 }
851
852 RTE_PMD_EXPORT_NAME(net_mlx5_vdpa, __COUNTER__);
853 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5_vdpa, mlx5_vdpa_pci_id_map);
854 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5_vdpa, "* ib_uverbs & mlx5_core & mlx5_ib");