9ea032d57b5906c02a666176f88e16aa2e5682df
[dpdk.git] / drivers / vdpa / mlx5 / mlx5_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 #include <unistd.h>
5 #include <net/if.h>
6 #include <sys/socket.h>
7 #include <sys/ioctl.h>
8 #include <fcntl.h>
9 #include <netinet/in.h>
10
11 #include <rte_malloc.h>
12 #include <rte_log.h>
13 #include <rte_errno.h>
14 #include <rte_bus_pci.h>
15 #include <rte_pci.h>
16 #include <rte_string_fns.h>
17
18 #include <mlx5_glue.h>
19 #include <mlx5_common.h>
20 #include <mlx5_devx_cmds.h>
21 #include <mlx5_prm.h>
22 #include <mlx5_nl.h>
23
24 #include "mlx5_vdpa_utils.h"
25 #include "mlx5_vdpa.h"
26
27
28 #define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
29                             (1ULL << VIRTIO_F_ANY_LAYOUT) | \
30                             (1ULL << VIRTIO_NET_F_MQ) | \
31                             (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
32                             (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
33                             (1ULL << VHOST_F_LOG_ALL) | \
34                             (1ULL << VIRTIO_NET_F_MTU))
35
36 #define MLX5_VDPA_PROTOCOL_FEATURES \
37                             ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
38                              (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
39                              (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
40                              (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \
41                              (1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
42                              (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU))
43
44 #define MLX5_VDPA_MAX_RETRIES 20
45 #define MLX5_VDPA_USEC 1000
46
47 TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
48                                               TAILQ_HEAD_INITIALIZER(priv_list);
49 static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER;
50 int mlx5_vdpa_logtype;
51
52 static struct mlx5_vdpa_priv *
53 mlx5_vdpa_find_priv_resource_by_vdev(struct rte_vdpa_device *vdev)
54 {
55         struct mlx5_vdpa_priv *priv;
56         int found = 0;
57
58         pthread_mutex_lock(&priv_list_lock);
59         TAILQ_FOREACH(priv, &priv_list, next) {
60                 if (vdev == priv->vdev) {
61                         found = 1;
62                         break;
63                 }
64         }
65         pthread_mutex_unlock(&priv_list_lock);
66         if (!found) {
67                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
68                 rte_errno = EINVAL;
69                 return NULL;
70         }
71         return priv;
72 }
73
74 static int
75 mlx5_vdpa_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
76 {
77         struct mlx5_vdpa_priv *priv =
78                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
79
80         if (priv == NULL) {
81                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
82                 return -1;
83         }
84         *queue_num = priv->caps.max_num_virtio_queues;
85         return 0;
86 }
87
88 static int
89 mlx5_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
90 {
91         struct mlx5_vdpa_priv *priv =
92                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
93
94         if (priv == NULL) {
95                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
96                 return -1;
97         }
98         *features = MLX5_VDPA_DEFAULT_FEATURES;
99         if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED))
100                 *features |= (1ULL << VIRTIO_F_RING_PACKED);
101         if (priv->caps.tso_ipv4)
102                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
103         if (priv->caps.tso_ipv6)
104                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
105         if (priv->caps.tx_csum)
106                 *features |= (1ULL << VIRTIO_NET_F_CSUM);
107         if (priv->caps.rx_csum)
108                 *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
109         if (priv->caps.virtio_version_1_0)
110                 *features |= (1ULL << VIRTIO_F_VERSION_1);
111         return 0;
112 }
113
114 static int
115 mlx5_vdpa_get_protocol_features(struct rte_vdpa_device *vdev,
116                 uint64_t *features)
117 {
118         struct mlx5_vdpa_priv *priv =
119                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
120
121         if (priv == NULL) {
122                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
123                 return -1;
124         }
125         *features = MLX5_VDPA_PROTOCOL_FEATURES;
126         return 0;
127 }
128
129 static int
130 mlx5_vdpa_set_vring_state(int vid, int vring, int state)
131 {
132         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
133                         rte_vhost_get_vdpa_device_id(vid));
134         struct mlx5_vdpa_priv *priv =
135                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
136
137         if (priv == NULL) {
138                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
139                 return -EINVAL;
140         }
141         if (vring >= (int)priv->caps.max_num_virtio_queues * 2) {
142                 DRV_LOG(ERR, "Too big vring id: %d.", vring);
143                 return -E2BIG;
144         }
145         return mlx5_vdpa_virtq_enable(priv, vring, state);
146 }
147
148 static int
149 mlx5_vdpa_direct_db_prepare(struct mlx5_vdpa_priv *priv)
150 {
151         int ret;
152
153         if (priv->direct_notifier) {
154                 ret = rte_vhost_host_notifier_ctrl(priv->vid, false);
155                 if (ret != 0) {
156                         DRV_LOG(INFO, "Direct HW notifier FD cannot be "
157                                 "destroyed for device %d: %d.", priv->vid, ret);
158                         return -1;
159                 }
160                 priv->direct_notifier = 0;
161         }
162         ret = rte_vhost_host_notifier_ctrl(priv->vid, true);
163         if (ret != 0)
164                 DRV_LOG(INFO, "Direct HW notifier FD cannot be configured for"
165                         " device %d: %d.", priv->vid, ret);
166         else
167                 priv->direct_notifier = 1;
168         return 0;
169 }
170
171 static int
172 mlx5_vdpa_features_set(int vid)
173 {
174         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
175                         rte_vhost_get_vdpa_device_id(vid));
176         struct mlx5_vdpa_priv *priv =
177                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
178         uint64_t log_base, log_size;
179         uint64_t features;
180         int ret;
181
182         if (priv == NULL) {
183                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
184                 return -EINVAL;
185         }
186         ret = rte_vhost_get_negotiated_features(vid, &features);
187         if (ret) {
188                 DRV_LOG(ERR, "Failed to get negotiated features.");
189                 return ret;
190         }
191         if (RTE_VHOST_NEED_LOG(features)) {
192                 ret = rte_vhost_get_log_base(vid, &log_base, &log_size);
193                 if (ret) {
194                         DRV_LOG(ERR, "Failed to get log base.");
195                         return ret;
196                 }
197                 ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size);
198                 if (ret) {
199                         DRV_LOG(ERR, "Failed to set dirty bitmap.");
200                         return ret;
201                 }
202                 DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging...");
203                 ret = mlx5_vdpa_logging_enable(priv, 1);
204                 if (ret) {
205                         DRV_LOG(ERR, "Failed t enable dirty logging.");
206                         return ret;
207                 }
208         }
209         return 0;
210 }
211
212 static int
213 mlx5_vdpa_pd_create(struct mlx5_vdpa_priv *priv)
214 {
215 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
216         priv->pd = mlx5_glue->alloc_pd(priv->ctx);
217         if (priv->pd == NULL) {
218                 DRV_LOG(ERR, "Failed to allocate PD.");
219                 return errno ? -errno : -ENOMEM;
220         }
221         struct mlx5dv_obj obj;
222         struct mlx5dv_pd pd_info;
223         int ret = 0;
224
225         obj.pd.in = priv->pd;
226         obj.pd.out = &pd_info;
227         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
228         if (ret) {
229                 DRV_LOG(ERR, "Fail to get PD object info.");
230                 mlx5_glue->dealloc_pd(priv->pd);
231                 priv->pd = NULL;
232                 return -errno;
233         }
234         priv->pdn = pd_info.pdn;
235         return 0;
236 #else
237         (void)priv;
238         DRV_LOG(ERR, "Cannot get pdn - no DV support.");
239         return -ENOTSUP;
240 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
241 }
242
243 static int
244 mlx5_vdpa_mtu_set(struct mlx5_vdpa_priv *priv)
245 {
246         struct ifreq request;
247         uint16_t vhost_mtu = 0;
248         uint16_t kern_mtu = 0;
249         int ret = rte_vhost_get_mtu(priv->vid, &vhost_mtu);
250         int sock;
251         int retries = MLX5_VDPA_MAX_RETRIES;
252
253         if (ret) {
254                 DRV_LOG(DEBUG, "Cannot get vhost MTU - %d.", ret);
255                 return ret;
256         }
257         if (!vhost_mtu) {
258                 DRV_LOG(DEBUG, "Vhost MTU is 0.");
259                 return ret;
260         }
261         ret = mlx5_get_ifname_sysfs(priv->ctx->device->ibdev_path,
262                                     request.ifr_name);
263         if (ret) {
264                 DRV_LOG(DEBUG, "Cannot get kernel IF name - %d.", ret);
265                 return ret;
266         }
267         sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
268         if (sock == -1) {
269                 DRV_LOG(DEBUG, "Cannot open IF socket.");
270                 return sock;
271         }
272         while (retries--) {
273                 ret = ioctl(sock, SIOCGIFMTU, &request);
274                 if (ret == -1)
275                         break;
276                 kern_mtu = request.ifr_mtu;
277                 DRV_LOG(DEBUG, "MTU: current %d requested %d.", (int)kern_mtu,
278                         (int)vhost_mtu);
279                 if (kern_mtu == vhost_mtu)
280                         break;
281                 request.ifr_mtu = vhost_mtu;
282                 ret = ioctl(sock, SIOCSIFMTU, &request);
283                 if (ret == -1)
284                         break;
285                 request.ifr_mtu = 0;
286                 usleep(MLX5_VDPA_USEC);
287         }
288         close(sock);
289         return kern_mtu == vhost_mtu ? 0 : -1;
290 }
291
292 static int
293 mlx5_vdpa_dev_close(int vid)
294 {
295         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
296                         rte_vhost_get_vdpa_device_id(vid));
297         struct mlx5_vdpa_priv *priv =
298                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
299         int ret = 0;
300
301         if (priv == NULL) {
302                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
303                 return -1;
304         }
305         if (priv->configured)
306                 ret |= mlx5_vdpa_lm_log(priv);
307         mlx5_vdpa_cqe_event_unset(priv);
308         mlx5_vdpa_steer_unset(priv);
309         mlx5_vdpa_virtqs_release(priv);
310         mlx5_vdpa_event_qp_global_release(priv);
311         mlx5_vdpa_mem_dereg(priv);
312         if (priv->pd) {
313                 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
314                 priv->pd = NULL;
315         }
316         priv->configured = 0;
317         priv->vid = 0;
318         DRV_LOG(INFO, "vDPA device %d was closed.", vid);
319         return ret;
320 }
321
322 static int
323 mlx5_vdpa_dev_config(int vid)
324 {
325         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
326                         rte_vhost_get_vdpa_device_id(vid));
327         struct mlx5_vdpa_priv *priv =
328                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
329
330         if (priv == NULL) {
331                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
332                 return -EINVAL;
333         }
334         if (priv->configured && mlx5_vdpa_dev_close(vid)) {
335                 DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid);
336                 return -1;
337         }
338         priv->vid = vid;
339         if (mlx5_vdpa_mtu_set(priv))
340                 DRV_LOG(WARNING, "MTU cannot be set on device %s.",
341                                 vdev->device->name);
342         if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
343             mlx5_vdpa_direct_db_prepare(priv) ||
344             mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
345             mlx5_vdpa_cqe_event_setup(priv)) {
346                 mlx5_vdpa_dev_close(vid);
347                 return -1;
348         }
349         priv->configured = 1;
350         DRV_LOG(INFO, "vDPA device %d was configured.", vid);
351         return 0;
352 }
353
354 static int
355 mlx5_vdpa_get_device_fd(int vid)
356 {
357         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
358                         rte_vhost_get_vdpa_device_id(vid));
359         struct mlx5_vdpa_priv *priv =
360                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
361
362         if (priv == NULL) {
363                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
364                 return -EINVAL;
365         }
366         return priv->ctx->cmd_fd;
367 }
368
369 static int
370 mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
371 {
372         struct rte_vdpa_device *vdev = rte_vdpa_get_device(
373                         rte_vhost_get_vdpa_device_id(vid));
374         struct mlx5_vdpa_priv *priv =
375                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
376
377         RTE_SET_USED(qid);
378         if (priv == NULL) {
379                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
380                 return -EINVAL;
381         }
382         if (!priv->var) {
383                 DRV_LOG(ERR, "VAR was not created for device %s, is the device"
384                         " configured?.", vdev->device->name);
385                 return -EINVAL;
386         }
387         *offset = priv->var->mmap_off;
388         *size = priv->var->length;
389         return 0;
390 }
391
392 static int
393 mlx5_vdpa_get_stats_names(struct rte_vdpa_device *vdev,
394                 struct rte_vdpa_stat_name *stats_names,
395                 unsigned int size)
396 {
397         static const char *mlx5_vdpa_stats_names[MLX5_VDPA_STATS_MAX] = {
398                 "received_descriptors",
399                 "completed_descriptors",
400                 "bad descriptor errors",
401                 "exceed max chain",
402                 "invalid buffer",
403                 "completion errors",
404         };
405         struct mlx5_vdpa_priv *priv =
406                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
407         unsigned int i;
408
409         if (priv == NULL) {
410                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
411                 return -ENODEV;
412         }
413         if (!stats_names)
414                 return MLX5_VDPA_STATS_MAX;
415         size = RTE_MIN(size, (unsigned int)MLX5_VDPA_STATS_MAX);
416         for (i = 0; i < size; ++i)
417                 strlcpy(stats_names[i].name, mlx5_vdpa_stats_names[i],
418                         RTE_VDPA_STATS_NAME_SIZE);
419         return size;
420 }
421
422 static int
423 mlx5_vdpa_get_stats(struct rte_vdpa_device *vdev, int qid,
424                 struct rte_vdpa_stat *stats, unsigned int n)
425 {
426         struct mlx5_vdpa_priv *priv =
427                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
428
429         if (priv == NULL) {
430                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
431                 return -ENODEV;
432         }
433         if (!priv->configured) {
434                 DRV_LOG(ERR, "Device %s was not configured.",
435                                 vdev->device->name);
436                 return -ENODATA;
437         }
438         if (qid >= (int)priv->nr_virtqs) {
439                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
440                                 vdev->device->name);
441                 return -E2BIG;
442         }
443         if (!priv->caps.queue_counters_valid) {
444                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
445                         vdev->device->name);
446                 return -ENOTSUP;
447         }
448         return mlx5_vdpa_virtq_stats_get(priv, qid, stats, n);
449 }
450
451 static int
452 mlx5_vdpa_reset_stats(struct rte_vdpa_device *vdev, int qid)
453 {
454         struct mlx5_vdpa_priv *priv =
455                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
456
457         if (priv == NULL) {
458                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
459                 return -ENODEV;
460         }
461         if (!priv->configured) {
462                 DRV_LOG(ERR, "Device %s was not configured.",
463                                 vdev->device->name);
464                 return -ENODATA;
465         }
466         if (qid >= (int)priv->nr_virtqs) {
467                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
468                                 vdev->device->name);
469                 return -E2BIG;
470         }
471         if (!priv->caps.queue_counters_valid) {
472                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
473                         vdev->device->name);
474                 return -ENOTSUP;
475         }
476         return mlx5_vdpa_virtq_stats_reset(priv, qid);
477 }
478
479 static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
480         .get_queue_num = mlx5_vdpa_get_queue_num,
481         .get_features = mlx5_vdpa_get_vdpa_features,
482         .get_protocol_features = mlx5_vdpa_get_protocol_features,
483         .dev_conf = mlx5_vdpa_dev_config,
484         .dev_close = mlx5_vdpa_dev_close,
485         .set_vring_state = mlx5_vdpa_set_vring_state,
486         .set_features = mlx5_vdpa_features_set,
487         .migration_done = NULL,
488         .get_vfio_group_fd = NULL,
489         .get_vfio_device_fd = mlx5_vdpa_get_device_fd,
490         .get_notify_area = mlx5_vdpa_get_notify_area,
491         .get_stats_names = mlx5_vdpa_get_stats_names,
492         .get_stats = mlx5_vdpa_get_stats,
493         .reset_stats = mlx5_vdpa_reset_stats,
494 };
495
496 static struct ibv_device *
497 mlx5_vdpa_get_ib_device_match(struct rte_pci_addr *addr)
498 {
499         int n;
500         struct ibv_device **ibv_list = mlx5_glue->get_device_list(&n);
501         struct ibv_device *ibv_match = NULL;
502
503         if (!ibv_list) {
504                 rte_errno = ENOSYS;
505                 return NULL;
506         }
507         while (n-- > 0) {
508                 struct rte_pci_addr pci_addr;
509
510                 DRV_LOG(DEBUG, "Checking device \"%s\"..", ibv_list[n]->name);
511                 if (mlx5_dev_to_pci_addr(ibv_list[n]->ibdev_path, &pci_addr))
512                         continue;
513                 if (rte_pci_addr_cmp(addr, &pci_addr))
514                         continue;
515                 ibv_match = ibv_list[n];
516                 break;
517         }
518         if (!ibv_match)
519                 rte_errno = ENOENT;
520         mlx5_glue->free_device_list(ibv_list);
521         return ibv_match;
522 }
523
524 /* Try to disable ROCE by Netlink\Devlink. */
525 static int
526 mlx5_vdpa_nl_roce_disable(const char *addr)
527 {
528         int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
529         int devlink_id;
530         int enable;
531         int ret;
532
533         if (nlsk_fd < 0)
534                 return nlsk_fd;
535         devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
536         if (devlink_id < 0) {
537                 ret = devlink_id;
538                 DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
539                         " Netlink.");
540                 goto close;
541         }
542         ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
543         if (ret) {
544                 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
545                         ret);
546                 goto close;
547         } else if (!enable) {
548                 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
549                 goto close;
550         }
551         ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
552         if (ret)
553                 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
554         else
555                 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
556 close:
557         close(nlsk_fd);
558         return ret;
559 }
560
561 /* Try to disable ROCE by sysfs. */
562 static int
563 mlx5_vdpa_sys_roce_disable(const char *addr)
564 {
565         FILE *file_o;
566         int enable;
567         int ret;
568
569         MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
570         file_o = fopen(file_p, "rb");
571         if (!file_o) {
572                 rte_errno = ENOTSUP;
573                 return -ENOTSUP;
574         }
575         ret = fscanf(file_o, "%d", &enable);
576         if (ret != 1) {
577                 rte_errno = EINVAL;
578                 ret = EINVAL;
579                 goto close;
580         } else if (!enable) {
581                 ret = 0;
582                 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
583                 goto close;
584         }
585         fclose(file_o);
586         file_o = fopen(file_p, "wb");
587         if (!file_o) {
588                 rte_errno = ENOTSUP;
589                 return -ENOTSUP;
590         }
591         fprintf(file_o, "0\n");
592         ret = 0;
593 close:
594         if (ret)
595                 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
596         else
597                 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
598         fclose(file_o);
599         return ret;
600 }
601
602 static int
603 mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
604 {
605         char addr_name[64] = {0};
606
607         rte_pci_device_name(addr, addr_name, sizeof(addr_name));
608         /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
609         if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
610             mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
611                 /*
612                  * Succeed to disable ROCE, wait for the IB device to appear
613                  * again after reload.
614                  */
615                 int r;
616                 struct ibv_device *ibv_new;
617
618                 for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
619                         ibv_new = mlx5_vdpa_get_ib_device_match(addr);
620                         if (ibv_new) {
621                                 *ibv = ibv_new;
622                                 return 0;
623                         }
624                         usleep(MLX5_VDPA_USEC);
625                 }
626                 DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
627                         "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
628                 rte_errno = EAGAIN;
629         }
630         return -rte_errno;
631 }
632
633 /**
634  * DPDK callback to register a PCI device.
635  *
636  * This function spawns vdpa device out of a given PCI device.
637  *
638  * @param[in] pci_drv
639  *   PCI driver structure (mlx5_vpda_driver).
640  * @param[in] pci_dev
641  *   PCI device information.
642  *
643  * @return
644  *   0 on success, 1 to skip this driver, a negative errno value otherwise
645  *   and rte_errno is set.
646  */
647 static int
648 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
649                     struct rte_pci_device *pci_dev __rte_unused)
650 {
651         struct ibv_device *ibv;
652         struct mlx5_vdpa_priv *priv = NULL;
653         struct ibv_context *ctx = NULL;
654         struct mlx5_hca_attr attr;
655         int ret;
656
657         if (mlx5_class_get(pci_dev->device.devargs) != MLX5_CLASS_VDPA) {
658                 DRV_LOG(DEBUG, "Skip probing - should be probed by other mlx5"
659                         " driver.");
660                 return 1;
661         }
662         ibv = mlx5_vdpa_get_ib_device_match(&pci_dev->addr);
663         if (!ibv) {
664                 DRV_LOG(ERR, "No matching IB device for PCI slot "
665                         PCI_PRI_FMT ".", pci_dev->addr.domain,
666                         pci_dev->addr.bus, pci_dev->addr.devid,
667                         pci_dev->addr.function);
668                 return -rte_errno;
669         } else {
670                 DRV_LOG(INFO, "PCI information matches for device \"%s\".",
671                         ibv->name);
672         }
673         if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
674                 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
675                         ibv->name);
676                 return -rte_errno;
677         }
678         ctx = mlx5_glue->dv_open_device(ibv);
679         if (!ctx) {
680                 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
681                 rte_errno = ENODEV;
682                 return -rte_errno;
683         }
684         ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr);
685         if (ret) {
686                 DRV_LOG(ERR, "Unable to read HCA capabilities.");
687                 rte_errno = ENOTSUP;
688                 goto error;
689         } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) {
690                 DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe "
691                         "old FW/OFED version?");
692                 rte_errno = ENOTSUP;
693                 goto error;
694         }
695         if (!attr.vdpa.queue_counters_valid)
696                 DRV_LOG(DEBUG, "No capability to support virtq statistics.");
697         priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) +
698                            sizeof(struct mlx5_vdpa_virtq) *
699                            attr.vdpa.max_num_virtio_queues * 2,
700                            RTE_CACHE_LINE_SIZE);
701         if (!priv) {
702                 DRV_LOG(ERR, "Failed to allocate private memory.");
703                 rte_errno = ENOMEM;
704                 goto error;
705         }
706         priv->caps = attr.vdpa;
707         priv->log_max_rqt_size = attr.log_max_rqt_size;
708         priv->ctx = ctx;
709         priv->pci_dev = pci_dev;
710         priv->var = mlx5_glue->dv_alloc_var(ctx, 0);
711         if (!priv->var) {
712                 DRV_LOG(ERR, "Failed to allocate VAR %u.\n", errno);
713                 goto error;
714         }
715         priv->vdev = rte_vdpa_register_device(&pci_dev->device,
716                         &mlx5_vdpa_ops);
717         if (priv->vdev == NULL) {
718                 DRV_LOG(ERR, "Failed to register vDPA device.");
719                 rte_errno = rte_errno ? rte_errno : EINVAL;
720                 goto error;
721         }
722         SLIST_INIT(&priv->mr_list);
723         pthread_mutex_lock(&priv_list_lock);
724         TAILQ_INSERT_TAIL(&priv_list, priv, next);
725         pthread_mutex_unlock(&priv_list_lock);
726         return 0;
727
728 error:
729         if (priv) {
730                 if (priv->var)
731                         mlx5_glue->dv_free_var(priv->var);
732                 rte_free(priv);
733         }
734         if (ctx)
735                 mlx5_glue->close_device(ctx);
736         return -rte_errno;
737 }
738
739 /**
740  * DPDK callback to remove a PCI device.
741  *
742  * This function removes all vDPA devices belong to a given PCI device.
743  *
744  * @param[in] pci_dev
745  *   Pointer to the PCI device.
746  *
747  * @return
748  *   0 on success, the function cannot fail.
749  */
750 static int
751 mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev)
752 {
753         struct mlx5_vdpa_priv *priv = NULL;
754         int found = 0;
755
756         pthread_mutex_lock(&priv_list_lock);
757         TAILQ_FOREACH(priv, &priv_list, next) {
758                 if (!rte_pci_addr_cmp(&priv->pci_dev->addr, &pci_dev->addr)) {
759                         found = 1;
760                         break;
761                 }
762         }
763         if (found)
764                 TAILQ_REMOVE(&priv_list, priv, next);
765         pthread_mutex_unlock(&priv_list_lock);
766         if (found) {
767                 if (priv->configured)
768                         mlx5_vdpa_dev_close(priv->vid);
769                 if (priv->var) {
770                         mlx5_glue->dv_free_var(priv->var);
771                         priv->var = NULL;
772                 }
773                 mlx5_glue->close_device(priv->ctx);
774                 rte_free(priv);
775         }
776         return 0;
777 }
778
779 static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = {
780         {
781                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
782                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
783         },
784         {
785                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
786                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
787         },
788         {
789                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
790                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
791         },
792         {
793                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
794                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXVF)
795         },
796         {
797                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
798                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
799         },
800         {
801                 .vendor_id = 0
802         }
803 };
804
805 static struct rte_pci_driver mlx5_vdpa_driver = {
806         .driver = {
807                 .name = "mlx5_vdpa",
808         },
809         .id_table = mlx5_vdpa_pci_id_map,
810         .probe = mlx5_vdpa_pci_probe,
811         .remove = mlx5_vdpa_pci_remove,
812         .drv_flags = 0,
813 };
814
815 /**
816  * Driver initialization routine.
817  */
818 RTE_INIT(rte_mlx5_vdpa_init)
819 {
820         /* Initialize common log type. */
821         mlx5_vdpa_logtype = rte_log_register("pmd.vdpa.mlx5");
822         if (mlx5_vdpa_logtype >= 0)
823                 rte_log_set_level(mlx5_vdpa_logtype, RTE_LOG_NOTICE);
824         if (mlx5_glue)
825                 rte_pci_register(&mlx5_vdpa_driver);
826 }
827
828 RTE_PMD_EXPORT_NAME(net_mlx5_vdpa, __COUNTER__);
829 RTE_PMD_REGISTER_PCI_TABLE(net_mlx5_vdpa, mlx5_vdpa_pci_id_map);
830 RTE_PMD_REGISTER_KMOD_DEP(net_mlx5_vdpa, "* ib_uverbs & mlx5_core & mlx5_ib");