vdpa/mlx5: define driver name as macro
[dpdk.git] / drivers / vdpa / mlx5 / mlx5_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2019 Mellanox Technologies, Ltd
3  */
4 #include <unistd.h>
5 #include <net/if.h>
6 #include <sys/socket.h>
7 #include <sys/ioctl.h>
8 #include <fcntl.h>
9 #include <netinet/in.h>
10
11 #include <rte_malloc.h>
12 #include <rte_log.h>
13 #include <rte_errno.h>
14 #include <rte_pci.h>
15 #include <rte_string_fns.h>
16
17 #include <mlx5_glue.h>
18 #include <mlx5_common.h>
19 #include <mlx5_common_pci.h>
20 #include <mlx5_devx_cmds.h>
21 #include <mlx5_prm.h>
22 #include <mlx5_nl.h>
23
24 #include "mlx5_vdpa_utils.h"
25 #include "mlx5_vdpa.h"
26
27 #define MLX5_VDPA_DRIVER_NAME vdpa_mlx5
28
29 #define MLX5_VDPA_DEFAULT_FEATURES ((1ULL << VHOST_USER_F_PROTOCOL_FEATURES) | \
30                             (1ULL << VIRTIO_F_ANY_LAYOUT) | \
31                             (1ULL << VIRTIO_NET_F_MQ) | \
32                             (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) | \
33                             (1ULL << VIRTIO_F_ORDER_PLATFORM) | \
34                             (1ULL << VHOST_F_LOG_ALL) | \
35                             (1ULL << VIRTIO_NET_F_MTU))
36
37 #define MLX5_VDPA_PROTOCOL_FEATURES \
38                             ((1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
39                              (1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD) | \
40                              (1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER) | \
41                              (1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD) | \
42                              (1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
43                              (1ULL << VHOST_USER_PROTOCOL_F_NET_MTU) | \
44                              (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
45
46 #define MLX5_VDPA_MAX_RETRIES 20
47 #define MLX5_VDPA_USEC 1000
48 #define MLX5_VDPA_DEFAULT_NO_TRAFFIC_MAX 16LLU
49
50 TAILQ_HEAD(mlx5_vdpa_privs, mlx5_vdpa_priv) priv_list =
51                                               TAILQ_HEAD_INITIALIZER(priv_list);
52 static pthread_mutex_t priv_list_lock = PTHREAD_MUTEX_INITIALIZER;
53
54 static struct mlx5_vdpa_priv *
55 mlx5_vdpa_find_priv_resource_by_vdev(struct rte_vdpa_device *vdev)
56 {
57         struct mlx5_vdpa_priv *priv;
58         int found = 0;
59
60         pthread_mutex_lock(&priv_list_lock);
61         TAILQ_FOREACH(priv, &priv_list, next) {
62                 if (vdev == priv->vdev) {
63                         found = 1;
64                         break;
65                 }
66         }
67         pthread_mutex_unlock(&priv_list_lock);
68         if (!found) {
69                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
70                 rte_errno = EINVAL;
71                 return NULL;
72         }
73         return priv;
74 }
75
76 static int
77 mlx5_vdpa_get_queue_num(struct rte_vdpa_device *vdev, uint32_t *queue_num)
78 {
79         struct mlx5_vdpa_priv *priv =
80                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
81
82         if (priv == NULL) {
83                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
84                 return -1;
85         }
86         *queue_num = priv->caps.max_num_virtio_queues;
87         return 0;
88 }
89
90 static int
91 mlx5_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev, uint64_t *features)
92 {
93         struct mlx5_vdpa_priv *priv =
94                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
95
96         if (priv == NULL) {
97                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
98                 return -1;
99         }
100         *features = MLX5_VDPA_DEFAULT_FEATURES;
101         if (priv->caps.virtio_queue_type & (1 << MLX5_VIRTQ_TYPE_PACKED))
102                 *features |= (1ULL << VIRTIO_F_RING_PACKED);
103         if (priv->caps.tso_ipv4)
104                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
105         if (priv->caps.tso_ipv6)
106                 *features |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
107         if (priv->caps.tx_csum)
108                 *features |= (1ULL << VIRTIO_NET_F_CSUM);
109         if (priv->caps.rx_csum)
110                 *features |= (1ULL << VIRTIO_NET_F_GUEST_CSUM);
111         if (priv->caps.virtio_version_1_0)
112                 *features |= (1ULL << VIRTIO_F_VERSION_1);
113         return 0;
114 }
115
116 static int
117 mlx5_vdpa_get_protocol_features(struct rte_vdpa_device *vdev,
118                 uint64_t *features)
119 {
120         struct mlx5_vdpa_priv *priv =
121                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
122
123         if (priv == NULL) {
124                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
125                 return -1;
126         }
127         *features = MLX5_VDPA_PROTOCOL_FEATURES;
128         return 0;
129 }
130
131 static int
132 mlx5_vdpa_set_vring_state(int vid, int vring, int state)
133 {
134         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
135         struct mlx5_vdpa_priv *priv =
136                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
137         int ret;
138
139         if (priv == NULL) {
140                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
141                 return -EINVAL;
142         }
143         if (vring >= (int)priv->caps.max_num_virtio_queues * 2) {
144                 DRV_LOG(ERR, "Too big vring id: %d.", vring);
145                 return -E2BIG;
146         }
147         pthread_mutex_lock(&priv->vq_config_lock);
148         ret = mlx5_vdpa_virtq_enable(priv, vring, state);
149         pthread_mutex_unlock(&priv->vq_config_lock);
150         return ret;
151 }
152
153 static int
154 mlx5_vdpa_features_set(int vid)
155 {
156         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
157         struct mlx5_vdpa_priv *priv =
158                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
159         uint64_t log_base, log_size;
160         uint64_t features;
161         int ret;
162
163         if (priv == NULL) {
164                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
165                 return -EINVAL;
166         }
167         ret = rte_vhost_get_negotiated_features(vid, &features);
168         if (ret) {
169                 DRV_LOG(ERR, "Failed to get negotiated features.");
170                 return ret;
171         }
172         if (RTE_VHOST_NEED_LOG(features)) {
173                 ret = rte_vhost_get_log_base(vid, &log_base, &log_size);
174                 if (ret) {
175                         DRV_LOG(ERR, "Failed to get log base.");
176                         return ret;
177                 }
178                 ret = mlx5_vdpa_dirty_bitmap_set(priv, log_base, log_size);
179                 if (ret) {
180                         DRV_LOG(ERR, "Failed to set dirty bitmap.");
181                         return ret;
182                 }
183                 DRV_LOG(INFO, "mlx5 vdpa: enabling dirty logging...");
184                 ret = mlx5_vdpa_logging_enable(priv, 1);
185                 if (ret) {
186                         DRV_LOG(ERR, "Failed t enable dirty logging.");
187                         return ret;
188                 }
189         }
190         return 0;
191 }
192
193 static int
194 mlx5_vdpa_pd_create(struct mlx5_vdpa_priv *priv)
195 {
196 #ifdef HAVE_IBV_FLOW_DV_SUPPORT
197         priv->pd = mlx5_glue->alloc_pd(priv->ctx);
198         if (priv->pd == NULL) {
199                 DRV_LOG(ERR, "Failed to allocate PD.");
200                 return errno ? -errno : -ENOMEM;
201         }
202         struct mlx5dv_obj obj;
203         struct mlx5dv_pd pd_info;
204         int ret = 0;
205
206         obj.pd.in = priv->pd;
207         obj.pd.out = &pd_info;
208         ret = mlx5_glue->dv_init_obj(&obj, MLX5DV_OBJ_PD);
209         if (ret) {
210                 DRV_LOG(ERR, "Fail to get PD object info.");
211                 mlx5_glue->dealloc_pd(priv->pd);
212                 priv->pd = NULL;
213                 return -errno;
214         }
215         priv->pdn = pd_info.pdn;
216         return 0;
217 #else
218         (void)priv;
219         DRV_LOG(ERR, "Cannot get pdn - no DV support.");
220         return -ENOTSUP;
221 #endif /* HAVE_IBV_FLOW_DV_SUPPORT */
222 }
223
224 static int
225 mlx5_vdpa_mtu_set(struct mlx5_vdpa_priv *priv)
226 {
227         struct ifreq request;
228         uint16_t vhost_mtu = 0;
229         uint16_t kern_mtu = 0;
230         int ret = rte_vhost_get_mtu(priv->vid, &vhost_mtu);
231         int sock;
232         int retries = MLX5_VDPA_MAX_RETRIES;
233
234         if (ret) {
235                 DRV_LOG(DEBUG, "Cannot get vhost MTU - %d.", ret);
236                 return ret;
237         }
238         if (!vhost_mtu) {
239                 DRV_LOG(DEBUG, "Vhost MTU is 0.");
240                 return ret;
241         }
242         ret = mlx5_get_ifname_sysfs(priv->ctx->device->ibdev_path,
243                                     request.ifr_name);
244         if (ret) {
245                 DRV_LOG(DEBUG, "Cannot get kernel IF name - %d.", ret);
246                 return ret;
247         }
248         sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
249         if (sock == -1) {
250                 DRV_LOG(DEBUG, "Cannot open IF socket.");
251                 return sock;
252         }
253         while (retries--) {
254                 ret = ioctl(sock, SIOCGIFMTU, &request);
255                 if (ret == -1)
256                         break;
257                 kern_mtu = request.ifr_mtu;
258                 DRV_LOG(DEBUG, "MTU: current %d requested %d.", (int)kern_mtu,
259                         (int)vhost_mtu);
260                 if (kern_mtu == vhost_mtu)
261                         break;
262                 request.ifr_mtu = vhost_mtu;
263                 ret = ioctl(sock, SIOCSIFMTU, &request);
264                 if (ret == -1)
265                         break;
266                 request.ifr_mtu = 0;
267                 usleep(MLX5_VDPA_USEC);
268         }
269         close(sock);
270         return kern_mtu == vhost_mtu ? 0 : -1;
271 }
272
273 static int
274 mlx5_vdpa_dev_close(int vid)
275 {
276         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
277         struct mlx5_vdpa_priv *priv =
278                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
279         int ret = 0;
280
281         if (priv == NULL) {
282                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
283                 return -1;
284         }
285         if (priv->configured)
286                 ret |= mlx5_vdpa_lm_log(priv);
287         mlx5_vdpa_err_event_unset(priv);
288         mlx5_vdpa_cqe_event_unset(priv);
289         mlx5_vdpa_steer_unset(priv);
290         mlx5_vdpa_virtqs_release(priv);
291         mlx5_vdpa_event_qp_global_release(priv);
292         mlx5_vdpa_mem_dereg(priv);
293         if (priv->pd) {
294                 claim_zero(mlx5_glue->dealloc_pd(priv->pd));
295                 priv->pd = NULL;
296         }
297         priv->configured = 0;
298         priv->vid = 0;
299         /* The mutex may stay locked after event thread cancel - initiate it. */
300         pthread_mutex_init(&priv->vq_config_lock, NULL);
301         DRV_LOG(INFO, "vDPA device %d was closed.", vid);
302         return ret;
303 }
304
305 static int
306 mlx5_vdpa_dev_config(int vid)
307 {
308         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
309         struct mlx5_vdpa_priv *priv =
310                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
311
312         if (priv == NULL) {
313                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
314                 return -EINVAL;
315         }
316         if (priv->configured && mlx5_vdpa_dev_close(vid)) {
317                 DRV_LOG(ERR, "Failed to reconfigure vid %d.", vid);
318                 return -1;
319         }
320         priv->vid = vid;
321         if (mlx5_vdpa_mtu_set(priv))
322                 DRV_LOG(WARNING, "MTU cannot be set on device %s.",
323                                 vdev->device->name);
324         if (mlx5_vdpa_pd_create(priv) || mlx5_vdpa_mem_register(priv) ||
325             mlx5_vdpa_err_event_setup(priv) ||
326             mlx5_vdpa_virtqs_prepare(priv) || mlx5_vdpa_steer_setup(priv) ||
327             mlx5_vdpa_cqe_event_setup(priv)) {
328                 mlx5_vdpa_dev_close(vid);
329                 return -1;
330         }
331         priv->configured = 1;
332         DRV_LOG(INFO, "vDPA device %d was configured.", vid);
333         return 0;
334 }
335
336 static int
337 mlx5_vdpa_get_device_fd(int vid)
338 {
339         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
340         struct mlx5_vdpa_priv *priv =
341                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
342
343         if (priv == NULL) {
344                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
345                 return -EINVAL;
346         }
347         return priv->ctx->cmd_fd;
348 }
349
350 static int
351 mlx5_vdpa_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
352 {
353         struct rte_vdpa_device *vdev = rte_vhost_get_vdpa_device(vid);
354         struct mlx5_vdpa_priv *priv =
355                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
356
357         RTE_SET_USED(qid);
358         if (priv == NULL) {
359                 DRV_LOG(ERR, "Invalid vDPA device: %s.", vdev->device->name);
360                 return -EINVAL;
361         }
362         if (!priv->var) {
363                 DRV_LOG(ERR, "VAR was not created for device %s, is the device"
364                         " configured?.", vdev->device->name);
365                 return -EINVAL;
366         }
367         *offset = priv->var->mmap_off;
368         *size = priv->var->length;
369         return 0;
370 }
371
372 static int
373 mlx5_vdpa_get_stats_names(struct rte_vdpa_device *vdev,
374                 struct rte_vdpa_stat_name *stats_names,
375                 unsigned int size)
376 {
377         static const char *mlx5_vdpa_stats_names[MLX5_VDPA_STATS_MAX] = {
378                 "received_descriptors",
379                 "completed_descriptors",
380                 "bad descriptor errors",
381                 "exceed max chain",
382                 "invalid buffer",
383                 "completion errors",
384         };
385         struct mlx5_vdpa_priv *priv =
386                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
387         unsigned int i;
388
389         if (priv == NULL) {
390                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
391                 return -ENODEV;
392         }
393         if (!stats_names)
394                 return MLX5_VDPA_STATS_MAX;
395         size = RTE_MIN(size, (unsigned int)MLX5_VDPA_STATS_MAX);
396         for (i = 0; i < size; ++i)
397                 strlcpy(stats_names[i].name, mlx5_vdpa_stats_names[i],
398                         RTE_VDPA_STATS_NAME_SIZE);
399         return size;
400 }
401
402 static int
403 mlx5_vdpa_get_stats(struct rte_vdpa_device *vdev, int qid,
404                 struct rte_vdpa_stat *stats, unsigned int n)
405 {
406         struct mlx5_vdpa_priv *priv =
407                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
408
409         if (priv == NULL) {
410                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
411                 return -ENODEV;
412         }
413         if (!priv->configured) {
414                 DRV_LOG(ERR, "Device %s was not configured.",
415                                 vdev->device->name);
416                 return -ENODATA;
417         }
418         if (qid >= (int)priv->nr_virtqs) {
419                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
420                                 vdev->device->name);
421                 return -E2BIG;
422         }
423         if (!priv->caps.queue_counters_valid) {
424                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
425                         vdev->device->name);
426                 return -ENOTSUP;
427         }
428         return mlx5_vdpa_virtq_stats_get(priv, qid, stats, n);
429 }
430
431 static int
432 mlx5_vdpa_reset_stats(struct rte_vdpa_device *vdev, int qid)
433 {
434         struct mlx5_vdpa_priv *priv =
435                 mlx5_vdpa_find_priv_resource_by_vdev(vdev);
436
437         if (priv == NULL) {
438                 DRV_LOG(ERR, "Invalid device: %s.", vdev->device->name);
439                 return -ENODEV;
440         }
441         if (!priv->configured) {
442                 DRV_LOG(ERR, "Device %s was not configured.",
443                                 vdev->device->name);
444                 return -ENODATA;
445         }
446         if (qid >= (int)priv->nr_virtqs) {
447                 DRV_LOG(ERR, "Too big vring id: %d for device %s.", qid,
448                                 vdev->device->name);
449                 return -E2BIG;
450         }
451         if (!priv->caps.queue_counters_valid) {
452                 DRV_LOG(ERR, "Virtq statistics is not supported for device %s.",
453                         vdev->device->name);
454                 return -ENOTSUP;
455         }
456         return mlx5_vdpa_virtq_stats_reset(priv, qid);
457 }
458
459 static struct rte_vdpa_dev_ops mlx5_vdpa_ops = {
460         .get_queue_num = mlx5_vdpa_get_queue_num,
461         .get_features = mlx5_vdpa_get_vdpa_features,
462         .get_protocol_features = mlx5_vdpa_get_protocol_features,
463         .dev_conf = mlx5_vdpa_dev_config,
464         .dev_close = mlx5_vdpa_dev_close,
465         .set_vring_state = mlx5_vdpa_set_vring_state,
466         .set_features = mlx5_vdpa_features_set,
467         .migration_done = NULL,
468         .get_vfio_group_fd = NULL,
469         .get_vfio_device_fd = mlx5_vdpa_get_device_fd,
470         .get_notify_area = mlx5_vdpa_get_notify_area,
471         .get_stats_names = mlx5_vdpa_get_stats_names,
472         .get_stats = mlx5_vdpa_get_stats,
473         .reset_stats = mlx5_vdpa_reset_stats,
474 };
475
476 /* Try to disable ROCE by Netlink\Devlink. */
477 static int
478 mlx5_vdpa_nl_roce_disable(const char *addr)
479 {
480         int nlsk_fd = mlx5_nl_init(NETLINK_GENERIC);
481         int devlink_id;
482         int enable;
483         int ret;
484
485         if (nlsk_fd < 0)
486                 return nlsk_fd;
487         devlink_id = mlx5_nl_devlink_family_id_get(nlsk_fd);
488         if (devlink_id < 0) {
489                 ret = devlink_id;
490                 DRV_LOG(DEBUG, "Failed to get devlink id for ROCE operations by"
491                         " Netlink.");
492                 goto close;
493         }
494         ret = mlx5_nl_enable_roce_get(nlsk_fd, devlink_id, addr, &enable);
495         if (ret) {
496                 DRV_LOG(DEBUG, "Failed to get ROCE enable by Netlink: %d.",
497                         ret);
498                 goto close;
499         } else if (!enable) {
500                 DRV_LOG(INFO, "ROCE has already disabled(Netlink).");
501                 goto close;
502         }
503         ret = mlx5_nl_enable_roce_set(nlsk_fd, devlink_id, addr, 0);
504         if (ret)
505                 DRV_LOG(DEBUG, "Failed to disable ROCE by Netlink: %d.", ret);
506         else
507                 DRV_LOG(INFO, "ROCE is disabled by Netlink successfully.");
508 close:
509         close(nlsk_fd);
510         return ret;
511 }
512
513 /* Try to disable ROCE by sysfs. */
514 static int
515 mlx5_vdpa_sys_roce_disable(const char *addr)
516 {
517         FILE *file_o;
518         int enable;
519         int ret;
520
521         MKSTR(file_p, "/sys/bus/pci/devices/%s/roce_enable", addr);
522         file_o = fopen(file_p, "rb");
523         if (!file_o) {
524                 rte_errno = ENOTSUP;
525                 return -ENOTSUP;
526         }
527         ret = fscanf(file_o, "%d", &enable);
528         if (ret != 1) {
529                 rte_errno = EINVAL;
530                 ret = EINVAL;
531                 goto close;
532         } else if (!enable) {
533                 ret = 0;
534                 DRV_LOG(INFO, "ROCE has already disabled(sysfs).");
535                 goto close;
536         }
537         fclose(file_o);
538         file_o = fopen(file_p, "wb");
539         if (!file_o) {
540                 rte_errno = ENOTSUP;
541                 return -ENOTSUP;
542         }
543         fprintf(file_o, "0\n");
544         ret = 0;
545 close:
546         if (ret)
547                 DRV_LOG(DEBUG, "Failed to disable ROCE by sysfs: %d.", ret);
548         else
549                 DRV_LOG(INFO, "ROCE is disabled by sysfs successfully.");
550         fclose(file_o);
551         return ret;
552 }
553
554 static int
555 mlx5_vdpa_roce_disable(struct rte_pci_addr *addr, struct ibv_device **ibv)
556 {
557         char addr_name[64] = {0};
558
559         rte_pci_device_name(addr, addr_name, sizeof(addr_name));
560         /* Firstly try to disable ROCE by Netlink and fallback to sysfs. */
561         if (mlx5_vdpa_nl_roce_disable(addr_name) == 0 ||
562             mlx5_vdpa_sys_roce_disable(addr_name) == 0) {
563                 /*
564                  * Succeed to disable ROCE, wait for the IB device to appear
565                  * again after reload.
566                  */
567                 int r;
568                 struct ibv_device *ibv_new;
569
570                 for (r = MLX5_VDPA_MAX_RETRIES; r; r--) {
571                         ibv_new = mlx5_os_get_ibv_device(addr);
572                         if (ibv_new) {
573                                 *ibv = ibv_new;
574                                 return 0;
575                         }
576                         usleep(MLX5_VDPA_USEC);
577                 }
578                 DRV_LOG(ERR, "Cannot much device %s after ROCE disable, "
579                         "retries exceed %d", addr_name, MLX5_VDPA_MAX_RETRIES);
580                 rte_errno = EAGAIN;
581         }
582         return -rte_errno;
583 }
584
585 static int
586 mlx5_vdpa_args_check_handler(const char *key, const char *val, void *opaque)
587 {
588         struct mlx5_vdpa_priv *priv = opaque;
589         unsigned long tmp;
590         int n_cores = sysconf(_SC_NPROCESSORS_ONLN);
591
592         if (strcmp(key, RTE_DEVARGS_KEY_CLASS) == 0)
593                 return 0;
594         errno = 0;
595         tmp = strtoul(val, NULL, 0);
596         if (errno) {
597                 DRV_LOG(WARNING, "%s: \"%s\" is an invalid integer.", key, val);
598                 return -errno;
599         }
600         if (strcmp(key, "event_mode") == 0) {
601                 if (tmp <= MLX5_VDPA_EVENT_MODE_ONLY_INTERRUPT)
602                         priv->event_mode = (int)tmp;
603                 else
604                         DRV_LOG(WARNING, "Invalid event_mode %s.", val);
605         } else if (strcmp(key, "event_us") == 0) {
606                 priv->event_us = (uint32_t)tmp;
607         } else if (strcmp(key, "no_traffic_time") == 0) {
608                 priv->no_traffic_max = (uint32_t)tmp;
609         } else if (strcmp(key, "event_core") == 0) {
610                 if (tmp >= (unsigned long)n_cores)
611                         DRV_LOG(WARNING, "Invalid event_core %s.", val);
612                 else
613                         priv->event_core = tmp;
614         } else if (strcmp(key, "hw_latency_mode") == 0) {
615                 priv->hw_latency_mode = (uint32_t)tmp;
616         } else if (strcmp(key, "hw_max_latency_us") == 0) {
617                 priv->hw_max_latency_us = (uint32_t)tmp;
618         } else if (strcmp(key, "hw_max_pending_comp") == 0) {
619                 priv->hw_max_pending_comp = (uint32_t)tmp;
620         } else {
621                 DRV_LOG(WARNING, "Invalid key %s.", key);
622         }
623         return 0;
624 }
625
626 static void
627 mlx5_vdpa_config_get(struct rte_devargs *devargs, struct mlx5_vdpa_priv *priv)
628 {
629         struct rte_kvargs *kvlist;
630
631         priv->event_mode = MLX5_VDPA_EVENT_MODE_FIXED_TIMER;
632         priv->event_us = 0;
633         priv->event_core = -1;
634         priv->no_traffic_max = MLX5_VDPA_DEFAULT_NO_TRAFFIC_MAX;
635         if (devargs == NULL)
636                 return;
637         kvlist = rte_kvargs_parse(devargs->args, NULL);
638         if (kvlist == NULL)
639                 return;
640         rte_kvargs_process(kvlist, NULL, mlx5_vdpa_args_check_handler, priv);
641         rte_kvargs_free(kvlist);
642         if (!priv->event_us &&
643             priv->event_mode == MLX5_VDPA_EVENT_MODE_DYNAMIC_TIMER)
644                 priv->event_us = MLX5_VDPA_DEFAULT_TIMER_STEP_US;
645         DRV_LOG(DEBUG, "event mode is %d.", priv->event_mode);
646         DRV_LOG(DEBUG, "event_us is %u us.", priv->event_us);
647         DRV_LOG(DEBUG, "no traffic max is %u.", priv->no_traffic_max);
648 }
649
650 /**
651  * DPDK callback to register a mlx5 PCI device.
652  *
653  * This function spawns vdpa device out of a given PCI device.
654  *
655  * @param[in] pci_drv
656  *   PCI driver structure (mlx5_vpda_driver).
657  * @param[in] pci_dev
658  *   PCI device information.
659  *
660  * @return
661  *   0 on success, 1 to skip this driver, a negative errno value otherwise
662  *   and rte_errno is set.
663  */
664 static int
665 mlx5_vdpa_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
666                     struct rte_pci_device *pci_dev __rte_unused)
667 {
668         struct ibv_device *ibv;
669         struct mlx5_vdpa_priv *priv = NULL;
670         struct ibv_context *ctx = NULL;
671         struct mlx5_hca_attr attr;
672         int ret;
673
674         ibv = mlx5_os_get_ibv_device(&pci_dev->addr);
675         if (!ibv) {
676                 DRV_LOG(ERR, "No matching IB device for PCI slot "
677                         PCI_PRI_FMT ".", pci_dev->addr.domain,
678                         pci_dev->addr.bus, pci_dev->addr.devid,
679                         pci_dev->addr.function);
680                 return -rte_errno;
681         } else {
682                 DRV_LOG(INFO, "PCI information matches for device \"%s\".",
683                         ibv->name);
684         }
685         if (mlx5_vdpa_roce_disable(&pci_dev->addr, &ibv) != 0) {
686                 DRV_LOG(WARNING, "Failed to disable ROCE for \"%s\".",
687                         ibv->name);
688                 return -rte_errno;
689         }
690         ctx = mlx5_glue->dv_open_device(ibv);
691         if (!ctx) {
692                 DRV_LOG(ERR, "Failed to open IB device \"%s\".", ibv->name);
693                 rte_errno = ENODEV;
694                 return -rte_errno;
695         }
696         ret = mlx5_devx_cmd_query_hca_attr(ctx, &attr);
697         if (ret) {
698                 DRV_LOG(ERR, "Unable to read HCA capabilities.");
699                 rte_errno = ENOTSUP;
700                 goto error;
701         } else if (!attr.vdpa.valid || !attr.vdpa.max_num_virtio_queues) {
702                 DRV_LOG(ERR, "Not enough capabilities to support vdpa, maybe "
703                         "old FW/OFED version?");
704                 rte_errno = ENOTSUP;
705                 goto error;
706         }
707         if (!attr.vdpa.queue_counters_valid)
708                 DRV_LOG(DEBUG, "No capability to support virtq statistics.");
709         priv = rte_zmalloc("mlx5 vDPA device private", sizeof(*priv) +
710                            sizeof(struct mlx5_vdpa_virtq) *
711                            attr.vdpa.max_num_virtio_queues * 2,
712                            RTE_CACHE_LINE_SIZE);
713         if (!priv) {
714                 DRV_LOG(ERR, "Failed to allocate private memory.");
715                 rte_errno = ENOMEM;
716                 goto error;
717         }
718         priv->caps = attr.vdpa;
719         priv->log_max_rqt_size = attr.log_max_rqt_size;
720         priv->num_lag_ports = attr.num_lag_ports;
721         priv->qp_ts_format = attr.qp_ts_format;
722         if (attr.num_lag_ports == 0)
723                 priv->num_lag_ports = 1;
724         priv->ctx = ctx;
725         priv->pci_dev = pci_dev;
726         priv->var = mlx5_glue->dv_alloc_var(ctx, 0);
727         if (!priv->var) {
728                 DRV_LOG(ERR, "Failed to allocate VAR %u.", errno);
729                 goto error;
730         }
731         priv->vdev = rte_vdpa_register_device(&pci_dev->device,
732                         &mlx5_vdpa_ops);
733         if (priv->vdev == NULL) {
734                 DRV_LOG(ERR, "Failed to register vDPA device.");
735                 rte_errno = rte_errno ? rte_errno : EINVAL;
736                 goto error;
737         }
738         mlx5_vdpa_config_get(pci_dev->device.devargs, priv);
739         SLIST_INIT(&priv->mr_list);
740         pthread_mutex_init(&priv->vq_config_lock, NULL);
741         pthread_mutex_lock(&priv_list_lock);
742         TAILQ_INSERT_TAIL(&priv_list, priv, next);
743         pthread_mutex_unlock(&priv_list_lock);
744         return 0;
745
746 error:
747         if (priv) {
748                 if (priv->var)
749                         mlx5_glue->dv_free_var(priv->var);
750                 rte_free(priv);
751         }
752         if (ctx)
753                 mlx5_glue->close_device(ctx);
754         return -rte_errno;
755 }
756
757 /**
758  * DPDK callback to remove a PCI device.
759  *
760  * This function removes all vDPA devices belong to a given PCI device.
761  *
762  * @param[in] pci_dev
763  *   Pointer to the PCI device.
764  *
765  * @return
766  *   0 on success, the function cannot fail.
767  */
768 static int
769 mlx5_vdpa_pci_remove(struct rte_pci_device *pci_dev)
770 {
771         struct mlx5_vdpa_priv *priv = NULL;
772         int found = 0;
773
774         pthread_mutex_lock(&priv_list_lock);
775         TAILQ_FOREACH(priv, &priv_list, next) {
776                 if (!rte_pci_addr_cmp(&priv->pci_dev->addr, &pci_dev->addr)) {
777                         found = 1;
778                         break;
779                 }
780         }
781         if (found)
782                 TAILQ_REMOVE(&priv_list, priv, next);
783         pthread_mutex_unlock(&priv_list_lock);
784         if (found) {
785                 if (priv->configured)
786                         mlx5_vdpa_dev_close(priv->vid);
787                 if (priv->var) {
788                         mlx5_glue->dv_free_var(priv->var);
789                         priv->var = NULL;
790                 }
791                 if (priv->vdev)
792                         rte_vdpa_unregister_device(priv->vdev);
793                 mlx5_glue->close_device(priv->ctx);
794                 pthread_mutex_destroy(&priv->vq_config_lock);
795                 rte_free(priv);
796         }
797         return 0;
798 }
799
800 static const struct rte_pci_id mlx5_vdpa_pci_id_map[] = {
801         {
802                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
803                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6)
804         },
805         {
806                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
807                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6VF)
808         },
809         {
810                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
811                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DX)
812         },
813         {
814                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
815                                 PCI_DEVICE_ID_MELLANOX_CONNECTXVF)
816         },
817         {
818                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
819                                 PCI_DEVICE_ID_MELLANOX_CONNECTX6DXBF)
820         },
821         {
822                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
823                                 PCI_DEVICE_ID_MELLANOX_CONNECTX7)
824         },
825         {
826                 RTE_PCI_DEVICE(PCI_VENDOR_ID_MELLANOX,
827                                 PCI_DEVICE_ID_MELLANOX_CONNECTX7BF)
828         },
829         {
830                 .vendor_id = 0
831         }
832 };
833
834 static struct mlx5_pci_driver mlx5_vdpa_driver = {
835         .driver_class = MLX5_CLASS_VDPA,
836         .pci_driver = {
837                 .driver = {
838                         .name = RTE_STR(MLX5_VDPA_DRIVER_NAME),
839                 },
840                 .id_table = mlx5_vdpa_pci_id_map,
841                 .probe = mlx5_vdpa_pci_probe,
842                 .remove = mlx5_vdpa_pci_remove,
843                 .drv_flags = 0,
844         },
845 };
846
847 RTE_LOG_REGISTER_DEFAULT(mlx5_vdpa_logtype, NOTICE)
848
849 /**
850  * Driver initialization routine.
851  */
852 RTE_INIT(rte_mlx5_vdpa_init)
853 {
854         mlx5_common_init();
855         if (mlx5_glue)
856                 mlx5_pci_driver_register(&mlx5_vdpa_driver);
857 }
858
859 RTE_PMD_EXPORT_NAME(MLX5_VDPA_DRIVER_NAME, __COUNTER__);
860 RTE_PMD_REGISTER_PCI_TABLE(MLX5_VDPA_DRIVER_NAME, mlx5_vdpa_pci_id_map);
861 RTE_PMD_REGISTER_KMOD_DEP(MLX5_VDPA_DRIVER_NAME, "* ib_uverbs & mlx5_core & mlx5_ib");