4edba3e5f61876261485542f46f9fba6bf9415f9
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11 #include <stdbool.h>
12
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
17 #include <rte_vdpa.h>
18 #include <rte_vfio.h>
19 #include <rte_spinlock.h>
20 #include <rte_log.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
23
24 #include "base/ifcvf.h"
25
26 #define DRV_LOG(level, fmt, args...) \
27         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28                 "IFCVF %s(): " fmt "\n", __func__, ##args)
29
30 #ifndef PAGE_SIZE
31 #define PAGE_SIZE 4096
32 #endif
33
34 #define IFCVF_USED_RING_LEN(size) \
35         ((size) * sizeof(struct vring_used_elem) + sizeof(uint16_t) * 3)
36
37 #define IFCVF_VDPA_MODE         "vdpa"
38 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
39
40 static const char * const ifcvf_valid_arguments[] = {
41         IFCVF_VDPA_MODE,
42         IFCVF_SW_FALLBACK_LM,
43         NULL
44 };
45
46 static int ifcvf_vdpa_logtype;
47
48 struct ifcvf_internal {
49         struct rte_vdpa_dev_addr dev_addr;
50         struct rte_pci_device *pdev;
51         struct ifcvf_hw hw;
52         int vfio_container_fd;
53         int vfio_group_fd;
54         int vfio_dev_fd;
55         pthread_t tid;  /* thread for notify relay */
56         int epfd;
57         int vid;
58         int did;
59         uint16_t max_queues;
60         uint64_t features;
61         rte_atomic32_t started;
62         rte_atomic32_t dev_attached;
63         rte_atomic32_t running;
64         rte_spinlock_t lock;
65         bool sw_lm;
66 };
67
68 struct internal_list {
69         TAILQ_ENTRY(internal_list) next;
70         struct ifcvf_internal *internal;
71 };
72
73 TAILQ_HEAD(internal_list_head, internal_list);
74 static struct internal_list_head internal_list =
75         TAILQ_HEAD_INITIALIZER(internal_list);
76
77 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
78
79 static struct internal_list *
80 find_internal_resource_by_did(int did)
81 {
82         int found = 0;
83         struct internal_list *list;
84
85         pthread_mutex_lock(&internal_list_lock);
86
87         TAILQ_FOREACH(list, &internal_list, next) {
88                 if (did == list->internal->did) {
89                         found = 1;
90                         break;
91                 }
92         }
93
94         pthread_mutex_unlock(&internal_list_lock);
95
96         if (!found)
97                 return NULL;
98
99         return list;
100 }
101
102 static struct internal_list *
103 find_internal_resource_by_dev(struct rte_pci_device *pdev)
104 {
105         int found = 0;
106         struct internal_list *list;
107
108         pthread_mutex_lock(&internal_list_lock);
109
110         TAILQ_FOREACH(list, &internal_list, next) {
111                 if (pdev == list->internal->pdev) {
112                         found = 1;
113                         break;
114                 }
115         }
116
117         pthread_mutex_unlock(&internal_list_lock);
118
119         if (!found)
120                 return NULL;
121
122         return list;
123 }
124
125 static int
126 ifcvf_vfio_setup(struct ifcvf_internal *internal)
127 {
128         struct rte_pci_device *dev = internal->pdev;
129         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
130         int iommu_group_num;
131         int i;
132
133         internal->vfio_dev_fd = -1;
134         internal->vfio_group_fd = -1;
135         internal->vfio_container_fd = -1;
136
137         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
138         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
139                         &iommu_group_num);
140
141         internal->vfio_container_fd = rte_vfio_container_create();
142         if (internal->vfio_container_fd < 0)
143                 return -1;
144
145         internal->vfio_group_fd = rte_vfio_container_group_bind(
146                         internal->vfio_container_fd, iommu_group_num);
147         if (internal->vfio_group_fd < 0)
148                 goto err;
149
150         if (rte_pci_map_device(dev))
151                 goto err;
152
153         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
154
155         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
156                         i++) {
157                 internal->hw.mem_resource[i].addr =
158                         internal->pdev->mem_resource[i].addr;
159                 internal->hw.mem_resource[i].phys_addr =
160                         internal->pdev->mem_resource[i].phys_addr;
161                 internal->hw.mem_resource[i].len =
162                         internal->pdev->mem_resource[i].len;
163         }
164
165         return 0;
166
167 err:
168         rte_vfio_container_destroy(internal->vfio_container_fd);
169         return -1;
170 }
171
172 static int
173 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
174 {
175         uint32_t i;
176         int ret;
177         struct rte_vhost_memory *mem = NULL;
178         int vfio_container_fd;
179
180         ret = rte_vhost_get_mem_table(internal->vid, &mem);
181         if (ret < 0) {
182                 DRV_LOG(ERR, "failed to get VM memory layout.");
183                 goto exit;
184         }
185
186         vfio_container_fd = internal->vfio_container_fd;
187
188         for (i = 0; i < mem->nregions; i++) {
189                 struct rte_vhost_mem_region *reg;
190
191                 reg = &mem->regions[i];
192                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
193                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
194                         do_map ? "DMA map" : "DMA unmap", i,
195                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
196
197                 if (do_map) {
198                         ret = rte_vfio_container_dma_map(vfio_container_fd,
199                                 reg->host_user_addr, reg->guest_phys_addr,
200                                 reg->size);
201                         if (ret < 0) {
202                                 DRV_LOG(ERR, "DMA map failed.");
203                                 goto exit;
204                         }
205                 } else {
206                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
207                                 reg->host_user_addr, reg->guest_phys_addr,
208                                 reg->size);
209                         if (ret < 0) {
210                                 DRV_LOG(ERR, "DMA unmap failed.");
211                                 goto exit;
212                         }
213                 }
214         }
215
216 exit:
217         if (mem)
218                 free(mem);
219         return ret;
220 }
221
222 static uint64_t
223 hva_to_gpa(int vid, uint64_t hva)
224 {
225         struct rte_vhost_memory *mem = NULL;
226         struct rte_vhost_mem_region *reg;
227         uint32_t i;
228         uint64_t gpa = 0;
229
230         if (rte_vhost_get_mem_table(vid, &mem) < 0)
231                 goto exit;
232
233         for (i = 0; i < mem->nregions; i++) {
234                 reg = &mem->regions[i];
235
236                 if (hva >= reg->host_user_addr &&
237                                 hva < reg->host_user_addr + reg->size) {
238                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
239                         break;
240                 }
241         }
242
243 exit:
244         if (mem)
245                 free(mem);
246         return gpa;
247 }
248
249 static int
250 vdpa_ifcvf_start(struct ifcvf_internal *internal)
251 {
252         struct ifcvf_hw *hw = &internal->hw;
253         int i, nr_vring;
254         int vid;
255         struct rte_vhost_vring vq;
256         uint64_t gpa;
257
258         vid = internal->vid;
259         nr_vring = rte_vhost_get_vring_num(vid);
260         rte_vhost_get_negotiated_features(vid, &hw->req_features);
261
262         for (i = 0; i < nr_vring; i++) {
263                 rte_vhost_get_vhost_vring(vid, i, &vq);
264                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
265                 if (gpa == 0) {
266                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
267                         return -1;
268                 }
269                 hw->vring[i].desc = gpa;
270
271                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
272                 if (gpa == 0) {
273                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
274                         return -1;
275                 }
276                 hw->vring[i].avail = gpa;
277
278                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
279                 if (gpa == 0) {
280                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
281                         return -1;
282                 }
283                 hw->vring[i].used = gpa;
284
285                 hw->vring[i].size = vq.size;
286                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
287                                 &hw->vring[i].last_used_idx);
288         }
289         hw->nr_vring = i;
290
291         return ifcvf_start_hw(&internal->hw);
292 }
293
294 static void
295 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
296 {
297         struct ifcvf_hw *hw = &internal->hw;
298         uint32_t i;
299         int vid;
300         uint64_t features;
301         uint64_t log_base, log_size;
302         uint64_t len;
303
304         vid = internal->vid;
305         ifcvf_stop_hw(hw);
306
307         for (i = 0; i < hw->nr_vring; i++)
308                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
309                                 hw->vring[i].last_used_idx);
310
311         rte_vhost_get_negotiated_features(vid, &features);
312         if (RTE_VHOST_NEED_LOG(features)) {
313                 ifcvf_disable_logging(hw);
314                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
315                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
316                                 log_base, IFCVF_LOG_BASE, log_size);
317                 /*
318                  * IFCVF marks dirty memory pages for only packet buffer,
319                  * SW helps to mark the used ring as dirty after device stops.
320                  */
321                 for (i = 0; i < hw->nr_vring; i++) {
322                         len = IFCVF_USED_RING_LEN(hw->vring[i].size);
323                         rte_vhost_log_used_vring(vid, i, 0, len);
324                 }
325         }
326 }
327
328 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
329                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
330 static int
331 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
332 {
333         int ret;
334         uint32_t i, nr_vring;
335         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
336         struct vfio_irq_set *irq_set;
337         int *fd_ptr;
338         struct rte_vhost_vring vring;
339
340         nr_vring = rte_vhost_get_vring_num(internal->vid);
341
342         irq_set = (struct vfio_irq_set *)irq_set_buf;
343         irq_set->argsz = sizeof(irq_set_buf);
344         irq_set->count = nr_vring + 1;
345         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
346                          VFIO_IRQ_SET_ACTION_TRIGGER;
347         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
348         irq_set->start = 0;
349         fd_ptr = (int *)&irq_set->data;
350         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
351
352         for (i = 0; i < nr_vring; i++) {
353                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
354                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
355         }
356
357         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
358         if (ret) {
359                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
360                                 strerror(errno));
361                 return -1;
362         }
363
364         return 0;
365 }
366
367 static int
368 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
369 {
370         int ret;
371         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
372         struct vfio_irq_set *irq_set;
373
374         irq_set = (struct vfio_irq_set *)irq_set_buf;
375         irq_set->argsz = sizeof(irq_set_buf);
376         irq_set->count = 0;
377         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
378         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
379         irq_set->start = 0;
380
381         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
382         if (ret) {
383                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
384                                 strerror(errno));
385                 return -1;
386         }
387
388         return 0;
389 }
390
391 static void *
392 notify_relay(void *arg)
393 {
394         int i, kickfd, epfd, nfds = 0;
395         uint32_t qid, q_num;
396         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
397         struct epoll_event ev;
398         uint64_t buf;
399         int nbytes;
400         struct rte_vhost_vring vring;
401         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
402         struct ifcvf_hw *hw = &internal->hw;
403
404         q_num = rte_vhost_get_vring_num(internal->vid);
405
406         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
407         if (epfd < 0) {
408                 DRV_LOG(ERR, "failed to create epoll instance.");
409                 return NULL;
410         }
411         internal->epfd = epfd;
412
413         for (qid = 0; qid < q_num; qid++) {
414                 ev.events = EPOLLIN | EPOLLPRI;
415                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
416                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
417                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
418                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
419                         return NULL;
420                 }
421         }
422
423         for (;;) {
424                 nfds = epoll_wait(epfd, events, q_num, -1);
425                 if (nfds < 0) {
426                         if (errno == EINTR)
427                                 continue;
428                         DRV_LOG(ERR, "epoll_wait return fail\n");
429                         return NULL;
430                 }
431
432                 for (i = 0; i < nfds; i++) {
433                         qid = events[i].data.u32;
434                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
435                         do {
436                                 nbytes = read(kickfd, &buf, 8);
437                                 if (nbytes < 0) {
438                                         if (errno == EINTR ||
439                                             errno == EWOULDBLOCK ||
440                                             errno == EAGAIN)
441                                                 continue;
442                                         DRV_LOG(INFO, "Error reading "
443                                                 "kickfd: %s",
444                                                 strerror(errno));
445                                 }
446                                 break;
447                         } while (1);
448
449                         ifcvf_notify_queue(hw, qid);
450                 }
451         }
452
453         return NULL;
454 }
455
456 static int
457 setup_notify_relay(struct ifcvf_internal *internal)
458 {
459         int ret;
460
461         ret = pthread_create(&internal->tid, NULL, notify_relay,
462                         (void *)internal);
463         if (ret) {
464                 DRV_LOG(ERR, "failed to create notify relay pthread.");
465                 return -1;
466         }
467         return 0;
468 }
469
470 static int
471 unset_notify_relay(struct ifcvf_internal *internal)
472 {
473         void *status;
474
475         if (internal->tid) {
476                 pthread_cancel(internal->tid);
477                 pthread_join(internal->tid, &status);
478         }
479         internal->tid = 0;
480
481         if (internal->epfd >= 0)
482                 close(internal->epfd);
483         internal->epfd = -1;
484
485         return 0;
486 }
487
488 static int
489 update_datapath(struct ifcvf_internal *internal)
490 {
491         int ret;
492
493         rte_spinlock_lock(&internal->lock);
494
495         if (!rte_atomic32_read(&internal->running) &&
496             (rte_atomic32_read(&internal->started) &&
497              rte_atomic32_read(&internal->dev_attached))) {
498                 ret = ifcvf_dma_map(internal, 1);
499                 if (ret)
500                         goto err;
501
502                 ret = vdpa_enable_vfio_intr(internal);
503                 if (ret)
504                         goto err;
505
506                 ret = vdpa_ifcvf_start(internal);
507                 if (ret)
508                         goto err;
509
510                 ret = setup_notify_relay(internal);
511                 if (ret)
512                         goto err;
513
514                 rte_atomic32_set(&internal->running, 1);
515         } else if (rte_atomic32_read(&internal->running) &&
516                    (!rte_atomic32_read(&internal->started) ||
517                     !rte_atomic32_read(&internal->dev_attached))) {
518                 ret = unset_notify_relay(internal);
519                 if (ret)
520                         goto err;
521
522                 vdpa_ifcvf_stop(internal);
523
524                 ret = vdpa_disable_vfio_intr(internal);
525                 if (ret)
526                         goto err;
527
528                 ret = ifcvf_dma_map(internal, 0);
529                 if (ret)
530                         goto err;
531
532                 rte_atomic32_set(&internal->running, 0);
533         }
534
535         rte_spinlock_unlock(&internal->lock);
536         return 0;
537 err:
538         rte_spinlock_unlock(&internal->lock);
539         return ret;
540 }
541
542 static int
543 ifcvf_dev_config(int vid)
544 {
545         int did;
546         struct internal_list *list;
547         struct ifcvf_internal *internal;
548
549         did = rte_vhost_get_vdpa_device_id(vid);
550         list = find_internal_resource_by_did(did);
551         if (list == NULL) {
552                 DRV_LOG(ERR, "Invalid device id: %d", did);
553                 return -1;
554         }
555
556         internal = list->internal;
557         internal->vid = vid;
558         rte_atomic32_set(&internal->dev_attached, 1);
559         update_datapath(internal);
560
561         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
562                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
563
564         return 0;
565 }
566
567 static int
568 ifcvf_dev_close(int vid)
569 {
570         int did;
571         struct internal_list *list;
572         struct ifcvf_internal *internal;
573
574         did = rte_vhost_get_vdpa_device_id(vid);
575         list = find_internal_resource_by_did(did);
576         if (list == NULL) {
577                 DRV_LOG(ERR, "Invalid device id: %d", did);
578                 return -1;
579         }
580
581         internal = list->internal;
582         rte_atomic32_set(&internal->dev_attached, 0);
583         update_datapath(internal);
584
585         return 0;
586 }
587
588 static int
589 ifcvf_set_features(int vid)
590 {
591         uint64_t features;
592         int did;
593         struct internal_list *list;
594         struct ifcvf_internal *internal;
595         uint64_t log_base, log_size;
596
597         did = rte_vhost_get_vdpa_device_id(vid);
598         list = find_internal_resource_by_did(did);
599         if (list == NULL) {
600                 DRV_LOG(ERR, "Invalid device id: %d", did);
601                 return -1;
602         }
603
604         internal = list->internal;
605         rte_vhost_get_negotiated_features(vid, &features);
606
607         if (RTE_VHOST_NEED_LOG(features)) {
608                 rte_vhost_get_log_base(vid, &log_base, &log_size);
609                 rte_vfio_container_dma_map(internal->vfio_container_fd,
610                                 log_base, IFCVF_LOG_BASE, log_size);
611                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
612         }
613
614         return 0;
615 }
616
617 static int
618 ifcvf_get_vfio_group_fd(int vid)
619 {
620         int did;
621         struct internal_list *list;
622
623         did = rte_vhost_get_vdpa_device_id(vid);
624         list = find_internal_resource_by_did(did);
625         if (list == NULL) {
626                 DRV_LOG(ERR, "Invalid device id: %d", did);
627                 return -1;
628         }
629
630         return list->internal->vfio_group_fd;
631 }
632
633 static int
634 ifcvf_get_vfio_device_fd(int vid)
635 {
636         int did;
637         struct internal_list *list;
638
639         did = rte_vhost_get_vdpa_device_id(vid);
640         list = find_internal_resource_by_did(did);
641         if (list == NULL) {
642                 DRV_LOG(ERR, "Invalid device id: %d", did);
643                 return -1;
644         }
645
646         return list->internal->vfio_dev_fd;
647 }
648
649 static int
650 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
651 {
652         int did;
653         struct internal_list *list;
654         struct ifcvf_internal *internal;
655         struct vfio_region_info reg = { .argsz = sizeof(reg) };
656         int ret;
657
658         did = rte_vhost_get_vdpa_device_id(vid);
659         list = find_internal_resource_by_did(did);
660         if (list == NULL) {
661                 DRV_LOG(ERR, "Invalid device id: %d", did);
662                 return -1;
663         }
664
665         internal = list->internal;
666
667         reg.index = ifcvf_get_notify_region(&internal->hw);
668         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
669         if (ret) {
670                 DRV_LOG(ERR, "Get not get device region info: %s",
671                                 strerror(errno));
672                 return -1;
673         }
674
675         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
676         *size = 0x1000;
677
678         return 0;
679 }
680
681 static int
682 ifcvf_get_queue_num(int did, uint32_t *queue_num)
683 {
684         struct internal_list *list;
685
686         list = find_internal_resource_by_did(did);
687         if (list == NULL) {
688                 DRV_LOG(ERR, "Invalid device id: %d", did);
689                 return -1;
690         }
691
692         *queue_num = list->internal->max_queues;
693
694         return 0;
695 }
696
697 static int
698 ifcvf_get_vdpa_features(int did, uint64_t *features)
699 {
700         struct internal_list *list;
701
702         list = find_internal_resource_by_did(did);
703         if (list == NULL) {
704                 DRV_LOG(ERR, "Invalid device id: %d", did);
705                 return -1;
706         }
707
708         *features = list->internal->features;
709
710         return 0;
711 }
712
713 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
714                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
715                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
716                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
717                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
718                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
719 static int
720 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
721 {
722         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
723         return 0;
724 }
725
726 static struct rte_vdpa_dev_ops ifcvf_ops = {
727         .get_queue_num = ifcvf_get_queue_num,
728         .get_features = ifcvf_get_vdpa_features,
729         .get_protocol_features = ifcvf_get_protocol_features,
730         .dev_conf = ifcvf_dev_config,
731         .dev_close = ifcvf_dev_close,
732         .set_vring_state = NULL,
733         .set_features = ifcvf_set_features,
734         .migration_done = NULL,
735         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
736         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
737         .get_notify_area = ifcvf_get_notify_area,
738 };
739
740 static inline int
741 open_int(const char *key __rte_unused, const char *value, void *extra_args)
742 {
743         uint16_t *n = extra_args;
744
745         if (value == NULL || extra_args == NULL)
746                 return -EINVAL;
747
748         *n = (uint16_t)strtoul(value, NULL, 0);
749         if (*n == USHRT_MAX && errno == ERANGE)
750                 return -1;
751
752         return 0;
753 }
754
755 static int
756 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
757                 struct rte_pci_device *pci_dev)
758 {
759         uint64_t features;
760         struct ifcvf_internal *internal = NULL;
761         struct internal_list *list = NULL;
762         int vdpa_mode = 0;
763         int sw_fallback_lm = 0;
764         struct rte_kvargs *kvlist = NULL;
765         int ret = 0;
766
767         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
768                 return 0;
769
770         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
771                         ifcvf_valid_arguments);
772         if (kvlist == NULL)
773                 return 1;
774
775         /* probe only when vdpa mode is specified */
776         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
777                 rte_kvargs_free(kvlist);
778                 return 1;
779         }
780
781         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
782                         &vdpa_mode);
783         if (ret < 0 || vdpa_mode == 0) {
784                 rte_kvargs_free(kvlist);
785                 return 1;
786         }
787
788         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
789         if (list == NULL)
790                 goto error;
791
792         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
793         if (internal == NULL)
794                 goto error;
795
796         internal->pdev = pci_dev;
797         rte_spinlock_init(&internal->lock);
798
799         if (ifcvf_vfio_setup(internal) < 0) {
800                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
801                 goto error;
802         }
803
804         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
805                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
806                 goto error;
807         }
808
809         internal->max_queues = IFCVF_MAX_QUEUES;
810         features = ifcvf_get_features(&internal->hw);
811         internal->features = (features &
812                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
813                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
814                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
815                 (1ULL << VIRTIO_NET_F_STATUS) |
816                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
817                 (1ULL << VHOST_F_LOG_ALL);
818
819         internal->dev_addr.pci_addr = pci_dev->addr;
820         internal->dev_addr.type = PCI_ADDR;
821         list->internal = internal;
822
823         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
824                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
825                                 &open_int, &sw_fallback_lm);
826                 if (ret < 0)
827                         goto error;
828         }
829         internal->sw_lm = sw_fallback_lm;
830
831         internal->did = rte_vdpa_register_device(&internal->dev_addr,
832                                 &ifcvf_ops);
833         if (internal->did < 0) {
834                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
835                 goto error;
836         }
837
838         pthread_mutex_lock(&internal_list_lock);
839         TAILQ_INSERT_TAIL(&internal_list, list, next);
840         pthread_mutex_unlock(&internal_list_lock);
841
842         rte_atomic32_set(&internal->started, 1);
843         update_datapath(internal);
844
845         rte_kvargs_free(kvlist);
846         return 0;
847
848 error:
849         rte_kvargs_free(kvlist);
850         rte_free(list);
851         rte_free(internal);
852         return -1;
853 }
854
855 static int
856 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
857 {
858         struct ifcvf_internal *internal;
859         struct internal_list *list;
860
861         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
862                 return 0;
863
864         list = find_internal_resource_by_dev(pci_dev);
865         if (list == NULL) {
866                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
867                 return -1;
868         }
869
870         internal = list->internal;
871         rte_atomic32_set(&internal->started, 0);
872         update_datapath(internal);
873
874         rte_pci_unmap_device(internal->pdev);
875         rte_vfio_container_destroy(internal->vfio_container_fd);
876         rte_vdpa_unregister_device(internal->did);
877
878         pthread_mutex_lock(&internal_list_lock);
879         TAILQ_REMOVE(&internal_list, list, next);
880         pthread_mutex_unlock(&internal_list_lock);
881
882         rte_free(list);
883         rte_free(internal);
884
885         return 0;
886 }
887
888 /*
889  * IFCVF has the same vendor ID and device ID as virtio net PCI
890  * device, with its specific subsystem vendor ID and device ID.
891  */
892 static const struct rte_pci_id pci_id_ifcvf_map[] = {
893         { .class_id = RTE_CLASS_ANY_ID,
894           .vendor_id = IFCVF_VENDOR_ID,
895           .device_id = IFCVF_DEVICE_ID,
896           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
897           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
898         },
899
900         { .vendor_id = 0, /* sentinel */
901         },
902 };
903
904 static struct rte_pci_driver rte_ifcvf_vdpa = {
905         .id_table = pci_id_ifcvf_map,
906         .drv_flags = 0,
907         .probe = ifcvf_pci_probe,
908         .remove = ifcvf_pci_remove,
909 };
910
911 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
912 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
913 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
914
915 RTE_INIT(ifcvf_vdpa_init_log)
916 {
917         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
918         if (ifcvf_vdpa_logtype >= 0)
919                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
920 }