aacd5f9bfcc82e243a04f239678aa7a24d772429
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11
12 #include <rte_malloc.h>
13 #include <rte_memory.h>
14 #include <rte_bus_pci.h>
15 #include <rte_vhost.h>
16 #include <rte_vdpa.h>
17 #include <rte_vfio.h>
18 #include <rte_spinlock.h>
19 #include <rte_log.h>
20
21 #include "base/ifcvf.h"
22
23 #define DRV_LOG(level, fmt, args...) \
24         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
25                 "IFCVF %s(): " fmt "\n", __func__, ##args)
26
27 #ifndef PAGE_SIZE
28 #define PAGE_SIZE 4096
29 #endif
30
31 static int ifcvf_vdpa_logtype;
32
33 struct ifcvf_internal {
34         struct rte_vdpa_dev_addr dev_addr;
35         struct rte_pci_device *pdev;
36         struct ifcvf_hw hw;
37         int vfio_container_fd;
38         int vfio_group_fd;
39         int vfio_dev_fd;
40         pthread_t tid;  /* thread for notify relay */
41         int epfd;
42         int vid;
43         int did;
44         uint16_t max_queues;
45         uint64_t features;
46         rte_atomic32_t started;
47         rte_atomic32_t dev_attached;
48         rte_atomic32_t running;
49         rte_spinlock_t lock;
50 };
51
52 struct internal_list {
53         TAILQ_ENTRY(internal_list) next;
54         struct ifcvf_internal *internal;
55 };
56
57 TAILQ_HEAD(internal_list_head, internal_list);
58 static struct internal_list_head internal_list =
59         TAILQ_HEAD_INITIALIZER(internal_list);
60
61 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
62
63 static struct internal_list *
64 find_internal_resource_by_did(int did)
65 {
66         int found = 0;
67         struct internal_list *list;
68
69         pthread_mutex_lock(&internal_list_lock);
70
71         TAILQ_FOREACH(list, &internal_list, next) {
72                 if (did == list->internal->did) {
73                         found = 1;
74                         break;
75                 }
76         }
77
78         pthread_mutex_unlock(&internal_list_lock);
79
80         if (!found)
81                 return NULL;
82
83         return list;
84 }
85
86 static struct internal_list *
87 find_internal_resource_by_dev(struct rte_pci_device *pdev)
88 {
89         int found = 0;
90         struct internal_list *list;
91
92         pthread_mutex_lock(&internal_list_lock);
93
94         TAILQ_FOREACH(list, &internal_list, next) {
95                 if (pdev == list->internal->pdev) {
96                         found = 1;
97                         break;
98                 }
99         }
100
101         pthread_mutex_unlock(&internal_list_lock);
102
103         if (!found)
104                 return NULL;
105
106         return list;
107 }
108
109 static int
110 ifcvf_vfio_setup(struct ifcvf_internal *internal)
111 {
112         struct rte_pci_device *dev = internal->pdev;
113         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
114         int iommu_group_num;
115         int i;
116
117         internal->vfio_dev_fd = -1;
118         internal->vfio_group_fd = -1;
119         internal->vfio_container_fd = -1;
120
121         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
122         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
123                         &iommu_group_num);
124
125         internal->vfio_container_fd = rte_vfio_container_create();
126         if (internal->vfio_container_fd < 0)
127                 return -1;
128
129         internal->vfio_group_fd = rte_vfio_container_group_bind(
130                         internal->vfio_container_fd, iommu_group_num);
131         if (internal->vfio_group_fd < 0)
132                 goto err;
133
134         if (rte_pci_map_device(dev))
135                 goto err;
136
137         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
138
139         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
140                         i++) {
141                 internal->hw.mem_resource[i].addr =
142                         internal->pdev->mem_resource[i].addr;
143                 internal->hw.mem_resource[i].phys_addr =
144                         internal->pdev->mem_resource[i].phys_addr;
145                 internal->hw.mem_resource[i].len =
146                         internal->pdev->mem_resource[i].len;
147         }
148
149         return 0;
150
151 err:
152         rte_vfio_container_destroy(internal->vfio_container_fd);
153         return -1;
154 }
155
156 static int
157 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
158 {
159         uint32_t i;
160         int ret;
161         struct rte_vhost_memory *mem = NULL;
162         int vfio_container_fd;
163
164         ret = rte_vhost_get_mem_table(internal->vid, &mem);
165         if (ret < 0) {
166                 DRV_LOG(ERR, "failed to get VM memory layout.");
167                 goto exit;
168         }
169
170         vfio_container_fd = internal->vfio_container_fd;
171
172         for (i = 0; i < mem->nregions; i++) {
173                 struct rte_vhost_mem_region *reg;
174
175                 reg = &mem->regions[i];
176                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
177                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
178                         do_map ? "DMA map" : "DMA unmap", i,
179                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
180
181                 if (do_map) {
182                         ret = rte_vfio_container_dma_map(vfio_container_fd,
183                                 reg->host_user_addr, reg->guest_phys_addr,
184                                 reg->size);
185                         if (ret < 0) {
186                                 DRV_LOG(ERR, "DMA map failed.");
187                                 goto exit;
188                         }
189                 } else {
190                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
191                                 reg->host_user_addr, reg->guest_phys_addr,
192                                 reg->size);
193                         if (ret < 0) {
194                                 DRV_LOG(ERR, "DMA unmap failed.");
195                                 goto exit;
196                         }
197                 }
198         }
199
200 exit:
201         if (mem)
202                 free(mem);
203         return ret;
204 }
205
206 static uint64_t
207 hva_to_gpa(int vid, uint64_t hva)
208 {
209         struct rte_vhost_memory *mem = NULL;
210         struct rte_vhost_mem_region *reg;
211         uint32_t i;
212         uint64_t gpa = 0;
213
214         if (rte_vhost_get_mem_table(vid, &mem) < 0)
215                 goto exit;
216
217         for (i = 0; i < mem->nregions; i++) {
218                 reg = &mem->regions[i];
219
220                 if (hva >= reg->host_user_addr &&
221                                 hva < reg->host_user_addr + reg->size) {
222                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
223                         break;
224                 }
225         }
226
227 exit:
228         if (mem)
229                 free(mem);
230         return gpa;
231 }
232
233 static int
234 vdpa_ifcvf_start(struct ifcvf_internal *internal)
235 {
236         struct ifcvf_hw *hw = &internal->hw;
237         int i, nr_vring;
238         int vid;
239         struct rte_vhost_vring vq;
240         uint64_t gpa;
241
242         vid = internal->vid;
243         nr_vring = rte_vhost_get_vring_num(vid);
244         rte_vhost_get_negotiated_features(vid, &hw->req_features);
245
246         for (i = 0; i < nr_vring; i++) {
247                 rte_vhost_get_vhost_vring(vid, i, &vq);
248                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
249                 if (gpa == 0) {
250                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
251                         return -1;
252                 }
253                 hw->vring[i].desc = gpa;
254
255                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
256                 if (gpa == 0) {
257                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
258                         return -1;
259                 }
260                 hw->vring[i].avail = gpa;
261
262                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
263                 if (gpa == 0) {
264                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
265                         return -1;
266                 }
267                 hw->vring[i].used = gpa;
268
269                 hw->vring[i].size = vq.size;
270                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
271                                 &hw->vring[i].last_used_idx);
272         }
273         hw->nr_vring = i;
274
275         return ifcvf_start_hw(&internal->hw);
276 }
277
278 static void
279 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
280 {
281         uint32_t i, size;
282         uint64_t pfn;
283
284         pfn = hw->vring[queue].used / PAGE_SIZE;
285         size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
286                         sizeof(uint16_t) * 3;
287
288         for (i = 0; i <= size / PAGE_SIZE; i++)
289                 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
290                                 1 << ((pfn + i) % 8));
291 }
292
293 static void
294 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
295 {
296         struct ifcvf_hw *hw = &internal->hw;
297         uint32_t i;
298         int vid;
299         uint64_t features;
300         uint64_t log_base, log_size;
301         uint8_t *log_buf;
302
303         vid = internal->vid;
304         ifcvf_stop_hw(hw);
305
306         for (i = 0; i < hw->nr_vring; i++)
307                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
308                                 hw->vring[i].last_used_idx);
309
310         rte_vhost_get_negotiated_features(vid, &features);
311         if (RTE_VHOST_NEED_LOG(features)) {
312                 ifcvf_disable_logging(hw);
313                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
314                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
315                                 log_base, IFCVF_LOG_BASE, log_size);
316                 /*
317                  * IFCVF marks dirty memory pages for only packet buffer,
318                  * SW helps to mark the used ring as dirty after device stops.
319                  */
320                 log_buf = (uint8_t *)(uintptr_t)log_base;
321                 for (i = 0; i < hw->nr_vring; i++)
322                         ifcvf_used_ring_log(hw, i, log_buf);
323         }
324 }
325
326 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
327                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
328 static int
329 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
330 {
331         int ret;
332         uint32_t i, nr_vring;
333         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
334         struct vfio_irq_set *irq_set;
335         int *fd_ptr;
336         struct rte_vhost_vring vring;
337
338         nr_vring = rte_vhost_get_vring_num(internal->vid);
339
340         irq_set = (struct vfio_irq_set *)irq_set_buf;
341         irq_set->argsz = sizeof(irq_set_buf);
342         irq_set->count = nr_vring + 1;
343         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
344                          VFIO_IRQ_SET_ACTION_TRIGGER;
345         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
346         irq_set->start = 0;
347         fd_ptr = (int *)&irq_set->data;
348         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
349
350         for (i = 0; i < nr_vring; i++) {
351                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
352                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
353         }
354
355         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
356         if (ret) {
357                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
358                                 strerror(errno));
359                 return -1;
360         }
361
362         return 0;
363 }
364
365 static int
366 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
367 {
368         int ret;
369         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
370         struct vfio_irq_set *irq_set;
371
372         irq_set = (struct vfio_irq_set *)irq_set_buf;
373         irq_set->argsz = sizeof(irq_set_buf);
374         irq_set->count = 0;
375         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
376         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
377         irq_set->start = 0;
378
379         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
380         if (ret) {
381                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
382                                 strerror(errno));
383                 return -1;
384         }
385
386         return 0;
387 }
388
389 static void *
390 notify_relay(void *arg)
391 {
392         int i, kickfd, epfd, nfds = 0;
393         uint32_t qid, q_num;
394         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
395         struct epoll_event ev;
396         uint64_t buf;
397         int nbytes;
398         struct rte_vhost_vring vring;
399         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
400         struct ifcvf_hw *hw = &internal->hw;
401
402         q_num = rte_vhost_get_vring_num(internal->vid);
403
404         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
405         if (epfd < 0) {
406                 DRV_LOG(ERR, "failed to create epoll instance.");
407                 return NULL;
408         }
409         internal->epfd = epfd;
410
411         for (qid = 0; qid < q_num; qid++) {
412                 ev.events = EPOLLIN | EPOLLPRI;
413                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
414                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
415                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
416                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
417                         return NULL;
418                 }
419         }
420
421         for (;;) {
422                 nfds = epoll_wait(epfd, events, q_num, -1);
423                 if (nfds < 0) {
424                         if (errno == EINTR)
425                                 continue;
426                         DRV_LOG(ERR, "epoll_wait return fail\n");
427                         return NULL;
428                 }
429
430                 for (i = 0; i < nfds; i++) {
431                         qid = events[i].data.u32;
432                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
433                         do {
434                                 nbytes = read(kickfd, &buf, 8);
435                                 if (nbytes < 0) {
436                                         if (errno == EINTR ||
437                                             errno == EWOULDBLOCK ||
438                                             errno == EAGAIN)
439                                                 continue;
440                                         DRV_LOG(INFO, "Error reading "
441                                                 "kickfd: %s",
442                                                 strerror(errno));
443                                 }
444                                 break;
445                         } while (1);
446
447                         ifcvf_notify_queue(hw, qid);
448                 }
449         }
450
451         return NULL;
452 }
453
454 static int
455 setup_notify_relay(struct ifcvf_internal *internal)
456 {
457         int ret;
458
459         ret = pthread_create(&internal->tid, NULL, notify_relay,
460                         (void *)internal);
461         if (ret) {
462                 DRV_LOG(ERR, "failed to create notify relay pthread.");
463                 return -1;
464         }
465         return 0;
466 }
467
468 static int
469 unset_notify_relay(struct ifcvf_internal *internal)
470 {
471         void *status;
472
473         if (internal->tid) {
474                 pthread_cancel(internal->tid);
475                 pthread_join(internal->tid, &status);
476         }
477         internal->tid = 0;
478
479         if (internal->epfd >= 0)
480                 close(internal->epfd);
481         internal->epfd = -1;
482
483         return 0;
484 }
485
486 static int
487 update_datapath(struct ifcvf_internal *internal)
488 {
489         int ret;
490
491         rte_spinlock_lock(&internal->lock);
492
493         if (!rte_atomic32_read(&internal->running) &&
494             (rte_atomic32_read(&internal->started) &&
495              rte_atomic32_read(&internal->dev_attached))) {
496                 ret = ifcvf_dma_map(internal, 1);
497                 if (ret)
498                         goto err;
499
500                 ret = vdpa_enable_vfio_intr(internal);
501                 if (ret)
502                         goto err;
503
504                 ret = vdpa_ifcvf_start(internal);
505                 if (ret)
506                         goto err;
507
508                 ret = setup_notify_relay(internal);
509                 if (ret)
510                         goto err;
511
512                 rte_atomic32_set(&internal->running, 1);
513         } else if (rte_atomic32_read(&internal->running) &&
514                    (!rte_atomic32_read(&internal->started) ||
515                     !rte_atomic32_read(&internal->dev_attached))) {
516                 ret = unset_notify_relay(internal);
517                 if (ret)
518                         goto err;
519
520                 vdpa_ifcvf_stop(internal);
521
522                 ret = vdpa_disable_vfio_intr(internal);
523                 if (ret)
524                         goto err;
525
526                 ret = ifcvf_dma_map(internal, 0);
527                 if (ret)
528                         goto err;
529
530                 rte_atomic32_set(&internal->running, 0);
531         }
532
533         rte_spinlock_unlock(&internal->lock);
534         return 0;
535 err:
536         rte_spinlock_unlock(&internal->lock);
537         return ret;
538 }
539
540 static int
541 ifcvf_dev_config(int vid)
542 {
543         int did;
544         struct internal_list *list;
545         struct ifcvf_internal *internal;
546
547         did = rte_vhost_get_vdpa_device_id(vid);
548         list = find_internal_resource_by_did(did);
549         if (list == NULL) {
550                 DRV_LOG(ERR, "Invalid device id: %d", did);
551                 return -1;
552         }
553
554         internal = list->internal;
555         internal->vid = vid;
556         rte_atomic32_set(&internal->dev_attached, 1);
557         update_datapath(internal);
558
559         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
560                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
561
562         return 0;
563 }
564
565 static int
566 ifcvf_dev_close(int vid)
567 {
568         int did;
569         struct internal_list *list;
570         struct ifcvf_internal *internal;
571
572         did = rte_vhost_get_vdpa_device_id(vid);
573         list = find_internal_resource_by_did(did);
574         if (list == NULL) {
575                 DRV_LOG(ERR, "Invalid device id: %d", did);
576                 return -1;
577         }
578
579         internal = list->internal;
580         rte_atomic32_set(&internal->dev_attached, 0);
581         update_datapath(internal);
582
583         return 0;
584 }
585
586 static int
587 ifcvf_set_features(int vid)
588 {
589         uint64_t features;
590         int did;
591         struct internal_list *list;
592         struct ifcvf_internal *internal;
593         uint64_t log_base, log_size;
594
595         did = rte_vhost_get_vdpa_device_id(vid);
596         list = find_internal_resource_by_did(did);
597         if (list == NULL) {
598                 DRV_LOG(ERR, "Invalid device id: %d", did);
599                 return -1;
600         }
601
602         internal = list->internal;
603         rte_vhost_get_negotiated_features(vid, &features);
604
605         if (RTE_VHOST_NEED_LOG(features)) {
606                 rte_vhost_get_log_base(vid, &log_base, &log_size);
607                 rte_vfio_container_dma_map(internal->vfio_container_fd,
608                                 log_base, IFCVF_LOG_BASE, log_size);
609                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
610         }
611
612         return 0;
613 }
614
615 static int
616 ifcvf_get_vfio_group_fd(int vid)
617 {
618         int did;
619         struct internal_list *list;
620
621         did = rte_vhost_get_vdpa_device_id(vid);
622         list = find_internal_resource_by_did(did);
623         if (list == NULL) {
624                 DRV_LOG(ERR, "Invalid device id: %d", did);
625                 return -1;
626         }
627
628         return list->internal->vfio_group_fd;
629 }
630
631 static int
632 ifcvf_get_vfio_device_fd(int vid)
633 {
634         int did;
635         struct internal_list *list;
636
637         did = rte_vhost_get_vdpa_device_id(vid);
638         list = find_internal_resource_by_did(did);
639         if (list == NULL) {
640                 DRV_LOG(ERR, "Invalid device id: %d", did);
641                 return -1;
642         }
643
644         return list->internal->vfio_dev_fd;
645 }
646
647 static int
648 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
649 {
650         int did;
651         struct internal_list *list;
652         struct ifcvf_internal *internal;
653         struct vfio_region_info reg = { .argsz = sizeof(reg) };
654         int ret;
655
656         did = rte_vhost_get_vdpa_device_id(vid);
657         list = find_internal_resource_by_did(did);
658         if (list == NULL) {
659                 DRV_LOG(ERR, "Invalid device id: %d", did);
660                 return -1;
661         }
662
663         internal = list->internal;
664
665         reg.index = ifcvf_get_notify_region(&internal->hw);
666         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
667         if (ret) {
668                 DRV_LOG(ERR, "Get not get device region info: %s",
669                                 strerror(errno));
670                 return -1;
671         }
672
673         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
674         *size = 0x1000;
675
676         return 0;
677 }
678
679 static int
680 ifcvf_get_queue_num(int did, uint32_t *queue_num)
681 {
682         struct internal_list *list;
683
684         list = find_internal_resource_by_did(did);
685         if (list == NULL) {
686                 DRV_LOG(ERR, "Invalid device id: %d", did);
687                 return -1;
688         }
689
690         *queue_num = list->internal->max_queues;
691
692         return 0;
693 }
694
695 static int
696 ifcvf_get_vdpa_features(int did, uint64_t *features)
697 {
698         struct internal_list *list;
699
700         list = find_internal_resource_by_did(did);
701         if (list == NULL) {
702                 DRV_LOG(ERR, "Invalid device id: %d", did);
703                 return -1;
704         }
705
706         *features = list->internal->features;
707
708         return 0;
709 }
710
711 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
712                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
713                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
714                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
715                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
716                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
717 static int
718 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
719 {
720         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
721         return 0;
722 }
723
724 static struct rte_vdpa_dev_ops ifcvf_ops = {
725         .get_queue_num = ifcvf_get_queue_num,
726         .get_features = ifcvf_get_vdpa_features,
727         .get_protocol_features = ifcvf_get_protocol_features,
728         .dev_conf = ifcvf_dev_config,
729         .dev_close = ifcvf_dev_close,
730         .set_vring_state = NULL,
731         .set_features = ifcvf_set_features,
732         .migration_done = NULL,
733         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
734         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
735         .get_notify_area = ifcvf_get_notify_area,
736 };
737
738 static int
739 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
740                 struct rte_pci_device *pci_dev)
741 {
742         uint64_t features;
743         struct ifcvf_internal *internal = NULL;
744         struct internal_list *list = NULL;
745
746         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
747                 return 0;
748
749         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
750         if (list == NULL)
751                 goto error;
752
753         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
754         if (internal == NULL)
755                 goto error;
756
757         internal->pdev = pci_dev;
758         rte_spinlock_init(&internal->lock);
759
760         if (ifcvf_vfio_setup(internal) < 0) {
761                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
762                 goto error;
763         }
764
765         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
766                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
767                 goto error;
768         }
769
770         internal->max_queues = IFCVF_MAX_QUEUES;
771         features = ifcvf_get_features(&internal->hw);
772         internal->features = (features &
773                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
774                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
775                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
776                 (1ULL << VIRTIO_NET_F_STATUS) |
777                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
778                 (1ULL << VHOST_F_LOG_ALL);
779
780         internal->dev_addr.pci_addr = pci_dev->addr;
781         internal->dev_addr.type = PCI_ADDR;
782         list->internal = internal;
783
784         pthread_mutex_lock(&internal_list_lock);
785         TAILQ_INSERT_TAIL(&internal_list, list, next);
786         pthread_mutex_unlock(&internal_list_lock);
787
788         internal->did = rte_vdpa_register_device(&internal->dev_addr,
789                                 &ifcvf_ops);
790         if (internal->did < 0) {
791                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
792                 goto error;
793         }
794
795         rte_atomic32_set(&internal->started, 1);
796         update_datapath(internal);
797
798         return 0;
799
800 error:
801         rte_free(list);
802         rte_free(internal);
803         return -1;
804 }
805
806 static int
807 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
808 {
809         struct ifcvf_internal *internal;
810         struct internal_list *list;
811
812         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
813                 return 0;
814
815         list = find_internal_resource_by_dev(pci_dev);
816         if (list == NULL) {
817                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
818                 return -1;
819         }
820
821         internal = list->internal;
822         rte_atomic32_set(&internal->started, 0);
823         update_datapath(internal);
824
825         rte_pci_unmap_device(internal->pdev);
826         rte_vfio_container_destroy(internal->vfio_container_fd);
827         rte_vdpa_unregister_device(internal->did);
828
829         pthread_mutex_lock(&internal_list_lock);
830         TAILQ_REMOVE(&internal_list, list, next);
831         pthread_mutex_unlock(&internal_list_lock);
832
833         rte_free(list);
834         rte_free(internal);
835
836         return 0;
837 }
838
839 /*
840  * IFCVF has the same vendor ID and device ID as virtio net PCI
841  * device, with its specific subsystem vendor ID and device ID.
842  */
843 static const struct rte_pci_id pci_id_ifcvf_map[] = {
844         { .class_id = RTE_CLASS_ANY_ID,
845           .vendor_id = IFCVF_VENDOR_ID,
846           .device_id = IFCVF_DEVICE_ID,
847           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
848           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
849         },
850
851         { .vendor_id = 0, /* sentinel */
852         },
853 };
854
855 static struct rte_pci_driver rte_ifcvf_vdpa = {
856         .id_table = pci_id_ifcvf_map,
857         .drv_flags = 0,
858         .probe = ifcvf_pci_probe,
859         .remove = ifcvf_pci_remove,
860 };
861
862 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
863 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
864 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
865
866 RTE_INIT(ifcvf_vdpa_init_log)
867 {
868         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
869         if (ifcvf_vdpa_logtype >= 0)
870                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
871 }