net/ifc: detect if VDPA mode is specified
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11
12 #include <rte_malloc.h>
13 #include <rte_memory.h>
14 #include <rte_bus_pci.h>
15 #include <rte_vhost.h>
16 #include <rte_vdpa.h>
17 #include <rte_vfio.h>
18 #include <rte_spinlock.h>
19 #include <rte_log.h>
20 #include <rte_kvargs.h>
21 #include <rte_devargs.h>
22
23 #include "base/ifcvf.h"
24
25 #define DRV_LOG(level, fmt, args...) \
26         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
27                 "IFCVF %s(): " fmt "\n", __func__, ##args)
28
29 #ifndef PAGE_SIZE
30 #define PAGE_SIZE 4096
31 #endif
32
33 #define IFCVF_VDPA_MODE         "vdpa"
34
35 static const char * const ifcvf_valid_arguments[] = {
36         IFCVF_VDPA_MODE,
37         NULL
38 };
39
40 static int ifcvf_vdpa_logtype;
41
42 struct ifcvf_internal {
43         struct rte_vdpa_dev_addr dev_addr;
44         struct rte_pci_device *pdev;
45         struct ifcvf_hw hw;
46         int vfio_container_fd;
47         int vfio_group_fd;
48         int vfio_dev_fd;
49         pthread_t tid;  /* thread for notify relay */
50         int epfd;
51         int vid;
52         int did;
53         uint16_t max_queues;
54         uint64_t features;
55         rte_atomic32_t started;
56         rte_atomic32_t dev_attached;
57         rte_atomic32_t running;
58         rte_spinlock_t lock;
59 };
60
61 struct internal_list {
62         TAILQ_ENTRY(internal_list) next;
63         struct ifcvf_internal *internal;
64 };
65
66 TAILQ_HEAD(internal_list_head, internal_list);
67 static struct internal_list_head internal_list =
68         TAILQ_HEAD_INITIALIZER(internal_list);
69
70 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
71
72 static struct internal_list *
73 find_internal_resource_by_did(int did)
74 {
75         int found = 0;
76         struct internal_list *list;
77
78         pthread_mutex_lock(&internal_list_lock);
79
80         TAILQ_FOREACH(list, &internal_list, next) {
81                 if (did == list->internal->did) {
82                         found = 1;
83                         break;
84                 }
85         }
86
87         pthread_mutex_unlock(&internal_list_lock);
88
89         if (!found)
90                 return NULL;
91
92         return list;
93 }
94
95 static struct internal_list *
96 find_internal_resource_by_dev(struct rte_pci_device *pdev)
97 {
98         int found = 0;
99         struct internal_list *list;
100
101         pthread_mutex_lock(&internal_list_lock);
102
103         TAILQ_FOREACH(list, &internal_list, next) {
104                 if (pdev == list->internal->pdev) {
105                         found = 1;
106                         break;
107                 }
108         }
109
110         pthread_mutex_unlock(&internal_list_lock);
111
112         if (!found)
113                 return NULL;
114
115         return list;
116 }
117
118 static int
119 ifcvf_vfio_setup(struct ifcvf_internal *internal)
120 {
121         struct rte_pci_device *dev = internal->pdev;
122         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
123         int iommu_group_num;
124         int i;
125
126         internal->vfio_dev_fd = -1;
127         internal->vfio_group_fd = -1;
128         internal->vfio_container_fd = -1;
129
130         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
131         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
132                         &iommu_group_num);
133
134         internal->vfio_container_fd = rte_vfio_container_create();
135         if (internal->vfio_container_fd < 0)
136                 return -1;
137
138         internal->vfio_group_fd = rte_vfio_container_group_bind(
139                         internal->vfio_container_fd, iommu_group_num);
140         if (internal->vfio_group_fd < 0)
141                 goto err;
142
143         if (rte_pci_map_device(dev))
144                 goto err;
145
146         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
147
148         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
149                         i++) {
150                 internal->hw.mem_resource[i].addr =
151                         internal->pdev->mem_resource[i].addr;
152                 internal->hw.mem_resource[i].phys_addr =
153                         internal->pdev->mem_resource[i].phys_addr;
154                 internal->hw.mem_resource[i].len =
155                         internal->pdev->mem_resource[i].len;
156         }
157
158         return 0;
159
160 err:
161         rte_vfio_container_destroy(internal->vfio_container_fd);
162         return -1;
163 }
164
165 static int
166 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
167 {
168         uint32_t i;
169         int ret;
170         struct rte_vhost_memory *mem = NULL;
171         int vfio_container_fd;
172
173         ret = rte_vhost_get_mem_table(internal->vid, &mem);
174         if (ret < 0) {
175                 DRV_LOG(ERR, "failed to get VM memory layout.");
176                 goto exit;
177         }
178
179         vfio_container_fd = internal->vfio_container_fd;
180
181         for (i = 0; i < mem->nregions; i++) {
182                 struct rte_vhost_mem_region *reg;
183
184                 reg = &mem->regions[i];
185                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
186                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
187                         do_map ? "DMA map" : "DMA unmap", i,
188                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
189
190                 if (do_map) {
191                         ret = rte_vfio_container_dma_map(vfio_container_fd,
192                                 reg->host_user_addr, reg->guest_phys_addr,
193                                 reg->size);
194                         if (ret < 0) {
195                                 DRV_LOG(ERR, "DMA map failed.");
196                                 goto exit;
197                         }
198                 } else {
199                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
200                                 reg->host_user_addr, reg->guest_phys_addr,
201                                 reg->size);
202                         if (ret < 0) {
203                                 DRV_LOG(ERR, "DMA unmap failed.");
204                                 goto exit;
205                         }
206                 }
207         }
208
209 exit:
210         if (mem)
211                 free(mem);
212         return ret;
213 }
214
215 static uint64_t
216 hva_to_gpa(int vid, uint64_t hva)
217 {
218         struct rte_vhost_memory *mem = NULL;
219         struct rte_vhost_mem_region *reg;
220         uint32_t i;
221         uint64_t gpa = 0;
222
223         if (rte_vhost_get_mem_table(vid, &mem) < 0)
224                 goto exit;
225
226         for (i = 0; i < mem->nregions; i++) {
227                 reg = &mem->regions[i];
228
229                 if (hva >= reg->host_user_addr &&
230                                 hva < reg->host_user_addr + reg->size) {
231                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
232                         break;
233                 }
234         }
235
236 exit:
237         if (mem)
238                 free(mem);
239         return gpa;
240 }
241
242 static int
243 vdpa_ifcvf_start(struct ifcvf_internal *internal)
244 {
245         struct ifcvf_hw *hw = &internal->hw;
246         int i, nr_vring;
247         int vid;
248         struct rte_vhost_vring vq;
249         uint64_t gpa;
250
251         vid = internal->vid;
252         nr_vring = rte_vhost_get_vring_num(vid);
253         rte_vhost_get_negotiated_features(vid, &hw->req_features);
254
255         for (i = 0; i < nr_vring; i++) {
256                 rte_vhost_get_vhost_vring(vid, i, &vq);
257                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
258                 if (gpa == 0) {
259                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
260                         return -1;
261                 }
262                 hw->vring[i].desc = gpa;
263
264                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
265                 if (gpa == 0) {
266                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
267                         return -1;
268                 }
269                 hw->vring[i].avail = gpa;
270
271                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
272                 if (gpa == 0) {
273                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
274                         return -1;
275                 }
276                 hw->vring[i].used = gpa;
277
278                 hw->vring[i].size = vq.size;
279                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
280                                 &hw->vring[i].last_used_idx);
281         }
282         hw->nr_vring = i;
283
284         return ifcvf_start_hw(&internal->hw);
285 }
286
287 static void
288 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
289 {
290         uint32_t i, size;
291         uint64_t pfn;
292
293         pfn = hw->vring[queue].used / PAGE_SIZE;
294         size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
295                         sizeof(uint16_t) * 3;
296
297         for (i = 0; i <= size / PAGE_SIZE; i++)
298                 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
299                                 1 << ((pfn + i) % 8));
300 }
301
302 static void
303 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
304 {
305         struct ifcvf_hw *hw = &internal->hw;
306         uint32_t i;
307         int vid;
308         uint64_t features;
309         uint64_t log_base, log_size;
310         uint8_t *log_buf;
311
312         vid = internal->vid;
313         ifcvf_stop_hw(hw);
314
315         for (i = 0; i < hw->nr_vring; i++)
316                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
317                                 hw->vring[i].last_used_idx);
318
319         rte_vhost_get_negotiated_features(vid, &features);
320         if (RTE_VHOST_NEED_LOG(features)) {
321                 ifcvf_disable_logging(hw);
322                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
323                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
324                                 log_base, IFCVF_LOG_BASE, log_size);
325                 /*
326                  * IFCVF marks dirty memory pages for only packet buffer,
327                  * SW helps to mark the used ring as dirty after device stops.
328                  */
329                 log_buf = (uint8_t *)(uintptr_t)log_base;
330                 for (i = 0; i < hw->nr_vring; i++)
331                         ifcvf_used_ring_log(hw, i, log_buf);
332         }
333 }
334
335 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
336                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
337 static int
338 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
339 {
340         int ret;
341         uint32_t i, nr_vring;
342         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
343         struct vfio_irq_set *irq_set;
344         int *fd_ptr;
345         struct rte_vhost_vring vring;
346
347         nr_vring = rte_vhost_get_vring_num(internal->vid);
348
349         irq_set = (struct vfio_irq_set *)irq_set_buf;
350         irq_set->argsz = sizeof(irq_set_buf);
351         irq_set->count = nr_vring + 1;
352         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
353                          VFIO_IRQ_SET_ACTION_TRIGGER;
354         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
355         irq_set->start = 0;
356         fd_ptr = (int *)&irq_set->data;
357         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
358
359         for (i = 0; i < nr_vring; i++) {
360                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
361                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
362         }
363
364         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
365         if (ret) {
366                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
367                                 strerror(errno));
368                 return -1;
369         }
370
371         return 0;
372 }
373
374 static int
375 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
376 {
377         int ret;
378         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
379         struct vfio_irq_set *irq_set;
380
381         irq_set = (struct vfio_irq_set *)irq_set_buf;
382         irq_set->argsz = sizeof(irq_set_buf);
383         irq_set->count = 0;
384         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
385         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
386         irq_set->start = 0;
387
388         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
389         if (ret) {
390                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
391                                 strerror(errno));
392                 return -1;
393         }
394
395         return 0;
396 }
397
398 static void *
399 notify_relay(void *arg)
400 {
401         int i, kickfd, epfd, nfds = 0;
402         uint32_t qid, q_num;
403         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
404         struct epoll_event ev;
405         uint64_t buf;
406         int nbytes;
407         struct rte_vhost_vring vring;
408         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
409         struct ifcvf_hw *hw = &internal->hw;
410
411         q_num = rte_vhost_get_vring_num(internal->vid);
412
413         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
414         if (epfd < 0) {
415                 DRV_LOG(ERR, "failed to create epoll instance.");
416                 return NULL;
417         }
418         internal->epfd = epfd;
419
420         for (qid = 0; qid < q_num; qid++) {
421                 ev.events = EPOLLIN | EPOLLPRI;
422                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
423                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
424                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
425                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
426                         return NULL;
427                 }
428         }
429
430         for (;;) {
431                 nfds = epoll_wait(epfd, events, q_num, -1);
432                 if (nfds < 0) {
433                         if (errno == EINTR)
434                                 continue;
435                         DRV_LOG(ERR, "epoll_wait return fail\n");
436                         return NULL;
437                 }
438
439                 for (i = 0; i < nfds; i++) {
440                         qid = events[i].data.u32;
441                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
442                         do {
443                                 nbytes = read(kickfd, &buf, 8);
444                                 if (nbytes < 0) {
445                                         if (errno == EINTR ||
446                                             errno == EWOULDBLOCK ||
447                                             errno == EAGAIN)
448                                                 continue;
449                                         DRV_LOG(INFO, "Error reading "
450                                                 "kickfd: %s",
451                                                 strerror(errno));
452                                 }
453                                 break;
454                         } while (1);
455
456                         ifcvf_notify_queue(hw, qid);
457                 }
458         }
459
460         return NULL;
461 }
462
463 static int
464 setup_notify_relay(struct ifcvf_internal *internal)
465 {
466         int ret;
467
468         ret = pthread_create(&internal->tid, NULL, notify_relay,
469                         (void *)internal);
470         if (ret) {
471                 DRV_LOG(ERR, "failed to create notify relay pthread.");
472                 return -1;
473         }
474         return 0;
475 }
476
477 static int
478 unset_notify_relay(struct ifcvf_internal *internal)
479 {
480         void *status;
481
482         if (internal->tid) {
483                 pthread_cancel(internal->tid);
484                 pthread_join(internal->tid, &status);
485         }
486         internal->tid = 0;
487
488         if (internal->epfd >= 0)
489                 close(internal->epfd);
490         internal->epfd = -1;
491
492         return 0;
493 }
494
495 static int
496 update_datapath(struct ifcvf_internal *internal)
497 {
498         int ret;
499
500         rte_spinlock_lock(&internal->lock);
501
502         if (!rte_atomic32_read(&internal->running) &&
503             (rte_atomic32_read(&internal->started) &&
504              rte_atomic32_read(&internal->dev_attached))) {
505                 ret = ifcvf_dma_map(internal, 1);
506                 if (ret)
507                         goto err;
508
509                 ret = vdpa_enable_vfio_intr(internal);
510                 if (ret)
511                         goto err;
512
513                 ret = vdpa_ifcvf_start(internal);
514                 if (ret)
515                         goto err;
516
517                 ret = setup_notify_relay(internal);
518                 if (ret)
519                         goto err;
520
521                 rte_atomic32_set(&internal->running, 1);
522         } else if (rte_atomic32_read(&internal->running) &&
523                    (!rte_atomic32_read(&internal->started) ||
524                     !rte_atomic32_read(&internal->dev_attached))) {
525                 ret = unset_notify_relay(internal);
526                 if (ret)
527                         goto err;
528
529                 vdpa_ifcvf_stop(internal);
530
531                 ret = vdpa_disable_vfio_intr(internal);
532                 if (ret)
533                         goto err;
534
535                 ret = ifcvf_dma_map(internal, 0);
536                 if (ret)
537                         goto err;
538
539                 rte_atomic32_set(&internal->running, 0);
540         }
541
542         rte_spinlock_unlock(&internal->lock);
543         return 0;
544 err:
545         rte_spinlock_unlock(&internal->lock);
546         return ret;
547 }
548
549 static int
550 ifcvf_dev_config(int vid)
551 {
552         int did;
553         struct internal_list *list;
554         struct ifcvf_internal *internal;
555
556         did = rte_vhost_get_vdpa_device_id(vid);
557         list = find_internal_resource_by_did(did);
558         if (list == NULL) {
559                 DRV_LOG(ERR, "Invalid device id: %d", did);
560                 return -1;
561         }
562
563         internal = list->internal;
564         internal->vid = vid;
565         rte_atomic32_set(&internal->dev_attached, 1);
566         update_datapath(internal);
567
568         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
569                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
570
571         return 0;
572 }
573
574 static int
575 ifcvf_dev_close(int vid)
576 {
577         int did;
578         struct internal_list *list;
579         struct ifcvf_internal *internal;
580
581         did = rte_vhost_get_vdpa_device_id(vid);
582         list = find_internal_resource_by_did(did);
583         if (list == NULL) {
584                 DRV_LOG(ERR, "Invalid device id: %d", did);
585                 return -1;
586         }
587
588         internal = list->internal;
589         rte_atomic32_set(&internal->dev_attached, 0);
590         update_datapath(internal);
591
592         return 0;
593 }
594
595 static int
596 ifcvf_set_features(int vid)
597 {
598         uint64_t features;
599         int did;
600         struct internal_list *list;
601         struct ifcvf_internal *internal;
602         uint64_t log_base, log_size;
603
604         did = rte_vhost_get_vdpa_device_id(vid);
605         list = find_internal_resource_by_did(did);
606         if (list == NULL) {
607                 DRV_LOG(ERR, "Invalid device id: %d", did);
608                 return -1;
609         }
610
611         internal = list->internal;
612         rte_vhost_get_negotiated_features(vid, &features);
613
614         if (RTE_VHOST_NEED_LOG(features)) {
615                 rte_vhost_get_log_base(vid, &log_base, &log_size);
616                 rte_vfio_container_dma_map(internal->vfio_container_fd,
617                                 log_base, IFCVF_LOG_BASE, log_size);
618                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
619         }
620
621         return 0;
622 }
623
624 static int
625 ifcvf_get_vfio_group_fd(int vid)
626 {
627         int did;
628         struct internal_list *list;
629
630         did = rte_vhost_get_vdpa_device_id(vid);
631         list = find_internal_resource_by_did(did);
632         if (list == NULL) {
633                 DRV_LOG(ERR, "Invalid device id: %d", did);
634                 return -1;
635         }
636
637         return list->internal->vfio_group_fd;
638 }
639
640 static int
641 ifcvf_get_vfio_device_fd(int vid)
642 {
643         int did;
644         struct internal_list *list;
645
646         did = rte_vhost_get_vdpa_device_id(vid);
647         list = find_internal_resource_by_did(did);
648         if (list == NULL) {
649                 DRV_LOG(ERR, "Invalid device id: %d", did);
650                 return -1;
651         }
652
653         return list->internal->vfio_dev_fd;
654 }
655
656 static int
657 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
658 {
659         int did;
660         struct internal_list *list;
661         struct ifcvf_internal *internal;
662         struct vfio_region_info reg = { .argsz = sizeof(reg) };
663         int ret;
664
665         did = rte_vhost_get_vdpa_device_id(vid);
666         list = find_internal_resource_by_did(did);
667         if (list == NULL) {
668                 DRV_LOG(ERR, "Invalid device id: %d", did);
669                 return -1;
670         }
671
672         internal = list->internal;
673
674         reg.index = ifcvf_get_notify_region(&internal->hw);
675         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
676         if (ret) {
677                 DRV_LOG(ERR, "Get not get device region info: %s",
678                                 strerror(errno));
679                 return -1;
680         }
681
682         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
683         *size = 0x1000;
684
685         return 0;
686 }
687
688 static int
689 ifcvf_get_queue_num(int did, uint32_t *queue_num)
690 {
691         struct internal_list *list;
692
693         list = find_internal_resource_by_did(did);
694         if (list == NULL) {
695                 DRV_LOG(ERR, "Invalid device id: %d", did);
696                 return -1;
697         }
698
699         *queue_num = list->internal->max_queues;
700
701         return 0;
702 }
703
704 static int
705 ifcvf_get_vdpa_features(int did, uint64_t *features)
706 {
707         struct internal_list *list;
708
709         list = find_internal_resource_by_did(did);
710         if (list == NULL) {
711                 DRV_LOG(ERR, "Invalid device id: %d", did);
712                 return -1;
713         }
714
715         *features = list->internal->features;
716
717         return 0;
718 }
719
720 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
721                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
722                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
723                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
724                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
725                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
726 static int
727 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
728 {
729         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
730         return 0;
731 }
732
733 static struct rte_vdpa_dev_ops ifcvf_ops = {
734         .get_queue_num = ifcvf_get_queue_num,
735         .get_features = ifcvf_get_vdpa_features,
736         .get_protocol_features = ifcvf_get_protocol_features,
737         .dev_conf = ifcvf_dev_config,
738         .dev_close = ifcvf_dev_close,
739         .set_vring_state = NULL,
740         .set_features = ifcvf_set_features,
741         .migration_done = NULL,
742         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
743         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
744         .get_notify_area = ifcvf_get_notify_area,
745 };
746
747 static inline int
748 open_int(const char *key __rte_unused, const char *value, void *extra_args)
749 {
750         uint16_t *n = extra_args;
751
752         if (value == NULL || extra_args == NULL)
753                 return -EINVAL;
754
755         *n = (uint16_t)strtoul(value, NULL, 0);
756         if (*n == USHRT_MAX && errno == ERANGE)
757                 return -1;
758
759         return 0;
760 }
761
762 static int
763 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
764                 struct rte_pci_device *pci_dev)
765 {
766         uint64_t features;
767         struct ifcvf_internal *internal = NULL;
768         struct internal_list *list = NULL;
769         int vdpa_mode = 0;
770         struct rte_kvargs *kvlist = NULL;
771         int ret = 0;
772
773         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
774                 return 0;
775
776         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
777                         ifcvf_valid_arguments);
778         if (kvlist == NULL)
779                 return 1;
780
781         /* probe only when vdpa mode is specified */
782         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
783                 rte_kvargs_free(kvlist);
784                 return 1;
785         }
786
787         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
788                         &vdpa_mode);
789         if (ret < 0 || vdpa_mode == 0) {
790                 rte_kvargs_free(kvlist);
791                 return 1;
792         }
793
794         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
795         if (list == NULL)
796                 goto error;
797
798         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
799         if (internal == NULL)
800                 goto error;
801
802         internal->pdev = pci_dev;
803         rte_spinlock_init(&internal->lock);
804
805         if (ifcvf_vfio_setup(internal) < 0) {
806                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
807                 goto error;
808         }
809
810         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
811                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
812                 goto error;
813         }
814
815         internal->max_queues = IFCVF_MAX_QUEUES;
816         features = ifcvf_get_features(&internal->hw);
817         internal->features = (features &
818                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
819                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
820                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
821                 (1ULL << VIRTIO_NET_F_STATUS) |
822                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
823                 (1ULL << VHOST_F_LOG_ALL);
824
825         internal->dev_addr.pci_addr = pci_dev->addr;
826         internal->dev_addr.type = PCI_ADDR;
827         list->internal = internal;
828
829         internal->did = rte_vdpa_register_device(&internal->dev_addr,
830                                 &ifcvf_ops);
831         if (internal->did < 0) {
832                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
833                 goto error;
834         }
835
836         pthread_mutex_lock(&internal_list_lock);
837         TAILQ_INSERT_TAIL(&internal_list, list, next);
838         pthread_mutex_unlock(&internal_list_lock);
839
840         rte_atomic32_set(&internal->started, 1);
841         update_datapath(internal);
842
843         rte_kvargs_free(kvlist);
844         return 0;
845
846 error:
847         rte_kvargs_free(kvlist);
848         rte_free(list);
849         rte_free(internal);
850         return -1;
851 }
852
853 static int
854 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
855 {
856         struct ifcvf_internal *internal;
857         struct internal_list *list;
858
859         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
860                 return 0;
861
862         list = find_internal_resource_by_dev(pci_dev);
863         if (list == NULL) {
864                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
865                 return -1;
866         }
867
868         internal = list->internal;
869         rte_atomic32_set(&internal->started, 0);
870         update_datapath(internal);
871
872         rte_pci_unmap_device(internal->pdev);
873         rte_vfio_container_destroy(internal->vfio_container_fd);
874         rte_vdpa_unregister_device(internal->did);
875
876         pthread_mutex_lock(&internal_list_lock);
877         TAILQ_REMOVE(&internal_list, list, next);
878         pthread_mutex_unlock(&internal_list_lock);
879
880         rte_free(list);
881         rte_free(internal);
882
883         return 0;
884 }
885
886 /*
887  * IFCVF has the same vendor ID and device ID as virtio net PCI
888  * device, with its specific subsystem vendor ID and device ID.
889  */
890 static const struct rte_pci_id pci_id_ifcvf_map[] = {
891         { .class_id = RTE_CLASS_ANY_ID,
892           .vendor_id = IFCVF_VENDOR_ID,
893           .device_id = IFCVF_DEVICE_ID,
894           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
895           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
896         },
897
898         { .vendor_id = 0, /* sentinel */
899         },
900 };
901
902 static struct rte_pci_driver rte_ifcvf_vdpa = {
903         .id_table = pci_id_ifcvf_map,
904         .drv_flags = 0,
905         .probe = ifcvf_pci_probe,
906         .remove = ifcvf_pci_remove,
907 };
908
909 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
910 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
911 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
912
913 RTE_INIT(ifcvf_vdpa_init_log)
914 {
915         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
916         if (ifcvf_vdpa_logtype >= 0)
917                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
918 }