2f73d3c7c6bbea01dc1da2f773cd18fb89ace9ef
[dpdk.git] / drivers / net / ifc / ifcvf_vdpa.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2018 Intel Corporation
3  */
4
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <fcntl.h>
8 #include <sys/ioctl.h>
9 #include <sys/epoll.h>
10 #include <linux/virtio_net.h>
11 #include <stdbool.h>
12
13 #include <rte_malloc.h>
14 #include <rte_memory.h>
15 #include <rte_bus_pci.h>
16 #include <rte_vhost.h>
17 #include <rte_vdpa.h>
18 #include <rte_vfio.h>
19 #include <rte_spinlock.h>
20 #include <rte_log.h>
21 #include <rte_kvargs.h>
22 #include <rte_devargs.h>
23
24 #include "base/ifcvf.h"
25
26 #define DRV_LOG(level, fmt, args...) \
27         rte_log(RTE_LOG_ ## level, ifcvf_vdpa_logtype, \
28                 "IFCVF %s(): " fmt "\n", __func__, ##args)
29
30 #ifndef PAGE_SIZE
31 #define PAGE_SIZE 4096
32 #endif
33
34 #define IFCVF_VDPA_MODE         "vdpa"
35 #define IFCVF_SW_FALLBACK_LM    "sw-live-migration"
36
37 static const char * const ifcvf_valid_arguments[] = {
38         IFCVF_VDPA_MODE,
39         IFCVF_SW_FALLBACK_LM,
40         NULL
41 };
42
43 static int ifcvf_vdpa_logtype;
44
45 struct ifcvf_internal {
46         struct rte_vdpa_dev_addr dev_addr;
47         struct rte_pci_device *pdev;
48         struct ifcvf_hw hw;
49         int vfio_container_fd;
50         int vfio_group_fd;
51         int vfio_dev_fd;
52         pthread_t tid;  /* thread for notify relay */
53         int epfd;
54         int vid;
55         int did;
56         uint16_t max_queues;
57         uint64_t features;
58         rte_atomic32_t started;
59         rte_atomic32_t dev_attached;
60         rte_atomic32_t running;
61         rte_spinlock_t lock;
62         bool sw_lm;
63 };
64
65 struct internal_list {
66         TAILQ_ENTRY(internal_list) next;
67         struct ifcvf_internal *internal;
68 };
69
70 TAILQ_HEAD(internal_list_head, internal_list);
71 static struct internal_list_head internal_list =
72         TAILQ_HEAD_INITIALIZER(internal_list);
73
74 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
75
76 static struct internal_list *
77 find_internal_resource_by_did(int did)
78 {
79         int found = 0;
80         struct internal_list *list;
81
82         pthread_mutex_lock(&internal_list_lock);
83
84         TAILQ_FOREACH(list, &internal_list, next) {
85                 if (did == list->internal->did) {
86                         found = 1;
87                         break;
88                 }
89         }
90
91         pthread_mutex_unlock(&internal_list_lock);
92
93         if (!found)
94                 return NULL;
95
96         return list;
97 }
98
99 static struct internal_list *
100 find_internal_resource_by_dev(struct rte_pci_device *pdev)
101 {
102         int found = 0;
103         struct internal_list *list;
104
105         pthread_mutex_lock(&internal_list_lock);
106
107         TAILQ_FOREACH(list, &internal_list, next) {
108                 if (pdev == list->internal->pdev) {
109                         found = 1;
110                         break;
111                 }
112         }
113
114         pthread_mutex_unlock(&internal_list_lock);
115
116         if (!found)
117                 return NULL;
118
119         return list;
120 }
121
122 static int
123 ifcvf_vfio_setup(struct ifcvf_internal *internal)
124 {
125         struct rte_pci_device *dev = internal->pdev;
126         char devname[RTE_DEV_NAME_MAX_LEN] = {0};
127         int iommu_group_num;
128         int i;
129
130         internal->vfio_dev_fd = -1;
131         internal->vfio_group_fd = -1;
132         internal->vfio_container_fd = -1;
133
134         rte_pci_device_name(&dev->addr, devname, RTE_DEV_NAME_MAX_LEN);
135         rte_vfio_get_group_num(rte_pci_get_sysfs_path(), devname,
136                         &iommu_group_num);
137
138         internal->vfio_container_fd = rte_vfio_container_create();
139         if (internal->vfio_container_fd < 0)
140                 return -1;
141
142         internal->vfio_group_fd = rte_vfio_container_group_bind(
143                         internal->vfio_container_fd, iommu_group_num);
144         if (internal->vfio_group_fd < 0)
145                 goto err;
146
147         if (rte_pci_map_device(dev))
148                 goto err;
149
150         internal->vfio_dev_fd = dev->intr_handle.vfio_dev_fd;
151
152         for (i = 0; i < RTE_MIN(PCI_MAX_RESOURCE, IFCVF_PCI_MAX_RESOURCE);
153                         i++) {
154                 internal->hw.mem_resource[i].addr =
155                         internal->pdev->mem_resource[i].addr;
156                 internal->hw.mem_resource[i].phys_addr =
157                         internal->pdev->mem_resource[i].phys_addr;
158                 internal->hw.mem_resource[i].len =
159                         internal->pdev->mem_resource[i].len;
160         }
161
162         return 0;
163
164 err:
165         rte_vfio_container_destroy(internal->vfio_container_fd);
166         return -1;
167 }
168
169 static int
170 ifcvf_dma_map(struct ifcvf_internal *internal, int do_map)
171 {
172         uint32_t i;
173         int ret;
174         struct rte_vhost_memory *mem = NULL;
175         int vfio_container_fd;
176
177         ret = rte_vhost_get_mem_table(internal->vid, &mem);
178         if (ret < 0) {
179                 DRV_LOG(ERR, "failed to get VM memory layout.");
180                 goto exit;
181         }
182
183         vfio_container_fd = internal->vfio_container_fd;
184
185         for (i = 0; i < mem->nregions; i++) {
186                 struct rte_vhost_mem_region *reg;
187
188                 reg = &mem->regions[i];
189                 DRV_LOG(INFO, "%s, region %u: HVA 0x%" PRIx64 ", "
190                         "GPA 0x%" PRIx64 ", size 0x%" PRIx64 ".",
191                         do_map ? "DMA map" : "DMA unmap", i,
192                         reg->host_user_addr, reg->guest_phys_addr, reg->size);
193
194                 if (do_map) {
195                         ret = rte_vfio_container_dma_map(vfio_container_fd,
196                                 reg->host_user_addr, reg->guest_phys_addr,
197                                 reg->size);
198                         if (ret < 0) {
199                                 DRV_LOG(ERR, "DMA map failed.");
200                                 goto exit;
201                         }
202                 } else {
203                         ret = rte_vfio_container_dma_unmap(vfio_container_fd,
204                                 reg->host_user_addr, reg->guest_phys_addr,
205                                 reg->size);
206                         if (ret < 0) {
207                                 DRV_LOG(ERR, "DMA unmap failed.");
208                                 goto exit;
209                         }
210                 }
211         }
212
213 exit:
214         if (mem)
215                 free(mem);
216         return ret;
217 }
218
219 static uint64_t
220 hva_to_gpa(int vid, uint64_t hva)
221 {
222         struct rte_vhost_memory *mem = NULL;
223         struct rte_vhost_mem_region *reg;
224         uint32_t i;
225         uint64_t gpa = 0;
226
227         if (rte_vhost_get_mem_table(vid, &mem) < 0)
228                 goto exit;
229
230         for (i = 0; i < mem->nregions; i++) {
231                 reg = &mem->regions[i];
232
233                 if (hva >= reg->host_user_addr &&
234                                 hva < reg->host_user_addr + reg->size) {
235                         gpa = hva - reg->host_user_addr + reg->guest_phys_addr;
236                         break;
237                 }
238         }
239
240 exit:
241         if (mem)
242                 free(mem);
243         return gpa;
244 }
245
246 static int
247 vdpa_ifcvf_start(struct ifcvf_internal *internal)
248 {
249         struct ifcvf_hw *hw = &internal->hw;
250         int i, nr_vring;
251         int vid;
252         struct rte_vhost_vring vq;
253         uint64_t gpa;
254
255         vid = internal->vid;
256         nr_vring = rte_vhost_get_vring_num(vid);
257         rte_vhost_get_negotiated_features(vid, &hw->req_features);
258
259         for (i = 0; i < nr_vring; i++) {
260                 rte_vhost_get_vhost_vring(vid, i, &vq);
261                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.desc);
262                 if (gpa == 0) {
263                         DRV_LOG(ERR, "Fail to get GPA for descriptor ring.");
264                         return -1;
265                 }
266                 hw->vring[i].desc = gpa;
267
268                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.avail);
269                 if (gpa == 0) {
270                         DRV_LOG(ERR, "Fail to get GPA for available ring.");
271                         return -1;
272                 }
273                 hw->vring[i].avail = gpa;
274
275                 gpa = hva_to_gpa(vid, (uint64_t)(uintptr_t)vq.used);
276                 if (gpa == 0) {
277                         DRV_LOG(ERR, "Fail to get GPA for used ring.");
278                         return -1;
279                 }
280                 hw->vring[i].used = gpa;
281
282                 hw->vring[i].size = vq.size;
283                 rte_vhost_get_vring_base(vid, i, &hw->vring[i].last_avail_idx,
284                                 &hw->vring[i].last_used_idx);
285         }
286         hw->nr_vring = i;
287
288         return ifcvf_start_hw(&internal->hw);
289 }
290
291 static void
292 ifcvf_used_ring_log(struct ifcvf_hw *hw, uint32_t queue, uint8_t *log_buf)
293 {
294         uint32_t i, size;
295         uint64_t pfn;
296
297         pfn = hw->vring[queue].used / PAGE_SIZE;
298         size = hw->vring[queue].size * sizeof(struct vring_used_elem) +
299                         sizeof(uint16_t) * 3;
300
301         for (i = 0; i <= size / PAGE_SIZE; i++)
302                 __sync_fetch_and_or_8(&log_buf[(pfn + i) / 8],
303                                 1 << ((pfn + i) % 8));
304 }
305
306 static void
307 vdpa_ifcvf_stop(struct ifcvf_internal *internal)
308 {
309         struct ifcvf_hw *hw = &internal->hw;
310         uint32_t i;
311         int vid;
312         uint64_t features;
313         uint64_t log_base, log_size;
314         uint8_t *log_buf;
315
316         vid = internal->vid;
317         ifcvf_stop_hw(hw);
318
319         for (i = 0; i < hw->nr_vring; i++)
320                 rte_vhost_set_vring_base(vid, i, hw->vring[i].last_avail_idx,
321                                 hw->vring[i].last_used_idx);
322
323         rte_vhost_get_negotiated_features(vid, &features);
324         if (RTE_VHOST_NEED_LOG(features)) {
325                 ifcvf_disable_logging(hw);
326                 rte_vhost_get_log_base(internal->vid, &log_base, &log_size);
327                 rte_vfio_container_dma_unmap(internal->vfio_container_fd,
328                                 log_base, IFCVF_LOG_BASE, log_size);
329                 /*
330                  * IFCVF marks dirty memory pages for only packet buffer,
331                  * SW helps to mark the used ring as dirty after device stops.
332                  */
333                 log_buf = (uint8_t *)(uintptr_t)log_base;
334                 for (i = 0; i < hw->nr_vring; i++)
335                         ifcvf_used_ring_log(hw, i, log_buf);
336         }
337 }
338
339 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
340                 sizeof(int) * (IFCVF_MAX_QUEUES * 2 + 1))
341 static int
342 vdpa_enable_vfio_intr(struct ifcvf_internal *internal)
343 {
344         int ret;
345         uint32_t i, nr_vring;
346         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
347         struct vfio_irq_set *irq_set;
348         int *fd_ptr;
349         struct rte_vhost_vring vring;
350
351         nr_vring = rte_vhost_get_vring_num(internal->vid);
352
353         irq_set = (struct vfio_irq_set *)irq_set_buf;
354         irq_set->argsz = sizeof(irq_set_buf);
355         irq_set->count = nr_vring + 1;
356         irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD |
357                          VFIO_IRQ_SET_ACTION_TRIGGER;
358         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
359         irq_set->start = 0;
360         fd_ptr = (int *)&irq_set->data;
361         fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = internal->pdev->intr_handle.fd;
362
363         for (i = 0; i < nr_vring; i++) {
364                 rte_vhost_get_vhost_vring(internal->vid, i, &vring);
365                 fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
366         }
367
368         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
369         if (ret) {
370                 DRV_LOG(ERR, "Error enabling MSI-X interrupts: %s",
371                                 strerror(errno));
372                 return -1;
373         }
374
375         return 0;
376 }
377
378 static int
379 vdpa_disable_vfio_intr(struct ifcvf_internal *internal)
380 {
381         int ret;
382         char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
383         struct vfio_irq_set *irq_set;
384
385         irq_set = (struct vfio_irq_set *)irq_set_buf;
386         irq_set->argsz = sizeof(irq_set_buf);
387         irq_set->count = 0;
388         irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
389         irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
390         irq_set->start = 0;
391
392         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
393         if (ret) {
394                 DRV_LOG(ERR, "Error disabling MSI-X interrupts: %s",
395                                 strerror(errno));
396                 return -1;
397         }
398
399         return 0;
400 }
401
402 static void *
403 notify_relay(void *arg)
404 {
405         int i, kickfd, epfd, nfds = 0;
406         uint32_t qid, q_num;
407         struct epoll_event events[IFCVF_MAX_QUEUES * 2];
408         struct epoll_event ev;
409         uint64_t buf;
410         int nbytes;
411         struct rte_vhost_vring vring;
412         struct ifcvf_internal *internal = (struct ifcvf_internal *)arg;
413         struct ifcvf_hw *hw = &internal->hw;
414
415         q_num = rte_vhost_get_vring_num(internal->vid);
416
417         epfd = epoll_create(IFCVF_MAX_QUEUES * 2);
418         if (epfd < 0) {
419                 DRV_LOG(ERR, "failed to create epoll instance.");
420                 return NULL;
421         }
422         internal->epfd = epfd;
423
424         for (qid = 0; qid < q_num; qid++) {
425                 ev.events = EPOLLIN | EPOLLPRI;
426                 rte_vhost_get_vhost_vring(internal->vid, qid, &vring);
427                 ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
428                 if (epoll_ctl(epfd, EPOLL_CTL_ADD, vring.kickfd, &ev) < 0) {
429                         DRV_LOG(ERR, "epoll add error: %s", strerror(errno));
430                         return NULL;
431                 }
432         }
433
434         for (;;) {
435                 nfds = epoll_wait(epfd, events, q_num, -1);
436                 if (nfds < 0) {
437                         if (errno == EINTR)
438                                 continue;
439                         DRV_LOG(ERR, "epoll_wait return fail\n");
440                         return NULL;
441                 }
442
443                 for (i = 0; i < nfds; i++) {
444                         qid = events[i].data.u32;
445                         kickfd = (uint32_t)(events[i].data.u64 >> 32);
446                         do {
447                                 nbytes = read(kickfd, &buf, 8);
448                                 if (nbytes < 0) {
449                                         if (errno == EINTR ||
450                                             errno == EWOULDBLOCK ||
451                                             errno == EAGAIN)
452                                                 continue;
453                                         DRV_LOG(INFO, "Error reading "
454                                                 "kickfd: %s",
455                                                 strerror(errno));
456                                 }
457                                 break;
458                         } while (1);
459
460                         ifcvf_notify_queue(hw, qid);
461                 }
462         }
463
464         return NULL;
465 }
466
467 static int
468 setup_notify_relay(struct ifcvf_internal *internal)
469 {
470         int ret;
471
472         ret = pthread_create(&internal->tid, NULL, notify_relay,
473                         (void *)internal);
474         if (ret) {
475                 DRV_LOG(ERR, "failed to create notify relay pthread.");
476                 return -1;
477         }
478         return 0;
479 }
480
481 static int
482 unset_notify_relay(struct ifcvf_internal *internal)
483 {
484         void *status;
485
486         if (internal->tid) {
487                 pthread_cancel(internal->tid);
488                 pthread_join(internal->tid, &status);
489         }
490         internal->tid = 0;
491
492         if (internal->epfd >= 0)
493                 close(internal->epfd);
494         internal->epfd = -1;
495
496         return 0;
497 }
498
499 static int
500 update_datapath(struct ifcvf_internal *internal)
501 {
502         int ret;
503
504         rte_spinlock_lock(&internal->lock);
505
506         if (!rte_atomic32_read(&internal->running) &&
507             (rte_atomic32_read(&internal->started) &&
508              rte_atomic32_read(&internal->dev_attached))) {
509                 ret = ifcvf_dma_map(internal, 1);
510                 if (ret)
511                         goto err;
512
513                 ret = vdpa_enable_vfio_intr(internal);
514                 if (ret)
515                         goto err;
516
517                 ret = vdpa_ifcvf_start(internal);
518                 if (ret)
519                         goto err;
520
521                 ret = setup_notify_relay(internal);
522                 if (ret)
523                         goto err;
524
525                 rte_atomic32_set(&internal->running, 1);
526         } else if (rte_atomic32_read(&internal->running) &&
527                    (!rte_atomic32_read(&internal->started) ||
528                     !rte_atomic32_read(&internal->dev_attached))) {
529                 ret = unset_notify_relay(internal);
530                 if (ret)
531                         goto err;
532
533                 vdpa_ifcvf_stop(internal);
534
535                 ret = vdpa_disable_vfio_intr(internal);
536                 if (ret)
537                         goto err;
538
539                 ret = ifcvf_dma_map(internal, 0);
540                 if (ret)
541                         goto err;
542
543                 rte_atomic32_set(&internal->running, 0);
544         }
545
546         rte_spinlock_unlock(&internal->lock);
547         return 0;
548 err:
549         rte_spinlock_unlock(&internal->lock);
550         return ret;
551 }
552
553 static int
554 ifcvf_dev_config(int vid)
555 {
556         int did;
557         struct internal_list *list;
558         struct ifcvf_internal *internal;
559
560         did = rte_vhost_get_vdpa_device_id(vid);
561         list = find_internal_resource_by_did(did);
562         if (list == NULL) {
563                 DRV_LOG(ERR, "Invalid device id: %d", did);
564                 return -1;
565         }
566
567         internal = list->internal;
568         internal->vid = vid;
569         rte_atomic32_set(&internal->dev_attached, 1);
570         update_datapath(internal);
571
572         if (rte_vhost_host_notifier_ctrl(vid, true) != 0)
573                 DRV_LOG(NOTICE, "vDPA (%d): software relay is used.", did);
574
575         return 0;
576 }
577
578 static int
579 ifcvf_dev_close(int vid)
580 {
581         int did;
582         struct internal_list *list;
583         struct ifcvf_internal *internal;
584
585         did = rte_vhost_get_vdpa_device_id(vid);
586         list = find_internal_resource_by_did(did);
587         if (list == NULL) {
588                 DRV_LOG(ERR, "Invalid device id: %d", did);
589                 return -1;
590         }
591
592         internal = list->internal;
593         rte_atomic32_set(&internal->dev_attached, 0);
594         update_datapath(internal);
595
596         return 0;
597 }
598
599 static int
600 ifcvf_set_features(int vid)
601 {
602         uint64_t features;
603         int did;
604         struct internal_list *list;
605         struct ifcvf_internal *internal;
606         uint64_t log_base, log_size;
607
608         did = rte_vhost_get_vdpa_device_id(vid);
609         list = find_internal_resource_by_did(did);
610         if (list == NULL) {
611                 DRV_LOG(ERR, "Invalid device id: %d", did);
612                 return -1;
613         }
614
615         internal = list->internal;
616         rte_vhost_get_negotiated_features(vid, &features);
617
618         if (RTE_VHOST_NEED_LOG(features)) {
619                 rte_vhost_get_log_base(vid, &log_base, &log_size);
620                 rte_vfio_container_dma_map(internal->vfio_container_fd,
621                                 log_base, IFCVF_LOG_BASE, log_size);
622                 ifcvf_enable_logging(&internal->hw, IFCVF_LOG_BASE, log_size);
623         }
624
625         return 0;
626 }
627
628 static int
629 ifcvf_get_vfio_group_fd(int vid)
630 {
631         int did;
632         struct internal_list *list;
633
634         did = rte_vhost_get_vdpa_device_id(vid);
635         list = find_internal_resource_by_did(did);
636         if (list == NULL) {
637                 DRV_LOG(ERR, "Invalid device id: %d", did);
638                 return -1;
639         }
640
641         return list->internal->vfio_group_fd;
642 }
643
644 static int
645 ifcvf_get_vfio_device_fd(int vid)
646 {
647         int did;
648         struct internal_list *list;
649
650         did = rte_vhost_get_vdpa_device_id(vid);
651         list = find_internal_resource_by_did(did);
652         if (list == NULL) {
653                 DRV_LOG(ERR, "Invalid device id: %d", did);
654                 return -1;
655         }
656
657         return list->internal->vfio_dev_fd;
658 }
659
660 static int
661 ifcvf_get_notify_area(int vid, int qid, uint64_t *offset, uint64_t *size)
662 {
663         int did;
664         struct internal_list *list;
665         struct ifcvf_internal *internal;
666         struct vfio_region_info reg = { .argsz = sizeof(reg) };
667         int ret;
668
669         did = rte_vhost_get_vdpa_device_id(vid);
670         list = find_internal_resource_by_did(did);
671         if (list == NULL) {
672                 DRV_LOG(ERR, "Invalid device id: %d", did);
673                 return -1;
674         }
675
676         internal = list->internal;
677
678         reg.index = ifcvf_get_notify_region(&internal->hw);
679         ret = ioctl(internal->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &reg);
680         if (ret) {
681                 DRV_LOG(ERR, "Get not get device region info: %s",
682                                 strerror(errno));
683                 return -1;
684         }
685
686         *offset = ifcvf_get_queue_notify_off(&internal->hw, qid) + reg.offset;
687         *size = 0x1000;
688
689         return 0;
690 }
691
692 static int
693 ifcvf_get_queue_num(int did, uint32_t *queue_num)
694 {
695         struct internal_list *list;
696
697         list = find_internal_resource_by_did(did);
698         if (list == NULL) {
699                 DRV_LOG(ERR, "Invalid device id: %d", did);
700                 return -1;
701         }
702
703         *queue_num = list->internal->max_queues;
704
705         return 0;
706 }
707
708 static int
709 ifcvf_get_vdpa_features(int did, uint64_t *features)
710 {
711         struct internal_list *list;
712
713         list = find_internal_resource_by_did(did);
714         if (list == NULL) {
715                 DRV_LOG(ERR, "Invalid device id: %d", did);
716                 return -1;
717         }
718
719         *features = list->internal->features;
720
721         return 0;
722 }
723
724 #define VDPA_SUPPORTED_PROTOCOL_FEATURES \
725                 (1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK | \
726                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ | \
727                  1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD | \
728                  1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER | \
729                  1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD)
730 static int
731 ifcvf_get_protocol_features(int did __rte_unused, uint64_t *features)
732 {
733         *features = VDPA_SUPPORTED_PROTOCOL_FEATURES;
734         return 0;
735 }
736
737 static struct rte_vdpa_dev_ops ifcvf_ops = {
738         .get_queue_num = ifcvf_get_queue_num,
739         .get_features = ifcvf_get_vdpa_features,
740         .get_protocol_features = ifcvf_get_protocol_features,
741         .dev_conf = ifcvf_dev_config,
742         .dev_close = ifcvf_dev_close,
743         .set_vring_state = NULL,
744         .set_features = ifcvf_set_features,
745         .migration_done = NULL,
746         .get_vfio_group_fd = ifcvf_get_vfio_group_fd,
747         .get_vfio_device_fd = ifcvf_get_vfio_device_fd,
748         .get_notify_area = ifcvf_get_notify_area,
749 };
750
751 static inline int
752 open_int(const char *key __rte_unused, const char *value, void *extra_args)
753 {
754         uint16_t *n = extra_args;
755
756         if (value == NULL || extra_args == NULL)
757                 return -EINVAL;
758
759         *n = (uint16_t)strtoul(value, NULL, 0);
760         if (*n == USHRT_MAX && errno == ERANGE)
761                 return -1;
762
763         return 0;
764 }
765
766 static int
767 ifcvf_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
768                 struct rte_pci_device *pci_dev)
769 {
770         uint64_t features;
771         struct ifcvf_internal *internal = NULL;
772         struct internal_list *list = NULL;
773         int vdpa_mode = 0;
774         int sw_fallback_lm = 0;
775         struct rte_kvargs *kvlist = NULL;
776         int ret = 0;
777
778         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
779                 return 0;
780
781         kvlist = rte_kvargs_parse(pci_dev->device.devargs->args,
782                         ifcvf_valid_arguments);
783         if (kvlist == NULL)
784                 return 1;
785
786         /* probe only when vdpa mode is specified */
787         if (rte_kvargs_count(kvlist, IFCVF_VDPA_MODE) == 0) {
788                 rte_kvargs_free(kvlist);
789                 return 1;
790         }
791
792         ret = rte_kvargs_process(kvlist, IFCVF_VDPA_MODE, &open_int,
793                         &vdpa_mode);
794         if (ret < 0 || vdpa_mode == 0) {
795                 rte_kvargs_free(kvlist);
796                 return 1;
797         }
798
799         list = rte_zmalloc("ifcvf", sizeof(*list), 0);
800         if (list == NULL)
801                 goto error;
802
803         internal = rte_zmalloc("ifcvf", sizeof(*internal), 0);
804         if (internal == NULL)
805                 goto error;
806
807         internal->pdev = pci_dev;
808         rte_spinlock_init(&internal->lock);
809
810         if (ifcvf_vfio_setup(internal) < 0) {
811                 DRV_LOG(ERR, "failed to setup device %s", pci_dev->name);
812                 goto error;
813         }
814
815         if (ifcvf_init_hw(&internal->hw, internal->pdev) < 0) {
816                 DRV_LOG(ERR, "failed to init device %s", pci_dev->name);
817                 goto error;
818         }
819
820         internal->max_queues = IFCVF_MAX_QUEUES;
821         features = ifcvf_get_features(&internal->hw);
822         internal->features = (features &
823                 ~(1ULL << VIRTIO_F_IOMMU_PLATFORM)) |
824                 (1ULL << VIRTIO_NET_F_GUEST_ANNOUNCE) |
825                 (1ULL << VIRTIO_NET_F_CTRL_VQ) |
826                 (1ULL << VIRTIO_NET_F_STATUS) |
827                 (1ULL << VHOST_USER_F_PROTOCOL_FEATURES) |
828                 (1ULL << VHOST_F_LOG_ALL);
829
830         internal->dev_addr.pci_addr = pci_dev->addr;
831         internal->dev_addr.type = PCI_ADDR;
832         list->internal = internal;
833
834         if (rte_kvargs_count(kvlist, IFCVF_SW_FALLBACK_LM)) {
835                 ret = rte_kvargs_process(kvlist, IFCVF_SW_FALLBACK_LM,
836                                 &open_int, &sw_fallback_lm);
837                 if (ret < 0)
838                         goto error;
839         }
840         internal->sw_lm = sw_fallback_lm;
841
842         internal->did = rte_vdpa_register_device(&internal->dev_addr,
843                                 &ifcvf_ops);
844         if (internal->did < 0) {
845                 DRV_LOG(ERR, "failed to register device %s", pci_dev->name);
846                 goto error;
847         }
848
849         pthread_mutex_lock(&internal_list_lock);
850         TAILQ_INSERT_TAIL(&internal_list, list, next);
851         pthread_mutex_unlock(&internal_list_lock);
852
853         rte_atomic32_set(&internal->started, 1);
854         update_datapath(internal);
855
856         rte_kvargs_free(kvlist);
857         return 0;
858
859 error:
860         rte_kvargs_free(kvlist);
861         rte_free(list);
862         rte_free(internal);
863         return -1;
864 }
865
866 static int
867 ifcvf_pci_remove(struct rte_pci_device *pci_dev)
868 {
869         struct ifcvf_internal *internal;
870         struct internal_list *list;
871
872         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
873                 return 0;
874
875         list = find_internal_resource_by_dev(pci_dev);
876         if (list == NULL) {
877                 DRV_LOG(ERR, "Invalid device: %s", pci_dev->name);
878                 return -1;
879         }
880
881         internal = list->internal;
882         rte_atomic32_set(&internal->started, 0);
883         update_datapath(internal);
884
885         rte_pci_unmap_device(internal->pdev);
886         rte_vfio_container_destroy(internal->vfio_container_fd);
887         rte_vdpa_unregister_device(internal->did);
888
889         pthread_mutex_lock(&internal_list_lock);
890         TAILQ_REMOVE(&internal_list, list, next);
891         pthread_mutex_unlock(&internal_list_lock);
892
893         rte_free(list);
894         rte_free(internal);
895
896         return 0;
897 }
898
899 /*
900  * IFCVF has the same vendor ID and device ID as virtio net PCI
901  * device, with its specific subsystem vendor ID and device ID.
902  */
903 static const struct rte_pci_id pci_id_ifcvf_map[] = {
904         { .class_id = RTE_CLASS_ANY_ID,
905           .vendor_id = IFCVF_VENDOR_ID,
906           .device_id = IFCVF_DEVICE_ID,
907           .subsystem_vendor_id = IFCVF_SUBSYS_VENDOR_ID,
908           .subsystem_device_id = IFCVF_SUBSYS_DEVICE_ID,
909         },
910
911         { .vendor_id = 0, /* sentinel */
912         },
913 };
914
915 static struct rte_pci_driver rte_ifcvf_vdpa = {
916         .id_table = pci_id_ifcvf_map,
917         .drv_flags = 0,
918         .probe = ifcvf_pci_probe,
919         .remove = ifcvf_pci_remove,
920 };
921
922 RTE_PMD_REGISTER_PCI(net_ifcvf, rte_ifcvf_vdpa);
923 RTE_PMD_REGISTER_PCI_TABLE(net_ifcvf, pci_id_ifcvf_map);
924 RTE_PMD_REGISTER_KMOD_DEP(net_ifcvf, "* vfio-pci");
925
926 RTE_INIT(ifcvf_vdpa_init_log)
927 {
928         ifcvf_vdpa_logtype = rte_log_register("pmd.net.ifcvf_vdpa");
929         if (ifcvf_vdpa_logtype >= 0)
930                 rte_log_set_level(ifcvf_vdpa_logtype, RTE_LOG_NOTICE);
931 }