net/vhost: update license to SPDX format
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
22
23 #define ETH_VHOST_IFACE_ARG             "iface"
24 #define ETH_VHOST_QUEUES_ARG            "queues"
25 #define ETH_VHOST_CLIENT_ARG            "client"
26 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
27 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
28 #define VHOST_MAX_PKT_BURST 32
29
30 static const char *valid_arguments[] = {
31         ETH_VHOST_IFACE_ARG,
32         ETH_VHOST_QUEUES_ARG,
33         ETH_VHOST_CLIENT_ARG,
34         ETH_VHOST_DEQUEUE_ZERO_COPY,
35         ETH_VHOST_IOMMU_SUPPORT,
36         NULL
37 };
38
39 static struct ether_addr base_eth_addr = {
40         .addr_bytes = {
41                 0x56 /* V */,
42                 0x48 /* H */,
43                 0x4F /* O */,
44                 0x53 /* S */,
45                 0x54 /* T */,
46                 0x00
47         }
48 };
49
50 enum vhost_xstats_pkts {
51         VHOST_UNDERSIZE_PKT = 0,
52         VHOST_64_PKT,
53         VHOST_65_TO_127_PKT,
54         VHOST_128_TO_255_PKT,
55         VHOST_256_TO_511_PKT,
56         VHOST_512_TO_1023_PKT,
57         VHOST_1024_TO_1522_PKT,
58         VHOST_1523_TO_MAX_PKT,
59         VHOST_BROADCAST_PKT,
60         VHOST_MULTICAST_PKT,
61         VHOST_UNICAST_PKT,
62         VHOST_ERRORS_PKT,
63         VHOST_ERRORS_FRAGMENTED,
64         VHOST_ERRORS_JABBER,
65         VHOST_UNKNOWN_PROTOCOL,
66         VHOST_XSTATS_MAX,
67 };
68
69 struct vhost_stats {
70         uint64_t pkts;
71         uint64_t bytes;
72         uint64_t missed_pkts;
73         uint64_t xstats[VHOST_XSTATS_MAX];
74 };
75
76 struct vhost_queue {
77         int vid;
78         rte_atomic32_t allow_queuing;
79         rte_atomic32_t while_queuing;
80         struct pmd_internal *internal;
81         struct rte_mempool *mb_pool;
82         uint16_t port;
83         uint16_t virtqueue_id;
84         struct vhost_stats stats;
85 };
86
87 struct pmd_internal {
88         rte_atomic32_t dev_attached;
89         char *dev_name;
90         char *iface_name;
91         uint16_t max_queues;
92         uint16_t vid;
93         rte_atomic32_t started;
94         uint8_t vlan_strip;
95 };
96
97 struct internal_list {
98         TAILQ_ENTRY(internal_list) next;
99         struct rte_eth_dev *eth_dev;
100 };
101
102 TAILQ_HEAD(internal_list_head, internal_list);
103 static struct internal_list_head internal_list =
104         TAILQ_HEAD_INITIALIZER(internal_list);
105
106 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
107
108 static struct rte_eth_link pmd_link = {
109                 .link_speed = 10000,
110                 .link_duplex = ETH_LINK_FULL_DUPLEX,
111                 .link_status = ETH_LINK_DOWN
112 };
113
114 struct rte_vhost_vring_state {
115         rte_spinlock_t lock;
116
117         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
118         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
119         unsigned int index;
120         unsigned int max_vring;
121 };
122
123 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
124
125 #define VHOST_XSTATS_NAME_SIZE 64
126
127 struct vhost_xstats_name_off {
128         char name[VHOST_XSTATS_NAME_SIZE];
129         uint64_t offset;
130 };
131
132 /* [rx]_is prepended to the name string here */
133 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
134         {"good_packets",
135          offsetof(struct vhost_queue, stats.pkts)},
136         {"total_bytes",
137          offsetof(struct vhost_queue, stats.bytes)},
138         {"missed_pkts",
139          offsetof(struct vhost_queue, stats.missed_pkts)},
140         {"broadcast_packets",
141          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
142         {"multicast_packets",
143          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
144         {"unicast_packets",
145          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
146          {"undersize_packets",
147          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
148         {"size_64_packets",
149          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
150         {"size_65_to_127_packets",
151          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
152         {"size_128_to_255_packets",
153          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
154         {"size_256_to_511_packets",
155          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
156         {"size_512_to_1023_packets",
157          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
158         {"size_1024_to_1522_packets",
159          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
160         {"size_1523_to_max_packets",
161          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
162         {"errors_with_bad_CRC",
163          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
164         {"fragmented_errors",
165          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
166         {"jabber_errors",
167          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
168         {"unknown_protos_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
170 };
171
172 /* [tx]_ is prepended to the name string here */
173 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
174         {"good_packets",
175          offsetof(struct vhost_queue, stats.pkts)},
176         {"total_bytes",
177          offsetof(struct vhost_queue, stats.bytes)},
178         {"missed_pkts",
179          offsetof(struct vhost_queue, stats.missed_pkts)},
180         {"broadcast_packets",
181          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
182         {"multicast_packets",
183          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
184         {"unicast_packets",
185          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
186         {"undersize_packets",
187          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
188         {"size_64_packets",
189          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
190         {"size_65_to_127_packets",
191          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
192         {"size_128_to_255_packets",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
194         {"size_256_to_511_packets",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
196         {"size_512_to_1023_packets",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
198         {"size_1024_to_1522_packets",
199          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
200         {"size_1523_to_max_packets",
201          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
202         {"errors_with_bad_CRC",
203          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
204 };
205
206 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
207                                 sizeof(vhost_rxport_stat_strings[0]))
208
209 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
210                                 sizeof(vhost_txport_stat_strings[0]))
211
212 static void
213 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
214 {
215         struct vhost_queue *vq = NULL;
216         unsigned int i = 0;
217
218         for (i = 0; i < dev->data->nb_rx_queues; i++) {
219                 vq = dev->data->rx_queues[i];
220                 if (!vq)
221                         continue;
222                 memset(&vq->stats, 0, sizeof(vq->stats));
223         }
224         for (i = 0; i < dev->data->nb_tx_queues; i++) {
225                 vq = dev->data->tx_queues[i];
226                 if (!vq)
227                         continue;
228                 memset(&vq->stats, 0, sizeof(vq->stats));
229         }
230 }
231
232 static int
233 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
234                            struct rte_eth_xstat_name *xstats_names,
235                            unsigned int limit __rte_unused)
236 {
237         unsigned int t = 0;
238         int count = 0;
239         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
240
241         if (!xstats_names)
242                 return nstats;
243         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
244                 snprintf(xstats_names[count].name,
245                          sizeof(xstats_names[count].name),
246                          "rx_%s", vhost_rxport_stat_strings[t].name);
247                 count++;
248         }
249         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
250                 snprintf(xstats_names[count].name,
251                          sizeof(xstats_names[count].name),
252                          "tx_%s", vhost_txport_stat_strings[t].name);
253                 count++;
254         }
255         return count;
256 }
257
258 static int
259 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
260                      unsigned int n)
261 {
262         unsigned int i;
263         unsigned int t;
264         unsigned int count = 0;
265         struct vhost_queue *vq = NULL;
266         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
267
268         if (n < nxstats)
269                 return nxstats;
270
271         for (i = 0; i < dev->data->nb_rx_queues; i++) {
272                 vq = dev->data->rx_queues[i];
273                 if (!vq)
274                         continue;
275                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
276                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
277                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
278         }
279         for (i = 0; i < dev->data->nb_tx_queues; i++) {
280                 vq = dev->data->tx_queues[i];
281                 if (!vq)
282                         continue;
283                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
284                                 + vq->stats.missed_pkts
285                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
286                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
287         }
288         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
289                 xstats[count].value = 0;
290                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
291                         vq = dev->data->rx_queues[i];
292                         if (!vq)
293                                 continue;
294                         xstats[count].value +=
295                                 *(uint64_t *)(((char *)vq)
296                                 + vhost_rxport_stat_strings[t].offset);
297                 }
298                 xstats[count].id = count;
299                 count++;
300         }
301         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
302                 xstats[count].value = 0;
303                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
304                         vq = dev->data->tx_queues[i];
305                         if (!vq)
306                                 continue;
307                         xstats[count].value +=
308                                 *(uint64_t *)(((char *)vq)
309                                 + vhost_txport_stat_strings[t].offset);
310                 }
311                 xstats[count].id = count;
312                 count++;
313         }
314         return count;
315 }
316
317 static inline void
318 vhost_count_multicast_broadcast(struct vhost_queue *vq,
319                                 struct rte_mbuf *mbuf)
320 {
321         struct ether_addr *ea = NULL;
322         struct vhost_stats *pstats = &vq->stats;
323
324         ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
325         if (is_multicast_ether_addr(ea)) {
326                 if (is_broadcast_ether_addr(ea))
327                         pstats->xstats[VHOST_BROADCAST_PKT]++;
328                 else
329                         pstats->xstats[VHOST_MULTICAST_PKT]++;
330         }
331 }
332
333 static void
334 vhost_update_packet_xstats(struct vhost_queue *vq,
335                            struct rte_mbuf **bufs,
336                            uint16_t count)
337 {
338         uint32_t pkt_len = 0;
339         uint64_t i = 0;
340         uint64_t index;
341         struct vhost_stats *pstats = &vq->stats;
342
343         for (i = 0; i < count ; i++) {
344                 pkt_len = bufs[i]->pkt_len;
345                 if (pkt_len == 64) {
346                         pstats->xstats[VHOST_64_PKT]++;
347                 } else if (pkt_len > 64 && pkt_len < 1024) {
348                         index = (sizeof(pkt_len) * 8)
349                                 - __builtin_clz(pkt_len) - 5;
350                         pstats->xstats[index]++;
351                 } else {
352                         if (pkt_len < 64)
353                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
354                         else if (pkt_len <= 1522)
355                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
356                         else if (pkt_len > 1522)
357                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
358                 }
359                 vhost_count_multicast_broadcast(vq, bufs[i]);
360         }
361 }
362
363 static uint16_t
364 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
365 {
366         struct vhost_queue *r = q;
367         uint16_t i, nb_rx = 0;
368         uint16_t nb_receive = nb_bufs;
369
370         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
371                 return 0;
372
373         rte_atomic32_set(&r->while_queuing, 1);
374
375         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
376                 goto out;
377
378         /* Dequeue packets from guest TX queue */
379         while (nb_receive) {
380                 uint16_t nb_pkts;
381                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
382                                                  VHOST_MAX_PKT_BURST);
383
384                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
385                                                   r->mb_pool, &bufs[nb_rx],
386                                                   num);
387
388                 nb_rx += nb_pkts;
389                 nb_receive -= nb_pkts;
390                 if (nb_pkts < num)
391                         break;
392         }
393
394         r->stats.pkts += nb_rx;
395
396         for (i = 0; likely(i < nb_rx); i++) {
397                 bufs[i]->port = r->port;
398                 bufs[i]->ol_flags = 0;
399                 bufs[i]->vlan_tci = 0;
400
401                 if (r->internal->vlan_strip)
402                         rte_vlan_strip(bufs[i]);
403
404                 r->stats.bytes += bufs[i]->pkt_len;
405         }
406
407         vhost_update_packet_xstats(r, bufs, nb_rx);
408
409 out:
410         rte_atomic32_set(&r->while_queuing, 0);
411
412         return nb_rx;
413 }
414
415 static uint16_t
416 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
417 {
418         struct vhost_queue *r = q;
419         uint16_t i, nb_tx = 0;
420         uint16_t nb_send = 0;
421
422         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
423                 return 0;
424
425         rte_atomic32_set(&r->while_queuing, 1);
426
427         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
428                 goto out;
429
430         for (i = 0; i < nb_bufs; i++) {
431                 struct rte_mbuf *m = bufs[i];
432
433                 /* Do VLAN tag insertion */
434                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
435                         int error = rte_vlan_insert(&m);
436                         if (unlikely(error)) {
437                                 rte_pktmbuf_free(m);
438                                 continue;
439                         }
440                 }
441
442                 bufs[nb_send] = m;
443                 ++nb_send;
444         }
445
446         /* Enqueue packets to guest RX queue */
447         while (nb_send) {
448                 uint16_t nb_pkts;
449                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
450                                                  VHOST_MAX_PKT_BURST);
451
452                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
453                                                   &bufs[nb_tx], num);
454
455                 nb_tx += nb_pkts;
456                 nb_send -= nb_pkts;
457                 if (nb_pkts < num)
458                         break;
459         }
460
461         r->stats.pkts += nb_tx;
462         r->stats.missed_pkts += nb_bufs - nb_tx;
463
464         for (i = 0; likely(i < nb_tx); i++)
465                 r->stats.bytes += bufs[i]->pkt_len;
466
467         vhost_update_packet_xstats(r, bufs, nb_tx);
468
469         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
470          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
471          * are increased when packets are not transmitted successfully.
472          */
473         for (i = nb_tx; i < nb_bufs; i++)
474                 vhost_count_multicast_broadcast(r, bufs[i]);
475
476         for (i = 0; likely(i < nb_tx); i++)
477                 rte_pktmbuf_free(bufs[i]);
478 out:
479         rte_atomic32_set(&r->while_queuing, 0);
480
481         return nb_tx;
482 }
483
484 static int
485 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
486 {
487         struct pmd_internal *internal = dev->data->dev_private;
488         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
489
490         internal->vlan_strip = rxmode->hw_vlan_strip;
491
492         if (rxmode->hw_vlan_filter)
493                 RTE_LOG(WARNING, PMD,
494                         "vhost(%s): vlan filtering not available\n",
495                         internal->dev_name);
496
497         return 0;
498 }
499
500 static inline struct internal_list *
501 find_internal_resource(char *ifname)
502 {
503         int found = 0;
504         struct internal_list *list;
505         struct pmd_internal *internal;
506
507         if (ifname == NULL)
508                 return NULL;
509
510         pthread_mutex_lock(&internal_list_lock);
511
512         TAILQ_FOREACH(list, &internal_list, next) {
513                 internal = list->eth_dev->data->dev_private;
514                 if (!strcmp(internal->iface_name, ifname)) {
515                         found = 1;
516                         break;
517                 }
518         }
519
520         pthread_mutex_unlock(&internal_list_lock);
521
522         if (!found)
523                 return NULL;
524
525         return list;
526 }
527
528 static int
529 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
530 {
531         struct rte_vhost_vring vring;
532         struct vhost_queue *vq;
533         int ret = 0;
534
535         vq = dev->data->rx_queues[qid];
536         if (!vq) {
537                 RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
538                 return -1;
539         }
540
541         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
542         if (ret < 0) {
543                 RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring\n", qid);
544                 return ret;
545         }
546         RTE_LOG(INFO, PMD, "Enable interrupt for rxq%d\n", qid);
547         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
548         rte_wmb();
549
550         return ret;
551 }
552
553 static int
554 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
555 {
556         struct rte_vhost_vring vring;
557         struct vhost_queue *vq;
558         int ret = 0;
559
560         vq = dev->data->rx_queues[qid];
561         if (!vq) {
562                 RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
563                 return -1;
564         }
565
566         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
567         if (ret < 0) {
568                 RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring", qid);
569                 return ret;
570         }
571         RTE_LOG(INFO, PMD, "Disable interrupt for rxq%d\n", qid);
572         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
573         rte_wmb();
574
575         return 0;
576 }
577
578 static void
579 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
580 {
581         struct rte_intr_handle *intr_handle = dev->intr_handle;
582
583         if (intr_handle) {
584                 if (intr_handle->intr_vec)
585                         free(intr_handle->intr_vec);
586                 free(intr_handle);
587         }
588
589         dev->intr_handle = NULL;
590 }
591
592 static int
593 eth_vhost_install_intr(struct rte_eth_dev *dev)
594 {
595         struct rte_vhost_vring vring;
596         struct vhost_queue *vq;
597         int count = 0;
598         int nb_rxq = dev->data->nb_rx_queues;
599         int i;
600         int ret;
601
602         /* uninstall firstly if we are reconnecting */
603         if (dev->intr_handle)
604                 eth_vhost_uninstall_intr(dev);
605
606         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
607         if (!dev->intr_handle) {
608                 RTE_LOG(ERR, PMD, "Fail to allocate intr_handle\n");
609                 return -ENOMEM;
610         }
611         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
612
613         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
614
615         dev->intr_handle->intr_vec =
616                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
617
618         if (!dev->intr_handle->intr_vec) {
619                 RTE_LOG(ERR, PMD,
620                         "Failed to allocate memory for interrupt vector\n");
621                 free(dev->intr_handle);
622                 return -ENOMEM;
623         }
624
625         RTE_LOG(INFO, PMD, "Prepare intr vec\n");
626         for (i = 0; i < nb_rxq; i++) {
627                 vq = dev->data->rx_queues[i];
628                 if (!vq) {
629                         RTE_LOG(INFO, PMD, "rxq-%d not setup yet, skip!\n", i);
630                         continue;
631                 }
632
633                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
634                 if (ret < 0) {
635                         RTE_LOG(INFO, PMD,
636                                 "Failed to get rxq-%d's vring, skip!\n", i);
637                         continue;
638                 }
639
640                 if (vring.kickfd < 0) {
641                         RTE_LOG(INFO, PMD,
642                                 "rxq-%d's kickfd is invalid, skip!\n", i);
643                         continue;
644                 }
645                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
646                 dev->intr_handle->efds[i] = vring.kickfd;
647                 count++;
648                 RTE_LOG(INFO, PMD, "Installed intr vec for rxq-%d\n", i);
649         }
650
651         dev->intr_handle->nb_efd = count;
652         dev->intr_handle->max_intr = count + 1;
653         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
654
655         return 0;
656 }
657
658 static void
659 update_queuing_status(struct rte_eth_dev *dev)
660 {
661         struct pmd_internal *internal = dev->data->dev_private;
662         struct vhost_queue *vq;
663         unsigned int i;
664         int allow_queuing = 1;
665
666         if (!dev->data->rx_queues || !dev->data->tx_queues)
667                 return;
668
669         if (rte_atomic32_read(&internal->started) == 0 ||
670             rte_atomic32_read(&internal->dev_attached) == 0)
671                 allow_queuing = 0;
672
673         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
674         for (i = 0; i < dev->data->nb_rx_queues; i++) {
675                 vq = dev->data->rx_queues[i];
676                 if (vq == NULL)
677                         continue;
678                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
679                 while (rte_atomic32_read(&vq->while_queuing))
680                         rte_pause();
681         }
682
683         for (i = 0; i < dev->data->nb_tx_queues; i++) {
684                 vq = dev->data->tx_queues[i];
685                 if (vq == NULL)
686                         continue;
687                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
688                 while (rte_atomic32_read(&vq->while_queuing))
689                         rte_pause();
690         }
691 }
692
693 static void
694 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
695 {
696         struct vhost_queue *vq;
697         int i;
698
699         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
700                 vq = eth_dev->data->rx_queues[i];
701                 if (!vq)
702                         continue;
703                 vq->vid = internal->vid;
704                 vq->internal = internal;
705                 vq->port = eth_dev->data->port_id;
706         }
707         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
708                 vq = eth_dev->data->tx_queues[i];
709                 if (!vq)
710                         continue;
711                 vq->vid = internal->vid;
712                 vq->internal = internal;
713                 vq->port = eth_dev->data->port_id;
714         }
715 }
716
717 static int
718 new_device(int vid)
719 {
720         struct rte_eth_dev *eth_dev;
721         struct internal_list *list;
722         struct pmd_internal *internal;
723         struct rte_eth_conf *dev_conf;
724         unsigned i;
725         char ifname[PATH_MAX];
726 #ifdef RTE_LIBRTE_VHOST_NUMA
727         int newnode;
728 #endif
729
730         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
731         list = find_internal_resource(ifname);
732         if (list == NULL) {
733                 RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
734                 return -1;
735         }
736
737         eth_dev = list->eth_dev;
738         internal = eth_dev->data->dev_private;
739         dev_conf = &eth_dev->data->dev_conf;
740
741 #ifdef RTE_LIBRTE_VHOST_NUMA
742         newnode = rte_vhost_get_numa_node(vid);
743         if (newnode >= 0)
744                 eth_dev->data->numa_node = newnode;
745 #endif
746
747         internal->vid = vid;
748         if (rte_atomic32_read(&internal->started) == 1) {
749                 queue_setup(eth_dev, internal);
750
751                 if (dev_conf->intr_conf.rxq) {
752                         if (eth_vhost_install_intr(eth_dev) < 0) {
753                                 RTE_LOG(INFO, PMD,
754                                         "Failed to install interrupt handler.");
755                                         return -1;
756                         }
757                 }
758         } else {
759                 RTE_LOG(INFO, PMD, "RX/TX queues not exist yet\n");
760         }
761
762         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
763                 rte_vhost_enable_guest_notification(vid, i, 0);
764
765         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
766
767         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
768
769         rte_atomic32_set(&internal->dev_attached, 1);
770         update_queuing_status(eth_dev);
771
772         RTE_LOG(INFO, PMD, "Vhost device %d created\n", vid);
773
774         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
775
776         return 0;
777 }
778
779 static void
780 destroy_device(int vid)
781 {
782         struct rte_eth_dev *eth_dev;
783         struct pmd_internal *internal;
784         struct vhost_queue *vq;
785         struct internal_list *list;
786         char ifname[PATH_MAX];
787         unsigned i;
788         struct rte_vhost_vring_state *state;
789
790         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
791         list = find_internal_resource(ifname);
792         if (list == NULL) {
793                 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
794                 return;
795         }
796         eth_dev = list->eth_dev;
797         internal = eth_dev->data->dev_private;
798
799         rte_atomic32_set(&internal->dev_attached, 0);
800         update_queuing_status(eth_dev);
801
802         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
803
804         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
805                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
806                         vq = eth_dev->data->rx_queues[i];
807                         if (!vq)
808                                 continue;
809                         vq->vid = -1;
810                 }
811                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
812                         vq = eth_dev->data->tx_queues[i];
813                         if (!vq)
814                                 continue;
815                         vq->vid = -1;
816                 }
817         }
818
819         state = vring_states[eth_dev->data->port_id];
820         rte_spinlock_lock(&state->lock);
821         for (i = 0; i <= state->max_vring; i++) {
822                 state->cur[i] = false;
823                 state->seen[i] = false;
824         }
825         state->max_vring = 0;
826         rte_spinlock_unlock(&state->lock);
827
828         RTE_LOG(INFO, PMD, "Vhost device %d destroyed\n", vid);
829         eth_vhost_uninstall_intr(eth_dev);
830
831         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
832 }
833
834 static int
835 vring_state_changed(int vid, uint16_t vring, int enable)
836 {
837         struct rte_vhost_vring_state *state;
838         struct rte_eth_dev *eth_dev;
839         struct internal_list *list;
840         char ifname[PATH_MAX];
841
842         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
843         list = find_internal_resource(ifname);
844         if (list == NULL) {
845                 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
846                 return -1;
847         }
848
849         eth_dev = list->eth_dev;
850         /* won't be NULL */
851         state = vring_states[eth_dev->data->port_id];
852         rte_spinlock_lock(&state->lock);
853         state->cur[vring] = enable;
854         state->max_vring = RTE_MAX(vring, state->max_vring);
855         rte_spinlock_unlock(&state->lock);
856
857         RTE_LOG(INFO, PMD, "vring%u is %s\n",
858                         vring, enable ? "enabled" : "disabled");
859
860         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
861
862         return 0;
863 }
864
865 static struct vhost_device_ops vhost_ops = {
866         .new_device          = new_device,
867         .destroy_device      = destroy_device,
868         .vring_state_changed = vring_state_changed,
869 };
870
871 int
872 rte_eth_vhost_get_queue_event(uint16_t port_id,
873                 struct rte_eth_vhost_queue_event *event)
874 {
875         struct rte_vhost_vring_state *state;
876         unsigned int i;
877         int idx;
878
879         if (port_id >= RTE_MAX_ETHPORTS) {
880                 RTE_LOG(ERR, PMD, "Invalid port id\n");
881                 return -1;
882         }
883
884         state = vring_states[port_id];
885         if (!state) {
886                 RTE_LOG(ERR, PMD, "Unused port\n");
887                 return -1;
888         }
889
890         rte_spinlock_lock(&state->lock);
891         for (i = 0; i <= state->max_vring; i++) {
892                 idx = state->index++ % (state->max_vring + 1);
893
894                 if (state->cur[idx] != state->seen[idx]) {
895                         state->seen[idx] = state->cur[idx];
896                         event->queue_id = idx / 2;
897                         event->rx = idx & 1;
898                         event->enable = state->cur[idx];
899                         rte_spinlock_unlock(&state->lock);
900                         return 0;
901                 }
902         }
903         rte_spinlock_unlock(&state->lock);
904
905         return -1;
906 }
907
908 int
909 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
910 {
911         struct internal_list *list;
912         struct rte_eth_dev *eth_dev;
913         struct vhost_queue *vq;
914         int vid = -1;
915
916         if (!rte_eth_dev_is_valid_port(port_id))
917                 return -1;
918
919         pthread_mutex_lock(&internal_list_lock);
920
921         TAILQ_FOREACH(list, &internal_list, next) {
922                 eth_dev = list->eth_dev;
923                 if (eth_dev->data->port_id == port_id) {
924                         vq = eth_dev->data->rx_queues[0];
925                         if (vq) {
926                                 vid = vq->vid;
927                         }
928                         break;
929                 }
930         }
931
932         pthread_mutex_unlock(&internal_list_lock);
933
934         return vid;
935 }
936
937 static int
938 eth_dev_start(struct rte_eth_dev *eth_dev)
939 {
940         struct pmd_internal *internal = eth_dev->data->dev_private;
941         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
942
943         queue_setup(eth_dev, internal);
944
945         if (rte_atomic32_read(&internal->dev_attached) == 1) {
946                 if (dev_conf->intr_conf.rxq) {
947                         if (eth_vhost_install_intr(eth_dev) < 0) {
948                                 RTE_LOG(INFO, PMD,
949                                         "Failed to install interrupt handler.");
950                                         return -1;
951                         }
952                 }
953         }
954
955         rte_atomic32_set(&internal->started, 1);
956         update_queuing_status(eth_dev);
957
958         return 0;
959 }
960
961 static void
962 eth_dev_stop(struct rte_eth_dev *dev)
963 {
964         struct pmd_internal *internal = dev->data->dev_private;
965
966         rte_atomic32_set(&internal->started, 0);
967         update_queuing_status(dev);
968 }
969
970 static void
971 eth_dev_close(struct rte_eth_dev *dev)
972 {
973         struct pmd_internal *internal;
974         struct internal_list *list;
975         unsigned int i;
976
977         internal = dev->data->dev_private;
978         if (!internal)
979                 return;
980
981         eth_dev_stop(dev);
982
983         rte_vhost_driver_unregister(internal->iface_name);
984
985         list = find_internal_resource(internal->iface_name);
986         if (!list)
987                 return;
988
989         pthread_mutex_lock(&internal_list_lock);
990         TAILQ_REMOVE(&internal_list, list, next);
991         pthread_mutex_unlock(&internal_list_lock);
992         rte_free(list);
993
994         if (dev->data->rx_queues)
995                 for (i = 0; i < dev->data->nb_rx_queues; i++)
996                         rte_free(dev->data->rx_queues[i]);
997
998         if (dev->data->tx_queues)
999                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1000                         rte_free(dev->data->tx_queues[i]);
1001
1002         rte_free(dev->data->mac_addrs);
1003         free(internal->dev_name);
1004         free(internal->iface_name);
1005         rte_free(internal);
1006
1007         dev->data->dev_private = NULL;
1008 }
1009
1010 static int
1011 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1012                    uint16_t nb_rx_desc __rte_unused,
1013                    unsigned int socket_id,
1014                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1015                    struct rte_mempool *mb_pool)
1016 {
1017         struct vhost_queue *vq;
1018
1019         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1020                         RTE_CACHE_LINE_SIZE, socket_id);
1021         if (vq == NULL) {
1022                 RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
1023                 return -ENOMEM;
1024         }
1025
1026         vq->mb_pool = mb_pool;
1027         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1028         dev->data->rx_queues[rx_queue_id] = vq;
1029
1030         return 0;
1031 }
1032
1033 static int
1034 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1035                    uint16_t nb_tx_desc __rte_unused,
1036                    unsigned int socket_id,
1037                    const struct rte_eth_txconf *tx_conf __rte_unused)
1038 {
1039         struct vhost_queue *vq;
1040
1041         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1042                         RTE_CACHE_LINE_SIZE, socket_id);
1043         if (vq == NULL) {
1044                 RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
1045                 return -ENOMEM;
1046         }
1047
1048         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1049         dev->data->tx_queues[tx_queue_id] = vq;
1050
1051         return 0;
1052 }
1053
1054 static void
1055 eth_dev_info(struct rte_eth_dev *dev,
1056              struct rte_eth_dev_info *dev_info)
1057 {
1058         struct pmd_internal *internal;
1059
1060         internal = dev->data->dev_private;
1061         if (internal == NULL) {
1062                 RTE_LOG(ERR, PMD, "Invalid device specified\n");
1063                 return;
1064         }
1065
1066         dev_info->max_mac_addrs = 1;
1067         dev_info->max_rx_pktlen = (uint32_t)-1;
1068         dev_info->max_rx_queues = internal->max_queues;
1069         dev_info->max_tx_queues = internal->max_queues;
1070         dev_info->min_rx_bufsize = 0;
1071 }
1072
1073 static int
1074 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1075 {
1076         unsigned i;
1077         unsigned long rx_total = 0, tx_total = 0, tx_missed_total = 0;
1078         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1079         struct vhost_queue *vq;
1080
1081         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1082                         i < dev->data->nb_rx_queues; i++) {
1083                 if (dev->data->rx_queues[i] == NULL)
1084                         continue;
1085                 vq = dev->data->rx_queues[i];
1086                 stats->q_ipackets[i] = vq->stats.pkts;
1087                 rx_total += stats->q_ipackets[i];
1088
1089                 stats->q_ibytes[i] = vq->stats.bytes;
1090                 rx_total_bytes += stats->q_ibytes[i];
1091         }
1092
1093         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1094                         i < dev->data->nb_tx_queues; i++) {
1095                 if (dev->data->tx_queues[i] == NULL)
1096                         continue;
1097                 vq = dev->data->tx_queues[i];
1098                 stats->q_opackets[i] = vq->stats.pkts;
1099                 tx_missed_total += vq->stats.missed_pkts;
1100                 tx_total += stats->q_opackets[i];
1101
1102                 stats->q_obytes[i] = vq->stats.bytes;
1103                 tx_total_bytes += stats->q_obytes[i];
1104         }
1105
1106         stats->ipackets = rx_total;
1107         stats->opackets = tx_total;
1108         stats->oerrors = tx_missed_total;
1109         stats->ibytes = rx_total_bytes;
1110         stats->obytes = tx_total_bytes;
1111
1112         return 0;
1113 }
1114
1115 static void
1116 eth_stats_reset(struct rte_eth_dev *dev)
1117 {
1118         struct vhost_queue *vq;
1119         unsigned i;
1120
1121         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1122                 if (dev->data->rx_queues[i] == NULL)
1123                         continue;
1124                 vq = dev->data->rx_queues[i];
1125                 vq->stats.pkts = 0;
1126                 vq->stats.bytes = 0;
1127         }
1128         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1129                 if (dev->data->tx_queues[i] == NULL)
1130                         continue;
1131                 vq = dev->data->tx_queues[i];
1132                 vq->stats.pkts = 0;
1133                 vq->stats.bytes = 0;
1134                 vq->stats.missed_pkts = 0;
1135         }
1136 }
1137
1138 static void
1139 eth_queue_release(void *q)
1140 {
1141         rte_free(q);
1142 }
1143
1144 static int
1145 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1146 {
1147         /*
1148          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1149          * and releases mbuf, so nothing to cleanup.
1150          */
1151         return 0;
1152 }
1153
1154 static int
1155 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1156                 int wait_to_complete __rte_unused)
1157 {
1158         return 0;
1159 }
1160
1161 static uint32_t
1162 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1163 {
1164         struct vhost_queue *vq;
1165
1166         vq = dev->data->rx_queues[rx_queue_id];
1167         if (vq == NULL)
1168                 return 0;
1169
1170         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1171 }
1172
1173 static const struct eth_dev_ops ops = {
1174         .dev_start = eth_dev_start,
1175         .dev_stop = eth_dev_stop,
1176         .dev_close = eth_dev_close,
1177         .dev_configure = eth_dev_configure,
1178         .dev_infos_get = eth_dev_info,
1179         .rx_queue_setup = eth_rx_queue_setup,
1180         .tx_queue_setup = eth_tx_queue_setup,
1181         .rx_queue_release = eth_queue_release,
1182         .tx_queue_release = eth_queue_release,
1183         .tx_done_cleanup = eth_tx_done_cleanup,
1184         .rx_queue_count = eth_rx_queue_count,
1185         .link_update = eth_link_update,
1186         .stats_get = eth_stats_get,
1187         .stats_reset = eth_stats_reset,
1188         .xstats_reset = vhost_dev_xstats_reset,
1189         .xstats_get = vhost_dev_xstats_get,
1190         .xstats_get_names = vhost_dev_xstats_get_names,
1191         .rx_queue_intr_enable = eth_rxq_intr_enable,
1192         .rx_queue_intr_disable = eth_rxq_intr_disable,
1193 };
1194
1195 static struct rte_vdev_driver pmd_vhost_drv;
1196
1197 static int
1198 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1199         int16_t queues, const unsigned int numa_node, uint64_t flags)
1200 {
1201         const char *name = rte_vdev_device_name(dev);
1202         struct rte_eth_dev_data *data;
1203         struct pmd_internal *internal = NULL;
1204         struct rte_eth_dev *eth_dev = NULL;
1205         struct ether_addr *eth_addr = NULL;
1206         struct rte_vhost_vring_state *vring_state = NULL;
1207         struct internal_list *list = NULL;
1208
1209         RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
1210                 numa_node);
1211
1212         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1213         if (list == NULL)
1214                 goto error;
1215
1216         /* reserve an ethdev entry */
1217         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1218         if (eth_dev == NULL)
1219                 goto error;
1220
1221         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1222         if (eth_addr == NULL)
1223                 goto error;
1224         *eth_addr = base_eth_addr;
1225         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1226
1227         vring_state = rte_zmalloc_socket(name,
1228                         sizeof(*vring_state), 0, numa_node);
1229         if (vring_state == NULL)
1230                 goto error;
1231
1232         /* now put it all together
1233          * - store queue data in internal,
1234          * - point eth_dev_data to internals
1235          * - and point eth_dev structure to new eth_dev_data structure
1236          */
1237         internal = eth_dev->data->dev_private;
1238         internal->dev_name = strdup(name);
1239         if (internal->dev_name == NULL)
1240                 goto error;
1241         internal->iface_name = strdup(iface_name);
1242         if (internal->iface_name == NULL)
1243                 goto error;
1244
1245         list->eth_dev = eth_dev;
1246         pthread_mutex_lock(&internal_list_lock);
1247         TAILQ_INSERT_TAIL(&internal_list, list, next);
1248         pthread_mutex_unlock(&internal_list_lock);
1249
1250         rte_spinlock_init(&vring_state->lock);
1251         vring_states[eth_dev->data->port_id] = vring_state;
1252
1253         data = eth_dev->data;
1254         data->nb_rx_queues = queues;
1255         data->nb_tx_queues = queues;
1256         internal->max_queues = queues;
1257         data->dev_link = pmd_link;
1258         data->mac_addrs = eth_addr;
1259         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1260
1261         eth_dev->dev_ops = &ops;
1262
1263         /* finally assign rx and tx ops */
1264         eth_dev->rx_pkt_burst = eth_vhost_rx;
1265         eth_dev->tx_pkt_burst = eth_vhost_tx;
1266
1267         if (rte_vhost_driver_register(iface_name, flags))
1268                 goto error;
1269
1270         if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) {
1271                 RTE_LOG(ERR, PMD, "Can't register callbacks\n");
1272                 goto error;
1273         }
1274
1275         if (rte_vhost_driver_start(iface_name) < 0) {
1276                 RTE_LOG(ERR, PMD, "Failed to start driver for %s\n",
1277                         iface_name);
1278                 goto error;
1279         }
1280
1281         return data->port_id;
1282
1283 error:
1284         if (internal) {
1285                 free(internal->iface_name);
1286                 free(internal->dev_name);
1287         }
1288         rte_free(vring_state);
1289         rte_free(eth_addr);
1290         if (eth_dev)
1291                 rte_eth_dev_release_port(eth_dev);
1292         rte_free(internal);
1293         rte_free(list);
1294
1295         return -1;
1296 }
1297
1298 static inline int
1299 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1300 {
1301         const char **iface_name = extra_args;
1302
1303         if (value == NULL)
1304                 return -1;
1305
1306         *iface_name = value;
1307
1308         return 0;
1309 }
1310
1311 static inline int
1312 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1313 {
1314         uint16_t *n = extra_args;
1315
1316         if (value == NULL || extra_args == NULL)
1317                 return -EINVAL;
1318
1319         *n = (uint16_t)strtoul(value, NULL, 0);
1320         if (*n == USHRT_MAX && errno == ERANGE)
1321                 return -1;
1322
1323         return 0;
1324 }
1325
1326 static int
1327 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1328 {
1329         struct rte_kvargs *kvlist = NULL;
1330         int ret = 0;
1331         char *iface_name;
1332         uint16_t queues;
1333         uint64_t flags = 0;
1334         int client_mode = 0;
1335         int dequeue_zero_copy = 0;
1336         int iommu_support = 0;
1337         struct rte_eth_dev *eth_dev;
1338         const char *name = rte_vdev_device_name(dev);
1339
1340         RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
1341
1342         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1343             strlen(rte_vdev_device_args(dev)) == 0) {
1344                 eth_dev = rte_eth_dev_attach_secondary(name);
1345                 if (!eth_dev) {
1346                         RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
1347                         return -1;
1348                 }
1349                 /* TODO: request info from primary to set up Rx and Tx */
1350                 eth_dev->dev_ops = &ops;
1351                 return 0;
1352         }
1353
1354         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1355         if (kvlist == NULL)
1356                 return -1;
1357
1358         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1359                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1360                                          &open_iface, &iface_name);
1361                 if (ret < 0)
1362                         goto out_free;
1363         } else {
1364                 ret = -1;
1365                 goto out_free;
1366         }
1367
1368         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1369                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1370                                          &open_int, &queues);
1371                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1372                         goto out_free;
1373
1374         } else
1375                 queues = 1;
1376
1377         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1378                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1379                                          &open_int, &client_mode);
1380                 if (ret < 0)
1381                         goto out_free;
1382
1383                 if (client_mode)
1384                         flags |= RTE_VHOST_USER_CLIENT;
1385         }
1386
1387         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1388                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1389                                          &open_int, &dequeue_zero_copy);
1390                 if (ret < 0)
1391                         goto out_free;
1392
1393                 if (dequeue_zero_copy)
1394                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1395         }
1396
1397         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1398                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1399                                          &open_int, &iommu_support);
1400                 if (ret < 0)
1401                         goto out_free;
1402
1403                 if (iommu_support)
1404                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1405         }
1406
1407         if (dev->device.numa_node == SOCKET_ID_ANY)
1408                 dev->device.numa_node = rte_socket_id();
1409
1410         eth_dev_vhost_create(dev, iface_name, queues, dev->device.numa_node,
1411                 flags);
1412
1413 out_free:
1414         rte_kvargs_free(kvlist);
1415         return ret;
1416 }
1417
1418 static int
1419 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1420 {
1421         const char *name;
1422         struct rte_eth_dev *eth_dev = NULL;
1423
1424         name = rte_vdev_device_name(dev);
1425         RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
1426
1427         /* find an ethdev entry */
1428         eth_dev = rte_eth_dev_allocated(name);
1429         if (eth_dev == NULL)
1430                 return -ENODEV;
1431
1432         eth_dev_close(eth_dev);
1433
1434         rte_free(vring_states[eth_dev->data->port_id]);
1435         vring_states[eth_dev->data->port_id] = NULL;
1436
1437         rte_eth_dev_release_port(eth_dev);
1438
1439         return 0;
1440 }
1441
1442 static struct rte_vdev_driver pmd_vhost_drv = {
1443         .probe = rte_pmd_vhost_probe,
1444         .remove = rte_pmd_vhost_remove,
1445 };
1446
1447 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1448 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1449 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1450         "iface=<ifc> "
1451         "queues=<int>");