ethdev: make stats and xstats reset callbacks return int
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 static int vhost_logtype;
22
23 #define VHOST_LOG(level, ...) \
24         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27
28 #define ETH_VHOST_IFACE_ARG             "iface"
29 #define ETH_VHOST_QUEUES_ARG            "queues"
30 #define ETH_VHOST_CLIENT_ARG            "client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define VHOST_MAX_PKT_BURST 32
35
36 static const char *valid_arguments[] = {
37         ETH_VHOST_IFACE_ARG,
38         ETH_VHOST_QUEUES_ARG,
39         ETH_VHOST_CLIENT_ARG,
40         ETH_VHOST_DEQUEUE_ZERO_COPY,
41         ETH_VHOST_IOMMU_SUPPORT,
42         ETH_VHOST_POSTCOPY_SUPPORT,
43         NULL
44 };
45
46 static struct rte_ether_addr base_eth_addr = {
47         .addr_bytes = {
48                 0x56 /* V */,
49                 0x48 /* H */,
50                 0x4F /* O */,
51                 0x53 /* S */,
52                 0x54 /* T */,
53                 0x00
54         }
55 };
56
57 enum vhost_xstats_pkts {
58         VHOST_UNDERSIZE_PKT = 0,
59         VHOST_64_PKT,
60         VHOST_65_TO_127_PKT,
61         VHOST_128_TO_255_PKT,
62         VHOST_256_TO_511_PKT,
63         VHOST_512_TO_1023_PKT,
64         VHOST_1024_TO_1522_PKT,
65         VHOST_1523_TO_MAX_PKT,
66         VHOST_BROADCAST_PKT,
67         VHOST_MULTICAST_PKT,
68         VHOST_UNICAST_PKT,
69         VHOST_ERRORS_PKT,
70         VHOST_ERRORS_FRAGMENTED,
71         VHOST_ERRORS_JABBER,
72         VHOST_UNKNOWN_PROTOCOL,
73         VHOST_XSTATS_MAX,
74 };
75
76 struct vhost_stats {
77         uint64_t pkts;
78         uint64_t bytes;
79         uint64_t missed_pkts;
80         uint64_t xstats[VHOST_XSTATS_MAX];
81 };
82
83 struct vhost_queue {
84         int vid;
85         rte_atomic32_t allow_queuing;
86         rte_atomic32_t while_queuing;
87         struct pmd_internal *internal;
88         struct rte_mempool *mb_pool;
89         uint16_t port;
90         uint16_t virtqueue_id;
91         struct vhost_stats stats;
92 };
93
94 struct pmd_internal {
95         rte_atomic32_t dev_attached;
96         char *dev_name;
97         char *iface_name;
98         uint16_t max_queues;
99         int vid;
100         rte_atomic32_t started;
101         uint8_t vlan_strip;
102 };
103
104 struct internal_list {
105         TAILQ_ENTRY(internal_list) next;
106         struct rte_eth_dev *eth_dev;
107 };
108
109 TAILQ_HEAD(internal_list_head, internal_list);
110 static struct internal_list_head internal_list =
111         TAILQ_HEAD_INITIALIZER(internal_list);
112
113 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
114
115 static struct rte_eth_link pmd_link = {
116                 .link_speed = 10000,
117                 .link_duplex = ETH_LINK_FULL_DUPLEX,
118                 .link_status = ETH_LINK_DOWN
119 };
120
121 struct rte_vhost_vring_state {
122         rte_spinlock_t lock;
123
124         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
125         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
126         unsigned int index;
127         unsigned int max_vring;
128 };
129
130 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
131
132 #define VHOST_XSTATS_NAME_SIZE 64
133
134 struct vhost_xstats_name_off {
135         char name[VHOST_XSTATS_NAME_SIZE];
136         uint64_t offset;
137 };
138
139 /* [rx]_is prepended to the name string here */
140 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
141         {"good_packets",
142          offsetof(struct vhost_queue, stats.pkts)},
143         {"total_bytes",
144          offsetof(struct vhost_queue, stats.bytes)},
145         {"missed_pkts",
146          offsetof(struct vhost_queue, stats.missed_pkts)},
147         {"broadcast_packets",
148          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
149         {"multicast_packets",
150          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
151         {"unicast_packets",
152          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
153          {"undersize_packets",
154          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
155         {"size_64_packets",
156          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
157         {"size_65_to_127_packets",
158          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
159         {"size_128_to_255_packets",
160          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
161         {"size_256_to_511_packets",
162          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
163         {"size_512_to_1023_packets",
164          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
165         {"size_1024_to_1522_packets",
166          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
167         {"size_1523_to_max_packets",
168          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
169         {"errors_with_bad_CRC",
170          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
171         {"fragmented_errors",
172          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
173         {"jabber_errors",
174          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
175         {"unknown_protos_packets",
176          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
177 };
178
179 /* [tx]_ is prepended to the name string here */
180 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
181         {"good_packets",
182          offsetof(struct vhost_queue, stats.pkts)},
183         {"total_bytes",
184          offsetof(struct vhost_queue, stats.bytes)},
185         {"missed_pkts",
186          offsetof(struct vhost_queue, stats.missed_pkts)},
187         {"broadcast_packets",
188          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
189         {"multicast_packets",
190          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
191         {"unicast_packets",
192          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
193         {"undersize_packets",
194          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
195         {"size_64_packets",
196          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
197         {"size_65_to_127_packets",
198          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
199         {"size_128_to_255_packets",
200          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
201         {"size_256_to_511_packets",
202          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
203         {"size_512_to_1023_packets",
204          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
205         {"size_1024_to_1522_packets",
206          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
207         {"size_1523_to_max_packets",
208          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
209         {"errors_with_bad_CRC",
210          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
211 };
212
213 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
214                                 sizeof(vhost_rxport_stat_strings[0]))
215
216 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
217                                 sizeof(vhost_txport_stat_strings[0]))
218
219 static int
220 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
221 {
222         struct vhost_queue *vq = NULL;
223         unsigned int i = 0;
224
225         for (i = 0; i < dev->data->nb_rx_queues; i++) {
226                 vq = dev->data->rx_queues[i];
227                 if (!vq)
228                         continue;
229                 memset(&vq->stats, 0, sizeof(vq->stats));
230         }
231         for (i = 0; i < dev->data->nb_tx_queues; i++) {
232                 vq = dev->data->tx_queues[i];
233                 if (!vq)
234                         continue;
235                 memset(&vq->stats, 0, sizeof(vq->stats));
236         }
237
238         return 0;
239 }
240
241 static int
242 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
243                            struct rte_eth_xstat_name *xstats_names,
244                            unsigned int limit __rte_unused)
245 {
246         unsigned int t = 0;
247         int count = 0;
248         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
249
250         if (!xstats_names)
251                 return nstats;
252         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
253                 snprintf(xstats_names[count].name,
254                          sizeof(xstats_names[count].name),
255                          "rx_%s", vhost_rxport_stat_strings[t].name);
256                 count++;
257         }
258         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
259                 snprintf(xstats_names[count].name,
260                          sizeof(xstats_names[count].name),
261                          "tx_%s", vhost_txport_stat_strings[t].name);
262                 count++;
263         }
264         return count;
265 }
266
267 static int
268 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
269                      unsigned int n)
270 {
271         unsigned int i;
272         unsigned int t;
273         unsigned int count = 0;
274         struct vhost_queue *vq = NULL;
275         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
276
277         if (n < nxstats)
278                 return nxstats;
279
280         for (i = 0; i < dev->data->nb_rx_queues; i++) {
281                 vq = dev->data->rx_queues[i];
282                 if (!vq)
283                         continue;
284                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
285                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
286                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
287         }
288         for (i = 0; i < dev->data->nb_tx_queues; i++) {
289                 vq = dev->data->tx_queues[i];
290                 if (!vq)
291                         continue;
292                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
293                                 + vq->stats.missed_pkts
294                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
295                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
296         }
297         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
298                 xstats[count].value = 0;
299                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
300                         vq = dev->data->rx_queues[i];
301                         if (!vq)
302                                 continue;
303                         xstats[count].value +=
304                                 *(uint64_t *)(((char *)vq)
305                                 + vhost_rxport_stat_strings[t].offset);
306                 }
307                 xstats[count].id = count;
308                 count++;
309         }
310         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
311                 xstats[count].value = 0;
312                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
313                         vq = dev->data->tx_queues[i];
314                         if (!vq)
315                                 continue;
316                         xstats[count].value +=
317                                 *(uint64_t *)(((char *)vq)
318                                 + vhost_txport_stat_strings[t].offset);
319                 }
320                 xstats[count].id = count;
321                 count++;
322         }
323         return count;
324 }
325
326 static inline void
327 vhost_count_multicast_broadcast(struct vhost_queue *vq,
328                                 struct rte_mbuf *mbuf)
329 {
330         struct rte_ether_addr *ea = NULL;
331         struct vhost_stats *pstats = &vq->stats;
332
333         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
334         if (rte_is_multicast_ether_addr(ea)) {
335                 if (rte_is_broadcast_ether_addr(ea))
336                         pstats->xstats[VHOST_BROADCAST_PKT]++;
337                 else
338                         pstats->xstats[VHOST_MULTICAST_PKT]++;
339         }
340 }
341
342 static void
343 vhost_update_packet_xstats(struct vhost_queue *vq,
344                            struct rte_mbuf **bufs,
345                            uint16_t count)
346 {
347         uint32_t pkt_len = 0;
348         uint64_t i = 0;
349         uint64_t index;
350         struct vhost_stats *pstats = &vq->stats;
351
352         for (i = 0; i < count ; i++) {
353                 pkt_len = bufs[i]->pkt_len;
354                 if (pkt_len == 64) {
355                         pstats->xstats[VHOST_64_PKT]++;
356                 } else if (pkt_len > 64 && pkt_len < 1024) {
357                         index = (sizeof(pkt_len) * 8)
358                                 - __builtin_clz(pkt_len) - 5;
359                         pstats->xstats[index]++;
360                 } else {
361                         if (pkt_len < 64)
362                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
363                         else if (pkt_len <= 1522)
364                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
365                         else if (pkt_len > 1522)
366                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
367                 }
368                 vhost_count_multicast_broadcast(vq, bufs[i]);
369         }
370 }
371
372 static uint16_t
373 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
374 {
375         struct vhost_queue *r = q;
376         uint16_t i, nb_rx = 0;
377         uint16_t nb_receive = nb_bufs;
378
379         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
380                 return 0;
381
382         rte_atomic32_set(&r->while_queuing, 1);
383
384         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
385                 goto out;
386
387         /* Dequeue packets from guest TX queue */
388         while (nb_receive) {
389                 uint16_t nb_pkts;
390                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
391                                                  VHOST_MAX_PKT_BURST);
392
393                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
394                                                   r->mb_pool, &bufs[nb_rx],
395                                                   num);
396
397                 nb_rx += nb_pkts;
398                 nb_receive -= nb_pkts;
399                 if (nb_pkts < num)
400                         break;
401         }
402
403         r->stats.pkts += nb_rx;
404
405         for (i = 0; likely(i < nb_rx); i++) {
406                 bufs[i]->port = r->port;
407                 bufs[i]->vlan_tci = 0;
408
409                 if (r->internal->vlan_strip)
410                         rte_vlan_strip(bufs[i]);
411
412                 r->stats.bytes += bufs[i]->pkt_len;
413         }
414
415         vhost_update_packet_xstats(r, bufs, nb_rx);
416
417 out:
418         rte_atomic32_set(&r->while_queuing, 0);
419
420         return nb_rx;
421 }
422
423 static uint16_t
424 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
425 {
426         struct vhost_queue *r = q;
427         uint16_t i, nb_tx = 0;
428         uint16_t nb_send = 0;
429
430         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
431                 return 0;
432
433         rte_atomic32_set(&r->while_queuing, 1);
434
435         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
436                 goto out;
437
438         for (i = 0; i < nb_bufs; i++) {
439                 struct rte_mbuf *m = bufs[i];
440
441                 /* Do VLAN tag insertion */
442                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
443                         int error = rte_vlan_insert(&m);
444                         if (unlikely(error)) {
445                                 rte_pktmbuf_free(m);
446                                 continue;
447                         }
448                 }
449
450                 bufs[nb_send] = m;
451                 ++nb_send;
452         }
453
454         /* Enqueue packets to guest RX queue */
455         while (nb_send) {
456                 uint16_t nb_pkts;
457                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
458                                                  VHOST_MAX_PKT_BURST);
459
460                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
461                                                   &bufs[nb_tx], num);
462
463                 nb_tx += nb_pkts;
464                 nb_send -= nb_pkts;
465                 if (nb_pkts < num)
466                         break;
467         }
468
469         r->stats.pkts += nb_tx;
470         r->stats.missed_pkts += nb_bufs - nb_tx;
471
472         for (i = 0; likely(i < nb_tx); i++)
473                 r->stats.bytes += bufs[i]->pkt_len;
474
475         vhost_update_packet_xstats(r, bufs, nb_tx);
476
477         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
478          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
479          * are increased when packets are not transmitted successfully.
480          */
481         for (i = nb_tx; i < nb_bufs; i++)
482                 vhost_count_multicast_broadcast(r, bufs[i]);
483
484         for (i = 0; likely(i < nb_tx); i++)
485                 rte_pktmbuf_free(bufs[i]);
486 out:
487         rte_atomic32_set(&r->while_queuing, 0);
488
489         return nb_tx;
490 }
491
492 static int
493 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
494 {
495         struct pmd_internal *internal = dev->data->dev_private;
496         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
497
498         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
499
500         return 0;
501 }
502
503 static inline struct internal_list *
504 find_internal_resource(char *ifname)
505 {
506         int found = 0;
507         struct internal_list *list;
508         struct pmd_internal *internal;
509
510         if (ifname == NULL)
511                 return NULL;
512
513         pthread_mutex_lock(&internal_list_lock);
514
515         TAILQ_FOREACH(list, &internal_list, next) {
516                 internal = list->eth_dev->data->dev_private;
517                 if (!strcmp(internal->iface_name, ifname)) {
518                         found = 1;
519                         break;
520                 }
521         }
522
523         pthread_mutex_unlock(&internal_list_lock);
524
525         if (!found)
526                 return NULL;
527
528         return list;
529 }
530
531 static int
532 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
533 {
534         struct rte_vhost_vring vring;
535         struct vhost_queue *vq;
536         int ret = 0;
537
538         vq = dev->data->rx_queues[qid];
539         if (!vq) {
540                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
541                 return -1;
542         }
543
544         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
545         if (ret < 0) {
546                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
547                 return ret;
548         }
549         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
550         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
551         rte_wmb();
552
553         return ret;
554 }
555
556 static int
557 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
558 {
559         struct rte_vhost_vring vring;
560         struct vhost_queue *vq;
561         int ret = 0;
562
563         vq = dev->data->rx_queues[qid];
564         if (!vq) {
565                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
566                 return -1;
567         }
568
569         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
570         if (ret < 0) {
571                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
572                 return ret;
573         }
574         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
575         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
576         rte_wmb();
577
578         return 0;
579 }
580
581 static void
582 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
583 {
584         struct rte_intr_handle *intr_handle = dev->intr_handle;
585
586         if (intr_handle) {
587                 if (intr_handle->intr_vec)
588                         free(intr_handle->intr_vec);
589                 free(intr_handle);
590         }
591
592         dev->intr_handle = NULL;
593 }
594
595 static int
596 eth_vhost_install_intr(struct rte_eth_dev *dev)
597 {
598         struct rte_vhost_vring vring;
599         struct vhost_queue *vq;
600         int count = 0;
601         int nb_rxq = dev->data->nb_rx_queues;
602         int i;
603         int ret;
604
605         /* uninstall firstly if we are reconnecting */
606         if (dev->intr_handle)
607                 eth_vhost_uninstall_intr(dev);
608
609         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
610         if (!dev->intr_handle) {
611                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
612                 return -ENOMEM;
613         }
614         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
615
616         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
617
618         dev->intr_handle->intr_vec =
619                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
620
621         if (!dev->intr_handle->intr_vec) {
622                 VHOST_LOG(ERR,
623                         "Failed to allocate memory for interrupt vector\n");
624                 free(dev->intr_handle);
625                 return -ENOMEM;
626         }
627
628         VHOST_LOG(INFO, "Prepare intr vec\n");
629         for (i = 0; i < nb_rxq; i++) {
630                 vq = dev->data->rx_queues[i];
631                 if (!vq) {
632                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
633                         continue;
634                 }
635
636                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
637                 if (ret < 0) {
638                         VHOST_LOG(INFO,
639                                 "Failed to get rxq-%d's vring, skip!\n", i);
640                         continue;
641                 }
642
643                 if (vring.kickfd < 0) {
644                         VHOST_LOG(INFO,
645                                 "rxq-%d's kickfd is invalid, skip!\n", i);
646                         continue;
647                 }
648                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
649                 dev->intr_handle->efds[i] = vring.kickfd;
650                 count++;
651                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
652         }
653
654         dev->intr_handle->nb_efd = count;
655         dev->intr_handle->max_intr = count + 1;
656         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
657
658         return 0;
659 }
660
661 static void
662 update_queuing_status(struct rte_eth_dev *dev)
663 {
664         struct pmd_internal *internal = dev->data->dev_private;
665         struct vhost_queue *vq;
666         unsigned int i;
667         int allow_queuing = 1;
668
669         if (!dev->data->rx_queues || !dev->data->tx_queues)
670                 return;
671
672         if (rte_atomic32_read(&internal->started) == 0 ||
673             rte_atomic32_read(&internal->dev_attached) == 0)
674                 allow_queuing = 0;
675
676         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
677         for (i = 0; i < dev->data->nb_rx_queues; i++) {
678                 vq = dev->data->rx_queues[i];
679                 if (vq == NULL)
680                         continue;
681                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
682                 while (rte_atomic32_read(&vq->while_queuing))
683                         rte_pause();
684         }
685
686         for (i = 0; i < dev->data->nb_tx_queues; i++) {
687                 vq = dev->data->tx_queues[i];
688                 if (vq == NULL)
689                         continue;
690                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
691                 while (rte_atomic32_read(&vq->while_queuing))
692                         rte_pause();
693         }
694 }
695
696 static void
697 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
698 {
699         struct vhost_queue *vq;
700         int i;
701
702         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
703                 vq = eth_dev->data->rx_queues[i];
704                 if (!vq)
705                         continue;
706                 vq->vid = internal->vid;
707                 vq->internal = internal;
708                 vq->port = eth_dev->data->port_id;
709         }
710         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
711                 vq = eth_dev->data->tx_queues[i];
712                 if (!vq)
713                         continue;
714                 vq->vid = internal->vid;
715                 vq->internal = internal;
716                 vq->port = eth_dev->data->port_id;
717         }
718 }
719
720 static int
721 new_device(int vid)
722 {
723         struct rte_eth_dev *eth_dev;
724         struct internal_list *list;
725         struct pmd_internal *internal;
726         struct rte_eth_conf *dev_conf;
727         unsigned i;
728         char ifname[PATH_MAX];
729 #ifdef RTE_LIBRTE_VHOST_NUMA
730         int newnode;
731 #endif
732
733         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
734         list = find_internal_resource(ifname);
735         if (list == NULL) {
736                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
737                 return -1;
738         }
739
740         eth_dev = list->eth_dev;
741         internal = eth_dev->data->dev_private;
742         dev_conf = &eth_dev->data->dev_conf;
743
744 #ifdef RTE_LIBRTE_VHOST_NUMA
745         newnode = rte_vhost_get_numa_node(vid);
746         if (newnode >= 0)
747                 eth_dev->data->numa_node = newnode;
748 #endif
749
750         internal->vid = vid;
751         if (rte_atomic32_read(&internal->started) == 1) {
752                 queue_setup(eth_dev, internal);
753
754                 if (dev_conf->intr_conf.rxq) {
755                         if (eth_vhost_install_intr(eth_dev) < 0) {
756                                 VHOST_LOG(INFO,
757                                         "Failed to install interrupt handler.");
758                                         return -1;
759                         }
760                 }
761         } else {
762                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
763         }
764
765         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
766                 rte_vhost_enable_guest_notification(vid, i, 0);
767
768         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
769
770         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
771
772         rte_atomic32_set(&internal->dev_attached, 1);
773         update_queuing_status(eth_dev);
774
775         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
776
777         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
778
779         return 0;
780 }
781
782 static void
783 destroy_device(int vid)
784 {
785         struct rte_eth_dev *eth_dev;
786         struct pmd_internal *internal;
787         struct vhost_queue *vq;
788         struct internal_list *list;
789         char ifname[PATH_MAX];
790         unsigned i;
791         struct rte_vhost_vring_state *state;
792
793         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
794         list = find_internal_resource(ifname);
795         if (list == NULL) {
796                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
797                 return;
798         }
799         eth_dev = list->eth_dev;
800         internal = eth_dev->data->dev_private;
801
802         rte_atomic32_set(&internal->dev_attached, 0);
803         update_queuing_status(eth_dev);
804
805         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
806
807         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
808                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
809                         vq = eth_dev->data->rx_queues[i];
810                         if (!vq)
811                                 continue;
812                         vq->vid = -1;
813                 }
814                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
815                         vq = eth_dev->data->tx_queues[i];
816                         if (!vq)
817                                 continue;
818                         vq->vid = -1;
819                 }
820         }
821
822         state = vring_states[eth_dev->data->port_id];
823         rte_spinlock_lock(&state->lock);
824         for (i = 0; i <= state->max_vring; i++) {
825                 state->cur[i] = false;
826                 state->seen[i] = false;
827         }
828         state->max_vring = 0;
829         rte_spinlock_unlock(&state->lock);
830
831         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
832         eth_vhost_uninstall_intr(eth_dev);
833
834         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
835 }
836
837 static int
838 vring_state_changed(int vid, uint16_t vring, int enable)
839 {
840         struct rte_vhost_vring_state *state;
841         struct rte_eth_dev *eth_dev;
842         struct internal_list *list;
843         char ifname[PATH_MAX];
844
845         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
846         list = find_internal_resource(ifname);
847         if (list == NULL) {
848                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
849                 return -1;
850         }
851
852         eth_dev = list->eth_dev;
853         /* won't be NULL */
854         state = vring_states[eth_dev->data->port_id];
855         rte_spinlock_lock(&state->lock);
856         state->cur[vring] = enable;
857         state->max_vring = RTE_MAX(vring, state->max_vring);
858         rte_spinlock_unlock(&state->lock);
859
860         VHOST_LOG(INFO, "vring%u is %s\n",
861                         vring, enable ? "enabled" : "disabled");
862
863         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
864
865         return 0;
866 }
867
868 static struct vhost_device_ops vhost_ops = {
869         .new_device          = new_device,
870         .destroy_device      = destroy_device,
871         .vring_state_changed = vring_state_changed,
872 };
873
874 int
875 rte_eth_vhost_get_queue_event(uint16_t port_id,
876                 struct rte_eth_vhost_queue_event *event)
877 {
878         struct rte_vhost_vring_state *state;
879         unsigned int i;
880         int idx;
881
882         if (port_id >= RTE_MAX_ETHPORTS) {
883                 VHOST_LOG(ERR, "Invalid port id\n");
884                 return -1;
885         }
886
887         state = vring_states[port_id];
888         if (!state) {
889                 VHOST_LOG(ERR, "Unused port\n");
890                 return -1;
891         }
892
893         rte_spinlock_lock(&state->lock);
894         for (i = 0; i <= state->max_vring; i++) {
895                 idx = state->index++ % (state->max_vring + 1);
896
897                 if (state->cur[idx] != state->seen[idx]) {
898                         state->seen[idx] = state->cur[idx];
899                         event->queue_id = idx / 2;
900                         event->rx = idx & 1;
901                         event->enable = state->cur[idx];
902                         rte_spinlock_unlock(&state->lock);
903                         return 0;
904                 }
905         }
906         rte_spinlock_unlock(&state->lock);
907
908         return -1;
909 }
910
911 int
912 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
913 {
914         struct internal_list *list;
915         struct rte_eth_dev *eth_dev;
916         struct vhost_queue *vq;
917         int vid = -1;
918
919         if (!rte_eth_dev_is_valid_port(port_id))
920                 return -1;
921
922         pthread_mutex_lock(&internal_list_lock);
923
924         TAILQ_FOREACH(list, &internal_list, next) {
925                 eth_dev = list->eth_dev;
926                 if (eth_dev->data->port_id == port_id) {
927                         vq = eth_dev->data->rx_queues[0];
928                         if (vq) {
929                                 vid = vq->vid;
930                         }
931                         break;
932                 }
933         }
934
935         pthread_mutex_unlock(&internal_list_lock);
936
937         return vid;
938 }
939
940 static int
941 eth_dev_start(struct rte_eth_dev *eth_dev)
942 {
943         struct pmd_internal *internal = eth_dev->data->dev_private;
944         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
945
946         queue_setup(eth_dev, internal);
947
948         if (rte_atomic32_read(&internal->dev_attached) == 1) {
949                 if (dev_conf->intr_conf.rxq) {
950                         if (eth_vhost_install_intr(eth_dev) < 0) {
951                                 VHOST_LOG(INFO,
952                                         "Failed to install interrupt handler.");
953                                         return -1;
954                         }
955                 }
956         }
957
958         rte_atomic32_set(&internal->started, 1);
959         update_queuing_status(eth_dev);
960
961         return 0;
962 }
963
964 static void
965 eth_dev_stop(struct rte_eth_dev *dev)
966 {
967         struct pmd_internal *internal = dev->data->dev_private;
968
969         rte_atomic32_set(&internal->started, 0);
970         update_queuing_status(dev);
971 }
972
973 static void
974 eth_dev_close(struct rte_eth_dev *dev)
975 {
976         struct pmd_internal *internal;
977         struct internal_list *list;
978         unsigned int i;
979
980         internal = dev->data->dev_private;
981         if (!internal)
982                 return;
983
984         eth_dev_stop(dev);
985
986         rte_vhost_driver_unregister(internal->iface_name);
987
988         list = find_internal_resource(internal->iface_name);
989         if (!list)
990                 return;
991
992         pthread_mutex_lock(&internal_list_lock);
993         TAILQ_REMOVE(&internal_list, list, next);
994         pthread_mutex_unlock(&internal_list_lock);
995         rte_free(list);
996
997         if (dev->data->rx_queues)
998                 for (i = 0; i < dev->data->nb_rx_queues; i++)
999                         rte_free(dev->data->rx_queues[i]);
1000
1001         if (dev->data->tx_queues)
1002                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1003                         rte_free(dev->data->tx_queues[i]);
1004
1005         free(internal->dev_name);
1006         free(internal->iface_name);
1007         rte_free(internal);
1008
1009         dev->data->dev_private = NULL;
1010
1011         rte_free(vring_states[dev->data->port_id]);
1012         vring_states[dev->data->port_id] = NULL;
1013 }
1014
1015 static int
1016 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1017                    uint16_t nb_rx_desc __rte_unused,
1018                    unsigned int socket_id,
1019                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1020                    struct rte_mempool *mb_pool)
1021 {
1022         struct vhost_queue *vq;
1023
1024         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1025                         RTE_CACHE_LINE_SIZE, socket_id);
1026         if (vq == NULL) {
1027                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1028                 return -ENOMEM;
1029         }
1030
1031         vq->mb_pool = mb_pool;
1032         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1033         dev->data->rx_queues[rx_queue_id] = vq;
1034
1035         return 0;
1036 }
1037
1038 static int
1039 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1040                    uint16_t nb_tx_desc __rte_unused,
1041                    unsigned int socket_id,
1042                    const struct rte_eth_txconf *tx_conf __rte_unused)
1043 {
1044         struct vhost_queue *vq;
1045
1046         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1047                         RTE_CACHE_LINE_SIZE, socket_id);
1048         if (vq == NULL) {
1049                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1050                 return -ENOMEM;
1051         }
1052
1053         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1054         dev->data->tx_queues[tx_queue_id] = vq;
1055
1056         return 0;
1057 }
1058
1059 static int
1060 eth_dev_info(struct rte_eth_dev *dev,
1061              struct rte_eth_dev_info *dev_info)
1062 {
1063         struct pmd_internal *internal;
1064
1065         internal = dev->data->dev_private;
1066         if (internal == NULL) {
1067                 VHOST_LOG(ERR, "Invalid device specified\n");
1068                 return -ENODEV;
1069         }
1070
1071         dev_info->max_mac_addrs = 1;
1072         dev_info->max_rx_pktlen = (uint32_t)-1;
1073         dev_info->max_rx_queues = internal->max_queues;
1074         dev_info->max_tx_queues = internal->max_queues;
1075         dev_info->min_rx_bufsize = 0;
1076
1077         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1078                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1079         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1080
1081         return 0;
1082 }
1083
1084 static int
1085 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1086 {
1087         unsigned i;
1088         unsigned long rx_total = 0, tx_total = 0;
1089         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1090         struct vhost_queue *vq;
1091
1092         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1093                         i < dev->data->nb_rx_queues; i++) {
1094                 if (dev->data->rx_queues[i] == NULL)
1095                         continue;
1096                 vq = dev->data->rx_queues[i];
1097                 stats->q_ipackets[i] = vq->stats.pkts;
1098                 rx_total += stats->q_ipackets[i];
1099
1100                 stats->q_ibytes[i] = vq->stats.bytes;
1101                 rx_total_bytes += stats->q_ibytes[i];
1102         }
1103
1104         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1105                         i < dev->data->nb_tx_queues; i++) {
1106                 if (dev->data->tx_queues[i] == NULL)
1107                         continue;
1108                 vq = dev->data->tx_queues[i];
1109                 stats->q_opackets[i] = vq->stats.pkts;
1110                 tx_total += stats->q_opackets[i];
1111
1112                 stats->q_obytes[i] = vq->stats.bytes;
1113                 tx_total_bytes += stats->q_obytes[i];
1114         }
1115
1116         stats->ipackets = rx_total;
1117         stats->opackets = tx_total;
1118         stats->ibytes = rx_total_bytes;
1119         stats->obytes = tx_total_bytes;
1120
1121         return 0;
1122 }
1123
1124 static int
1125 eth_stats_reset(struct rte_eth_dev *dev)
1126 {
1127         struct vhost_queue *vq;
1128         unsigned i;
1129
1130         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1131                 if (dev->data->rx_queues[i] == NULL)
1132                         continue;
1133                 vq = dev->data->rx_queues[i];
1134                 vq->stats.pkts = 0;
1135                 vq->stats.bytes = 0;
1136         }
1137         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1138                 if (dev->data->tx_queues[i] == NULL)
1139                         continue;
1140                 vq = dev->data->tx_queues[i];
1141                 vq->stats.pkts = 0;
1142                 vq->stats.bytes = 0;
1143                 vq->stats.missed_pkts = 0;
1144         }
1145
1146         return 0;
1147 }
1148
1149 static void
1150 eth_queue_release(void *q)
1151 {
1152         rte_free(q);
1153 }
1154
1155 static int
1156 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1157 {
1158         /*
1159          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1160          * and releases mbuf, so nothing to cleanup.
1161          */
1162         return 0;
1163 }
1164
1165 static int
1166 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1167                 int wait_to_complete __rte_unused)
1168 {
1169         return 0;
1170 }
1171
1172 static uint32_t
1173 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1174 {
1175         struct vhost_queue *vq;
1176
1177         vq = dev->data->rx_queues[rx_queue_id];
1178         if (vq == NULL)
1179                 return 0;
1180
1181         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1182 }
1183
1184 static const struct eth_dev_ops ops = {
1185         .dev_start = eth_dev_start,
1186         .dev_stop = eth_dev_stop,
1187         .dev_close = eth_dev_close,
1188         .dev_configure = eth_dev_configure,
1189         .dev_infos_get = eth_dev_info,
1190         .rx_queue_setup = eth_rx_queue_setup,
1191         .tx_queue_setup = eth_tx_queue_setup,
1192         .rx_queue_release = eth_queue_release,
1193         .tx_queue_release = eth_queue_release,
1194         .tx_done_cleanup = eth_tx_done_cleanup,
1195         .rx_queue_count = eth_rx_queue_count,
1196         .link_update = eth_link_update,
1197         .stats_get = eth_stats_get,
1198         .stats_reset = eth_stats_reset,
1199         .xstats_reset = vhost_dev_xstats_reset,
1200         .xstats_get = vhost_dev_xstats_get,
1201         .xstats_get_names = vhost_dev_xstats_get_names,
1202         .rx_queue_intr_enable = eth_rxq_intr_enable,
1203         .rx_queue_intr_disable = eth_rxq_intr_disable,
1204 };
1205
1206 static int
1207 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1208         int16_t queues, const unsigned int numa_node, uint64_t flags)
1209 {
1210         const char *name = rte_vdev_device_name(dev);
1211         struct rte_eth_dev_data *data;
1212         struct pmd_internal *internal = NULL;
1213         struct rte_eth_dev *eth_dev = NULL;
1214         struct rte_ether_addr *eth_addr = NULL;
1215         struct rte_vhost_vring_state *vring_state = NULL;
1216         struct internal_list *list = NULL;
1217
1218         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1219                 numa_node);
1220
1221         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1222         if (list == NULL)
1223                 goto error;
1224
1225         /* reserve an ethdev entry */
1226         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1227         if (eth_dev == NULL)
1228                 goto error;
1229         data = eth_dev->data;
1230
1231         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1232         if (eth_addr == NULL)
1233                 goto error;
1234         data->mac_addrs = eth_addr;
1235         *eth_addr = base_eth_addr;
1236         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1237
1238         vring_state = rte_zmalloc_socket(name,
1239                         sizeof(*vring_state), 0, numa_node);
1240         if (vring_state == NULL)
1241                 goto error;
1242
1243         /* now put it all together
1244          * - store queue data in internal,
1245          * - point eth_dev_data to internals
1246          * - and point eth_dev structure to new eth_dev_data structure
1247          */
1248         internal = eth_dev->data->dev_private;
1249         internal->dev_name = strdup(name);
1250         if (internal->dev_name == NULL)
1251                 goto error;
1252         internal->iface_name = strdup(iface_name);
1253         if (internal->iface_name == NULL)
1254                 goto error;
1255
1256         list->eth_dev = eth_dev;
1257         pthread_mutex_lock(&internal_list_lock);
1258         TAILQ_INSERT_TAIL(&internal_list, list, next);
1259         pthread_mutex_unlock(&internal_list_lock);
1260
1261         rte_spinlock_init(&vring_state->lock);
1262         vring_states[eth_dev->data->port_id] = vring_state;
1263
1264         data->nb_rx_queues = queues;
1265         data->nb_tx_queues = queues;
1266         internal->max_queues = queues;
1267         internal->vid = -1;
1268         data->dev_link = pmd_link;
1269         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1270
1271         eth_dev->dev_ops = &ops;
1272
1273         /* finally assign rx and tx ops */
1274         eth_dev->rx_pkt_burst = eth_vhost_rx;
1275         eth_dev->tx_pkt_burst = eth_vhost_tx;
1276
1277         if (rte_vhost_driver_register(iface_name, flags))
1278                 goto error;
1279
1280         if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) {
1281                 VHOST_LOG(ERR, "Can't register callbacks\n");
1282                 goto error;
1283         }
1284
1285         if (rte_vhost_driver_start(iface_name) < 0) {
1286                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
1287                         iface_name);
1288                 goto error;
1289         }
1290
1291         rte_eth_dev_probing_finish(eth_dev);
1292         return data->port_id;
1293
1294 error:
1295         if (internal) {
1296                 free(internal->iface_name);
1297                 free(internal->dev_name);
1298         }
1299         rte_free(vring_state);
1300         rte_eth_dev_release_port(eth_dev);
1301         rte_free(list);
1302
1303         return -1;
1304 }
1305
1306 static inline int
1307 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1308 {
1309         const char **iface_name = extra_args;
1310
1311         if (value == NULL)
1312                 return -1;
1313
1314         *iface_name = value;
1315
1316         return 0;
1317 }
1318
1319 static inline int
1320 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1321 {
1322         uint16_t *n = extra_args;
1323
1324         if (value == NULL || extra_args == NULL)
1325                 return -EINVAL;
1326
1327         *n = (uint16_t)strtoul(value, NULL, 0);
1328         if (*n == USHRT_MAX && errno == ERANGE)
1329                 return -1;
1330
1331         return 0;
1332 }
1333
1334 static int
1335 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1336 {
1337         struct rte_kvargs *kvlist = NULL;
1338         int ret = 0;
1339         char *iface_name;
1340         uint16_t queues;
1341         uint64_t flags = 0;
1342         int client_mode = 0;
1343         int dequeue_zero_copy = 0;
1344         int iommu_support = 0;
1345         int postcopy_support = 0;
1346         struct rte_eth_dev *eth_dev;
1347         const char *name = rte_vdev_device_name(dev);
1348
1349         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1350
1351         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1352                 eth_dev = rte_eth_dev_attach_secondary(name);
1353                 if (!eth_dev) {
1354                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1355                         return -1;
1356                 }
1357                 /* TODO: request info from primary to set up Rx and Tx */
1358                 eth_dev->dev_ops = &ops;
1359                 eth_dev->device = &dev->device;
1360                 rte_eth_dev_probing_finish(eth_dev);
1361                 return 0;
1362         }
1363
1364         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1365         if (kvlist == NULL)
1366                 return -1;
1367
1368         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1369                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1370                                          &open_iface, &iface_name);
1371                 if (ret < 0)
1372                         goto out_free;
1373         } else {
1374                 ret = -1;
1375                 goto out_free;
1376         }
1377
1378         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1379                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1380                                          &open_int, &queues);
1381                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1382                         goto out_free;
1383
1384         } else
1385                 queues = 1;
1386
1387         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1388                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1389                                          &open_int, &client_mode);
1390                 if (ret < 0)
1391                         goto out_free;
1392
1393                 if (client_mode)
1394                         flags |= RTE_VHOST_USER_CLIENT;
1395         }
1396
1397         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1398                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1399                                          &open_int, &dequeue_zero_copy);
1400                 if (ret < 0)
1401                         goto out_free;
1402
1403                 if (dequeue_zero_copy)
1404                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1405         }
1406
1407         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1408                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1409                                          &open_int, &iommu_support);
1410                 if (ret < 0)
1411                         goto out_free;
1412
1413                 if (iommu_support)
1414                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1415         }
1416
1417         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1418                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1419                                          &open_int, &postcopy_support);
1420                 if (ret < 0)
1421                         goto out_free;
1422
1423                 if (postcopy_support)
1424                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1425         }
1426
1427         if (dev->device.numa_node == SOCKET_ID_ANY)
1428                 dev->device.numa_node = rte_socket_id();
1429
1430         eth_dev_vhost_create(dev, iface_name, queues, dev->device.numa_node,
1431                 flags);
1432
1433 out_free:
1434         rte_kvargs_free(kvlist);
1435         return ret;
1436 }
1437
1438 static int
1439 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1440 {
1441         const char *name;
1442         struct rte_eth_dev *eth_dev = NULL;
1443
1444         name = rte_vdev_device_name(dev);
1445         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1446
1447         /* find an ethdev entry */
1448         eth_dev = rte_eth_dev_allocated(name);
1449         if (eth_dev == NULL)
1450                 return 0;
1451
1452         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1453                 return rte_eth_dev_release_port(eth_dev);
1454
1455         eth_dev_close(eth_dev);
1456
1457         rte_eth_dev_release_port(eth_dev);
1458
1459         return 0;
1460 }
1461
1462 static struct rte_vdev_driver pmd_vhost_drv = {
1463         .probe = rte_pmd_vhost_probe,
1464         .remove = rte_pmd_vhost_remove,
1465 };
1466
1467 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1468 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1469 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1470         "iface=<ifc> "
1471         "queues=<int> "
1472         "client=<0|1> "
1473         "dequeue-zero-copy=<0|1> "
1474         "iommu-support=<0|1> "
1475         "postcopy-support=<0|1>");
1476
1477 RTE_INIT(vhost_init_log)
1478 {
1479         vhost_logtype = rte_log_register("pmd.net.vhost");
1480         if (vhost_logtype >= 0)
1481                 rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1482 }