90263ae77c04bdcfe297a7fb12be6d2c418949a7
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 static int vhost_logtype;
22
23 #define VHOST_LOG(level, ...) \
24         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27
28 #define ETH_VHOST_IFACE_ARG             "iface"
29 #define ETH_VHOST_QUEUES_ARG            "queues"
30 #define ETH_VHOST_CLIENT_ARG            "client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define VHOST_MAX_PKT_BURST 32
36
37 static const char *valid_arguments[] = {
38         ETH_VHOST_IFACE_ARG,
39         ETH_VHOST_QUEUES_ARG,
40         ETH_VHOST_CLIENT_ARG,
41         ETH_VHOST_DEQUEUE_ZERO_COPY,
42         ETH_VHOST_IOMMU_SUPPORT,
43         ETH_VHOST_POSTCOPY_SUPPORT,
44         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
45         NULL
46 };
47
48 static struct rte_ether_addr base_eth_addr = {
49         .addr_bytes = {
50                 0x56 /* V */,
51                 0x48 /* H */,
52                 0x4F /* O */,
53                 0x53 /* S */,
54                 0x54 /* T */,
55                 0x00
56         }
57 };
58
59 enum vhost_xstats_pkts {
60         VHOST_UNDERSIZE_PKT = 0,
61         VHOST_64_PKT,
62         VHOST_65_TO_127_PKT,
63         VHOST_128_TO_255_PKT,
64         VHOST_256_TO_511_PKT,
65         VHOST_512_TO_1023_PKT,
66         VHOST_1024_TO_1522_PKT,
67         VHOST_1523_TO_MAX_PKT,
68         VHOST_BROADCAST_PKT,
69         VHOST_MULTICAST_PKT,
70         VHOST_UNICAST_PKT,
71         VHOST_ERRORS_PKT,
72         VHOST_ERRORS_FRAGMENTED,
73         VHOST_ERRORS_JABBER,
74         VHOST_UNKNOWN_PROTOCOL,
75         VHOST_XSTATS_MAX,
76 };
77
78 struct vhost_stats {
79         uint64_t pkts;
80         uint64_t bytes;
81         uint64_t missed_pkts;
82         uint64_t xstats[VHOST_XSTATS_MAX];
83 };
84
85 struct vhost_queue {
86         int vid;
87         rte_atomic32_t allow_queuing;
88         rte_atomic32_t while_queuing;
89         struct pmd_internal *internal;
90         struct rte_mempool *mb_pool;
91         uint16_t port;
92         uint16_t virtqueue_id;
93         struct vhost_stats stats;
94 };
95
96 struct pmd_internal {
97         rte_atomic32_t dev_attached;
98         char *iface_name;
99         uint64_t flags;
100         uint64_t disable_flags;
101         uint16_t max_queues;
102         int vid;
103         rte_atomic32_t started;
104         uint8_t vlan_strip;
105 };
106
107 struct internal_list {
108         TAILQ_ENTRY(internal_list) next;
109         struct rte_eth_dev *eth_dev;
110 };
111
112 TAILQ_HEAD(internal_list_head, internal_list);
113 static struct internal_list_head internal_list =
114         TAILQ_HEAD_INITIALIZER(internal_list);
115
116 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
117
118 static struct rte_eth_link pmd_link = {
119                 .link_speed = 10000,
120                 .link_duplex = ETH_LINK_FULL_DUPLEX,
121                 .link_status = ETH_LINK_DOWN
122 };
123
124 struct rte_vhost_vring_state {
125         rte_spinlock_t lock;
126
127         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
128         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
129         unsigned int index;
130         unsigned int max_vring;
131 };
132
133 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
134
135 #define VHOST_XSTATS_NAME_SIZE 64
136
137 struct vhost_xstats_name_off {
138         char name[VHOST_XSTATS_NAME_SIZE];
139         uint64_t offset;
140 };
141
142 /* [rx]_is prepended to the name string here */
143 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
144         {"good_packets",
145          offsetof(struct vhost_queue, stats.pkts)},
146         {"total_bytes",
147          offsetof(struct vhost_queue, stats.bytes)},
148         {"missed_pkts",
149          offsetof(struct vhost_queue, stats.missed_pkts)},
150         {"broadcast_packets",
151          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
152         {"multicast_packets",
153          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
154         {"unicast_packets",
155          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
156          {"undersize_packets",
157          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
158         {"size_64_packets",
159          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
160         {"size_65_to_127_packets",
161          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
162         {"size_128_to_255_packets",
163          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
164         {"size_256_to_511_packets",
165          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
166         {"size_512_to_1023_packets",
167          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
168         {"size_1024_to_1522_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
170         {"size_1523_to_max_packets",
171          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
172         {"errors_with_bad_CRC",
173          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
174         {"fragmented_errors",
175          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
176         {"jabber_errors",
177          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
178         {"unknown_protos_packets",
179          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
180 };
181
182 /* [tx]_ is prepended to the name string here */
183 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
184         {"good_packets",
185          offsetof(struct vhost_queue, stats.pkts)},
186         {"total_bytes",
187          offsetof(struct vhost_queue, stats.bytes)},
188         {"missed_pkts",
189          offsetof(struct vhost_queue, stats.missed_pkts)},
190         {"broadcast_packets",
191          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
192         {"multicast_packets",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
194         {"unicast_packets",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
196         {"undersize_packets",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
198         {"size_64_packets",
199          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
200         {"size_65_to_127_packets",
201          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
202         {"size_128_to_255_packets",
203          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
204         {"size_256_to_511_packets",
205          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
206         {"size_512_to_1023_packets",
207          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
208         {"size_1024_to_1522_packets",
209          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
210         {"size_1523_to_max_packets",
211          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
212         {"errors_with_bad_CRC",
213          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
214 };
215
216 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
217                                 sizeof(vhost_rxport_stat_strings[0]))
218
219 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
220                                 sizeof(vhost_txport_stat_strings[0]))
221
222 static int
223 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
224 {
225         struct vhost_queue *vq = NULL;
226         unsigned int i = 0;
227
228         for (i = 0; i < dev->data->nb_rx_queues; i++) {
229                 vq = dev->data->rx_queues[i];
230                 if (!vq)
231                         continue;
232                 memset(&vq->stats, 0, sizeof(vq->stats));
233         }
234         for (i = 0; i < dev->data->nb_tx_queues; i++) {
235                 vq = dev->data->tx_queues[i];
236                 if (!vq)
237                         continue;
238                 memset(&vq->stats, 0, sizeof(vq->stats));
239         }
240
241         return 0;
242 }
243
244 static int
245 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
246                            struct rte_eth_xstat_name *xstats_names,
247                            unsigned int limit __rte_unused)
248 {
249         unsigned int t = 0;
250         int count = 0;
251         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
252
253         if (!xstats_names)
254                 return nstats;
255         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
256                 snprintf(xstats_names[count].name,
257                          sizeof(xstats_names[count].name),
258                          "rx_%s", vhost_rxport_stat_strings[t].name);
259                 count++;
260         }
261         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
262                 snprintf(xstats_names[count].name,
263                          sizeof(xstats_names[count].name),
264                          "tx_%s", vhost_txport_stat_strings[t].name);
265                 count++;
266         }
267         return count;
268 }
269
270 static int
271 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
272                      unsigned int n)
273 {
274         unsigned int i;
275         unsigned int t;
276         unsigned int count = 0;
277         struct vhost_queue *vq = NULL;
278         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
279
280         if (n < nxstats)
281                 return nxstats;
282
283         for (i = 0; i < dev->data->nb_rx_queues; i++) {
284                 vq = dev->data->rx_queues[i];
285                 if (!vq)
286                         continue;
287                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
288                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
289                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
290         }
291         for (i = 0; i < dev->data->nb_tx_queues; i++) {
292                 vq = dev->data->tx_queues[i];
293                 if (!vq)
294                         continue;
295                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
296                                 + vq->stats.missed_pkts
297                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
298                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
299         }
300         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
301                 xstats[count].value = 0;
302                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
303                         vq = dev->data->rx_queues[i];
304                         if (!vq)
305                                 continue;
306                         xstats[count].value +=
307                                 *(uint64_t *)(((char *)vq)
308                                 + vhost_rxport_stat_strings[t].offset);
309                 }
310                 xstats[count].id = count;
311                 count++;
312         }
313         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
314                 xstats[count].value = 0;
315                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
316                         vq = dev->data->tx_queues[i];
317                         if (!vq)
318                                 continue;
319                         xstats[count].value +=
320                                 *(uint64_t *)(((char *)vq)
321                                 + vhost_txport_stat_strings[t].offset);
322                 }
323                 xstats[count].id = count;
324                 count++;
325         }
326         return count;
327 }
328
329 static inline void
330 vhost_count_multicast_broadcast(struct vhost_queue *vq,
331                                 struct rte_mbuf *mbuf)
332 {
333         struct rte_ether_addr *ea = NULL;
334         struct vhost_stats *pstats = &vq->stats;
335
336         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
337         if (rte_is_multicast_ether_addr(ea)) {
338                 if (rte_is_broadcast_ether_addr(ea))
339                         pstats->xstats[VHOST_BROADCAST_PKT]++;
340                 else
341                         pstats->xstats[VHOST_MULTICAST_PKT]++;
342         }
343 }
344
345 static void
346 vhost_update_packet_xstats(struct vhost_queue *vq,
347                            struct rte_mbuf **bufs,
348                            uint16_t count)
349 {
350         uint32_t pkt_len = 0;
351         uint64_t i = 0;
352         uint64_t index;
353         struct vhost_stats *pstats = &vq->stats;
354
355         for (i = 0; i < count ; i++) {
356                 pkt_len = bufs[i]->pkt_len;
357                 if (pkt_len == 64) {
358                         pstats->xstats[VHOST_64_PKT]++;
359                 } else if (pkt_len > 64 && pkt_len < 1024) {
360                         index = (sizeof(pkt_len) * 8)
361                                 - __builtin_clz(pkt_len) - 5;
362                         pstats->xstats[index]++;
363                 } else {
364                         if (pkt_len < 64)
365                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
366                         else if (pkt_len <= 1522)
367                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
368                         else if (pkt_len > 1522)
369                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
370                 }
371                 vhost_count_multicast_broadcast(vq, bufs[i]);
372         }
373 }
374
375 static uint16_t
376 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
377 {
378         struct vhost_queue *r = q;
379         uint16_t i, nb_rx = 0;
380         uint16_t nb_receive = nb_bufs;
381
382         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
383                 return 0;
384
385         rte_atomic32_set(&r->while_queuing, 1);
386
387         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
388                 goto out;
389
390         /* Dequeue packets from guest TX queue */
391         while (nb_receive) {
392                 uint16_t nb_pkts;
393                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
394                                                  VHOST_MAX_PKT_BURST);
395
396                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
397                                                   r->mb_pool, &bufs[nb_rx],
398                                                   num);
399
400                 nb_rx += nb_pkts;
401                 nb_receive -= nb_pkts;
402                 if (nb_pkts < num)
403                         break;
404         }
405
406         r->stats.pkts += nb_rx;
407
408         for (i = 0; likely(i < nb_rx); i++) {
409                 bufs[i]->port = r->port;
410                 bufs[i]->vlan_tci = 0;
411
412                 if (r->internal->vlan_strip)
413                         rte_vlan_strip(bufs[i]);
414
415                 r->stats.bytes += bufs[i]->pkt_len;
416         }
417
418         vhost_update_packet_xstats(r, bufs, nb_rx);
419
420 out:
421         rte_atomic32_set(&r->while_queuing, 0);
422
423         return nb_rx;
424 }
425
426 static uint16_t
427 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
428 {
429         struct vhost_queue *r = q;
430         uint16_t i, nb_tx = 0;
431         uint16_t nb_send = 0;
432
433         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
434                 return 0;
435
436         rte_atomic32_set(&r->while_queuing, 1);
437
438         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
439                 goto out;
440
441         for (i = 0; i < nb_bufs; i++) {
442                 struct rte_mbuf *m = bufs[i];
443
444                 /* Do VLAN tag insertion */
445                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
446                         int error = rte_vlan_insert(&m);
447                         if (unlikely(error)) {
448                                 rte_pktmbuf_free(m);
449                                 continue;
450                         }
451                 }
452
453                 bufs[nb_send] = m;
454                 ++nb_send;
455         }
456
457         /* Enqueue packets to guest RX queue */
458         while (nb_send) {
459                 uint16_t nb_pkts;
460                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
461                                                  VHOST_MAX_PKT_BURST);
462
463                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
464                                                   &bufs[nb_tx], num);
465
466                 nb_tx += nb_pkts;
467                 nb_send -= nb_pkts;
468                 if (nb_pkts < num)
469                         break;
470         }
471
472         r->stats.pkts += nb_tx;
473         r->stats.missed_pkts += nb_bufs - nb_tx;
474
475         for (i = 0; likely(i < nb_tx); i++)
476                 r->stats.bytes += bufs[i]->pkt_len;
477
478         vhost_update_packet_xstats(r, bufs, nb_tx);
479
480         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
481          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
482          * are increased when packets are not transmitted successfully.
483          */
484         for (i = nb_tx; i < nb_bufs; i++)
485                 vhost_count_multicast_broadcast(r, bufs[i]);
486
487         for (i = 0; likely(i < nb_tx); i++)
488                 rte_pktmbuf_free(bufs[i]);
489 out:
490         rte_atomic32_set(&r->while_queuing, 0);
491
492         return nb_tx;
493 }
494
495 static inline struct internal_list *
496 find_internal_resource(char *ifname)
497 {
498         int found = 0;
499         struct internal_list *list;
500         struct pmd_internal *internal;
501
502         if (ifname == NULL)
503                 return NULL;
504
505         pthread_mutex_lock(&internal_list_lock);
506
507         TAILQ_FOREACH(list, &internal_list, next) {
508                 internal = list->eth_dev->data->dev_private;
509                 if (!strcmp(internal->iface_name, ifname)) {
510                         found = 1;
511                         break;
512                 }
513         }
514
515         pthread_mutex_unlock(&internal_list_lock);
516
517         if (!found)
518                 return NULL;
519
520         return list;
521 }
522
523 static int
524 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
525 {
526         struct rte_vhost_vring vring;
527         struct vhost_queue *vq;
528         int ret = 0;
529
530         vq = dev->data->rx_queues[qid];
531         if (!vq) {
532                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
533                 return -1;
534         }
535
536         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
537         if (ret < 0) {
538                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
539                 return ret;
540         }
541         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
542         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
543         rte_wmb();
544
545         return ret;
546 }
547
548 static int
549 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
550 {
551         struct rte_vhost_vring vring;
552         struct vhost_queue *vq;
553         int ret = 0;
554
555         vq = dev->data->rx_queues[qid];
556         if (!vq) {
557                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
558                 return -1;
559         }
560
561         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
562         if (ret < 0) {
563                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
564                 return ret;
565         }
566         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
567         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
568         rte_wmb();
569
570         return 0;
571 }
572
573 static void
574 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
575 {
576         struct rte_intr_handle *intr_handle = dev->intr_handle;
577
578         if (intr_handle) {
579                 if (intr_handle->intr_vec)
580                         free(intr_handle->intr_vec);
581                 free(intr_handle);
582         }
583
584         dev->intr_handle = NULL;
585 }
586
587 static int
588 eth_vhost_install_intr(struct rte_eth_dev *dev)
589 {
590         struct rte_vhost_vring vring;
591         struct vhost_queue *vq;
592         int count = 0;
593         int nb_rxq = dev->data->nb_rx_queues;
594         int i;
595         int ret;
596
597         /* uninstall firstly if we are reconnecting */
598         if (dev->intr_handle)
599                 eth_vhost_uninstall_intr(dev);
600
601         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
602         if (!dev->intr_handle) {
603                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
604                 return -ENOMEM;
605         }
606         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
607
608         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
609
610         dev->intr_handle->intr_vec =
611                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
612
613         if (!dev->intr_handle->intr_vec) {
614                 VHOST_LOG(ERR,
615                         "Failed to allocate memory for interrupt vector\n");
616                 free(dev->intr_handle);
617                 return -ENOMEM;
618         }
619
620         VHOST_LOG(INFO, "Prepare intr vec\n");
621         for (i = 0; i < nb_rxq; i++) {
622                 vq = dev->data->rx_queues[i];
623                 if (!vq) {
624                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
625                         continue;
626                 }
627
628                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
629                 if (ret < 0) {
630                         VHOST_LOG(INFO,
631                                 "Failed to get rxq-%d's vring, skip!\n", i);
632                         continue;
633                 }
634
635                 if (vring.kickfd < 0) {
636                         VHOST_LOG(INFO,
637                                 "rxq-%d's kickfd is invalid, skip!\n", i);
638                         continue;
639                 }
640                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
641                 dev->intr_handle->efds[i] = vring.kickfd;
642                 count++;
643                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
644         }
645
646         dev->intr_handle->nb_efd = count;
647         dev->intr_handle->max_intr = count + 1;
648         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
649
650         return 0;
651 }
652
653 static void
654 update_queuing_status(struct rte_eth_dev *dev)
655 {
656         struct pmd_internal *internal = dev->data->dev_private;
657         struct vhost_queue *vq;
658         unsigned int i;
659         int allow_queuing = 1;
660
661         if (!dev->data->rx_queues || !dev->data->tx_queues)
662                 return;
663
664         if (rte_atomic32_read(&internal->started) == 0 ||
665             rte_atomic32_read(&internal->dev_attached) == 0)
666                 allow_queuing = 0;
667
668         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
669         for (i = 0; i < dev->data->nb_rx_queues; i++) {
670                 vq = dev->data->rx_queues[i];
671                 if (vq == NULL)
672                         continue;
673                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
674                 while (rte_atomic32_read(&vq->while_queuing))
675                         rte_pause();
676         }
677
678         for (i = 0; i < dev->data->nb_tx_queues; i++) {
679                 vq = dev->data->tx_queues[i];
680                 if (vq == NULL)
681                         continue;
682                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
683                 while (rte_atomic32_read(&vq->while_queuing))
684                         rte_pause();
685         }
686 }
687
688 static void
689 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
690 {
691         struct vhost_queue *vq;
692         int i;
693
694         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
695                 vq = eth_dev->data->rx_queues[i];
696                 if (!vq)
697                         continue;
698                 vq->vid = internal->vid;
699                 vq->internal = internal;
700                 vq->port = eth_dev->data->port_id;
701         }
702         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
703                 vq = eth_dev->data->tx_queues[i];
704                 if (!vq)
705                         continue;
706                 vq->vid = internal->vid;
707                 vq->internal = internal;
708                 vq->port = eth_dev->data->port_id;
709         }
710 }
711
712 static int
713 new_device(int vid)
714 {
715         struct rte_eth_dev *eth_dev;
716         struct internal_list *list;
717         struct pmd_internal *internal;
718         struct rte_eth_conf *dev_conf;
719         unsigned i;
720         char ifname[PATH_MAX];
721 #ifdef RTE_LIBRTE_VHOST_NUMA
722         int newnode;
723 #endif
724
725         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
726         list = find_internal_resource(ifname);
727         if (list == NULL) {
728                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
729                 return -1;
730         }
731
732         eth_dev = list->eth_dev;
733         internal = eth_dev->data->dev_private;
734         dev_conf = &eth_dev->data->dev_conf;
735
736 #ifdef RTE_LIBRTE_VHOST_NUMA
737         newnode = rte_vhost_get_numa_node(vid);
738         if (newnode >= 0)
739                 eth_dev->data->numa_node = newnode;
740 #endif
741
742         internal->vid = vid;
743         if (rte_atomic32_read(&internal->started) == 1) {
744                 queue_setup(eth_dev, internal);
745
746                 if (dev_conf->intr_conf.rxq) {
747                         if (eth_vhost_install_intr(eth_dev) < 0) {
748                                 VHOST_LOG(INFO,
749                                         "Failed to install interrupt handler.");
750                                         return -1;
751                         }
752                 }
753         } else {
754                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
755         }
756
757         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
758                 rte_vhost_enable_guest_notification(vid, i, 0);
759
760         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
761
762         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
763
764         rte_atomic32_set(&internal->dev_attached, 1);
765         update_queuing_status(eth_dev);
766
767         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
768
769         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
770
771         return 0;
772 }
773
774 static void
775 destroy_device(int vid)
776 {
777         struct rte_eth_dev *eth_dev;
778         struct pmd_internal *internal;
779         struct vhost_queue *vq;
780         struct internal_list *list;
781         char ifname[PATH_MAX];
782         unsigned i;
783         struct rte_vhost_vring_state *state;
784
785         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
786         list = find_internal_resource(ifname);
787         if (list == NULL) {
788                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
789                 return;
790         }
791         eth_dev = list->eth_dev;
792         internal = eth_dev->data->dev_private;
793
794         rte_atomic32_set(&internal->dev_attached, 0);
795         update_queuing_status(eth_dev);
796
797         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
798
799         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
800                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
801                         vq = eth_dev->data->rx_queues[i];
802                         if (!vq)
803                                 continue;
804                         vq->vid = -1;
805                 }
806                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
807                         vq = eth_dev->data->tx_queues[i];
808                         if (!vq)
809                                 continue;
810                         vq->vid = -1;
811                 }
812         }
813
814         state = vring_states[eth_dev->data->port_id];
815         rte_spinlock_lock(&state->lock);
816         for (i = 0; i <= state->max_vring; i++) {
817                 state->cur[i] = false;
818                 state->seen[i] = false;
819         }
820         state->max_vring = 0;
821         rte_spinlock_unlock(&state->lock);
822
823         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
824         eth_vhost_uninstall_intr(eth_dev);
825
826         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
827 }
828
829 static int
830 vring_state_changed(int vid, uint16_t vring, int enable)
831 {
832         struct rte_vhost_vring_state *state;
833         struct rte_eth_dev *eth_dev;
834         struct internal_list *list;
835         char ifname[PATH_MAX];
836
837         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
838         list = find_internal_resource(ifname);
839         if (list == NULL) {
840                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
841                 return -1;
842         }
843
844         eth_dev = list->eth_dev;
845         /* won't be NULL */
846         state = vring_states[eth_dev->data->port_id];
847         rte_spinlock_lock(&state->lock);
848         if (state->cur[vring] == enable) {
849                 rte_spinlock_unlock(&state->lock);
850                 return 0;
851         }
852         state->cur[vring] = enable;
853         state->max_vring = RTE_MAX(vring, state->max_vring);
854         rte_spinlock_unlock(&state->lock);
855
856         VHOST_LOG(INFO, "vring%u is %s\n",
857                         vring, enable ? "enabled" : "disabled");
858
859         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
860
861         return 0;
862 }
863
864 static struct vhost_device_ops vhost_ops = {
865         .new_device          = new_device,
866         .destroy_device      = destroy_device,
867         .vring_state_changed = vring_state_changed,
868 };
869
870 static int
871 vhost_driver_setup(struct rte_eth_dev *eth_dev)
872 {
873         struct pmd_internal *internal = eth_dev->data->dev_private;
874         struct internal_list *list = NULL;
875         struct rte_vhost_vring_state *vring_state = NULL;
876         unsigned int numa_node = eth_dev->device->numa_node;
877         const char *name = eth_dev->device->name;
878
879         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
880         if (list == NULL)
881                 goto error;
882
883         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
884                                          0, numa_node);
885         if (vring_state == NULL)
886                 goto error;
887
888         list->eth_dev = eth_dev;
889         pthread_mutex_lock(&internal_list_lock);
890         TAILQ_INSERT_TAIL(&internal_list, list, next);
891         pthread_mutex_unlock(&internal_list_lock);
892
893         rte_spinlock_init(&vring_state->lock);
894         vring_states[eth_dev->data->port_id] = vring_state;
895
896         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
897                 goto error;
898
899         if (internal->disable_flags) {
900                 if (rte_vhost_driver_disable_features(internal->iface_name,
901                                                       internal->disable_flags))
902                         goto error;
903         }
904
905         if (rte_vhost_driver_callback_register(internal->iface_name,
906                                                &vhost_ops) < 0) {
907                 VHOST_LOG(ERR, "Can't register callbacks\n");
908                 goto error;
909         }
910
911         if (rte_vhost_driver_start(internal->iface_name) < 0) {
912                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
913                           internal->iface_name);
914                 goto error;
915         }
916
917         return 0;
918
919 error:
920         rte_free(vring_state);
921         rte_free(list);
922
923         return -1;
924 }
925
926 int
927 rte_eth_vhost_get_queue_event(uint16_t port_id,
928                 struct rte_eth_vhost_queue_event *event)
929 {
930         struct rte_vhost_vring_state *state;
931         unsigned int i;
932         int idx;
933
934         if (port_id >= RTE_MAX_ETHPORTS) {
935                 VHOST_LOG(ERR, "Invalid port id\n");
936                 return -1;
937         }
938
939         state = vring_states[port_id];
940         if (!state) {
941                 VHOST_LOG(ERR, "Unused port\n");
942                 return -1;
943         }
944
945         rte_spinlock_lock(&state->lock);
946         for (i = 0; i <= state->max_vring; i++) {
947                 idx = state->index++ % (state->max_vring + 1);
948
949                 if (state->cur[idx] != state->seen[idx]) {
950                         state->seen[idx] = state->cur[idx];
951                         event->queue_id = idx / 2;
952                         event->rx = idx & 1;
953                         event->enable = state->cur[idx];
954                         rte_spinlock_unlock(&state->lock);
955                         return 0;
956                 }
957         }
958         rte_spinlock_unlock(&state->lock);
959
960         return -1;
961 }
962
963 int
964 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
965 {
966         struct internal_list *list;
967         struct rte_eth_dev *eth_dev;
968         struct vhost_queue *vq;
969         int vid = -1;
970
971         if (!rte_eth_dev_is_valid_port(port_id))
972                 return -1;
973
974         pthread_mutex_lock(&internal_list_lock);
975
976         TAILQ_FOREACH(list, &internal_list, next) {
977                 eth_dev = list->eth_dev;
978                 if (eth_dev->data->port_id == port_id) {
979                         vq = eth_dev->data->rx_queues[0];
980                         if (vq) {
981                                 vid = vq->vid;
982                         }
983                         break;
984                 }
985         }
986
987         pthread_mutex_unlock(&internal_list_lock);
988
989         return vid;
990 }
991
992 static int
993 eth_dev_configure(struct rte_eth_dev *dev)
994 {
995         struct pmd_internal *internal = dev->data->dev_private;
996         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
997
998         /* NOTE: the same process has to operate a vhost interface
999          * from beginning to end (from eth_dev configure to eth_dev close).
1000          * It is user's responsibility at the moment.
1001          */
1002         if (vhost_driver_setup(dev) < 0)
1003                 return -1;
1004
1005         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1006
1007         return 0;
1008 }
1009
1010 static int
1011 eth_dev_start(struct rte_eth_dev *eth_dev)
1012 {
1013         struct pmd_internal *internal = eth_dev->data->dev_private;
1014         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1015
1016         queue_setup(eth_dev, internal);
1017
1018         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1019                 if (dev_conf->intr_conf.rxq) {
1020                         if (eth_vhost_install_intr(eth_dev) < 0) {
1021                                 VHOST_LOG(INFO,
1022                                         "Failed to install interrupt handler.");
1023                                         return -1;
1024                         }
1025                 }
1026         }
1027
1028         rte_atomic32_set(&internal->started, 1);
1029         update_queuing_status(eth_dev);
1030
1031         return 0;
1032 }
1033
1034 static void
1035 eth_dev_stop(struct rte_eth_dev *dev)
1036 {
1037         struct pmd_internal *internal = dev->data->dev_private;
1038
1039         rte_atomic32_set(&internal->started, 0);
1040         update_queuing_status(dev);
1041 }
1042
1043 static void
1044 eth_dev_close(struct rte_eth_dev *dev)
1045 {
1046         struct pmd_internal *internal;
1047         struct internal_list *list;
1048         unsigned int i;
1049
1050         internal = dev->data->dev_private;
1051         if (!internal)
1052                 return;
1053
1054         eth_dev_stop(dev);
1055
1056         rte_vhost_driver_unregister(internal->iface_name);
1057
1058         list = find_internal_resource(internal->iface_name);
1059         if (!list)
1060                 return;
1061
1062         pthread_mutex_lock(&internal_list_lock);
1063         TAILQ_REMOVE(&internal_list, list, next);
1064         pthread_mutex_unlock(&internal_list_lock);
1065         rte_free(list);
1066
1067         if (dev->data->rx_queues)
1068                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1069                         rte_free(dev->data->rx_queues[i]);
1070
1071         if (dev->data->tx_queues)
1072                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1073                         rte_free(dev->data->tx_queues[i]);
1074
1075         rte_free(internal->iface_name);
1076         rte_free(internal);
1077
1078         dev->data->dev_private = NULL;
1079
1080         rte_free(vring_states[dev->data->port_id]);
1081         vring_states[dev->data->port_id] = NULL;
1082 }
1083
1084 static int
1085 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1086                    uint16_t nb_rx_desc __rte_unused,
1087                    unsigned int socket_id,
1088                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1089                    struct rte_mempool *mb_pool)
1090 {
1091         struct vhost_queue *vq;
1092
1093         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1094                         RTE_CACHE_LINE_SIZE, socket_id);
1095         if (vq == NULL) {
1096                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1097                 return -ENOMEM;
1098         }
1099
1100         vq->mb_pool = mb_pool;
1101         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1102         dev->data->rx_queues[rx_queue_id] = vq;
1103
1104         return 0;
1105 }
1106
1107 static int
1108 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1109                    uint16_t nb_tx_desc __rte_unused,
1110                    unsigned int socket_id,
1111                    const struct rte_eth_txconf *tx_conf __rte_unused)
1112 {
1113         struct vhost_queue *vq;
1114
1115         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1116                         RTE_CACHE_LINE_SIZE, socket_id);
1117         if (vq == NULL) {
1118                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1119                 return -ENOMEM;
1120         }
1121
1122         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1123         dev->data->tx_queues[tx_queue_id] = vq;
1124
1125         return 0;
1126 }
1127
1128 static int
1129 eth_dev_info(struct rte_eth_dev *dev,
1130              struct rte_eth_dev_info *dev_info)
1131 {
1132         struct pmd_internal *internal;
1133
1134         internal = dev->data->dev_private;
1135         if (internal == NULL) {
1136                 VHOST_LOG(ERR, "Invalid device specified\n");
1137                 return -ENODEV;
1138         }
1139
1140         dev_info->max_mac_addrs = 1;
1141         dev_info->max_rx_pktlen = (uint32_t)-1;
1142         dev_info->max_rx_queues = internal->max_queues;
1143         dev_info->max_tx_queues = internal->max_queues;
1144         dev_info->min_rx_bufsize = 0;
1145
1146         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1147                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1148         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1149
1150         return 0;
1151 }
1152
1153 static int
1154 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1155 {
1156         unsigned i;
1157         unsigned long rx_total = 0, tx_total = 0;
1158         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1159         struct vhost_queue *vq;
1160
1161         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1162                         i < dev->data->nb_rx_queues; i++) {
1163                 if (dev->data->rx_queues[i] == NULL)
1164                         continue;
1165                 vq = dev->data->rx_queues[i];
1166                 stats->q_ipackets[i] = vq->stats.pkts;
1167                 rx_total += stats->q_ipackets[i];
1168
1169                 stats->q_ibytes[i] = vq->stats.bytes;
1170                 rx_total_bytes += stats->q_ibytes[i];
1171         }
1172
1173         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1174                         i < dev->data->nb_tx_queues; i++) {
1175                 if (dev->data->tx_queues[i] == NULL)
1176                         continue;
1177                 vq = dev->data->tx_queues[i];
1178                 stats->q_opackets[i] = vq->stats.pkts;
1179                 tx_total += stats->q_opackets[i];
1180
1181                 stats->q_obytes[i] = vq->stats.bytes;
1182                 tx_total_bytes += stats->q_obytes[i];
1183         }
1184
1185         stats->ipackets = rx_total;
1186         stats->opackets = tx_total;
1187         stats->ibytes = rx_total_bytes;
1188         stats->obytes = tx_total_bytes;
1189
1190         return 0;
1191 }
1192
1193 static int
1194 eth_stats_reset(struct rte_eth_dev *dev)
1195 {
1196         struct vhost_queue *vq;
1197         unsigned i;
1198
1199         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1200                 if (dev->data->rx_queues[i] == NULL)
1201                         continue;
1202                 vq = dev->data->rx_queues[i];
1203                 vq->stats.pkts = 0;
1204                 vq->stats.bytes = 0;
1205         }
1206         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1207                 if (dev->data->tx_queues[i] == NULL)
1208                         continue;
1209                 vq = dev->data->tx_queues[i];
1210                 vq->stats.pkts = 0;
1211                 vq->stats.bytes = 0;
1212                 vq->stats.missed_pkts = 0;
1213         }
1214
1215         return 0;
1216 }
1217
1218 static void
1219 eth_queue_release(void *q)
1220 {
1221         rte_free(q);
1222 }
1223
1224 static int
1225 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1226 {
1227         /*
1228          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1229          * and releases mbuf, so nothing to cleanup.
1230          */
1231         return 0;
1232 }
1233
1234 static int
1235 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1236                 int wait_to_complete __rte_unused)
1237 {
1238         return 0;
1239 }
1240
1241 static uint32_t
1242 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1243 {
1244         struct vhost_queue *vq;
1245
1246         vq = dev->data->rx_queues[rx_queue_id];
1247         if (vq == NULL)
1248                 return 0;
1249
1250         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1251 }
1252
1253 static const struct eth_dev_ops ops = {
1254         .dev_start = eth_dev_start,
1255         .dev_stop = eth_dev_stop,
1256         .dev_close = eth_dev_close,
1257         .dev_configure = eth_dev_configure,
1258         .dev_infos_get = eth_dev_info,
1259         .rx_queue_setup = eth_rx_queue_setup,
1260         .tx_queue_setup = eth_tx_queue_setup,
1261         .rx_queue_release = eth_queue_release,
1262         .tx_queue_release = eth_queue_release,
1263         .tx_done_cleanup = eth_tx_done_cleanup,
1264         .rx_queue_count = eth_rx_queue_count,
1265         .link_update = eth_link_update,
1266         .stats_get = eth_stats_get,
1267         .stats_reset = eth_stats_reset,
1268         .xstats_reset = vhost_dev_xstats_reset,
1269         .xstats_get = vhost_dev_xstats_get,
1270         .xstats_get_names = vhost_dev_xstats_get_names,
1271         .rx_queue_intr_enable = eth_rxq_intr_enable,
1272         .rx_queue_intr_disable = eth_rxq_intr_disable,
1273 };
1274
1275 static int
1276 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1277         int16_t queues, const unsigned int numa_node, uint64_t flags,
1278         uint64_t disable_flags)
1279 {
1280         const char *name = rte_vdev_device_name(dev);
1281         struct rte_eth_dev_data *data;
1282         struct pmd_internal *internal = NULL;
1283         struct rte_eth_dev *eth_dev = NULL;
1284         struct rte_ether_addr *eth_addr = NULL;
1285
1286         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1287                 numa_node);
1288
1289         /* reserve an ethdev entry */
1290         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1291         if (eth_dev == NULL)
1292                 goto error;
1293         data = eth_dev->data;
1294
1295         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1296         if (eth_addr == NULL)
1297                 goto error;
1298         data->mac_addrs = eth_addr;
1299         *eth_addr = base_eth_addr;
1300         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1301
1302         /* now put it all together
1303          * - store queue data in internal,
1304          * - point eth_dev_data to internals
1305          * - and point eth_dev structure to new eth_dev_data structure
1306          */
1307         internal = eth_dev->data->dev_private;
1308         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1309                                                  0, numa_node);
1310         if (internal->iface_name == NULL)
1311                 goto error;
1312         strcpy(internal->iface_name, iface_name);
1313
1314         data->nb_rx_queues = queues;
1315         data->nb_tx_queues = queues;
1316         internal->max_queues = queues;
1317         internal->vid = -1;
1318         internal->flags = flags;
1319         internal->disable_flags = disable_flags;
1320         data->dev_link = pmd_link;
1321         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1322
1323         eth_dev->dev_ops = &ops;
1324
1325         /* finally assign rx and tx ops */
1326         eth_dev->rx_pkt_burst = eth_vhost_rx;
1327         eth_dev->tx_pkt_burst = eth_vhost_tx;
1328
1329         rte_eth_dev_probing_finish(eth_dev);
1330         return 0;
1331
1332 error:
1333         if (internal)
1334                 rte_free(internal->iface_name);
1335         rte_eth_dev_release_port(eth_dev);
1336
1337         return -1;
1338 }
1339
1340 static inline int
1341 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1342 {
1343         const char **iface_name = extra_args;
1344
1345         if (value == NULL)
1346                 return -1;
1347
1348         *iface_name = value;
1349
1350         return 0;
1351 }
1352
1353 static inline int
1354 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1355 {
1356         uint16_t *n = extra_args;
1357
1358         if (value == NULL || extra_args == NULL)
1359                 return -EINVAL;
1360
1361         *n = (uint16_t)strtoul(value, NULL, 0);
1362         if (*n == USHRT_MAX && errno == ERANGE)
1363                 return -1;
1364
1365         return 0;
1366 }
1367
1368 static int
1369 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1370 {
1371         struct rte_kvargs *kvlist = NULL;
1372         int ret = 0;
1373         char *iface_name;
1374         uint16_t queues;
1375         uint64_t flags = 0;
1376         uint64_t disable_flags = 0;
1377         int client_mode = 0;
1378         int dequeue_zero_copy = 0;
1379         int iommu_support = 0;
1380         int postcopy_support = 0;
1381         int tso = 0;
1382         struct rte_eth_dev *eth_dev;
1383         const char *name = rte_vdev_device_name(dev);
1384
1385         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1386
1387         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1388                 eth_dev = rte_eth_dev_attach_secondary(name);
1389                 if (!eth_dev) {
1390                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1391                         return -1;
1392                 }
1393                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1394                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1395                 eth_dev->dev_ops = &ops;
1396                 if (dev->device.numa_node == SOCKET_ID_ANY)
1397                         dev->device.numa_node = rte_socket_id();
1398                 eth_dev->device = &dev->device;
1399                 rte_eth_dev_probing_finish(eth_dev);
1400                 return 0;
1401         }
1402
1403         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1404         if (kvlist == NULL)
1405                 return -1;
1406
1407         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1408                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1409                                          &open_iface, &iface_name);
1410                 if (ret < 0)
1411                         goto out_free;
1412         } else {
1413                 ret = -1;
1414                 goto out_free;
1415         }
1416
1417         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1418                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1419                                          &open_int, &queues);
1420                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1421                         goto out_free;
1422
1423         } else
1424                 queues = 1;
1425
1426         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1427                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1428                                          &open_int, &client_mode);
1429                 if (ret < 0)
1430                         goto out_free;
1431
1432                 if (client_mode)
1433                         flags |= RTE_VHOST_USER_CLIENT;
1434         }
1435
1436         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1437                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1438                                          &open_int, &dequeue_zero_copy);
1439                 if (ret < 0)
1440                         goto out_free;
1441
1442                 if (dequeue_zero_copy)
1443                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1444         }
1445
1446         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1447                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1448                                          &open_int, &iommu_support);
1449                 if (ret < 0)
1450                         goto out_free;
1451
1452                 if (iommu_support)
1453                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1454         }
1455
1456         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1457                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1458                                          &open_int, &postcopy_support);
1459                 if (ret < 0)
1460                         goto out_free;
1461
1462                 if (postcopy_support)
1463                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1464         }
1465
1466         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1467                 ret = rte_kvargs_process(kvlist,
1468                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1469                                 &open_int, &tso);
1470                 if (ret < 0)
1471                         goto out_free;
1472
1473                 if (tso == 0) {
1474                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1475                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1476                 }
1477         }
1478
1479         if (dev->device.numa_node == SOCKET_ID_ANY)
1480                 dev->device.numa_node = rte_socket_id();
1481
1482         ret = eth_dev_vhost_create(dev, iface_name, queues,
1483                                    dev->device.numa_node, flags, disable_flags);
1484         if (ret == -1)
1485                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1486
1487 out_free:
1488         rte_kvargs_free(kvlist);
1489         return ret;
1490 }
1491
1492 static int
1493 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1494 {
1495         const char *name;
1496         struct rte_eth_dev *eth_dev = NULL;
1497
1498         name = rte_vdev_device_name(dev);
1499         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1500
1501         /* find an ethdev entry */
1502         eth_dev = rte_eth_dev_allocated(name);
1503         if (eth_dev == NULL)
1504                 return 0;
1505
1506         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1507                 return rte_eth_dev_release_port(eth_dev);
1508
1509         eth_dev_close(eth_dev);
1510
1511         rte_eth_dev_release_port(eth_dev);
1512
1513         return 0;
1514 }
1515
1516 static struct rte_vdev_driver pmd_vhost_drv = {
1517         .probe = rte_pmd_vhost_probe,
1518         .remove = rte_pmd_vhost_remove,
1519 };
1520
1521 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1522 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1523 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1524         "iface=<ifc> "
1525         "queues=<int> "
1526         "client=<0|1> "
1527         "dequeue-zero-copy=<0|1> "
1528         "iommu-support=<0|1> "
1529         "postcopy-support=<0|1> "
1530         "tso=<0|1>");
1531
1532 RTE_INIT(vhost_init_log)
1533 {
1534         vhost_logtype = rte_log_register("pmd.net.vhost");
1535         if (vhost_logtype >= 0)
1536                 rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1537 }