net/vhost: fix setup error path
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 static int vhost_logtype;
22
23 #define VHOST_LOG(level, ...) \
24         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27
28 #define ETH_VHOST_IFACE_ARG             "iface"
29 #define ETH_VHOST_QUEUES_ARG            "queues"
30 #define ETH_VHOST_CLIENT_ARG            "client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define VHOST_MAX_PKT_BURST 32
36
37 static const char *valid_arguments[] = {
38         ETH_VHOST_IFACE_ARG,
39         ETH_VHOST_QUEUES_ARG,
40         ETH_VHOST_CLIENT_ARG,
41         ETH_VHOST_DEQUEUE_ZERO_COPY,
42         ETH_VHOST_IOMMU_SUPPORT,
43         ETH_VHOST_POSTCOPY_SUPPORT,
44         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
45         NULL
46 };
47
48 static struct rte_ether_addr base_eth_addr = {
49         .addr_bytes = {
50                 0x56 /* V */,
51                 0x48 /* H */,
52                 0x4F /* O */,
53                 0x53 /* S */,
54                 0x54 /* T */,
55                 0x00
56         }
57 };
58
59 enum vhost_xstats_pkts {
60         VHOST_UNDERSIZE_PKT = 0,
61         VHOST_64_PKT,
62         VHOST_65_TO_127_PKT,
63         VHOST_128_TO_255_PKT,
64         VHOST_256_TO_511_PKT,
65         VHOST_512_TO_1023_PKT,
66         VHOST_1024_TO_1522_PKT,
67         VHOST_1523_TO_MAX_PKT,
68         VHOST_BROADCAST_PKT,
69         VHOST_MULTICAST_PKT,
70         VHOST_UNICAST_PKT,
71         VHOST_ERRORS_PKT,
72         VHOST_ERRORS_FRAGMENTED,
73         VHOST_ERRORS_JABBER,
74         VHOST_UNKNOWN_PROTOCOL,
75         VHOST_XSTATS_MAX,
76 };
77
78 struct vhost_stats {
79         uint64_t pkts;
80         uint64_t bytes;
81         uint64_t missed_pkts;
82         uint64_t xstats[VHOST_XSTATS_MAX];
83 };
84
85 struct vhost_queue {
86         int vid;
87         rte_atomic32_t allow_queuing;
88         rte_atomic32_t while_queuing;
89         struct pmd_internal *internal;
90         struct rte_mempool *mb_pool;
91         uint16_t port;
92         uint16_t virtqueue_id;
93         struct vhost_stats stats;
94 };
95
96 struct pmd_internal {
97         rte_atomic32_t dev_attached;
98         char *iface_name;
99         uint64_t flags;
100         uint64_t disable_flags;
101         uint16_t max_queues;
102         int vid;
103         rte_atomic32_t started;
104         uint8_t vlan_strip;
105 };
106
107 struct internal_list {
108         TAILQ_ENTRY(internal_list) next;
109         struct rte_eth_dev *eth_dev;
110 };
111
112 TAILQ_HEAD(internal_list_head, internal_list);
113 static struct internal_list_head internal_list =
114         TAILQ_HEAD_INITIALIZER(internal_list);
115
116 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
117
118 static struct rte_eth_link pmd_link = {
119                 .link_speed = 10000,
120                 .link_duplex = ETH_LINK_FULL_DUPLEX,
121                 .link_status = ETH_LINK_DOWN
122 };
123
124 struct rte_vhost_vring_state {
125         rte_spinlock_t lock;
126
127         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
128         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
129         unsigned int index;
130         unsigned int max_vring;
131 };
132
133 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
134
135 #define VHOST_XSTATS_NAME_SIZE 64
136
137 struct vhost_xstats_name_off {
138         char name[VHOST_XSTATS_NAME_SIZE];
139         uint64_t offset;
140 };
141
142 /* [rx]_is prepended to the name string here */
143 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
144         {"good_packets",
145          offsetof(struct vhost_queue, stats.pkts)},
146         {"total_bytes",
147          offsetof(struct vhost_queue, stats.bytes)},
148         {"missed_pkts",
149          offsetof(struct vhost_queue, stats.missed_pkts)},
150         {"broadcast_packets",
151          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
152         {"multicast_packets",
153          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
154         {"unicast_packets",
155          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
156          {"undersize_packets",
157          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
158         {"size_64_packets",
159          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
160         {"size_65_to_127_packets",
161          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
162         {"size_128_to_255_packets",
163          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
164         {"size_256_to_511_packets",
165          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
166         {"size_512_to_1023_packets",
167          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
168         {"size_1024_to_1522_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
170         {"size_1523_to_max_packets",
171          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
172         {"errors_with_bad_CRC",
173          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
174         {"fragmented_errors",
175          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
176         {"jabber_errors",
177          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
178         {"unknown_protos_packets",
179          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
180 };
181
182 /* [tx]_ is prepended to the name string here */
183 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
184         {"good_packets",
185          offsetof(struct vhost_queue, stats.pkts)},
186         {"total_bytes",
187          offsetof(struct vhost_queue, stats.bytes)},
188         {"missed_pkts",
189          offsetof(struct vhost_queue, stats.missed_pkts)},
190         {"broadcast_packets",
191          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
192         {"multicast_packets",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
194         {"unicast_packets",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
196         {"undersize_packets",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
198         {"size_64_packets",
199          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
200         {"size_65_to_127_packets",
201          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
202         {"size_128_to_255_packets",
203          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
204         {"size_256_to_511_packets",
205          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
206         {"size_512_to_1023_packets",
207          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
208         {"size_1024_to_1522_packets",
209          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
210         {"size_1523_to_max_packets",
211          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
212         {"errors_with_bad_CRC",
213          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
214 };
215
216 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
217                                 sizeof(vhost_rxport_stat_strings[0]))
218
219 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
220                                 sizeof(vhost_txport_stat_strings[0]))
221
222 static int
223 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
224 {
225         struct vhost_queue *vq = NULL;
226         unsigned int i = 0;
227
228         for (i = 0; i < dev->data->nb_rx_queues; i++) {
229                 vq = dev->data->rx_queues[i];
230                 if (!vq)
231                         continue;
232                 memset(&vq->stats, 0, sizeof(vq->stats));
233         }
234         for (i = 0; i < dev->data->nb_tx_queues; i++) {
235                 vq = dev->data->tx_queues[i];
236                 if (!vq)
237                         continue;
238                 memset(&vq->stats, 0, sizeof(vq->stats));
239         }
240
241         return 0;
242 }
243
244 static int
245 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
246                            struct rte_eth_xstat_name *xstats_names,
247                            unsigned int limit __rte_unused)
248 {
249         unsigned int t = 0;
250         int count = 0;
251         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
252
253         if (!xstats_names)
254                 return nstats;
255         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
256                 snprintf(xstats_names[count].name,
257                          sizeof(xstats_names[count].name),
258                          "rx_%s", vhost_rxport_stat_strings[t].name);
259                 count++;
260         }
261         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
262                 snprintf(xstats_names[count].name,
263                          sizeof(xstats_names[count].name),
264                          "tx_%s", vhost_txport_stat_strings[t].name);
265                 count++;
266         }
267         return count;
268 }
269
270 static int
271 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
272                      unsigned int n)
273 {
274         unsigned int i;
275         unsigned int t;
276         unsigned int count = 0;
277         struct vhost_queue *vq = NULL;
278         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
279
280         if (n < nxstats)
281                 return nxstats;
282
283         for (i = 0; i < dev->data->nb_rx_queues; i++) {
284                 vq = dev->data->rx_queues[i];
285                 if (!vq)
286                         continue;
287                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
288                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
289                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
290         }
291         for (i = 0; i < dev->data->nb_tx_queues; i++) {
292                 vq = dev->data->tx_queues[i];
293                 if (!vq)
294                         continue;
295                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
296                                 + vq->stats.missed_pkts
297                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
298                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
299         }
300         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
301                 xstats[count].value = 0;
302                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
303                         vq = dev->data->rx_queues[i];
304                         if (!vq)
305                                 continue;
306                         xstats[count].value +=
307                                 *(uint64_t *)(((char *)vq)
308                                 + vhost_rxport_stat_strings[t].offset);
309                 }
310                 xstats[count].id = count;
311                 count++;
312         }
313         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
314                 xstats[count].value = 0;
315                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
316                         vq = dev->data->tx_queues[i];
317                         if (!vq)
318                                 continue;
319                         xstats[count].value +=
320                                 *(uint64_t *)(((char *)vq)
321                                 + vhost_txport_stat_strings[t].offset);
322                 }
323                 xstats[count].id = count;
324                 count++;
325         }
326         return count;
327 }
328
329 static inline void
330 vhost_count_multicast_broadcast(struct vhost_queue *vq,
331                                 struct rte_mbuf *mbuf)
332 {
333         struct rte_ether_addr *ea = NULL;
334         struct vhost_stats *pstats = &vq->stats;
335
336         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
337         if (rte_is_multicast_ether_addr(ea)) {
338                 if (rte_is_broadcast_ether_addr(ea))
339                         pstats->xstats[VHOST_BROADCAST_PKT]++;
340                 else
341                         pstats->xstats[VHOST_MULTICAST_PKT]++;
342         }
343 }
344
345 static void
346 vhost_update_packet_xstats(struct vhost_queue *vq,
347                            struct rte_mbuf **bufs,
348                            uint16_t count)
349 {
350         uint32_t pkt_len = 0;
351         uint64_t i = 0;
352         uint64_t index;
353         struct vhost_stats *pstats = &vq->stats;
354
355         for (i = 0; i < count ; i++) {
356                 pkt_len = bufs[i]->pkt_len;
357                 if (pkt_len == 64) {
358                         pstats->xstats[VHOST_64_PKT]++;
359                 } else if (pkt_len > 64 && pkt_len < 1024) {
360                         index = (sizeof(pkt_len) * 8)
361                                 - __builtin_clz(pkt_len) - 5;
362                         pstats->xstats[index]++;
363                 } else {
364                         if (pkt_len < 64)
365                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
366                         else if (pkt_len <= 1522)
367                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
368                         else if (pkt_len > 1522)
369                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
370                 }
371                 vhost_count_multicast_broadcast(vq, bufs[i]);
372         }
373 }
374
375 static uint16_t
376 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
377 {
378         struct vhost_queue *r = q;
379         uint16_t i, nb_rx = 0;
380         uint16_t nb_receive = nb_bufs;
381
382         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
383                 return 0;
384
385         rte_atomic32_set(&r->while_queuing, 1);
386
387         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
388                 goto out;
389
390         /* Dequeue packets from guest TX queue */
391         while (nb_receive) {
392                 uint16_t nb_pkts;
393                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
394                                                  VHOST_MAX_PKT_BURST);
395
396                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
397                                                   r->mb_pool, &bufs[nb_rx],
398                                                   num);
399
400                 nb_rx += nb_pkts;
401                 nb_receive -= nb_pkts;
402                 if (nb_pkts < num)
403                         break;
404         }
405
406         r->stats.pkts += nb_rx;
407
408         for (i = 0; likely(i < nb_rx); i++) {
409                 bufs[i]->port = r->port;
410                 bufs[i]->vlan_tci = 0;
411
412                 if (r->internal->vlan_strip)
413                         rte_vlan_strip(bufs[i]);
414
415                 r->stats.bytes += bufs[i]->pkt_len;
416         }
417
418         vhost_update_packet_xstats(r, bufs, nb_rx);
419
420 out:
421         rte_atomic32_set(&r->while_queuing, 0);
422
423         return nb_rx;
424 }
425
426 static uint16_t
427 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
428 {
429         struct vhost_queue *r = q;
430         uint16_t i, nb_tx = 0;
431         uint16_t nb_send = 0;
432
433         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
434                 return 0;
435
436         rte_atomic32_set(&r->while_queuing, 1);
437
438         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
439                 goto out;
440
441         for (i = 0; i < nb_bufs; i++) {
442                 struct rte_mbuf *m = bufs[i];
443
444                 /* Do VLAN tag insertion */
445                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
446                         int error = rte_vlan_insert(&m);
447                         if (unlikely(error)) {
448                                 rte_pktmbuf_free(m);
449                                 continue;
450                         }
451                 }
452
453                 bufs[nb_send] = m;
454                 ++nb_send;
455         }
456
457         /* Enqueue packets to guest RX queue */
458         while (nb_send) {
459                 uint16_t nb_pkts;
460                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
461                                                  VHOST_MAX_PKT_BURST);
462
463                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
464                                                   &bufs[nb_tx], num);
465
466                 nb_tx += nb_pkts;
467                 nb_send -= nb_pkts;
468                 if (nb_pkts < num)
469                         break;
470         }
471
472         r->stats.pkts += nb_tx;
473         r->stats.missed_pkts += nb_bufs - nb_tx;
474
475         for (i = 0; likely(i < nb_tx); i++)
476                 r->stats.bytes += bufs[i]->pkt_len;
477
478         vhost_update_packet_xstats(r, bufs, nb_tx);
479
480         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
481          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
482          * are increased when packets are not transmitted successfully.
483          */
484         for (i = nb_tx; i < nb_bufs; i++)
485                 vhost_count_multicast_broadcast(r, bufs[i]);
486
487         for (i = 0; likely(i < nb_tx); i++)
488                 rte_pktmbuf_free(bufs[i]);
489 out:
490         rte_atomic32_set(&r->while_queuing, 0);
491
492         return nb_tx;
493 }
494
495 static inline struct internal_list *
496 find_internal_resource(char *ifname)
497 {
498         int found = 0;
499         struct internal_list *list;
500         struct pmd_internal *internal;
501
502         if (ifname == NULL)
503                 return NULL;
504
505         pthread_mutex_lock(&internal_list_lock);
506
507         TAILQ_FOREACH(list, &internal_list, next) {
508                 internal = list->eth_dev->data->dev_private;
509                 if (!strcmp(internal->iface_name, ifname)) {
510                         found = 1;
511                         break;
512                 }
513         }
514
515         pthread_mutex_unlock(&internal_list_lock);
516
517         if (!found)
518                 return NULL;
519
520         return list;
521 }
522
523 static int
524 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
525 {
526         struct rte_vhost_vring vring;
527         struct vhost_queue *vq;
528         int ret = 0;
529
530         vq = dev->data->rx_queues[qid];
531         if (!vq) {
532                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
533                 return -1;
534         }
535
536         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
537         if (ret < 0) {
538                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
539                 return ret;
540         }
541         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
542         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
543         rte_wmb();
544
545         return ret;
546 }
547
548 static int
549 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
550 {
551         struct rte_vhost_vring vring;
552         struct vhost_queue *vq;
553         int ret = 0;
554
555         vq = dev->data->rx_queues[qid];
556         if (!vq) {
557                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
558                 return -1;
559         }
560
561         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
562         if (ret < 0) {
563                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
564                 return ret;
565         }
566         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
567         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
568         rte_wmb();
569
570         return 0;
571 }
572
573 static void
574 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
575 {
576         struct rte_intr_handle *intr_handle = dev->intr_handle;
577
578         if (intr_handle) {
579                 if (intr_handle->intr_vec)
580                         free(intr_handle->intr_vec);
581                 free(intr_handle);
582         }
583
584         dev->intr_handle = NULL;
585 }
586
587 static int
588 eth_vhost_install_intr(struct rte_eth_dev *dev)
589 {
590         struct rte_vhost_vring vring;
591         struct vhost_queue *vq;
592         int count = 0;
593         int nb_rxq = dev->data->nb_rx_queues;
594         int i;
595         int ret;
596
597         /* uninstall firstly if we are reconnecting */
598         if (dev->intr_handle)
599                 eth_vhost_uninstall_intr(dev);
600
601         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
602         if (!dev->intr_handle) {
603                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
604                 return -ENOMEM;
605         }
606         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
607
608         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
609
610         dev->intr_handle->intr_vec =
611                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
612
613         if (!dev->intr_handle->intr_vec) {
614                 VHOST_LOG(ERR,
615                         "Failed to allocate memory for interrupt vector\n");
616                 free(dev->intr_handle);
617                 return -ENOMEM;
618         }
619
620         VHOST_LOG(INFO, "Prepare intr vec\n");
621         for (i = 0; i < nb_rxq; i++) {
622                 vq = dev->data->rx_queues[i];
623                 if (!vq) {
624                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
625                         continue;
626                 }
627
628                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
629                 if (ret < 0) {
630                         VHOST_LOG(INFO,
631                                 "Failed to get rxq-%d's vring, skip!\n", i);
632                         continue;
633                 }
634
635                 if (vring.kickfd < 0) {
636                         VHOST_LOG(INFO,
637                                 "rxq-%d's kickfd is invalid, skip!\n", i);
638                         continue;
639                 }
640                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
641                 dev->intr_handle->efds[i] = vring.kickfd;
642                 count++;
643                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
644         }
645
646         dev->intr_handle->nb_efd = count;
647         dev->intr_handle->max_intr = count + 1;
648         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
649
650         return 0;
651 }
652
653 static void
654 update_queuing_status(struct rte_eth_dev *dev)
655 {
656         struct pmd_internal *internal = dev->data->dev_private;
657         struct vhost_queue *vq;
658         unsigned int i;
659         int allow_queuing = 1;
660
661         if (!dev->data->rx_queues || !dev->data->tx_queues)
662                 return;
663
664         if (rte_atomic32_read(&internal->started) == 0 ||
665             rte_atomic32_read(&internal->dev_attached) == 0)
666                 allow_queuing = 0;
667
668         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
669         for (i = 0; i < dev->data->nb_rx_queues; i++) {
670                 vq = dev->data->rx_queues[i];
671                 if (vq == NULL)
672                         continue;
673                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
674                 while (rte_atomic32_read(&vq->while_queuing))
675                         rte_pause();
676         }
677
678         for (i = 0; i < dev->data->nb_tx_queues; i++) {
679                 vq = dev->data->tx_queues[i];
680                 if (vq == NULL)
681                         continue;
682                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
683                 while (rte_atomic32_read(&vq->while_queuing))
684                         rte_pause();
685         }
686 }
687
688 static void
689 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
690 {
691         struct vhost_queue *vq;
692         int i;
693
694         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
695                 vq = eth_dev->data->rx_queues[i];
696                 if (!vq)
697                         continue;
698                 vq->vid = internal->vid;
699                 vq->internal = internal;
700                 vq->port = eth_dev->data->port_id;
701         }
702         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
703                 vq = eth_dev->data->tx_queues[i];
704                 if (!vq)
705                         continue;
706                 vq->vid = internal->vid;
707                 vq->internal = internal;
708                 vq->port = eth_dev->data->port_id;
709         }
710 }
711
712 static int
713 new_device(int vid)
714 {
715         struct rte_eth_dev *eth_dev;
716         struct internal_list *list;
717         struct pmd_internal *internal;
718         struct rte_eth_conf *dev_conf;
719         unsigned i;
720         char ifname[PATH_MAX];
721 #ifdef RTE_LIBRTE_VHOST_NUMA
722         int newnode;
723 #endif
724
725         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
726         list = find_internal_resource(ifname);
727         if (list == NULL) {
728                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
729                 return -1;
730         }
731
732         eth_dev = list->eth_dev;
733         internal = eth_dev->data->dev_private;
734         dev_conf = &eth_dev->data->dev_conf;
735
736 #ifdef RTE_LIBRTE_VHOST_NUMA
737         newnode = rte_vhost_get_numa_node(vid);
738         if (newnode >= 0)
739                 eth_dev->data->numa_node = newnode;
740 #endif
741
742         internal->vid = vid;
743         if (rte_atomic32_read(&internal->started) == 1) {
744                 queue_setup(eth_dev, internal);
745
746                 if (dev_conf->intr_conf.rxq) {
747                         if (eth_vhost_install_intr(eth_dev) < 0) {
748                                 VHOST_LOG(INFO,
749                                         "Failed to install interrupt handler.");
750                                         return -1;
751                         }
752                 }
753         } else {
754                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
755         }
756
757         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
758                 rte_vhost_enable_guest_notification(vid, i, 0);
759
760         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
761
762         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
763
764         rte_atomic32_set(&internal->dev_attached, 1);
765         update_queuing_status(eth_dev);
766
767         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
768
769         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
770
771         return 0;
772 }
773
774 static void
775 destroy_device(int vid)
776 {
777         struct rte_eth_dev *eth_dev;
778         struct pmd_internal *internal;
779         struct vhost_queue *vq;
780         struct internal_list *list;
781         char ifname[PATH_MAX];
782         unsigned i;
783         struct rte_vhost_vring_state *state;
784
785         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
786         list = find_internal_resource(ifname);
787         if (list == NULL) {
788                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
789                 return;
790         }
791         eth_dev = list->eth_dev;
792         internal = eth_dev->data->dev_private;
793
794         rte_atomic32_set(&internal->dev_attached, 0);
795         update_queuing_status(eth_dev);
796
797         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
798
799         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
800                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
801                         vq = eth_dev->data->rx_queues[i];
802                         if (!vq)
803                                 continue;
804                         vq->vid = -1;
805                 }
806                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
807                         vq = eth_dev->data->tx_queues[i];
808                         if (!vq)
809                                 continue;
810                         vq->vid = -1;
811                 }
812         }
813
814         state = vring_states[eth_dev->data->port_id];
815         rte_spinlock_lock(&state->lock);
816         for (i = 0; i <= state->max_vring; i++) {
817                 state->cur[i] = false;
818                 state->seen[i] = false;
819         }
820         state->max_vring = 0;
821         rte_spinlock_unlock(&state->lock);
822
823         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
824         eth_vhost_uninstall_intr(eth_dev);
825
826         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
827 }
828
829 static int
830 vring_state_changed(int vid, uint16_t vring, int enable)
831 {
832         struct rte_vhost_vring_state *state;
833         struct rte_eth_dev *eth_dev;
834         struct internal_list *list;
835         char ifname[PATH_MAX];
836
837         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
838         list = find_internal_resource(ifname);
839         if (list == NULL) {
840                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
841                 return -1;
842         }
843
844         eth_dev = list->eth_dev;
845         /* won't be NULL */
846         state = vring_states[eth_dev->data->port_id];
847         rte_spinlock_lock(&state->lock);
848         if (state->cur[vring] == enable) {
849                 rte_spinlock_unlock(&state->lock);
850                 return 0;
851         }
852         state->cur[vring] = enable;
853         state->max_vring = RTE_MAX(vring, state->max_vring);
854         rte_spinlock_unlock(&state->lock);
855
856         VHOST_LOG(INFO, "vring%u is %s\n",
857                         vring, enable ? "enabled" : "disabled");
858
859         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
860
861         return 0;
862 }
863
864 static struct vhost_device_ops vhost_ops = {
865         .new_device          = new_device,
866         .destroy_device      = destroy_device,
867         .vring_state_changed = vring_state_changed,
868 };
869
870 static int
871 vhost_driver_setup(struct rte_eth_dev *eth_dev)
872 {
873         struct pmd_internal *internal = eth_dev->data->dev_private;
874         struct internal_list *list = NULL;
875         struct rte_vhost_vring_state *vring_state = NULL;
876         unsigned int numa_node = eth_dev->device->numa_node;
877         const char *name = eth_dev->device->name;
878
879         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
880         if (list == NULL)
881                 return -1;
882
883         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
884                                          0, numa_node);
885         if (vring_state == NULL)
886                 goto free_list;
887
888         list->eth_dev = eth_dev;
889         pthread_mutex_lock(&internal_list_lock);
890         TAILQ_INSERT_TAIL(&internal_list, list, next);
891         pthread_mutex_unlock(&internal_list_lock);
892
893         rte_spinlock_init(&vring_state->lock);
894         vring_states[eth_dev->data->port_id] = vring_state;
895
896         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
897                 goto list_remove;
898
899         if (internal->disable_flags) {
900                 if (rte_vhost_driver_disable_features(internal->iface_name,
901                                                       internal->disable_flags))
902                         goto drv_unreg;
903         }
904
905         if (rte_vhost_driver_callback_register(internal->iface_name,
906                                                &vhost_ops) < 0) {
907                 VHOST_LOG(ERR, "Can't register callbacks\n");
908                 goto drv_unreg;
909         }
910
911         if (rte_vhost_driver_start(internal->iface_name) < 0) {
912                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
913                           internal->iface_name);
914                 goto drv_unreg;
915         }
916
917         return 0;
918
919 drv_unreg:
920         rte_vhost_driver_unregister(internal->iface_name);
921 list_remove:
922         vring_states[eth_dev->data->port_id] = NULL;
923         pthread_mutex_lock(&internal_list_lock);
924         TAILQ_REMOVE(&internal_list, list, next);
925         pthread_mutex_unlock(&internal_list_lock);
926         rte_free(vring_state);
927 free_list:
928         rte_free(list);
929
930         return -1;
931 }
932
933 int
934 rte_eth_vhost_get_queue_event(uint16_t port_id,
935                 struct rte_eth_vhost_queue_event *event)
936 {
937         struct rte_vhost_vring_state *state;
938         unsigned int i;
939         int idx;
940
941         if (port_id >= RTE_MAX_ETHPORTS) {
942                 VHOST_LOG(ERR, "Invalid port id\n");
943                 return -1;
944         }
945
946         state = vring_states[port_id];
947         if (!state) {
948                 VHOST_LOG(ERR, "Unused port\n");
949                 return -1;
950         }
951
952         rte_spinlock_lock(&state->lock);
953         for (i = 0; i <= state->max_vring; i++) {
954                 idx = state->index++ % (state->max_vring + 1);
955
956                 if (state->cur[idx] != state->seen[idx]) {
957                         state->seen[idx] = state->cur[idx];
958                         event->queue_id = idx / 2;
959                         event->rx = idx & 1;
960                         event->enable = state->cur[idx];
961                         rte_spinlock_unlock(&state->lock);
962                         return 0;
963                 }
964         }
965         rte_spinlock_unlock(&state->lock);
966
967         return -1;
968 }
969
970 int
971 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
972 {
973         struct internal_list *list;
974         struct rte_eth_dev *eth_dev;
975         struct vhost_queue *vq;
976         int vid = -1;
977
978         if (!rte_eth_dev_is_valid_port(port_id))
979                 return -1;
980
981         pthread_mutex_lock(&internal_list_lock);
982
983         TAILQ_FOREACH(list, &internal_list, next) {
984                 eth_dev = list->eth_dev;
985                 if (eth_dev->data->port_id == port_id) {
986                         vq = eth_dev->data->rx_queues[0];
987                         if (vq) {
988                                 vid = vq->vid;
989                         }
990                         break;
991                 }
992         }
993
994         pthread_mutex_unlock(&internal_list_lock);
995
996         return vid;
997 }
998
999 static int
1000 eth_dev_configure(struct rte_eth_dev *dev)
1001 {
1002         struct pmd_internal *internal = dev->data->dev_private;
1003         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1004
1005         /* NOTE: the same process has to operate a vhost interface
1006          * from beginning to end (from eth_dev configure to eth_dev close).
1007          * It is user's responsibility at the moment.
1008          */
1009         if (vhost_driver_setup(dev) < 0)
1010                 return -1;
1011
1012         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1013
1014         return 0;
1015 }
1016
1017 static int
1018 eth_dev_start(struct rte_eth_dev *eth_dev)
1019 {
1020         struct pmd_internal *internal = eth_dev->data->dev_private;
1021         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1022
1023         queue_setup(eth_dev, internal);
1024
1025         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1026                 if (dev_conf->intr_conf.rxq) {
1027                         if (eth_vhost_install_intr(eth_dev) < 0) {
1028                                 VHOST_LOG(INFO,
1029                                         "Failed to install interrupt handler.");
1030                                         return -1;
1031                         }
1032                 }
1033         }
1034
1035         rte_atomic32_set(&internal->started, 1);
1036         update_queuing_status(eth_dev);
1037
1038         return 0;
1039 }
1040
1041 static void
1042 eth_dev_stop(struct rte_eth_dev *dev)
1043 {
1044         struct pmd_internal *internal = dev->data->dev_private;
1045
1046         rte_atomic32_set(&internal->started, 0);
1047         update_queuing_status(dev);
1048 }
1049
1050 static void
1051 eth_dev_close(struct rte_eth_dev *dev)
1052 {
1053         struct pmd_internal *internal;
1054         struct internal_list *list;
1055         unsigned int i;
1056
1057         internal = dev->data->dev_private;
1058         if (!internal)
1059                 return;
1060
1061         eth_dev_stop(dev);
1062
1063         rte_vhost_driver_unregister(internal->iface_name);
1064
1065         list = find_internal_resource(internal->iface_name);
1066         if (!list)
1067                 return;
1068
1069         pthread_mutex_lock(&internal_list_lock);
1070         TAILQ_REMOVE(&internal_list, list, next);
1071         pthread_mutex_unlock(&internal_list_lock);
1072         rte_free(list);
1073
1074         if (dev->data->rx_queues)
1075                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1076                         rte_free(dev->data->rx_queues[i]);
1077
1078         if (dev->data->tx_queues)
1079                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1080                         rte_free(dev->data->tx_queues[i]);
1081
1082         rte_free(internal->iface_name);
1083         rte_free(internal);
1084
1085         dev->data->dev_private = NULL;
1086
1087         rte_free(vring_states[dev->data->port_id]);
1088         vring_states[dev->data->port_id] = NULL;
1089 }
1090
1091 static int
1092 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1093                    uint16_t nb_rx_desc __rte_unused,
1094                    unsigned int socket_id,
1095                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1096                    struct rte_mempool *mb_pool)
1097 {
1098         struct vhost_queue *vq;
1099
1100         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1101                         RTE_CACHE_LINE_SIZE, socket_id);
1102         if (vq == NULL) {
1103                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1104                 return -ENOMEM;
1105         }
1106
1107         vq->mb_pool = mb_pool;
1108         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1109         dev->data->rx_queues[rx_queue_id] = vq;
1110
1111         return 0;
1112 }
1113
1114 static int
1115 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1116                    uint16_t nb_tx_desc __rte_unused,
1117                    unsigned int socket_id,
1118                    const struct rte_eth_txconf *tx_conf __rte_unused)
1119 {
1120         struct vhost_queue *vq;
1121
1122         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1123                         RTE_CACHE_LINE_SIZE, socket_id);
1124         if (vq == NULL) {
1125                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1126                 return -ENOMEM;
1127         }
1128
1129         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1130         dev->data->tx_queues[tx_queue_id] = vq;
1131
1132         return 0;
1133 }
1134
1135 static int
1136 eth_dev_info(struct rte_eth_dev *dev,
1137              struct rte_eth_dev_info *dev_info)
1138 {
1139         struct pmd_internal *internal;
1140
1141         internal = dev->data->dev_private;
1142         if (internal == NULL) {
1143                 VHOST_LOG(ERR, "Invalid device specified\n");
1144                 return -ENODEV;
1145         }
1146
1147         dev_info->max_mac_addrs = 1;
1148         dev_info->max_rx_pktlen = (uint32_t)-1;
1149         dev_info->max_rx_queues = internal->max_queues;
1150         dev_info->max_tx_queues = internal->max_queues;
1151         dev_info->min_rx_bufsize = 0;
1152
1153         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1154                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1155         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1156
1157         return 0;
1158 }
1159
1160 static int
1161 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1162 {
1163         unsigned i;
1164         unsigned long rx_total = 0, tx_total = 0;
1165         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1166         struct vhost_queue *vq;
1167
1168         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1169                         i < dev->data->nb_rx_queues; i++) {
1170                 if (dev->data->rx_queues[i] == NULL)
1171                         continue;
1172                 vq = dev->data->rx_queues[i];
1173                 stats->q_ipackets[i] = vq->stats.pkts;
1174                 rx_total += stats->q_ipackets[i];
1175
1176                 stats->q_ibytes[i] = vq->stats.bytes;
1177                 rx_total_bytes += stats->q_ibytes[i];
1178         }
1179
1180         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1181                         i < dev->data->nb_tx_queues; i++) {
1182                 if (dev->data->tx_queues[i] == NULL)
1183                         continue;
1184                 vq = dev->data->tx_queues[i];
1185                 stats->q_opackets[i] = vq->stats.pkts;
1186                 tx_total += stats->q_opackets[i];
1187
1188                 stats->q_obytes[i] = vq->stats.bytes;
1189                 tx_total_bytes += stats->q_obytes[i];
1190         }
1191
1192         stats->ipackets = rx_total;
1193         stats->opackets = tx_total;
1194         stats->ibytes = rx_total_bytes;
1195         stats->obytes = tx_total_bytes;
1196
1197         return 0;
1198 }
1199
1200 static int
1201 eth_stats_reset(struct rte_eth_dev *dev)
1202 {
1203         struct vhost_queue *vq;
1204         unsigned i;
1205
1206         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1207                 if (dev->data->rx_queues[i] == NULL)
1208                         continue;
1209                 vq = dev->data->rx_queues[i];
1210                 vq->stats.pkts = 0;
1211                 vq->stats.bytes = 0;
1212         }
1213         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1214                 if (dev->data->tx_queues[i] == NULL)
1215                         continue;
1216                 vq = dev->data->tx_queues[i];
1217                 vq->stats.pkts = 0;
1218                 vq->stats.bytes = 0;
1219                 vq->stats.missed_pkts = 0;
1220         }
1221
1222         return 0;
1223 }
1224
1225 static void
1226 eth_queue_release(void *q)
1227 {
1228         rte_free(q);
1229 }
1230
1231 static int
1232 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1233 {
1234         /*
1235          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1236          * and releases mbuf, so nothing to cleanup.
1237          */
1238         return 0;
1239 }
1240
1241 static int
1242 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1243                 int wait_to_complete __rte_unused)
1244 {
1245         return 0;
1246 }
1247
1248 static uint32_t
1249 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1250 {
1251         struct vhost_queue *vq;
1252
1253         vq = dev->data->rx_queues[rx_queue_id];
1254         if (vq == NULL)
1255                 return 0;
1256
1257         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1258 }
1259
1260 static const struct eth_dev_ops ops = {
1261         .dev_start = eth_dev_start,
1262         .dev_stop = eth_dev_stop,
1263         .dev_close = eth_dev_close,
1264         .dev_configure = eth_dev_configure,
1265         .dev_infos_get = eth_dev_info,
1266         .rx_queue_setup = eth_rx_queue_setup,
1267         .tx_queue_setup = eth_tx_queue_setup,
1268         .rx_queue_release = eth_queue_release,
1269         .tx_queue_release = eth_queue_release,
1270         .tx_done_cleanup = eth_tx_done_cleanup,
1271         .rx_queue_count = eth_rx_queue_count,
1272         .link_update = eth_link_update,
1273         .stats_get = eth_stats_get,
1274         .stats_reset = eth_stats_reset,
1275         .xstats_reset = vhost_dev_xstats_reset,
1276         .xstats_get = vhost_dev_xstats_get,
1277         .xstats_get_names = vhost_dev_xstats_get_names,
1278         .rx_queue_intr_enable = eth_rxq_intr_enable,
1279         .rx_queue_intr_disable = eth_rxq_intr_disable,
1280 };
1281
1282 static int
1283 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1284         int16_t queues, const unsigned int numa_node, uint64_t flags,
1285         uint64_t disable_flags)
1286 {
1287         const char *name = rte_vdev_device_name(dev);
1288         struct rte_eth_dev_data *data;
1289         struct pmd_internal *internal = NULL;
1290         struct rte_eth_dev *eth_dev = NULL;
1291         struct rte_ether_addr *eth_addr = NULL;
1292
1293         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1294                 numa_node);
1295
1296         /* reserve an ethdev entry */
1297         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1298         if (eth_dev == NULL)
1299                 goto error;
1300         data = eth_dev->data;
1301
1302         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1303         if (eth_addr == NULL)
1304                 goto error;
1305         data->mac_addrs = eth_addr;
1306         *eth_addr = base_eth_addr;
1307         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1308
1309         /* now put it all together
1310          * - store queue data in internal,
1311          * - point eth_dev_data to internals
1312          * - and point eth_dev structure to new eth_dev_data structure
1313          */
1314         internal = eth_dev->data->dev_private;
1315         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1316                                                  0, numa_node);
1317         if (internal->iface_name == NULL)
1318                 goto error;
1319         strcpy(internal->iface_name, iface_name);
1320
1321         data->nb_rx_queues = queues;
1322         data->nb_tx_queues = queues;
1323         internal->max_queues = queues;
1324         internal->vid = -1;
1325         internal->flags = flags;
1326         internal->disable_flags = disable_flags;
1327         data->dev_link = pmd_link;
1328         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1329
1330         eth_dev->dev_ops = &ops;
1331
1332         /* finally assign rx and tx ops */
1333         eth_dev->rx_pkt_burst = eth_vhost_rx;
1334         eth_dev->tx_pkt_burst = eth_vhost_tx;
1335
1336         rte_eth_dev_probing_finish(eth_dev);
1337         return 0;
1338
1339 error:
1340         if (internal)
1341                 rte_free(internal->iface_name);
1342         rte_eth_dev_release_port(eth_dev);
1343
1344         return -1;
1345 }
1346
1347 static inline int
1348 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1349 {
1350         const char **iface_name = extra_args;
1351
1352         if (value == NULL)
1353                 return -1;
1354
1355         *iface_name = value;
1356
1357         return 0;
1358 }
1359
1360 static inline int
1361 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1362 {
1363         uint16_t *n = extra_args;
1364
1365         if (value == NULL || extra_args == NULL)
1366                 return -EINVAL;
1367
1368         *n = (uint16_t)strtoul(value, NULL, 0);
1369         if (*n == USHRT_MAX && errno == ERANGE)
1370                 return -1;
1371
1372         return 0;
1373 }
1374
1375 static int
1376 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1377 {
1378         struct rte_kvargs *kvlist = NULL;
1379         int ret = 0;
1380         char *iface_name;
1381         uint16_t queues;
1382         uint64_t flags = 0;
1383         uint64_t disable_flags = 0;
1384         int client_mode = 0;
1385         int dequeue_zero_copy = 0;
1386         int iommu_support = 0;
1387         int postcopy_support = 0;
1388         int tso = 0;
1389         struct rte_eth_dev *eth_dev;
1390         const char *name = rte_vdev_device_name(dev);
1391
1392         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1393
1394         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1395                 eth_dev = rte_eth_dev_attach_secondary(name);
1396                 if (!eth_dev) {
1397                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1398                         return -1;
1399                 }
1400                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1401                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1402                 eth_dev->dev_ops = &ops;
1403                 if (dev->device.numa_node == SOCKET_ID_ANY)
1404                         dev->device.numa_node = rte_socket_id();
1405                 eth_dev->device = &dev->device;
1406                 rte_eth_dev_probing_finish(eth_dev);
1407                 return 0;
1408         }
1409
1410         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1411         if (kvlist == NULL)
1412                 return -1;
1413
1414         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1415                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1416                                          &open_iface, &iface_name);
1417                 if (ret < 0)
1418                         goto out_free;
1419         } else {
1420                 ret = -1;
1421                 goto out_free;
1422         }
1423
1424         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1425                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1426                                          &open_int, &queues);
1427                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1428                         goto out_free;
1429
1430         } else
1431                 queues = 1;
1432
1433         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1434                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1435                                          &open_int, &client_mode);
1436                 if (ret < 0)
1437                         goto out_free;
1438
1439                 if (client_mode)
1440                         flags |= RTE_VHOST_USER_CLIENT;
1441         }
1442
1443         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1444                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1445                                          &open_int, &dequeue_zero_copy);
1446                 if (ret < 0)
1447                         goto out_free;
1448
1449                 if (dequeue_zero_copy)
1450                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1451         }
1452
1453         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1454                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1455                                          &open_int, &iommu_support);
1456                 if (ret < 0)
1457                         goto out_free;
1458
1459                 if (iommu_support)
1460                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1461         }
1462
1463         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1464                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1465                                          &open_int, &postcopy_support);
1466                 if (ret < 0)
1467                         goto out_free;
1468
1469                 if (postcopy_support)
1470                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1471         }
1472
1473         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1474                 ret = rte_kvargs_process(kvlist,
1475                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1476                                 &open_int, &tso);
1477                 if (ret < 0)
1478                         goto out_free;
1479
1480                 if (tso == 0) {
1481                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1482                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1483                 }
1484         }
1485
1486         if (dev->device.numa_node == SOCKET_ID_ANY)
1487                 dev->device.numa_node = rte_socket_id();
1488
1489         ret = eth_dev_vhost_create(dev, iface_name, queues,
1490                                    dev->device.numa_node, flags, disable_flags);
1491         if (ret == -1)
1492                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1493
1494 out_free:
1495         rte_kvargs_free(kvlist);
1496         return ret;
1497 }
1498
1499 static int
1500 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1501 {
1502         const char *name;
1503         struct rte_eth_dev *eth_dev = NULL;
1504
1505         name = rte_vdev_device_name(dev);
1506         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1507
1508         /* find an ethdev entry */
1509         eth_dev = rte_eth_dev_allocated(name);
1510         if (eth_dev == NULL)
1511                 return 0;
1512
1513         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1514                 return rte_eth_dev_release_port(eth_dev);
1515
1516         eth_dev_close(eth_dev);
1517
1518         rte_eth_dev_release_port(eth_dev);
1519
1520         return 0;
1521 }
1522
1523 static struct rte_vdev_driver pmd_vhost_drv = {
1524         .probe = rte_pmd_vhost_probe,
1525         .remove = rte_pmd_vhost_remove,
1526 };
1527
1528 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1529 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1530 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1531         "iface=<ifc> "
1532         "queues=<int> "
1533         "client=<0|1> "
1534         "dequeue-zero-copy=<0|1> "
1535         "iommu-support=<0|1> "
1536         "postcopy-support=<0|1> "
1537         "tso=<0|1>");
1538
1539 RTE_INIT(vhost_init_log)
1540 {
1541         vhost_logtype = rte_log_register("pmd.net.vhost");
1542         if (vhost_logtype >= 0)
1543                 rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1544 }