drivers/net: share vdev data to secondary process
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /*-
2  *   BSD LICENSE
3  *
4  *   Copyright (c) 2016 IGEL Co., Ltd.
5  *   All rights reserved.
6  *
7  *   Redistribution and use in source and binary forms, with or without
8  *   modification, are permitted provided that the following conditions
9  *   are met:
10  *
11  *     * Redistributions of source code must retain the above copyright
12  *       notice, this list of conditions and the following disclaimer.
13  *     * Redistributions in binary form must reproduce the above copyright
14  *       notice, this list of conditions and the following disclaimer in
15  *       the documentation and/or other materials provided with the
16  *       distribution.
17  *     * Neither the name of IGEL Co.,Ltd. nor the names of its
18  *       contributors may be used to endorse or promote products derived
19  *       from this software without specific prior written permission.
20  *
21  *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24  *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25  *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 #include <unistd.h>
34 #include <pthread.h>
35 #include <stdbool.h>
36
37 #include <rte_mbuf.h>
38 #include <rte_ethdev_driver.h>
39 #include <rte_ethdev_vdev.h>
40 #include <rte_malloc.h>
41 #include <rte_memcpy.h>
42 #include <rte_bus_vdev.h>
43 #include <rte_kvargs.h>
44 #include <rte_vhost.h>
45 #include <rte_spinlock.h>
46
47 #include "rte_eth_vhost.h"
48
49 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
50
51 #define ETH_VHOST_IFACE_ARG             "iface"
52 #define ETH_VHOST_QUEUES_ARG            "queues"
53 #define ETH_VHOST_CLIENT_ARG            "client"
54 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
55 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
56 #define VHOST_MAX_PKT_BURST 32
57
58 static const char *valid_arguments[] = {
59         ETH_VHOST_IFACE_ARG,
60         ETH_VHOST_QUEUES_ARG,
61         ETH_VHOST_CLIENT_ARG,
62         ETH_VHOST_DEQUEUE_ZERO_COPY,
63         ETH_VHOST_IOMMU_SUPPORT,
64         NULL
65 };
66
67 static struct ether_addr base_eth_addr = {
68         .addr_bytes = {
69                 0x56 /* V */,
70                 0x48 /* H */,
71                 0x4F /* O */,
72                 0x53 /* S */,
73                 0x54 /* T */,
74                 0x00
75         }
76 };
77
78 enum vhost_xstats_pkts {
79         VHOST_UNDERSIZE_PKT = 0,
80         VHOST_64_PKT,
81         VHOST_65_TO_127_PKT,
82         VHOST_128_TO_255_PKT,
83         VHOST_256_TO_511_PKT,
84         VHOST_512_TO_1023_PKT,
85         VHOST_1024_TO_1522_PKT,
86         VHOST_1523_TO_MAX_PKT,
87         VHOST_BROADCAST_PKT,
88         VHOST_MULTICAST_PKT,
89         VHOST_UNICAST_PKT,
90         VHOST_ERRORS_PKT,
91         VHOST_ERRORS_FRAGMENTED,
92         VHOST_ERRORS_JABBER,
93         VHOST_UNKNOWN_PROTOCOL,
94         VHOST_XSTATS_MAX,
95 };
96
97 struct vhost_stats {
98         uint64_t pkts;
99         uint64_t bytes;
100         uint64_t missed_pkts;
101         uint64_t xstats[VHOST_XSTATS_MAX];
102 };
103
104 struct vhost_queue {
105         int vid;
106         rte_atomic32_t allow_queuing;
107         rte_atomic32_t while_queuing;
108         struct pmd_internal *internal;
109         struct rte_mempool *mb_pool;
110         uint16_t port;
111         uint16_t virtqueue_id;
112         struct vhost_stats stats;
113 };
114
115 struct pmd_internal {
116         rte_atomic32_t dev_attached;
117         char *dev_name;
118         char *iface_name;
119         uint16_t max_queues;
120         uint16_t vid;
121         rte_atomic32_t started;
122         uint8_t vlan_strip;
123 };
124
125 struct internal_list {
126         TAILQ_ENTRY(internal_list) next;
127         struct rte_eth_dev *eth_dev;
128 };
129
130 TAILQ_HEAD(internal_list_head, internal_list);
131 static struct internal_list_head internal_list =
132         TAILQ_HEAD_INITIALIZER(internal_list);
133
134 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
135
136 static struct rte_eth_link pmd_link = {
137                 .link_speed = 10000,
138                 .link_duplex = ETH_LINK_FULL_DUPLEX,
139                 .link_status = ETH_LINK_DOWN
140 };
141
142 struct rte_vhost_vring_state {
143         rte_spinlock_t lock;
144
145         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
146         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
147         unsigned int index;
148         unsigned int max_vring;
149 };
150
151 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
152
153 #define VHOST_XSTATS_NAME_SIZE 64
154
155 struct vhost_xstats_name_off {
156         char name[VHOST_XSTATS_NAME_SIZE];
157         uint64_t offset;
158 };
159
160 /* [rx]_is prepended to the name string here */
161 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
162         {"good_packets",
163          offsetof(struct vhost_queue, stats.pkts)},
164         {"total_bytes",
165          offsetof(struct vhost_queue, stats.bytes)},
166         {"missed_pkts",
167          offsetof(struct vhost_queue, stats.missed_pkts)},
168         {"broadcast_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
170         {"multicast_packets",
171          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
172         {"unicast_packets",
173          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
174          {"undersize_packets",
175          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
176         {"size_64_packets",
177          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
178         {"size_65_to_127_packets",
179          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
180         {"size_128_to_255_packets",
181          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
182         {"size_256_to_511_packets",
183          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
184         {"size_512_to_1023_packets",
185          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
186         {"size_1024_to_1522_packets",
187          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
188         {"size_1523_to_max_packets",
189          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
190         {"errors_with_bad_CRC",
191          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
192         {"fragmented_errors",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
194         {"jabber_errors",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
196         {"unknown_protos_packets",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
198 };
199
200 /* [tx]_ is prepended to the name string here */
201 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
202         {"good_packets",
203          offsetof(struct vhost_queue, stats.pkts)},
204         {"total_bytes",
205          offsetof(struct vhost_queue, stats.bytes)},
206         {"missed_pkts",
207          offsetof(struct vhost_queue, stats.missed_pkts)},
208         {"broadcast_packets",
209          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
210         {"multicast_packets",
211          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
212         {"unicast_packets",
213          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
214         {"undersize_packets",
215          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
216         {"size_64_packets",
217          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
218         {"size_65_to_127_packets",
219          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
220         {"size_128_to_255_packets",
221          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
222         {"size_256_to_511_packets",
223          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
224         {"size_512_to_1023_packets",
225          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
226         {"size_1024_to_1522_packets",
227          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
228         {"size_1523_to_max_packets",
229          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
230         {"errors_with_bad_CRC",
231          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
232 };
233
234 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
235                                 sizeof(vhost_rxport_stat_strings[0]))
236
237 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
238                                 sizeof(vhost_txport_stat_strings[0]))
239
240 static void
241 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
242 {
243         struct vhost_queue *vq = NULL;
244         unsigned int i = 0;
245
246         for (i = 0; i < dev->data->nb_rx_queues; i++) {
247                 vq = dev->data->rx_queues[i];
248                 if (!vq)
249                         continue;
250                 memset(&vq->stats, 0, sizeof(vq->stats));
251         }
252         for (i = 0; i < dev->data->nb_tx_queues; i++) {
253                 vq = dev->data->tx_queues[i];
254                 if (!vq)
255                         continue;
256                 memset(&vq->stats, 0, sizeof(vq->stats));
257         }
258 }
259
260 static int
261 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
262                            struct rte_eth_xstat_name *xstats_names,
263                            unsigned int limit __rte_unused)
264 {
265         unsigned int t = 0;
266         int count = 0;
267         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
268
269         if (!xstats_names)
270                 return nstats;
271         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
272                 snprintf(xstats_names[count].name,
273                          sizeof(xstats_names[count].name),
274                          "rx_%s", vhost_rxport_stat_strings[t].name);
275                 count++;
276         }
277         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
278                 snprintf(xstats_names[count].name,
279                          sizeof(xstats_names[count].name),
280                          "tx_%s", vhost_txport_stat_strings[t].name);
281                 count++;
282         }
283         return count;
284 }
285
286 static int
287 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
288                      unsigned int n)
289 {
290         unsigned int i;
291         unsigned int t;
292         unsigned int count = 0;
293         struct vhost_queue *vq = NULL;
294         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
295
296         if (n < nxstats)
297                 return nxstats;
298
299         for (i = 0; i < dev->data->nb_rx_queues; i++) {
300                 vq = dev->data->rx_queues[i];
301                 if (!vq)
302                         continue;
303                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
304                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
305                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
306         }
307         for (i = 0; i < dev->data->nb_tx_queues; i++) {
308                 vq = dev->data->tx_queues[i];
309                 if (!vq)
310                         continue;
311                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
312                                 + vq->stats.missed_pkts
313                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
314                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
315         }
316         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
317                 xstats[count].value = 0;
318                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
319                         vq = dev->data->rx_queues[i];
320                         if (!vq)
321                                 continue;
322                         xstats[count].value +=
323                                 *(uint64_t *)(((char *)vq)
324                                 + vhost_rxport_stat_strings[t].offset);
325                 }
326                 xstats[count].id = count;
327                 count++;
328         }
329         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
330                 xstats[count].value = 0;
331                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
332                         vq = dev->data->tx_queues[i];
333                         if (!vq)
334                                 continue;
335                         xstats[count].value +=
336                                 *(uint64_t *)(((char *)vq)
337                                 + vhost_txport_stat_strings[t].offset);
338                 }
339                 xstats[count].id = count;
340                 count++;
341         }
342         return count;
343 }
344
345 static inline void
346 vhost_count_multicast_broadcast(struct vhost_queue *vq,
347                                 struct rte_mbuf *mbuf)
348 {
349         struct ether_addr *ea = NULL;
350         struct vhost_stats *pstats = &vq->stats;
351
352         ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
353         if (is_multicast_ether_addr(ea)) {
354                 if (is_broadcast_ether_addr(ea))
355                         pstats->xstats[VHOST_BROADCAST_PKT]++;
356                 else
357                         pstats->xstats[VHOST_MULTICAST_PKT]++;
358         }
359 }
360
361 static void
362 vhost_update_packet_xstats(struct vhost_queue *vq,
363                            struct rte_mbuf **bufs,
364                            uint16_t count)
365 {
366         uint32_t pkt_len = 0;
367         uint64_t i = 0;
368         uint64_t index;
369         struct vhost_stats *pstats = &vq->stats;
370
371         for (i = 0; i < count ; i++) {
372                 pkt_len = bufs[i]->pkt_len;
373                 if (pkt_len == 64) {
374                         pstats->xstats[VHOST_64_PKT]++;
375                 } else if (pkt_len > 64 && pkt_len < 1024) {
376                         index = (sizeof(pkt_len) * 8)
377                                 - __builtin_clz(pkt_len) - 5;
378                         pstats->xstats[index]++;
379                 } else {
380                         if (pkt_len < 64)
381                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
382                         else if (pkt_len <= 1522)
383                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
384                         else if (pkt_len > 1522)
385                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
386                 }
387                 vhost_count_multicast_broadcast(vq, bufs[i]);
388         }
389 }
390
391 static uint16_t
392 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
393 {
394         struct vhost_queue *r = q;
395         uint16_t i, nb_rx = 0;
396         uint16_t nb_receive = nb_bufs;
397
398         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
399                 return 0;
400
401         rte_atomic32_set(&r->while_queuing, 1);
402
403         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
404                 goto out;
405
406         /* Dequeue packets from guest TX queue */
407         while (nb_receive) {
408                 uint16_t nb_pkts;
409                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
410                                                  VHOST_MAX_PKT_BURST);
411
412                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
413                                                   r->mb_pool, &bufs[nb_rx],
414                                                   num);
415
416                 nb_rx += nb_pkts;
417                 nb_receive -= nb_pkts;
418                 if (nb_pkts < num)
419                         break;
420         }
421
422         r->stats.pkts += nb_rx;
423
424         for (i = 0; likely(i < nb_rx); i++) {
425                 bufs[i]->port = r->port;
426                 bufs[i]->ol_flags = 0;
427                 bufs[i]->vlan_tci = 0;
428
429                 if (r->internal->vlan_strip)
430                         rte_vlan_strip(bufs[i]);
431
432                 r->stats.bytes += bufs[i]->pkt_len;
433         }
434
435         vhost_update_packet_xstats(r, bufs, nb_rx);
436
437 out:
438         rte_atomic32_set(&r->while_queuing, 0);
439
440         return nb_rx;
441 }
442
443 static uint16_t
444 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
445 {
446         struct vhost_queue *r = q;
447         uint16_t i, nb_tx = 0;
448         uint16_t nb_send = 0;
449
450         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
451                 return 0;
452
453         rte_atomic32_set(&r->while_queuing, 1);
454
455         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
456                 goto out;
457
458         for (i = 0; i < nb_bufs; i++) {
459                 struct rte_mbuf *m = bufs[i];
460
461                 /* Do VLAN tag insertion */
462                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
463                         int error = rte_vlan_insert(&m);
464                         if (unlikely(error)) {
465                                 rte_pktmbuf_free(m);
466                                 continue;
467                         }
468                 }
469
470                 bufs[nb_send] = m;
471                 ++nb_send;
472         }
473
474         /* Enqueue packets to guest RX queue */
475         while (nb_send) {
476                 uint16_t nb_pkts;
477                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
478                                                  VHOST_MAX_PKT_BURST);
479
480                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
481                                                   &bufs[nb_tx], num);
482
483                 nb_tx += nb_pkts;
484                 nb_send -= nb_pkts;
485                 if (nb_pkts < num)
486                         break;
487         }
488
489         r->stats.pkts += nb_tx;
490         r->stats.missed_pkts += nb_bufs - nb_tx;
491
492         for (i = 0; likely(i < nb_tx); i++)
493                 r->stats.bytes += bufs[i]->pkt_len;
494
495         vhost_update_packet_xstats(r, bufs, nb_tx);
496
497         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
498          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
499          * are increased when packets are not transmitted successfully.
500          */
501         for (i = nb_tx; i < nb_bufs; i++)
502                 vhost_count_multicast_broadcast(r, bufs[i]);
503
504         for (i = 0; likely(i < nb_tx); i++)
505                 rte_pktmbuf_free(bufs[i]);
506 out:
507         rte_atomic32_set(&r->while_queuing, 0);
508
509         return nb_tx;
510 }
511
512 static int
513 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
514 {
515         struct pmd_internal *internal = dev->data->dev_private;
516         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
517
518         internal->vlan_strip = rxmode->hw_vlan_strip;
519
520         if (rxmode->hw_vlan_filter)
521                 RTE_LOG(WARNING, PMD,
522                         "vhost(%s): vlan filtering not available\n",
523                         internal->dev_name);
524
525         return 0;
526 }
527
528 static inline struct internal_list *
529 find_internal_resource(char *ifname)
530 {
531         int found = 0;
532         struct internal_list *list;
533         struct pmd_internal *internal;
534
535         if (ifname == NULL)
536                 return NULL;
537
538         pthread_mutex_lock(&internal_list_lock);
539
540         TAILQ_FOREACH(list, &internal_list, next) {
541                 internal = list->eth_dev->data->dev_private;
542                 if (!strcmp(internal->iface_name, ifname)) {
543                         found = 1;
544                         break;
545                 }
546         }
547
548         pthread_mutex_unlock(&internal_list_lock);
549
550         if (!found)
551                 return NULL;
552
553         return list;
554 }
555
556 static int
557 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
558 {
559         struct rte_vhost_vring vring;
560         struct vhost_queue *vq;
561         int ret = 0;
562
563         vq = dev->data->rx_queues[qid];
564         if (!vq) {
565                 RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
566                 return -1;
567         }
568
569         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
570         if (ret < 0) {
571                 RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring\n", qid);
572                 return ret;
573         }
574         RTE_LOG(INFO, PMD, "Enable interrupt for rxq%d\n", qid);
575         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
576         rte_wmb();
577
578         return ret;
579 }
580
581 static int
582 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
583 {
584         struct rte_vhost_vring vring;
585         struct vhost_queue *vq;
586         int ret = 0;
587
588         vq = dev->data->rx_queues[qid];
589         if (!vq) {
590                 RTE_LOG(ERR, PMD, "rxq%d is not setup yet\n", qid);
591                 return -1;
592         }
593
594         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
595         if (ret < 0) {
596                 RTE_LOG(ERR, PMD, "Failed to get rxq%d's vring", qid);
597                 return ret;
598         }
599         RTE_LOG(INFO, PMD, "Disable interrupt for rxq%d\n", qid);
600         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
601         rte_wmb();
602
603         return 0;
604 }
605
606 static void
607 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
608 {
609         struct rte_intr_handle *intr_handle = dev->intr_handle;
610
611         if (intr_handle) {
612                 if (intr_handle->intr_vec)
613                         free(intr_handle->intr_vec);
614                 free(intr_handle);
615         }
616
617         dev->intr_handle = NULL;
618 }
619
620 static int
621 eth_vhost_install_intr(struct rte_eth_dev *dev)
622 {
623         struct rte_vhost_vring vring;
624         struct vhost_queue *vq;
625         int count = 0;
626         int nb_rxq = dev->data->nb_rx_queues;
627         int i;
628         int ret;
629
630         /* uninstall firstly if we are reconnecting */
631         if (dev->intr_handle)
632                 eth_vhost_uninstall_intr(dev);
633
634         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
635         if (!dev->intr_handle) {
636                 RTE_LOG(ERR, PMD, "Fail to allocate intr_handle\n");
637                 return -ENOMEM;
638         }
639         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
640
641         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
642
643         dev->intr_handle->intr_vec =
644                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
645
646         if (!dev->intr_handle->intr_vec) {
647                 RTE_LOG(ERR, PMD,
648                         "Failed to allocate memory for interrupt vector\n");
649                 free(dev->intr_handle);
650                 return -ENOMEM;
651         }
652
653         RTE_LOG(INFO, PMD, "Prepare intr vec\n");
654         for (i = 0; i < nb_rxq; i++) {
655                 vq = dev->data->rx_queues[i];
656                 if (!vq) {
657                         RTE_LOG(INFO, PMD, "rxq-%d not setup yet, skip!\n", i);
658                         continue;
659                 }
660
661                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
662                 if (ret < 0) {
663                         RTE_LOG(INFO, PMD,
664                                 "Failed to get rxq-%d's vring, skip!\n", i);
665                         continue;
666                 }
667
668                 if (vring.kickfd < 0) {
669                         RTE_LOG(INFO, PMD,
670                                 "rxq-%d's kickfd is invalid, skip!\n", i);
671                         continue;
672                 }
673                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
674                 dev->intr_handle->efds[i] = vring.kickfd;
675                 count++;
676                 RTE_LOG(INFO, PMD, "Installed intr vec for rxq-%d\n", i);
677         }
678
679         dev->intr_handle->nb_efd = count;
680         dev->intr_handle->max_intr = count + 1;
681         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
682
683         return 0;
684 }
685
686 static void
687 update_queuing_status(struct rte_eth_dev *dev)
688 {
689         struct pmd_internal *internal = dev->data->dev_private;
690         struct vhost_queue *vq;
691         unsigned int i;
692         int allow_queuing = 1;
693
694         if (!dev->data->rx_queues || !dev->data->tx_queues)
695                 return;
696
697         if (rte_atomic32_read(&internal->started) == 0 ||
698             rte_atomic32_read(&internal->dev_attached) == 0)
699                 allow_queuing = 0;
700
701         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
702         for (i = 0; i < dev->data->nb_rx_queues; i++) {
703                 vq = dev->data->rx_queues[i];
704                 if (vq == NULL)
705                         continue;
706                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
707                 while (rte_atomic32_read(&vq->while_queuing))
708                         rte_pause();
709         }
710
711         for (i = 0; i < dev->data->nb_tx_queues; i++) {
712                 vq = dev->data->tx_queues[i];
713                 if (vq == NULL)
714                         continue;
715                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
716                 while (rte_atomic32_read(&vq->while_queuing))
717                         rte_pause();
718         }
719 }
720
721 static void
722 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
723 {
724         struct vhost_queue *vq;
725         int i;
726
727         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
728                 vq = eth_dev->data->rx_queues[i];
729                 if (!vq)
730                         continue;
731                 vq->vid = internal->vid;
732                 vq->internal = internal;
733                 vq->port = eth_dev->data->port_id;
734         }
735         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
736                 vq = eth_dev->data->tx_queues[i];
737                 if (!vq)
738                         continue;
739                 vq->vid = internal->vid;
740                 vq->internal = internal;
741                 vq->port = eth_dev->data->port_id;
742         }
743 }
744
745 static int
746 new_device(int vid)
747 {
748         struct rte_eth_dev *eth_dev;
749         struct internal_list *list;
750         struct pmd_internal *internal;
751         struct rte_eth_conf *dev_conf;
752         unsigned i;
753         char ifname[PATH_MAX];
754 #ifdef RTE_LIBRTE_VHOST_NUMA
755         int newnode;
756 #endif
757
758         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
759         list = find_internal_resource(ifname);
760         if (list == NULL) {
761                 RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
762                 return -1;
763         }
764
765         eth_dev = list->eth_dev;
766         internal = eth_dev->data->dev_private;
767         dev_conf = &eth_dev->data->dev_conf;
768
769 #ifdef RTE_LIBRTE_VHOST_NUMA
770         newnode = rte_vhost_get_numa_node(vid);
771         if (newnode >= 0)
772                 eth_dev->data->numa_node = newnode;
773 #endif
774
775         internal->vid = vid;
776         if (rte_atomic32_read(&internal->started) == 1) {
777                 queue_setup(eth_dev, internal);
778
779                 if (dev_conf->intr_conf.rxq) {
780                         if (eth_vhost_install_intr(eth_dev) < 0) {
781                                 RTE_LOG(INFO, PMD,
782                                         "Failed to install interrupt handler.");
783                                         return -1;
784                         }
785                 }
786         } else {
787                 RTE_LOG(INFO, PMD, "RX/TX queues not exist yet\n");
788         }
789
790         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
791                 rte_vhost_enable_guest_notification(vid, i, 0);
792
793         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
794
795         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
796
797         rte_atomic32_set(&internal->dev_attached, 1);
798         update_queuing_status(eth_dev);
799
800         RTE_LOG(INFO, PMD, "Vhost device %d created\n", vid);
801
802         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
803
804         return 0;
805 }
806
807 static void
808 destroy_device(int vid)
809 {
810         struct rte_eth_dev *eth_dev;
811         struct pmd_internal *internal;
812         struct vhost_queue *vq;
813         struct internal_list *list;
814         char ifname[PATH_MAX];
815         unsigned i;
816         struct rte_vhost_vring_state *state;
817
818         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
819         list = find_internal_resource(ifname);
820         if (list == NULL) {
821                 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
822                 return;
823         }
824         eth_dev = list->eth_dev;
825         internal = eth_dev->data->dev_private;
826
827         rte_atomic32_set(&internal->dev_attached, 0);
828         update_queuing_status(eth_dev);
829
830         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
831
832         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
833                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
834                         vq = eth_dev->data->rx_queues[i];
835                         if (!vq)
836                                 continue;
837                         vq->vid = -1;
838                 }
839                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
840                         vq = eth_dev->data->tx_queues[i];
841                         if (!vq)
842                                 continue;
843                         vq->vid = -1;
844                 }
845         }
846
847         state = vring_states[eth_dev->data->port_id];
848         rte_spinlock_lock(&state->lock);
849         for (i = 0; i <= state->max_vring; i++) {
850                 state->cur[i] = false;
851                 state->seen[i] = false;
852         }
853         state->max_vring = 0;
854         rte_spinlock_unlock(&state->lock);
855
856         RTE_LOG(INFO, PMD, "Vhost device %d destroyed\n", vid);
857         eth_vhost_uninstall_intr(eth_dev);
858
859         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
860 }
861
862 static int
863 vring_state_changed(int vid, uint16_t vring, int enable)
864 {
865         struct rte_vhost_vring_state *state;
866         struct rte_eth_dev *eth_dev;
867         struct internal_list *list;
868         char ifname[PATH_MAX];
869
870         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
871         list = find_internal_resource(ifname);
872         if (list == NULL) {
873                 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
874                 return -1;
875         }
876
877         eth_dev = list->eth_dev;
878         /* won't be NULL */
879         state = vring_states[eth_dev->data->port_id];
880         rte_spinlock_lock(&state->lock);
881         state->cur[vring] = enable;
882         state->max_vring = RTE_MAX(vring, state->max_vring);
883         rte_spinlock_unlock(&state->lock);
884
885         RTE_LOG(INFO, PMD, "vring%u is %s\n",
886                         vring, enable ? "enabled" : "disabled");
887
888         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
889
890         return 0;
891 }
892
893 static struct vhost_device_ops vhost_ops = {
894         .new_device          = new_device,
895         .destroy_device      = destroy_device,
896         .vring_state_changed = vring_state_changed,
897 };
898
899 int
900 rte_eth_vhost_get_queue_event(uint16_t port_id,
901                 struct rte_eth_vhost_queue_event *event)
902 {
903         struct rte_vhost_vring_state *state;
904         unsigned int i;
905         int idx;
906
907         if (port_id >= RTE_MAX_ETHPORTS) {
908                 RTE_LOG(ERR, PMD, "Invalid port id\n");
909                 return -1;
910         }
911
912         state = vring_states[port_id];
913         if (!state) {
914                 RTE_LOG(ERR, PMD, "Unused port\n");
915                 return -1;
916         }
917
918         rte_spinlock_lock(&state->lock);
919         for (i = 0; i <= state->max_vring; i++) {
920                 idx = state->index++ % (state->max_vring + 1);
921
922                 if (state->cur[idx] != state->seen[idx]) {
923                         state->seen[idx] = state->cur[idx];
924                         event->queue_id = idx / 2;
925                         event->rx = idx & 1;
926                         event->enable = state->cur[idx];
927                         rte_spinlock_unlock(&state->lock);
928                         return 0;
929                 }
930         }
931         rte_spinlock_unlock(&state->lock);
932
933         return -1;
934 }
935
936 int
937 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
938 {
939         struct internal_list *list;
940         struct rte_eth_dev *eth_dev;
941         struct vhost_queue *vq;
942         int vid = -1;
943
944         if (!rte_eth_dev_is_valid_port(port_id))
945                 return -1;
946
947         pthread_mutex_lock(&internal_list_lock);
948
949         TAILQ_FOREACH(list, &internal_list, next) {
950                 eth_dev = list->eth_dev;
951                 if (eth_dev->data->port_id == port_id) {
952                         vq = eth_dev->data->rx_queues[0];
953                         if (vq) {
954                                 vid = vq->vid;
955                         }
956                         break;
957                 }
958         }
959
960         pthread_mutex_unlock(&internal_list_lock);
961
962         return vid;
963 }
964
965 static int
966 eth_dev_start(struct rte_eth_dev *eth_dev)
967 {
968         struct pmd_internal *internal = eth_dev->data->dev_private;
969         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
970
971         queue_setup(eth_dev, internal);
972
973         if (rte_atomic32_read(&internal->dev_attached) == 1) {
974                 if (dev_conf->intr_conf.rxq) {
975                         if (eth_vhost_install_intr(eth_dev) < 0) {
976                                 RTE_LOG(INFO, PMD,
977                                         "Failed to install interrupt handler.");
978                                         return -1;
979                         }
980                 }
981         }
982
983         rte_atomic32_set(&internal->started, 1);
984         update_queuing_status(eth_dev);
985
986         return 0;
987 }
988
989 static void
990 eth_dev_stop(struct rte_eth_dev *dev)
991 {
992         struct pmd_internal *internal = dev->data->dev_private;
993
994         rte_atomic32_set(&internal->started, 0);
995         update_queuing_status(dev);
996 }
997
998 static void
999 eth_dev_close(struct rte_eth_dev *dev)
1000 {
1001         struct pmd_internal *internal;
1002         struct internal_list *list;
1003         unsigned int i;
1004
1005         internal = dev->data->dev_private;
1006         if (!internal)
1007                 return;
1008
1009         eth_dev_stop(dev);
1010
1011         rte_vhost_driver_unregister(internal->iface_name);
1012
1013         list = find_internal_resource(internal->iface_name);
1014         if (!list)
1015                 return;
1016
1017         pthread_mutex_lock(&internal_list_lock);
1018         TAILQ_REMOVE(&internal_list, list, next);
1019         pthread_mutex_unlock(&internal_list_lock);
1020         rte_free(list);
1021
1022         if (dev->data->rx_queues)
1023                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1024                         rte_free(dev->data->rx_queues[i]);
1025
1026         if (dev->data->tx_queues)
1027                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1028                         rte_free(dev->data->tx_queues[i]);
1029
1030         rte_free(dev->data->mac_addrs);
1031         free(internal->dev_name);
1032         free(internal->iface_name);
1033         rte_free(internal);
1034
1035         dev->data->dev_private = NULL;
1036 }
1037
1038 static int
1039 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1040                    uint16_t nb_rx_desc __rte_unused,
1041                    unsigned int socket_id,
1042                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1043                    struct rte_mempool *mb_pool)
1044 {
1045         struct vhost_queue *vq;
1046
1047         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1048                         RTE_CACHE_LINE_SIZE, socket_id);
1049         if (vq == NULL) {
1050                 RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
1051                 return -ENOMEM;
1052         }
1053
1054         vq->mb_pool = mb_pool;
1055         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1056         dev->data->rx_queues[rx_queue_id] = vq;
1057
1058         return 0;
1059 }
1060
1061 static int
1062 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1063                    uint16_t nb_tx_desc __rte_unused,
1064                    unsigned int socket_id,
1065                    const struct rte_eth_txconf *tx_conf __rte_unused)
1066 {
1067         struct vhost_queue *vq;
1068
1069         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1070                         RTE_CACHE_LINE_SIZE, socket_id);
1071         if (vq == NULL) {
1072                 RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
1073                 return -ENOMEM;
1074         }
1075
1076         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1077         dev->data->tx_queues[tx_queue_id] = vq;
1078
1079         return 0;
1080 }
1081
1082 static void
1083 eth_dev_info(struct rte_eth_dev *dev,
1084              struct rte_eth_dev_info *dev_info)
1085 {
1086         struct pmd_internal *internal;
1087
1088         internal = dev->data->dev_private;
1089         if (internal == NULL) {
1090                 RTE_LOG(ERR, PMD, "Invalid device specified\n");
1091                 return;
1092         }
1093
1094         dev_info->max_mac_addrs = 1;
1095         dev_info->max_rx_pktlen = (uint32_t)-1;
1096         dev_info->max_rx_queues = internal->max_queues;
1097         dev_info->max_tx_queues = internal->max_queues;
1098         dev_info->min_rx_bufsize = 0;
1099 }
1100
1101 static int
1102 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1103 {
1104         unsigned i;
1105         unsigned long rx_total = 0, tx_total = 0, tx_missed_total = 0;
1106         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1107         struct vhost_queue *vq;
1108
1109         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1110                         i < dev->data->nb_rx_queues; i++) {
1111                 if (dev->data->rx_queues[i] == NULL)
1112                         continue;
1113                 vq = dev->data->rx_queues[i];
1114                 stats->q_ipackets[i] = vq->stats.pkts;
1115                 rx_total += stats->q_ipackets[i];
1116
1117                 stats->q_ibytes[i] = vq->stats.bytes;
1118                 rx_total_bytes += stats->q_ibytes[i];
1119         }
1120
1121         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1122                         i < dev->data->nb_tx_queues; i++) {
1123                 if (dev->data->tx_queues[i] == NULL)
1124                         continue;
1125                 vq = dev->data->tx_queues[i];
1126                 stats->q_opackets[i] = vq->stats.pkts;
1127                 tx_missed_total += vq->stats.missed_pkts;
1128                 tx_total += stats->q_opackets[i];
1129
1130                 stats->q_obytes[i] = vq->stats.bytes;
1131                 tx_total_bytes += stats->q_obytes[i];
1132         }
1133
1134         stats->ipackets = rx_total;
1135         stats->opackets = tx_total;
1136         stats->oerrors = tx_missed_total;
1137         stats->ibytes = rx_total_bytes;
1138         stats->obytes = tx_total_bytes;
1139
1140         return 0;
1141 }
1142
1143 static void
1144 eth_stats_reset(struct rte_eth_dev *dev)
1145 {
1146         struct vhost_queue *vq;
1147         unsigned i;
1148
1149         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1150                 if (dev->data->rx_queues[i] == NULL)
1151                         continue;
1152                 vq = dev->data->rx_queues[i];
1153                 vq->stats.pkts = 0;
1154                 vq->stats.bytes = 0;
1155         }
1156         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1157                 if (dev->data->tx_queues[i] == NULL)
1158                         continue;
1159                 vq = dev->data->tx_queues[i];
1160                 vq->stats.pkts = 0;
1161                 vq->stats.bytes = 0;
1162                 vq->stats.missed_pkts = 0;
1163         }
1164 }
1165
1166 static void
1167 eth_queue_release(void *q)
1168 {
1169         rte_free(q);
1170 }
1171
1172 static int
1173 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1174 {
1175         /*
1176          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1177          * and releases mbuf, so nothing to cleanup.
1178          */
1179         return 0;
1180 }
1181
1182 static int
1183 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1184                 int wait_to_complete __rte_unused)
1185 {
1186         return 0;
1187 }
1188
1189 static uint32_t
1190 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1191 {
1192         struct vhost_queue *vq;
1193
1194         vq = dev->data->rx_queues[rx_queue_id];
1195         if (vq == NULL)
1196                 return 0;
1197
1198         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1199 }
1200
1201 static const struct eth_dev_ops ops = {
1202         .dev_start = eth_dev_start,
1203         .dev_stop = eth_dev_stop,
1204         .dev_close = eth_dev_close,
1205         .dev_configure = eth_dev_configure,
1206         .dev_infos_get = eth_dev_info,
1207         .rx_queue_setup = eth_rx_queue_setup,
1208         .tx_queue_setup = eth_tx_queue_setup,
1209         .rx_queue_release = eth_queue_release,
1210         .tx_queue_release = eth_queue_release,
1211         .tx_done_cleanup = eth_tx_done_cleanup,
1212         .rx_queue_count = eth_rx_queue_count,
1213         .link_update = eth_link_update,
1214         .stats_get = eth_stats_get,
1215         .stats_reset = eth_stats_reset,
1216         .xstats_reset = vhost_dev_xstats_reset,
1217         .xstats_get = vhost_dev_xstats_get,
1218         .xstats_get_names = vhost_dev_xstats_get_names,
1219         .rx_queue_intr_enable = eth_rxq_intr_enable,
1220         .rx_queue_intr_disable = eth_rxq_intr_disable,
1221 };
1222
1223 static struct rte_vdev_driver pmd_vhost_drv;
1224
1225 static int
1226 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1227         int16_t queues, const unsigned int numa_node, uint64_t flags)
1228 {
1229         const char *name = rte_vdev_device_name(dev);
1230         struct rte_eth_dev_data *data;
1231         struct pmd_internal *internal = NULL;
1232         struct rte_eth_dev *eth_dev = NULL;
1233         struct ether_addr *eth_addr = NULL;
1234         struct rte_vhost_vring_state *vring_state = NULL;
1235         struct internal_list *list = NULL;
1236
1237         RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
1238                 numa_node);
1239
1240         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1241         if (list == NULL)
1242                 goto error;
1243
1244         /* reserve an ethdev entry */
1245         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1246         if (eth_dev == NULL)
1247                 goto error;
1248
1249         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1250         if (eth_addr == NULL)
1251                 goto error;
1252         *eth_addr = base_eth_addr;
1253         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1254
1255         vring_state = rte_zmalloc_socket(name,
1256                         sizeof(*vring_state), 0, numa_node);
1257         if (vring_state == NULL)
1258                 goto error;
1259
1260         /* now put it all together
1261          * - store queue data in internal,
1262          * - point eth_dev_data to internals
1263          * - and point eth_dev structure to new eth_dev_data structure
1264          */
1265         internal = eth_dev->data->dev_private;
1266         internal->dev_name = strdup(name);
1267         if (internal->dev_name == NULL)
1268                 goto error;
1269         internal->iface_name = strdup(iface_name);
1270         if (internal->iface_name == NULL)
1271                 goto error;
1272
1273         list->eth_dev = eth_dev;
1274         pthread_mutex_lock(&internal_list_lock);
1275         TAILQ_INSERT_TAIL(&internal_list, list, next);
1276         pthread_mutex_unlock(&internal_list_lock);
1277
1278         rte_spinlock_init(&vring_state->lock);
1279         vring_states[eth_dev->data->port_id] = vring_state;
1280
1281         data = eth_dev->data;
1282         data->nb_rx_queues = queues;
1283         data->nb_tx_queues = queues;
1284         internal->max_queues = queues;
1285         data->dev_link = pmd_link;
1286         data->mac_addrs = eth_addr;
1287         data->dev_flags = RTE_ETH_DEV_INTR_LSC;
1288
1289         eth_dev->dev_ops = &ops;
1290
1291         /* finally assign rx and tx ops */
1292         eth_dev->rx_pkt_burst = eth_vhost_rx;
1293         eth_dev->tx_pkt_burst = eth_vhost_tx;
1294
1295         if (rte_vhost_driver_register(iface_name, flags))
1296                 goto error;
1297
1298         if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) {
1299                 RTE_LOG(ERR, PMD, "Can't register callbacks\n");
1300                 goto error;
1301         }
1302
1303         if (rte_vhost_driver_start(iface_name) < 0) {
1304                 RTE_LOG(ERR, PMD, "Failed to start driver for %s\n",
1305                         iface_name);
1306                 goto error;
1307         }
1308
1309         return data->port_id;
1310
1311 error:
1312         if (internal) {
1313                 free(internal->iface_name);
1314                 free(internal->dev_name);
1315         }
1316         rte_free(vring_state);
1317         rte_free(eth_addr);
1318         if (eth_dev)
1319                 rte_eth_dev_release_port(eth_dev);
1320         rte_free(internal);
1321         rte_free(list);
1322
1323         return -1;
1324 }
1325
1326 static inline int
1327 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1328 {
1329         const char **iface_name = extra_args;
1330
1331         if (value == NULL)
1332                 return -1;
1333
1334         *iface_name = value;
1335
1336         return 0;
1337 }
1338
1339 static inline int
1340 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1341 {
1342         uint16_t *n = extra_args;
1343
1344         if (value == NULL || extra_args == NULL)
1345                 return -EINVAL;
1346
1347         *n = (uint16_t)strtoul(value, NULL, 0);
1348         if (*n == USHRT_MAX && errno == ERANGE)
1349                 return -1;
1350
1351         return 0;
1352 }
1353
1354 static int
1355 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1356 {
1357         struct rte_kvargs *kvlist = NULL;
1358         int ret = 0;
1359         char *iface_name;
1360         uint16_t queues;
1361         uint64_t flags = 0;
1362         int client_mode = 0;
1363         int dequeue_zero_copy = 0;
1364         int iommu_support = 0;
1365         struct rte_eth_dev *eth_dev;
1366         const char *name = rte_vdev_device_name(dev);
1367
1368         RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
1369
1370         if (rte_eal_process_type() == RTE_PROC_SECONDARY &&
1371             strlen(rte_vdev_device_args(dev)) == 0) {
1372                 eth_dev = rte_eth_dev_attach_secondary(name);
1373                 if (!eth_dev) {
1374                         RTE_LOG(ERR, PMD, "Failed to probe %s\n", name);
1375                         return -1;
1376                 }
1377                 /* TODO: request info from primary to set up Rx and Tx */
1378                 eth_dev->dev_ops = &ops;
1379                 return 0;
1380         }
1381
1382         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1383         if (kvlist == NULL)
1384                 return -1;
1385
1386         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1387                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1388                                          &open_iface, &iface_name);
1389                 if (ret < 0)
1390                         goto out_free;
1391         } else {
1392                 ret = -1;
1393                 goto out_free;
1394         }
1395
1396         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1397                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1398                                          &open_int, &queues);
1399                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1400                         goto out_free;
1401
1402         } else
1403                 queues = 1;
1404
1405         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1406                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1407                                          &open_int, &client_mode);
1408                 if (ret < 0)
1409                         goto out_free;
1410
1411                 if (client_mode)
1412                         flags |= RTE_VHOST_USER_CLIENT;
1413         }
1414
1415         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1416                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1417                                          &open_int, &dequeue_zero_copy);
1418                 if (ret < 0)
1419                         goto out_free;
1420
1421                 if (dequeue_zero_copy)
1422                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1423         }
1424
1425         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1426                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1427                                          &open_int, &iommu_support);
1428                 if (ret < 0)
1429                         goto out_free;
1430
1431                 if (iommu_support)
1432                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1433         }
1434
1435         if (dev->device.numa_node == SOCKET_ID_ANY)
1436                 dev->device.numa_node = rte_socket_id();
1437
1438         eth_dev_vhost_create(dev, iface_name, queues, dev->device.numa_node,
1439                 flags);
1440
1441 out_free:
1442         rte_kvargs_free(kvlist);
1443         return ret;
1444 }
1445
1446 static int
1447 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1448 {
1449         const char *name;
1450         struct rte_eth_dev *eth_dev = NULL;
1451
1452         name = rte_vdev_device_name(dev);
1453         RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
1454
1455         /* find an ethdev entry */
1456         eth_dev = rte_eth_dev_allocated(name);
1457         if (eth_dev == NULL)
1458                 return -ENODEV;
1459
1460         eth_dev_close(eth_dev);
1461
1462         rte_free(vring_states[eth_dev->data->port_id]);
1463         vring_states[eth_dev->data->port_id] = NULL;
1464
1465         rte_eth_dev_release_port(eth_dev);
1466
1467         return 0;
1468 }
1469
1470 static struct rte_vdev_driver pmd_vhost_drv = {
1471         .probe = rte_pmd_vhost_probe,
1472         .remove = rte_pmd_vhost_remove,
1473 };
1474
1475 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1476 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1477 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1478         "iface=<ifc> "
1479         "queues=<int>");