net/vhost: fix probing in secondary process
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8
9 #include <rte_mbuf.h>
10 #include <rte_ethdev_driver.h>
11 #include <rte_ethdev_vdev.h>
12 #include <rte_malloc.h>
13 #include <rte_memcpy.h>
14 #include <rte_bus_vdev.h>
15 #include <rte_kvargs.h>
16 #include <rte_vhost.h>
17 #include <rte_spinlock.h>
18
19 #include "rte_eth_vhost.h"
20
21 static int vhost_logtype;
22
23 #define VHOST_LOG(level, ...) \
24         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
25
26 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
27
28 #define ETH_VHOST_IFACE_ARG             "iface"
29 #define ETH_VHOST_QUEUES_ARG            "queues"
30 #define ETH_VHOST_CLIENT_ARG            "client"
31 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define VHOST_MAX_PKT_BURST 32
36
37 static const char *valid_arguments[] = {
38         ETH_VHOST_IFACE_ARG,
39         ETH_VHOST_QUEUES_ARG,
40         ETH_VHOST_CLIENT_ARG,
41         ETH_VHOST_DEQUEUE_ZERO_COPY,
42         ETH_VHOST_IOMMU_SUPPORT,
43         ETH_VHOST_POSTCOPY_SUPPORT,
44         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
45         NULL
46 };
47
48 static struct rte_ether_addr base_eth_addr = {
49         .addr_bytes = {
50                 0x56 /* V */,
51                 0x48 /* H */,
52                 0x4F /* O */,
53                 0x53 /* S */,
54                 0x54 /* T */,
55                 0x00
56         }
57 };
58
59 enum vhost_xstats_pkts {
60         VHOST_UNDERSIZE_PKT = 0,
61         VHOST_64_PKT,
62         VHOST_65_TO_127_PKT,
63         VHOST_128_TO_255_PKT,
64         VHOST_256_TO_511_PKT,
65         VHOST_512_TO_1023_PKT,
66         VHOST_1024_TO_1522_PKT,
67         VHOST_1523_TO_MAX_PKT,
68         VHOST_BROADCAST_PKT,
69         VHOST_MULTICAST_PKT,
70         VHOST_UNICAST_PKT,
71         VHOST_ERRORS_PKT,
72         VHOST_ERRORS_FRAGMENTED,
73         VHOST_ERRORS_JABBER,
74         VHOST_UNKNOWN_PROTOCOL,
75         VHOST_XSTATS_MAX,
76 };
77
78 struct vhost_stats {
79         uint64_t pkts;
80         uint64_t bytes;
81         uint64_t missed_pkts;
82         uint64_t xstats[VHOST_XSTATS_MAX];
83 };
84
85 struct vhost_queue {
86         int vid;
87         rte_atomic32_t allow_queuing;
88         rte_atomic32_t while_queuing;
89         struct pmd_internal *internal;
90         struct rte_mempool *mb_pool;
91         uint16_t port;
92         uint16_t virtqueue_id;
93         struct vhost_stats stats;
94 };
95
96 struct pmd_internal {
97         rte_atomic32_t dev_attached;
98         char *dev_name;
99         char *iface_name;
100         uint64_t flags;
101         uint64_t disable_flags;
102         uint16_t max_queues;
103         int vid;
104         rte_atomic32_t started;
105         uint8_t vlan_strip;
106 };
107
108 struct internal_list {
109         TAILQ_ENTRY(internal_list) next;
110         struct rte_eth_dev *eth_dev;
111 };
112
113 TAILQ_HEAD(internal_list_head, internal_list);
114 static struct internal_list_head internal_list =
115         TAILQ_HEAD_INITIALIZER(internal_list);
116
117 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
118
119 static struct rte_eth_link pmd_link = {
120                 .link_speed = 10000,
121                 .link_duplex = ETH_LINK_FULL_DUPLEX,
122                 .link_status = ETH_LINK_DOWN
123 };
124
125 struct rte_vhost_vring_state {
126         rte_spinlock_t lock;
127
128         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
129         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
130         unsigned int index;
131         unsigned int max_vring;
132 };
133
134 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
135
136 #define VHOST_XSTATS_NAME_SIZE 64
137
138 struct vhost_xstats_name_off {
139         char name[VHOST_XSTATS_NAME_SIZE];
140         uint64_t offset;
141 };
142
143 /* [rx]_is prepended to the name string here */
144 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
145         {"good_packets",
146          offsetof(struct vhost_queue, stats.pkts)},
147         {"total_bytes",
148          offsetof(struct vhost_queue, stats.bytes)},
149         {"missed_pkts",
150          offsetof(struct vhost_queue, stats.missed_pkts)},
151         {"broadcast_packets",
152          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
153         {"multicast_packets",
154          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
155         {"unicast_packets",
156          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
157          {"undersize_packets",
158          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
159         {"size_64_packets",
160          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
161         {"size_65_to_127_packets",
162          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
163         {"size_128_to_255_packets",
164          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
165         {"size_256_to_511_packets",
166          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
167         {"size_512_to_1023_packets",
168          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
169         {"size_1024_to_1522_packets",
170          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
171         {"size_1523_to_max_packets",
172          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
173         {"errors_with_bad_CRC",
174          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
175         {"fragmented_errors",
176          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
177         {"jabber_errors",
178          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
179         {"unknown_protos_packets",
180          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
181 };
182
183 /* [tx]_ is prepended to the name string here */
184 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
185         {"good_packets",
186          offsetof(struct vhost_queue, stats.pkts)},
187         {"total_bytes",
188          offsetof(struct vhost_queue, stats.bytes)},
189         {"missed_pkts",
190          offsetof(struct vhost_queue, stats.missed_pkts)},
191         {"broadcast_packets",
192          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
193         {"multicast_packets",
194          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
195         {"unicast_packets",
196          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
197         {"undersize_packets",
198          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
199         {"size_64_packets",
200          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
201         {"size_65_to_127_packets",
202          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
203         {"size_128_to_255_packets",
204          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
205         {"size_256_to_511_packets",
206          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
207         {"size_512_to_1023_packets",
208          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
209         {"size_1024_to_1522_packets",
210          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
211         {"size_1523_to_max_packets",
212          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
213         {"errors_with_bad_CRC",
214          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
215 };
216
217 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
218                                 sizeof(vhost_rxport_stat_strings[0]))
219
220 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
221                                 sizeof(vhost_txport_stat_strings[0]))
222
223 static int
224 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
225 {
226         struct vhost_queue *vq = NULL;
227         unsigned int i = 0;
228
229         for (i = 0; i < dev->data->nb_rx_queues; i++) {
230                 vq = dev->data->rx_queues[i];
231                 if (!vq)
232                         continue;
233                 memset(&vq->stats, 0, sizeof(vq->stats));
234         }
235         for (i = 0; i < dev->data->nb_tx_queues; i++) {
236                 vq = dev->data->tx_queues[i];
237                 if (!vq)
238                         continue;
239                 memset(&vq->stats, 0, sizeof(vq->stats));
240         }
241
242         return 0;
243 }
244
245 static int
246 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
247                            struct rte_eth_xstat_name *xstats_names,
248                            unsigned int limit __rte_unused)
249 {
250         unsigned int t = 0;
251         int count = 0;
252         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
253
254         if (!xstats_names)
255                 return nstats;
256         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
257                 snprintf(xstats_names[count].name,
258                          sizeof(xstats_names[count].name),
259                          "rx_%s", vhost_rxport_stat_strings[t].name);
260                 count++;
261         }
262         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
263                 snprintf(xstats_names[count].name,
264                          sizeof(xstats_names[count].name),
265                          "tx_%s", vhost_txport_stat_strings[t].name);
266                 count++;
267         }
268         return count;
269 }
270
271 static int
272 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
273                      unsigned int n)
274 {
275         unsigned int i;
276         unsigned int t;
277         unsigned int count = 0;
278         struct vhost_queue *vq = NULL;
279         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
280
281         if (n < nxstats)
282                 return nxstats;
283
284         for (i = 0; i < dev->data->nb_rx_queues; i++) {
285                 vq = dev->data->rx_queues[i];
286                 if (!vq)
287                         continue;
288                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
289                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
290                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
291         }
292         for (i = 0; i < dev->data->nb_tx_queues; i++) {
293                 vq = dev->data->tx_queues[i];
294                 if (!vq)
295                         continue;
296                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
297                                 + vq->stats.missed_pkts
298                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
299                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
300         }
301         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
302                 xstats[count].value = 0;
303                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
304                         vq = dev->data->rx_queues[i];
305                         if (!vq)
306                                 continue;
307                         xstats[count].value +=
308                                 *(uint64_t *)(((char *)vq)
309                                 + vhost_rxport_stat_strings[t].offset);
310                 }
311                 xstats[count].id = count;
312                 count++;
313         }
314         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
315                 xstats[count].value = 0;
316                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
317                         vq = dev->data->tx_queues[i];
318                         if (!vq)
319                                 continue;
320                         xstats[count].value +=
321                                 *(uint64_t *)(((char *)vq)
322                                 + vhost_txport_stat_strings[t].offset);
323                 }
324                 xstats[count].id = count;
325                 count++;
326         }
327         return count;
328 }
329
330 static inline void
331 vhost_count_multicast_broadcast(struct vhost_queue *vq,
332                                 struct rte_mbuf *mbuf)
333 {
334         struct rte_ether_addr *ea = NULL;
335         struct vhost_stats *pstats = &vq->stats;
336
337         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
338         if (rte_is_multicast_ether_addr(ea)) {
339                 if (rte_is_broadcast_ether_addr(ea))
340                         pstats->xstats[VHOST_BROADCAST_PKT]++;
341                 else
342                         pstats->xstats[VHOST_MULTICAST_PKT]++;
343         }
344 }
345
346 static void
347 vhost_update_packet_xstats(struct vhost_queue *vq,
348                            struct rte_mbuf **bufs,
349                            uint16_t count)
350 {
351         uint32_t pkt_len = 0;
352         uint64_t i = 0;
353         uint64_t index;
354         struct vhost_stats *pstats = &vq->stats;
355
356         for (i = 0; i < count ; i++) {
357                 pkt_len = bufs[i]->pkt_len;
358                 if (pkt_len == 64) {
359                         pstats->xstats[VHOST_64_PKT]++;
360                 } else if (pkt_len > 64 && pkt_len < 1024) {
361                         index = (sizeof(pkt_len) * 8)
362                                 - __builtin_clz(pkt_len) - 5;
363                         pstats->xstats[index]++;
364                 } else {
365                         if (pkt_len < 64)
366                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
367                         else if (pkt_len <= 1522)
368                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
369                         else if (pkt_len > 1522)
370                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
371                 }
372                 vhost_count_multicast_broadcast(vq, bufs[i]);
373         }
374 }
375
376 static uint16_t
377 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
378 {
379         struct vhost_queue *r = q;
380         uint16_t i, nb_rx = 0;
381         uint16_t nb_receive = nb_bufs;
382
383         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
384                 return 0;
385
386         rte_atomic32_set(&r->while_queuing, 1);
387
388         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
389                 goto out;
390
391         /* Dequeue packets from guest TX queue */
392         while (nb_receive) {
393                 uint16_t nb_pkts;
394                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
395                                                  VHOST_MAX_PKT_BURST);
396
397                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
398                                                   r->mb_pool, &bufs[nb_rx],
399                                                   num);
400
401                 nb_rx += nb_pkts;
402                 nb_receive -= nb_pkts;
403                 if (nb_pkts < num)
404                         break;
405         }
406
407         r->stats.pkts += nb_rx;
408
409         for (i = 0; likely(i < nb_rx); i++) {
410                 bufs[i]->port = r->port;
411                 bufs[i]->vlan_tci = 0;
412
413                 if (r->internal->vlan_strip)
414                         rte_vlan_strip(bufs[i]);
415
416                 r->stats.bytes += bufs[i]->pkt_len;
417         }
418
419         vhost_update_packet_xstats(r, bufs, nb_rx);
420
421 out:
422         rte_atomic32_set(&r->while_queuing, 0);
423
424         return nb_rx;
425 }
426
427 static uint16_t
428 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
429 {
430         struct vhost_queue *r = q;
431         uint16_t i, nb_tx = 0;
432         uint16_t nb_send = 0;
433
434         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
435                 return 0;
436
437         rte_atomic32_set(&r->while_queuing, 1);
438
439         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
440                 goto out;
441
442         for (i = 0; i < nb_bufs; i++) {
443                 struct rte_mbuf *m = bufs[i];
444
445                 /* Do VLAN tag insertion */
446                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
447                         int error = rte_vlan_insert(&m);
448                         if (unlikely(error)) {
449                                 rte_pktmbuf_free(m);
450                                 continue;
451                         }
452                 }
453
454                 bufs[nb_send] = m;
455                 ++nb_send;
456         }
457
458         /* Enqueue packets to guest RX queue */
459         while (nb_send) {
460                 uint16_t nb_pkts;
461                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
462                                                  VHOST_MAX_PKT_BURST);
463
464                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
465                                                   &bufs[nb_tx], num);
466
467                 nb_tx += nb_pkts;
468                 nb_send -= nb_pkts;
469                 if (nb_pkts < num)
470                         break;
471         }
472
473         r->stats.pkts += nb_tx;
474         r->stats.missed_pkts += nb_bufs - nb_tx;
475
476         for (i = 0; likely(i < nb_tx); i++)
477                 r->stats.bytes += bufs[i]->pkt_len;
478
479         vhost_update_packet_xstats(r, bufs, nb_tx);
480
481         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
482          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
483          * are increased when packets are not transmitted successfully.
484          */
485         for (i = nb_tx; i < nb_bufs; i++)
486                 vhost_count_multicast_broadcast(r, bufs[i]);
487
488         for (i = 0; likely(i < nb_tx); i++)
489                 rte_pktmbuf_free(bufs[i]);
490 out:
491         rte_atomic32_set(&r->while_queuing, 0);
492
493         return nb_tx;
494 }
495
496 static inline struct internal_list *
497 find_internal_resource(char *ifname)
498 {
499         int found = 0;
500         struct internal_list *list;
501         struct pmd_internal *internal;
502
503         if (ifname == NULL)
504                 return NULL;
505
506         pthread_mutex_lock(&internal_list_lock);
507
508         TAILQ_FOREACH(list, &internal_list, next) {
509                 internal = list->eth_dev->data->dev_private;
510                 if (!strcmp(internal->iface_name, ifname)) {
511                         found = 1;
512                         break;
513                 }
514         }
515
516         pthread_mutex_unlock(&internal_list_lock);
517
518         if (!found)
519                 return NULL;
520
521         return list;
522 }
523
524 static int
525 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
526 {
527         struct rte_vhost_vring vring;
528         struct vhost_queue *vq;
529         int ret = 0;
530
531         vq = dev->data->rx_queues[qid];
532         if (!vq) {
533                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
534                 return -1;
535         }
536
537         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
538         if (ret < 0) {
539                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
540                 return ret;
541         }
542         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
543         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
544         rte_wmb();
545
546         return ret;
547 }
548
549 static int
550 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
551 {
552         struct rte_vhost_vring vring;
553         struct vhost_queue *vq;
554         int ret = 0;
555
556         vq = dev->data->rx_queues[qid];
557         if (!vq) {
558                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
559                 return -1;
560         }
561
562         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
563         if (ret < 0) {
564                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
565                 return ret;
566         }
567         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
568         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
569         rte_wmb();
570
571         return 0;
572 }
573
574 static void
575 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
576 {
577         struct rte_intr_handle *intr_handle = dev->intr_handle;
578
579         if (intr_handle) {
580                 if (intr_handle->intr_vec)
581                         free(intr_handle->intr_vec);
582                 free(intr_handle);
583         }
584
585         dev->intr_handle = NULL;
586 }
587
588 static int
589 eth_vhost_install_intr(struct rte_eth_dev *dev)
590 {
591         struct rte_vhost_vring vring;
592         struct vhost_queue *vq;
593         int count = 0;
594         int nb_rxq = dev->data->nb_rx_queues;
595         int i;
596         int ret;
597
598         /* uninstall firstly if we are reconnecting */
599         if (dev->intr_handle)
600                 eth_vhost_uninstall_intr(dev);
601
602         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
603         if (!dev->intr_handle) {
604                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
605                 return -ENOMEM;
606         }
607         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
608
609         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
610
611         dev->intr_handle->intr_vec =
612                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
613
614         if (!dev->intr_handle->intr_vec) {
615                 VHOST_LOG(ERR,
616                         "Failed to allocate memory for interrupt vector\n");
617                 free(dev->intr_handle);
618                 return -ENOMEM;
619         }
620
621         VHOST_LOG(INFO, "Prepare intr vec\n");
622         for (i = 0; i < nb_rxq; i++) {
623                 vq = dev->data->rx_queues[i];
624                 if (!vq) {
625                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
626                         continue;
627                 }
628
629                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
630                 if (ret < 0) {
631                         VHOST_LOG(INFO,
632                                 "Failed to get rxq-%d's vring, skip!\n", i);
633                         continue;
634                 }
635
636                 if (vring.kickfd < 0) {
637                         VHOST_LOG(INFO,
638                                 "rxq-%d's kickfd is invalid, skip!\n", i);
639                         continue;
640                 }
641                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
642                 dev->intr_handle->efds[i] = vring.kickfd;
643                 count++;
644                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
645         }
646
647         dev->intr_handle->nb_efd = count;
648         dev->intr_handle->max_intr = count + 1;
649         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
650
651         return 0;
652 }
653
654 static void
655 update_queuing_status(struct rte_eth_dev *dev)
656 {
657         struct pmd_internal *internal = dev->data->dev_private;
658         struct vhost_queue *vq;
659         unsigned int i;
660         int allow_queuing = 1;
661
662         if (!dev->data->rx_queues || !dev->data->tx_queues)
663                 return;
664
665         if (rte_atomic32_read(&internal->started) == 0 ||
666             rte_atomic32_read(&internal->dev_attached) == 0)
667                 allow_queuing = 0;
668
669         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
670         for (i = 0; i < dev->data->nb_rx_queues; i++) {
671                 vq = dev->data->rx_queues[i];
672                 if (vq == NULL)
673                         continue;
674                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
675                 while (rte_atomic32_read(&vq->while_queuing))
676                         rte_pause();
677         }
678
679         for (i = 0; i < dev->data->nb_tx_queues; i++) {
680                 vq = dev->data->tx_queues[i];
681                 if (vq == NULL)
682                         continue;
683                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
684                 while (rte_atomic32_read(&vq->while_queuing))
685                         rte_pause();
686         }
687 }
688
689 static void
690 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
691 {
692         struct vhost_queue *vq;
693         int i;
694
695         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
696                 vq = eth_dev->data->rx_queues[i];
697                 if (!vq)
698                         continue;
699                 vq->vid = internal->vid;
700                 vq->internal = internal;
701                 vq->port = eth_dev->data->port_id;
702         }
703         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
704                 vq = eth_dev->data->tx_queues[i];
705                 if (!vq)
706                         continue;
707                 vq->vid = internal->vid;
708                 vq->internal = internal;
709                 vq->port = eth_dev->data->port_id;
710         }
711 }
712
713 static int
714 new_device(int vid)
715 {
716         struct rte_eth_dev *eth_dev;
717         struct internal_list *list;
718         struct pmd_internal *internal;
719         struct rte_eth_conf *dev_conf;
720         unsigned i;
721         char ifname[PATH_MAX];
722 #ifdef RTE_LIBRTE_VHOST_NUMA
723         int newnode;
724 #endif
725
726         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
727         list = find_internal_resource(ifname);
728         if (list == NULL) {
729                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
730                 return -1;
731         }
732
733         eth_dev = list->eth_dev;
734         internal = eth_dev->data->dev_private;
735         dev_conf = &eth_dev->data->dev_conf;
736
737 #ifdef RTE_LIBRTE_VHOST_NUMA
738         newnode = rte_vhost_get_numa_node(vid);
739         if (newnode >= 0)
740                 eth_dev->data->numa_node = newnode;
741 #endif
742
743         internal->vid = vid;
744         if (rte_atomic32_read(&internal->started) == 1) {
745                 queue_setup(eth_dev, internal);
746
747                 if (dev_conf->intr_conf.rxq) {
748                         if (eth_vhost_install_intr(eth_dev) < 0) {
749                                 VHOST_LOG(INFO,
750                                         "Failed to install interrupt handler.");
751                                         return -1;
752                         }
753                 }
754         } else {
755                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
756         }
757
758         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
759                 rte_vhost_enable_guest_notification(vid, i, 0);
760
761         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
762
763         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
764
765         rte_atomic32_set(&internal->dev_attached, 1);
766         update_queuing_status(eth_dev);
767
768         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
769
770         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
771
772         return 0;
773 }
774
775 static void
776 destroy_device(int vid)
777 {
778         struct rte_eth_dev *eth_dev;
779         struct pmd_internal *internal;
780         struct vhost_queue *vq;
781         struct internal_list *list;
782         char ifname[PATH_MAX];
783         unsigned i;
784         struct rte_vhost_vring_state *state;
785
786         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
787         list = find_internal_resource(ifname);
788         if (list == NULL) {
789                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
790                 return;
791         }
792         eth_dev = list->eth_dev;
793         internal = eth_dev->data->dev_private;
794
795         rte_atomic32_set(&internal->dev_attached, 0);
796         update_queuing_status(eth_dev);
797
798         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
799
800         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
801                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
802                         vq = eth_dev->data->rx_queues[i];
803                         if (!vq)
804                                 continue;
805                         vq->vid = -1;
806                 }
807                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
808                         vq = eth_dev->data->tx_queues[i];
809                         if (!vq)
810                                 continue;
811                         vq->vid = -1;
812                 }
813         }
814
815         state = vring_states[eth_dev->data->port_id];
816         rte_spinlock_lock(&state->lock);
817         for (i = 0; i <= state->max_vring; i++) {
818                 state->cur[i] = false;
819                 state->seen[i] = false;
820         }
821         state->max_vring = 0;
822         rte_spinlock_unlock(&state->lock);
823
824         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
825         eth_vhost_uninstall_intr(eth_dev);
826
827         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
828 }
829
830 static int
831 vring_state_changed(int vid, uint16_t vring, int enable)
832 {
833         struct rte_vhost_vring_state *state;
834         struct rte_eth_dev *eth_dev;
835         struct internal_list *list;
836         char ifname[PATH_MAX];
837
838         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
839         list = find_internal_resource(ifname);
840         if (list == NULL) {
841                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
842                 return -1;
843         }
844
845         eth_dev = list->eth_dev;
846         /* won't be NULL */
847         state = vring_states[eth_dev->data->port_id];
848         rte_spinlock_lock(&state->lock);
849         if (state->cur[vring] == enable) {
850                 rte_spinlock_unlock(&state->lock);
851                 return 0;
852         }
853         state->cur[vring] = enable;
854         state->max_vring = RTE_MAX(vring, state->max_vring);
855         rte_spinlock_unlock(&state->lock);
856
857         VHOST_LOG(INFO, "vring%u is %s\n",
858                         vring, enable ? "enabled" : "disabled");
859
860         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
861
862         return 0;
863 }
864
865 static struct vhost_device_ops vhost_ops = {
866         .new_device          = new_device,
867         .destroy_device      = destroy_device,
868         .vring_state_changed = vring_state_changed,
869 };
870
871 static int
872 vhost_driver_setup(struct rte_eth_dev *eth_dev)
873 {
874         struct pmd_internal *internal = eth_dev->data->dev_private;
875         struct internal_list *list = NULL;
876         struct rte_vhost_vring_state *vring_state = NULL;
877         unsigned int numa_node = eth_dev->device->numa_node;
878         const char *name = eth_dev->device->name;
879
880         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
881         if (list == NULL)
882                 goto error;
883
884         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
885                                          0, numa_node);
886         if (vring_state == NULL)
887                 goto error;
888
889         list->eth_dev = eth_dev;
890         pthread_mutex_lock(&internal_list_lock);
891         TAILQ_INSERT_TAIL(&internal_list, list, next);
892         pthread_mutex_unlock(&internal_list_lock);
893
894         rte_spinlock_init(&vring_state->lock);
895         vring_states[eth_dev->data->port_id] = vring_state;
896
897         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
898                 goto error;
899
900         if (internal->disable_flags) {
901                 if (rte_vhost_driver_disable_features(internal->iface_name,
902                                                       internal->disable_flags))
903                         goto error;
904         }
905
906         if (rte_vhost_driver_callback_register(internal->iface_name,
907                                                &vhost_ops) < 0) {
908                 VHOST_LOG(ERR, "Can't register callbacks\n");
909                 goto error;
910         }
911
912         if (rte_vhost_driver_start(internal->iface_name) < 0) {
913                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
914                           internal->iface_name);
915                 goto error;
916         }
917
918         return 0;
919
920 error:
921         rte_free(vring_state);
922         rte_free(list);
923
924         return -1;
925 }
926
927 int
928 rte_eth_vhost_get_queue_event(uint16_t port_id,
929                 struct rte_eth_vhost_queue_event *event)
930 {
931         struct rte_vhost_vring_state *state;
932         unsigned int i;
933         int idx;
934
935         if (port_id >= RTE_MAX_ETHPORTS) {
936                 VHOST_LOG(ERR, "Invalid port id\n");
937                 return -1;
938         }
939
940         state = vring_states[port_id];
941         if (!state) {
942                 VHOST_LOG(ERR, "Unused port\n");
943                 return -1;
944         }
945
946         rte_spinlock_lock(&state->lock);
947         for (i = 0; i <= state->max_vring; i++) {
948                 idx = state->index++ % (state->max_vring + 1);
949
950                 if (state->cur[idx] != state->seen[idx]) {
951                         state->seen[idx] = state->cur[idx];
952                         event->queue_id = idx / 2;
953                         event->rx = idx & 1;
954                         event->enable = state->cur[idx];
955                         rte_spinlock_unlock(&state->lock);
956                         return 0;
957                 }
958         }
959         rte_spinlock_unlock(&state->lock);
960
961         return -1;
962 }
963
964 int
965 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
966 {
967         struct internal_list *list;
968         struct rte_eth_dev *eth_dev;
969         struct vhost_queue *vq;
970         int vid = -1;
971
972         if (!rte_eth_dev_is_valid_port(port_id))
973                 return -1;
974
975         pthread_mutex_lock(&internal_list_lock);
976
977         TAILQ_FOREACH(list, &internal_list, next) {
978                 eth_dev = list->eth_dev;
979                 if (eth_dev->data->port_id == port_id) {
980                         vq = eth_dev->data->rx_queues[0];
981                         if (vq) {
982                                 vid = vq->vid;
983                         }
984                         break;
985                 }
986         }
987
988         pthread_mutex_unlock(&internal_list_lock);
989
990         return vid;
991 }
992
993 static int
994 eth_dev_configure(struct rte_eth_dev *dev)
995 {
996         struct pmd_internal *internal = dev->data->dev_private;
997         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
998
999         /* NOTE: the same process has to operate a vhost interface
1000          * from beginning to end (from eth_dev configure to eth_dev close).
1001          * It is user's responsibility at the moment.
1002          */
1003         if (vhost_driver_setup(dev) < 0)
1004                 return -1;
1005
1006         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1007
1008         return 0;
1009 }
1010
1011 static int
1012 eth_dev_start(struct rte_eth_dev *eth_dev)
1013 {
1014         struct pmd_internal *internal = eth_dev->data->dev_private;
1015         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1016
1017         queue_setup(eth_dev, internal);
1018
1019         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1020                 if (dev_conf->intr_conf.rxq) {
1021                         if (eth_vhost_install_intr(eth_dev) < 0) {
1022                                 VHOST_LOG(INFO,
1023                                         "Failed to install interrupt handler.");
1024                                         return -1;
1025                         }
1026                 }
1027         }
1028
1029         rte_atomic32_set(&internal->started, 1);
1030         update_queuing_status(eth_dev);
1031
1032         return 0;
1033 }
1034
1035 static void
1036 eth_dev_stop(struct rte_eth_dev *dev)
1037 {
1038         struct pmd_internal *internal = dev->data->dev_private;
1039
1040         rte_atomic32_set(&internal->started, 0);
1041         update_queuing_status(dev);
1042 }
1043
1044 static void
1045 eth_dev_close(struct rte_eth_dev *dev)
1046 {
1047         struct pmd_internal *internal;
1048         struct internal_list *list;
1049         unsigned int i;
1050
1051         internal = dev->data->dev_private;
1052         if (!internal)
1053                 return;
1054
1055         eth_dev_stop(dev);
1056
1057         rte_vhost_driver_unregister(internal->iface_name);
1058
1059         list = find_internal_resource(internal->iface_name);
1060         if (!list)
1061                 return;
1062
1063         pthread_mutex_lock(&internal_list_lock);
1064         TAILQ_REMOVE(&internal_list, list, next);
1065         pthread_mutex_unlock(&internal_list_lock);
1066         rte_free(list);
1067
1068         if (dev->data->rx_queues)
1069                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1070                         rte_free(dev->data->rx_queues[i]);
1071
1072         if (dev->data->tx_queues)
1073                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1074                         rte_free(dev->data->tx_queues[i]);
1075
1076         free(internal->dev_name);
1077         rte_free(internal->iface_name);
1078         rte_free(internal);
1079
1080         dev->data->dev_private = NULL;
1081
1082         rte_free(vring_states[dev->data->port_id]);
1083         vring_states[dev->data->port_id] = NULL;
1084 }
1085
1086 static int
1087 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1088                    uint16_t nb_rx_desc __rte_unused,
1089                    unsigned int socket_id,
1090                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1091                    struct rte_mempool *mb_pool)
1092 {
1093         struct vhost_queue *vq;
1094
1095         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1096                         RTE_CACHE_LINE_SIZE, socket_id);
1097         if (vq == NULL) {
1098                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1099                 return -ENOMEM;
1100         }
1101
1102         vq->mb_pool = mb_pool;
1103         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1104         dev->data->rx_queues[rx_queue_id] = vq;
1105
1106         return 0;
1107 }
1108
1109 static int
1110 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1111                    uint16_t nb_tx_desc __rte_unused,
1112                    unsigned int socket_id,
1113                    const struct rte_eth_txconf *tx_conf __rte_unused)
1114 {
1115         struct vhost_queue *vq;
1116
1117         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1118                         RTE_CACHE_LINE_SIZE, socket_id);
1119         if (vq == NULL) {
1120                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1121                 return -ENOMEM;
1122         }
1123
1124         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1125         dev->data->tx_queues[tx_queue_id] = vq;
1126
1127         return 0;
1128 }
1129
1130 static int
1131 eth_dev_info(struct rte_eth_dev *dev,
1132              struct rte_eth_dev_info *dev_info)
1133 {
1134         struct pmd_internal *internal;
1135
1136         internal = dev->data->dev_private;
1137         if (internal == NULL) {
1138                 VHOST_LOG(ERR, "Invalid device specified\n");
1139                 return -ENODEV;
1140         }
1141
1142         dev_info->max_mac_addrs = 1;
1143         dev_info->max_rx_pktlen = (uint32_t)-1;
1144         dev_info->max_rx_queues = internal->max_queues;
1145         dev_info->max_tx_queues = internal->max_queues;
1146         dev_info->min_rx_bufsize = 0;
1147
1148         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1149                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1150         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1151
1152         return 0;
1153 }
1154
1155 static int
1156 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1157 {
1158         unsigned i;
1159         unsigned long rx_total = 0, tx_total = 0;
1160         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1161         struct vhost_queue *vq;
1162
1163         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1164                         i < dev->data->nb_rx_queues; i++) {
1165                 if (dev->data->rx_queues[i] == NULL)
1166                         continue;
1167                 vq = dev->data->rx_queues[i];
1168                 stats->q_ipackets[i] = vq->stats.pkts;
1169                 rx_total += stats->q_ipackets[i];
1170
1171                 stats->q_ibytes[i] = vq->stats.bytes;
1172                 rx_total_bytes += stats->q_ibytes[i];
1173         }
1174
1175         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1176                         i < dev->data->nb_tx_queues; i++) {
1177                 if (dev->data->tx_queues[i] == NULL)
1178                         continue;
1179                 vq = dev->data->tx_queues[i];
1180                 stats->q_opackets[i] = vq->stats.pkts;
1181                 tx_total += stats->q_opackets[i];
1182
1183                 stats->q_obytes[i] = vq->stats.bytes;
1184                 tx_total_bytes += stats->q_obytes[i];
1185         }
1186
1187         stats->ipackets = rx_total;
1188         stats->opackets = tx_total;
1189         stats->ibytes = rx_total_bytes;
1190         stats->obytes = tx_total_bytes;
1191
1192         return 0;
1193 }
1194
1195 static int
1196 eth_stats_reset(struct rte_eth_dev *dev)
1197 {
1198         struct vhost_queue *vq;
1199         unsigned i;
1200
1201         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1202                 if (dev->data->rx_queues[i] == NULL)
1203                         continue;
1204                 vq = dev->data->rx_queues[i];
1205                 vq->stats.pkts = 0;
1206                 vq->stats.bytes = 0;
1207         }
1208         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1209                 if (dev->data->tx_queues[i] == NULL)
1210                         continue;
1211                 vq = dev->data->tx_queues[i];
1212                 vq->stats.pkts = 0;
1213                 vq->stats.bytes = 0;
1214                 vq->stats.missed_pkts = 0;
1215         }
1216
1217         return 0;
1218 }
1219
1220 static void
1221 eth_queue_release(void *q)
1222 {
1223         rte_free(q);
1224 }
1225
1226 static int
1227 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1228 {
1229         /*
1230          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1231          * and releases mbuf, so nothing to cleanup.
1232          */
1233         return 0;
1234 }
1235
1236 static int
1237 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1238                 int wait_to_complete __rte_unused)
1239 {
1240         return 0;
1241 }
1242
1243 static uint32_t
1244 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1245 {
1246         struct vhost_queue *vq;
1247
1248         vq = dev->data->rx_queues[rx_queue_id];
1249         if (vq == NULL)
1250                 return 0;
1251
1252         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1253 }
1254
1255 static const struct eth_dev_ops ops = {
1256         .dev_start = eth_dev_start,
1257         .dev_stop = eth_dev_stop,
1258         .dev_close = eth_dev_close,
1259         .dev_configure = eth_dev_configure,
1260         .dev_infos_get = eth_dev_info,
1261         .rx_queue_setup = eth_rx_queue_setup,
1262         .tx_queue_setup = eth_tx_queue_setup,
1263         .rx_queue_release = eth_queue_release,
1264         .tx_queue_release = eth_queue_release,
1265         .tx_done_cleanup = eth_tx_done_cleanup,
1266         .rx_queue_count = eth_rx_queue_count,
1267         .link_update = eth_link_update,
1268         .stats_get = eth_stats_get,
1269         .stats_reset = eth_stats_reset,
1270         .xstats_reset = vhost_dev_xstats_reset,
1271         .xstats_get = vhost_dev_xstats_get,
1272         .xstats_get_names = vhost_dev_xstats_get_names,
1273         .rx_queue_intr_enable = eth_rxq_intr_enable,
1274         .rx_queue_intr_disable = eth_rxq_intr_disable,
1275 };
1276
1277 static int
1278 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1279         int16_t queues, const unsigned int numa_node, uint64_t flags,
1280         uint64_t disable_flags)
1281 {
1282         const char *name = rte_vdev_device_name(dev);
1283         struct rte_eth_dev_data *data;
1284         struct pmd_internal *internal = NULL;
1285         struct rte_eth_dev *eth_dev = NULL;
1286         struct rte_ether_addr *eth_addr = NULL;
1287
1288         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1289                 numa_node);
1290
1291         /* reserve an ethdev entry */
1292         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1293         if (eth_dev == NULL)
1294                 goto error;
1295         data = eth_dev->data;
1296
1297         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1298         if (eth_addr == NULL)
1299                 goto error;
1300         data->mac_addrs = eth_addr;
1301         *eth_addr = base_eth_addr;
1302         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1303
1304         /* now put it all together
1305          * - store queue data in internal,
1306          * - point eth_dev_data to internals
1307          * - and point eth_dev structure to new eth_dev_data structure
1308          */
1309         internal = eth_dev->data->dev_private;
1310         internal->dev_name = strdup(name);
1311         if (internal->dev_name == NULL)
1312                 goto error;
1313         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1314                                                  0, numa_node);
1315         if (internal->iface_name == NULL)
1316                 goto error;
1317         strcpy(internal->iface_name, iface_name);
1318
1319         data->nb_rx_queues = queues;
1320         data->nb_tx_queues = queues;
1321         internal->max_queues = queues;
1322         internal->vid = -1;
1323         internal->flags = flags;
1324         internal->disable_flags = disable_flags;
1325         data->dev_link = pmd_link;
1326         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1327
1328         eth_dev->dev_ops = &ops;
1329
1330         /* finally assign rx and tx ops */
1331         eth_dev->rx_pkt_burst = eth_vhost_rx;
1332         eth_dev->tx_pkt_burst = eth_vhost_tx;
1333
1334         rte_eth_dev_probing_finish(eth_dev);
1335         return 0;
1336
1337 error:
1338         if (internal) {
1339                 rte_free(internal->iface_name);
1340                 free(internal->dev_name);
1341         }
1342         rte_eth_dev_release_port(eth_dev);
1343
1344         return -1;
1345 }
1346
1347 static inline int
1348 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1349 {
1350         const char **iface_name = extra_args;
1351
1352         if (value == NULL)
1353                 return -1;
1354
1355         *iface_name = value;
1356
1357         return 0;
1358 }
1359
1360 static inline int
1361 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1362 {
1363         uint16_t *n = extra_args;
1364
1365         if (value == NULL || extra_args == NULL)
1366                 return -EINVAL;
1367
1368         *n = (uint16_t)strtoul(value, NULL, 0);
1369         if (*n == USHRT_MAX && errno == ERANGE)
1370                 return -1;
1371
1372         return 0;
1373 }
1374
1375 static int
1376 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1377 {
1378         struct rte_kvargs *kvlist = NULL;
1379         int ret = 0;
1380         char *iface_name;
1381         uint16_t queues;
1382         uint64_t flags = 0;
1383         uint64_t disable_flags = 0;
1384         int client_mode = 0;
1385         int dequeue_zero_copy = 0;
1386         int iommu_support = 0;
1387         int postcopy_support = 0;
1388         int tso = 0;
1389         struct rte_eth_dev *eth_dev;
1390         const char *name = rte_vdev_device_name(dev);
1391
1392         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1393
1394         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1395                 eth_dev = rte_eth_dev_attach_secondary(name);
1396                 if (!eth_dev) {
1397                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1398                         return -1;
1399                 }
1400                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1401                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1402                 eth_dev->dev_ops = &ops;
1403                 if (dev->device.numa_node == SOCKET_ID_ANY)
1404                         dev->device.numa_node = rte_socket_id();
1405                 eth_dev->device = &dev->device;
1406                 rte_eth_dev_probing_finish(eth_dev);
1407                 return 0;
1408         }
1409
1410         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1411         if (kvlist == NULL)
1412                 return -1;
1413
1414         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1415                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1416                                          &open_iface, &iface_name);
1417                 if (ret < 0)
1418                         goto out_free;
1419         } else {
1420                 ret = -1;
1421                 goto out_free;
1422         }
1423
1424         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1425                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1426                                          &open_int, &queues);
1427                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1428                         goto out_free;
1429
1430         } else
1431                 queues = 1;
1432
1433         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1434                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1435                                          &open_int, &client_mode);
1436                 if (ret < 0)
1437                         goto out_free;
1438
1439                 if (client_mode)
1440                         flags |= RTE_VHOST_USER_CLIENT;
1441         }
1442
1443         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1444                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1445                                          &open_int, &dequeue_zero_copy);
1446                 if (ret < 0)
1447                         goto out_free;
1448
1449                 if (dequeue_zero_copy)
1450                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1451         }
1452
1453         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1454                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1455                                          &open_int, &iommu_support);
1456                 if (ret < 0)
1457                         goto out_free;
1458
1459                 if (iommu_support)
1460                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1461         }
1462
1463         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1464                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1465                                          &open_int, &postcopy_support);
1466                 if (ret < 0)
1467                         goto out_free;
1468
1469                 if (postcopy_support)
1470                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1471         }
1472
1473         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1474                 ret = rte_kvargs_process(kvlist,
1475                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1476                                 &open_int, &tso);
1477                 if (ret < 0)
1478                         goto out_free;
1479
1480                 if (tso == 0) {
1481                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1482                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1483                 }
1484         }
1485
1486         if (dev->device.numa_node == SOCKET_ID_ANY)
1487                 dev->device.numa_node = rte_socket_id();
1488
1489         ret = eth_dev_vhost_create(dev, iface_name, queues,
1490                                    dev->device.numa_node, flags, disable_flags);
1491         if (ret == -1)
1492                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1493
1494 out_free:
1495         rte_kvargs_free(kvlist);
1496         return ret;
1497 }
1498
1499 static int
1500 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1501 {
1502         const char *name;
1503         struct rte_eth_dev *eth_dev = NULL;
1504
1505         name = rte_vdev_device_name(dev);
1506         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1507
1508         /* find an ethdev entry */
1509         eth_dev = rte_eth_dev_allocated(name);
1510         if (eth_dev == NULL)
1511                 return 0;
1512
1513         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1514                 return rte_eth_dev_release_port(eth_dev);
1515
1516         eth_dev_close(eth_dev);
1517
1518         rte_eth_dev_release_port(eth_dev);
1519
1520         return 0;
1521 }
1522
1523 static struct rte_vdev_driver pmd_vhost_drv = {
1524         .probe = rte_pmd_vhost_probe,
1525         .remove = rte_pmd_vhost_remove,
1526 };
1527
1528 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1529 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1530 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1531         "iface=<ifc> "
1532         "queues=<int> "
1533         "client=<0|1> "
1534         "dequeue-zero-copy=<0|1> "
1535         "iommu-support=<0|1> "
1536         "postcopy-support=<0|1> "
1537         "tso=<0|1>");
1538
1539 RTE_INIT(vhost_init_log)
1540 {
1541         vhost_logtype = rte_log_register("pmd.net.vhost");
1542         if (vhost_logtype >= 0)
1543                 rte_log_set_level(vhost_logtype, RTE_LOG_NOTICE);
1544 }