net/vhost: fix interrupt mode
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9
10 #include <rte_mbuf.h>
11 #include <rte_ethdev_driver.h>
12 #include <rte_ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19
20 #include "rte_eth_vhost.h"
21
22 RTE_LOG_REGISTER(vhost_logtype, pmd.net.vhost, NOTICE);
23
24 #define VHOST_LOG(level, ...) \
25         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28
29 #define ETH_VHOST_IFACE_ARG             "iface"
30 #define ETH_VHOST_QUEUES_ARG            "queues"
31 #define ETH_VHOST_CLIENT_ARG            "client"
32 #define ETH_VHOST_DEQUEUE_ZERO_COPY     "dequeue-zero-copy"
33 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
34 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
35 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
36 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
37 #define ETH_VHOST_EXT_BUF  "ext-buffer"
38 #define VHOST_MAX_PKT_BURST 32
39
40 static const char *valid_arguments[] = {
41         ETH_VHOST_IFACE_ARG,
42         ETH_VHOST_QUEUES_ARG,
43         ETH_VHOST_CLIENT_ARG,
44         ETH_VHOST_DEQUEUE_ZERO_COPY,
45         ETH_VHOST_IOMMU_SUPPORT,
46         ETH_VHOST_POSTCOPY_SUPPORT,
47         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
48         ETH_VHOST_LINEAR_BUF,
49         ETH_VHOST_EXT_BUF,
50         NULL
51 };
52
53 static struct rte_ether_addr base_eth_addr = {
54         .addr_bytes = {
55                 0x56 /* V */,
56                 0x48 /* H */,
57                 0x4F /* O */,
58                 0x53 /* S */,
59                 0x54 /* T */,
60                 0x00
61         }
62 };
63
64 enum vhost_xstats_pkts {
65         VHOST_UNDERSIZE_PKT = 0,
66         VHOST_64_PKT,
67         VHOST_65_TO_127_PKT,
68         VHOST_128_TO_255_PKT,
69         VHOST_256_TO_511_PKT,
70         VHOST_512_TO_1023_PKT,
71         VHOST_1024_TO_1522_PKT,
72         VHOST_1523_TO_MAX_PKT,
73         VHOST_BROADCAST_PKT,
74         VHOST_MULTICAST_PKT,
75         VHOST_UNICAST_PKT,
76         VHOST_ERRORS_PKT,
77         VHOST_ERRORS_FRAGMENTED,
78         VHOST_ERRORS_JABBER,
79         VHOST_UNKNOWN_PROTOCOL,
80         VHOST_XSTATS_MAX,
81 };
82
83 struct vhost_stats {
84         uint64_t pkts;
85         uint64_t bytes;
86         uint64_t missed_pkts;
87         uint64_t xstats[VHOST_XSTATS_MAX];
88 };
89
90 struct vhost_queue {
91         int vid;
92         rte_atomic32_t allow_queuing;
93         rte_atomic32_t while_queuing;
94         struct pmd_internal *internal;
95         struct rte_mempool *mb_pool;
96         uint16_t port;
97         uint16_t virtqueue_id;
98         struct vhost_stats stats;
99         int intr_enable;
100         rte_spinlock_t intr_lock;
101 };
102
103 struct pmd_internal {
104         rte_atomic32_t dev_attached;
105         char *iface_name;
106         uint64_t flags;
107         uint64_t disable_flags;
108         uint16_t max_queues;
109         int vid;
110         rte_atomic32_t started;
111         uint8_t vlan_strip;
112 };
113
114 struct internal_list {
115         TAILQ_ENTRY(internal_list) next;
116         struct rte_eth_dev *eth_dev;
117 };
118
119 TAILQ_HEAD(internal_list_head, internal_list);
120 static struct internal_list_head internal_list =
121         TAILQ_HEAD_INITIALIZER(internal_list);
122
123 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
124
125 static struct rte_eth_link pmd_link = {
126                 .link_speed = 10000,
127                 .link_duplex = ETH_LINK_FULL_DUPLEX,
128                 .link_status = ETH_LINK_DOWN
129 };
130
131 struct rte_vhost_vring_state {
132         rte_spinlock_t lock;
133
134         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
135         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
136         unsigned int index;
137         unsigned int max_vring;
138 };
139
140 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
141
142 #define VHOST_XSTATS_NAME_SIZE 64
143
144 struct vhost_xstats_name_off {
145         char name[VHOST_XSTATS_NAME_SIZE];
146         uint64_t offset;
147 };
148
149 /* [rx]_is prepended to the name string here */
150 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
151         {"good_packets",
152          offsetof(struct vhost_queue, stats.pkts)},
153         {"total_bytes",
154          offsetof(struct vhost_queue, stats.bytes)},
155         {"missed_pkts",
156          offsetof(struct vhost_queue, stats.missed_pkts)},
157         {"broadcast_packets",
158          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
159         {"multicast_packets",
160          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
161         {"unicast_packets",
162          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
163          {"undersize_packets",
164          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
165         {"size_64_packets",
166          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
167         {"size_65_to_127_packets",
168          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
169         {"size_128_to_255_packets",
170          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
171         {"size_256_to_511_packets",
172          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
173         {"size_512_to_1023_packets",
174          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
175         {"size_1024_to_1522_packets",
176          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
177         {"size_1523_to_max_packets",
178          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
179         {"errors_with_bad_CRC",
180          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
181         {"fragmented_errors",
182          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
183         {"jabber_errors",
184          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
185         {"unknown_protos_packets",
186          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
187 };
188
189 /* [tx]_ is prepended to the name string here */
190 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
191         {"good_packets",
192          offsetof(struct vhost_queue, stats.pkts)},
193         {"total_bytes",
194          offsetof(struct vhost_queue, stats.bytes)},
195         {"missed_pkts",
196          offsetof(struct vhost_queue, stats.missed_pkts)},
197         {"broadcast_packets",
198          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
199         {"multicast_packets",
200          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
201         {"unicast_packets",
202          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
203         {"undersize_packets",
204          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
205         {"size_64_packets",
206          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
207         {"size_65_to_127_packets",
208          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
209         {"size_128_to_255_packets",
210          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
211         {"size_256_to_511_packets",
212          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
213         {"size_512_to_1023_packets",
214          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
215         {"size_1024_to_1522_packets",
216          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
217         {"size_1523_to_max_packets",
218          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
219         {"errors_with_bad_CRC",
220          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
221 };
222
223 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
224                                 sizeof(vhost_rxport_stat_strings[0]))
225
226 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
227                                 sizeof(vhost_txport_stat_strings[0]))
228
229 static int
230 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
231 {
232         struct vhost_queue *vq = NULL;
233         unsigned int i = 0;
234
235         for (i = 0; i < dev->data->nb_rx_queues; i++) {
236                 vq = dev->data->rx_queues[i];
237                 if (!vq)
238                         continue;
239                 memset(&vq->stats, 0, sizeof(vq->stats));
240         }
241         for (i = 0; i < dev->data->nb_tx_queues; i++) {
242                 vq = dev->data->tx_queues[i];
243                 if (!vq)
244                         continue;
245                 memset(&vq->stats, 0, sizeof(vq->stats));
246         }
247
248         return 0;
249 }
250
251 static int
252 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
253                            struct rte_eth_xstat_name *xstats_names,
254                            unsigned int limit __rte_unused)
255 {
256         unsigned int t = 0;
257         int count = 0;
258         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
259
260         if (!xstats_names)
261                 return nstats;
262         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
263                 snprintf(xstats_names[count].name,
264                          sizeof(xstats_names[count].name),
265                          "rx_%s", vhost_rxport_stat_strings[t].name);
266                 count++;
267         }
268         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
269                 snprintf(xstats_names[count].name,
270                          sizeof(xstats_names[count].name),
271                          "tx_%s", vhost_txport_stat_strings[t].name);
272                 count++;
273         }
274         return count;
275 }
276
277 static int
278 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
279                      unsigned int n)
280 {
281         unsigned int i;
282         unsigned int t;
283         unsigned int count = 0;
284         struct vhost_queue *vq = NULL;
285         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
286
287         if (n < nxstats)
288                 return nxstats;
289
290         for (i = 0; i < dev->data->nb_rx_queues; i++) {
291                 vq = dev->data->rx_queues[i];
292                 if (!vq)
293                         continue;
294                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
295                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
296                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
297         }
298         for (i = 0; i < dev->data->nb_tx_queues; i++) {
299                 vq = dev->data->tx_queues[i];
300                 if (!vq)
301                         continue;
302                 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
303                                 + vq->stats.missed_pkts
304                                 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
305                                 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
306         }
307         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
308                 xstats[count].value = 0;
309                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
310                         vq = dev->data->rx_queues[i];
311                         if (!vq)
312                                 continue;
313                         xstats[count].value +=
314                                 *(uint64_t *)(((char *)vq)
315                                 + vhost_rxport_stat_strings[t].offset);
316                 }
317                 xstats[count].id = count;
318                 count++;
319         }
320         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
321                 xstats[count].value = 0;
322                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
323                         vq = dev->data->tx_queues[i];
324                         if (!vq)
325                                 continue;
326                         xstats[count].value +=
327                                 *(uint64_t *)(((char *)vq)
328                                 + vhost_txport_stat_strings[t].offset);
329                 }
330                 xstats[count].id = count;
331                 count++;
332         }
333         return count;
334 }
335
336 static inline void
337 vhost_count_multicast_broadcast(struct vhost_queue *vq,
338                                 struct rte_mbuf *mbuf)
339 {
340         struct rte_ether_addr *ea = NULL;
341         struct vhost_stats *pstats = &vq->stats;
342
343         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
344         if (rte_is_multicast_ether_addr(ea)) {
345                 if (rte_is_broadcast_ether_addr(ea))
346                         pstats->xstats[VHOST_BROADCAST_PKT]++;
347                 else
348                         pstats->xstats[VHOST_MULTICAST_PKT]++;
349         }
350 }
351
352 static void
353 vhost_update_packet_xstats(struct vhost_queue *vq,
354                            struct rte_mbuf **bufs,
355                            uint16_t count)
356 {
357         uint32_t pkt_len = 0;
358         uint64_t i = 0;
359         uint64_t index;
360         struct vhost_stats *pstats = &vq->stats;
361
362         for (i = 0; i < count ; i++) {
363                 pkt_len = bufs[i]->pkt_len;
364                 if (pkt_len == 64) {
365                         pstats->xstats[VHOST_64_PKT]++;
366                 } else if (pkt_len > 64 && pkt_len < 1024) {
367                         index = (sizeof(pkt_len) * 8)
368                                 - __builtin_clz(pkt_len) - 5;
369                         pstats->xstats[index]++;
370                 } else {
371                         if (pkt_len < 64)
372                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
373                         else if (pkt_len <= 1522)
374                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
375                         else if (pkt_len > 1522)
376                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
377                 }
378                 vhost_count_multicast_broadcast(vq, bufs[i]);
379         }
380 }
381
382 static uint16_t
383 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
384 {
385         struct vhost_queue *r = q;
386         uint16_t i, nb_rx = 0;
387         uint16_t nb_receive = nb_bufs;
388
389         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
390                 return 0;
391
392         rte_atomic32_set(&r->while_queuing, 1);
393
394         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
395                 goto out;
396
397         /* Dequeue packets from guest TX queue */
398         while (nb_receive) {
399                 uint16_t nb_pkts;
400                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
401                                                  VHOST_MAX_PKT_BURST);
402
403                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
404                                                   r->mb_pool, &bufs[nb_rx],
405                                                   num);
406
407                 nb_rx += nb_pkts;
408                 nb_receive -= nb_pkts;
409                 if (nb_pkts < num)
410                         break;
411         }
412
413         r->stats.pkts += nb_rx;
414
415         for (i = 0; likely(i < nb_rx); i++) {
416                 bufs[i]->port = r->port;
417                 bufs[i]->vlan_tci = 0;
418
419                 if (r->internal->vlan_strip)
420                         rte_vlan_strip(bufs[i]);
421
422                 r->stats.bytes += bufs[i]->pkt_len;
423         }
424
425         vhost_update_packet_xstats(r, bufs, nb_rx);
426
427 out:
428         rte_atomic32_set(&r->while_queuing, 0);
429
430         return nb_rx;
431 }
432
433 static uint16_t
434 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
435 {
436         struct vhost_queue *r = q;
437         uint16_t i, nb_tx = 0;
438         uint16_t nb_send = 0;
439
440         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
441                 return 0;
442
443         rte_atomic32_set(&r->while_queuing, 1);
444
445         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
446                 goto out;
447
448         for (i = 0; i < nb_bufs; i++) {
449                 struct rte_mbuf *m = bufs[i];
450
451                 /* Do VLAN tag insertion */
452                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
453                         int error = rte_vlan_insert(&m);
454                         if (unlikely(error)) {
455                                 rte_pktmbuf_free(m);
456                                 continue;
457                         }
458                 }
459
460                 bufs[nb_send] = m;
461                 ++nb_send;
462         }
463
464         /* Enqueue packets to guest RX queue */
465         while (nb_send) {
466                 uint16_t nb_pkts;
467                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
468                                                  VHOST_MAX_PKT_BURST);
469
470                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
471                                                   &bufs[nb_tx], num);
472
473                 nb_tx += nb_pkts;
474                 nb_send -= nb_pkts;
475                 if (nb_pkts < num)
476                         break;
477         }
478
479         r->stats.pkts += nb_tx;
480         r->stats.missed_pkts += nb_bufs - nb_tx;
481
482         for (i = 0; likely(i < nb_tx); i++)
483                 r->stats.bytes += bufs[i]->pkt_len;
484
485         vhost_update_packet_xstats(r, bufs, nb_tx);
486
487         /* According to RFC2863 page42 section ifHCOutMulticastPkts and
488          * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
489          * are increased when packets are not transmitted successfully.
490          */
491         for (i = nb_tx; i < nb_bufs; i++)
492                 vhost_count_multicast_broadcast(r, bufs[i]);
493
494         for (i = 0; likely(i < nb_tx); i++)
495                 rte_pktmbuf_free(bufs[i]);
496 out:
497         rte_atomic32_set(&r->while_queuing, 0);
498
499         return nb_tx;
500 }
501
502 static inline struct internal_list *
503 find_internal_resource(char *ifname)
504 {
505         int found = 0;
506         struct internal_list *list;
507         struct pmd_internal *internal;
508
509         if (ifname == NULL)
510                 return NULL;
511
512         pthread_mutex_lock(&internal_list_lock);
513
514         TAILQ_FOREACH(list, &internal_list, next) {
515                 internal = list->eth_dev->data->dev_private;
516                 if (!strcmp(internal->iface_name, ifname)) {
517                         found = 1;
518                         break;
519                 }
520         }
521
522         pthread_mutex_unlock(&internal_list_lock);
523
524         if (!found)
525                 return NULL;
526
527         return list;
528 }
529
530 static int
531 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
532 {
533         struct rte_intr_handle *handle = eth_dev->intr_handle;
534         struct rte_epoll_event rev;
535         int epfd, ret;
536
537         if (!handle)
538                 return 0;
539
540         if (handle->efds[rxq_idx] == handle->elist[rxq_idx].fd)
541                 return 0;
542
543         VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
544                         rxq_idx);
545
546         if (handle->elist[rxq_idx].fd != -1)
547                 VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
548                                 handle->elist[rxq_idx].fd);
549
550         /*
551          * First remove invalid epoll event, and then install
552          * the new one. May be solved with a proper API in the
553          * future.
554          */
555         epfd = handle->elist[rxq_idx].epfd;
556         rev = handle->elist[rxq_idx];
557         ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
558                         &handle->elist[rxq_idx]);
559         if (ret) {
560                 VHOST_LOG(ERR, "Delete epoll event failed.\n");
561                 return ret;
562         }
563
564         rev.fd = handle->efds[rxq_idx];
565         handle->elist[rxq_idx] = rev;
566         ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd,
567                         &handle->elist[rxq_idx]);
568         if (ret) {
569                 VHOST_LOG(ERR, "Add epoll event failed.\n");
570                 return ret;
571         }
572
573         return 0;
574 }
575
576 static int
577 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
578 {
579         struct rte_vhost_vring vring;
580         struct vhost_queue *vq;
581         int old_intr_enable, ret = 0;
582
583         vq = dev->data->rx_queues[qid];
584         if (!vq) {
585                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
586                 return -1;
587         }
588
589         rte_spinlock_lock(&vq->intr_lock);
590         old_intr_enable = vq->intr_enable;
591         vq->intr_enable = 1;
592         ret = eth_vhost_update_intr(dev, qid);
593         rte_spinlock_unlock(&vq->intr_lock);
594
595         if (ret < 0) {
596                 VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
597                 vq->intr_enable = old_intr_enable;
598                 return ret;
599         }
600
601         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
602         if (ret < 0) {
603                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
604                 return ret;
605         }
606         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
607         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
608         rte_wmb();
609
610         return ret;
611 }
612
613 static int
614 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
615 {
616         struct rte_vhost_vring vring;
617         struct vhost_queue *vq;
618         int ret = 0;
619
620         vq = dev->data->rx_queues[qid];
621         if (!vq) {
622                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
623                 return -1;
624         }
625
626         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
627         if (ret < 0) {
628                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
629                 return ret;
630         }
631         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
632         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
633         rte_wmb();
634
635         vq->intr_enable = 0;
636
637         return 0;
638 }
639
640 static void
641 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
642 {
643         struct rte_intr_handle *intr_handle = dev->intr_handle;
644
645         if (intr_handle) {
646                 if (intr_handle->intr_vec)
647                         free(intr_handle->intr_vec);
648                 free(intr_handle);
649         }
650
651         dev->intr_handle = NULL;
652 }
653
654 static int
655 eth_vhost_install_intr(struct rte_eth_dev *dev)
656 {
657         struct rte_vhost_vring vring;
658         struct vhost_queue *vq;
659         int nb_rxq = dev->data->nb_rx_queues;
660         int i;
661         int ret;
662
663         /* uninstall firstly if we are reconnecting */
664         if (dev->intr_handle)
665                 eth_vhost_uninstall_intr(dev);
666
667         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
668         if (!dev->intr_handle) {
669                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
670                 return -ENOMEM;
671         }
672         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
673
674         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
675
676         dev->intr_handle->intr_vec =
677                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
678
679         if (!dev->intr_handle->intr_vec) {
680                 VHOST_LOG(ERR,
681                         "Failed to allocate memory for interrupt vector\n");
682                 free(dev->intr_handle);
683                 return -ENOMEM;
684         }
685
686         VHOST_LOG(INFO, "Prepare intr vec\n");
687         for (i = 0; i < nb_rxq; i++) {
688                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
689                 dev->intr_handle->efds[i] = -1;
690                 vq = dev->data->rx_queues[i];
691                 if (!vq) {
692                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
693                         continue;
694                 }
695
696                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
697                 if (ret < 0) {
698                         VHOST_LOG(INFO,
699                                 "Failed to get rxq-%d's vring, skip!\n", i);
700                         continue;
701                 }
702
703                 if (vring.kickfd < 0) {
704                         VHOST_LOG(INFO,
705                                 "rxq-%d's kickfd is invalid, skip!\n", i);
706                         continue;
707                 }
708                 dev->intr_handle->efds[i] = vring.kickfd;
709                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
710         }
711
712         dev->intr_handle->nb_efd = nb_rxq;
713         dev->intr_handle->max_intr = nb_rxq + 1;
714         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
715
716         return 0;
717 }
718
719 static void
720 update_queuing_status(struct rte_eth_dev *dev)
721 {
722         struct pmd_internal *internal = dev->data->dev_private;
723         struct vhost_queue *vq;
724         unsigned int i;
725         int allow_queuing = 1;
726
727         if (!dev->data->rx_queues || !dev->data->tx_queues)
728                 return;
729
730         if (rte_atomic32_read(&internal->started) == 0 ||
731             rte_atomic32_read(&internal->dev_attached) == 0)
732                 allow_queuing = 0;
733
734         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
735         for (i = 0; i < dev->data->nb_rx_queues; i++) {
736                 vq = dev->data->rx_queues[i];
737                 if (vq == NULL)
738                         continue;
739                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
740                 while (rte_atomic32_read(&vq->while_queuing))
741                         rte_pause();
742         }
743
744         for (i = 0; i < dev->data->nb_tx_queues; i++) {
745                 vq = dev->data->tx_queues[i];
746                 if (vq == NULL)
747                         continue;
748                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
749                 while (rte_atomic32_read(&vq->while_queuing))
750                         rte_pause();
751         }
752 }
753
754 static void
755 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
756 {
757         struct vhost_queue *vq;
758         int i;
759
760         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
761                 vq = eth_dev->data->rx_queues[i];
762                 if (!vq)
763                         continue;
764                 vq->vid = internal->vid;
765                 vq->internal = internal;
766                 vq->port = eth_dev->data->port_id;
767         }
768         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
769                 vq = eth_dev->data->tx_queues[i];
770                 if (!vq)
771                         continue;
772                 vq->vid = internal->vid;
773                 vq->internal = internal;
774                 vq->port = eth_dev->data->port_id;
775         }
776 }
777
778 static int
779 new_device(int vid)
780 {
781         struct rte_eth_dev *eth_dev;
782         struct internal_list *list;
783         struct pmd_internal *internal;
784         struct rte_eth_conf *dev_conf;
785         unsigned i;
786         char ifname[PATH_MAX];
787 #ifdef RTE_LIBRTE_VHOST_NUMA
788         int newnode;
789 #endif
790
791         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
792         list = find_internal_resource(ifname);
793         if (list == NULL) {
794                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
795                 return -1;
796         }
797
798         eth_dev = list->eth_dev;
799         internal = eth_dev->data->dev_private;
800         dev_conf = &eth_dev->data->dev_conf;
801
802 #ifdef RTE_LIBRTE_VHOST_NUMA
803         newnode = rte_vhost_get_numa_node(vid);
804         if (newnode >= 0)
805                 eth_dev->data->numa_node = newnode;
806 #endif
807
808         internal->vid = vid;
809         if (rte_atomic32_read(&internal->started) == 1) {
810                 queue_setup(eth_dev, internal);
811
812                 if (dev_conf->intr_conf.rxq) {
813                         if (eth_vhost_install_intr(eth_dev) < 0) {
814                                 VHOST_LOG(INFO,
815                                         "Failed to install interrupt handler.");
816                                         return -1;
817                         }
818                 }
819         } else {
820                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
821         }
822
823         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
824                 rte_vhost_enable_guest_notification(vid, i, 0);
825
826         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
827
828         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
829
830         rte_atomic32_set(&internal->dev_attached, 1);
831         update_queuing_status(eth_dev);
832
833         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
834
835         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
836
837         return 0;
838 }
839
840 static void
841 destroy_device(int vid)
842 {
843         struct rte_eth_dev *eth_dev;
844         struct pmd_internal *internal;
845         struct vhost_queue *vq;
846         struct internal_list *list;
847         char ifname[PATH_MAX];
848         unsigned i;
849         struct rte_vhost_vring_state *state;
850
851         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
852         list = find_internal_resource(ifname);
853         if (list == NULL) {
854                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
855                 return;
856         }
857         eth_dev = list->eth_dev;
858         internal = eth_dev->data->dev_private;
859
860         rte_atomic32_set(&internal->dev_attached, 0);
861         update_queuing_status(eth_dev);
862
863         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
864
865         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
866                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
867                         vq = eth_dev->data->rx_queues[i];
868                         if (!vq)
869                                 continue;
870                         vq->vid = -1;
871                 }
872                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
873                         vq = eth_dev->data->tx_queues[i];
874                         if (!vq)
875                                 continue;
876                         vq->vid = -1;
877                 }
878         }
879
880         state = vring_states[eth_dev->data->port_id];
881         rte_spinlock_lock(&state->lock);
882         for (i = 0; i <= state->max_vring; i++) {
883                 state->cur[i] = false;
884                 state->seen[i] = false;
885         }
886         state->max_vring = 0;
887         rte_spinlock_unlock(&state->lock);
888
889         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
890         eth_vhost_uninstall_intr(eth_dev);
891
892         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
893 }
894
895 static int
896 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
897 {
898         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
899         struct pmd_internal *internal = eth_dev->data->dev_private;
900         struct vhost_queue *vq;
901         struct rte_vhost_vring vring;
902         int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
903         int ret = 0;
904
905         /*
906          * The vring kickfd may be changed after the new device notification.
907          * Update it when the vring state is updated.
908          */
909         if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
910             rte_atomic32_read(&internal->dev_attached) &&
911             rte_atomic32_read(&internal->started) &&
912             dev_conf->intr_conf.rxq) {
913                 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
914                 if (ret) {
915                         VHOST_LOG(ERR, "Failed to get vring %d information.\n",
916                                         vring_id);
917                         return ret;
918                 }
919                 eth_dev->intr_handle->efds[rx_idx] = vring.kickfd;
920
921                 vq = eth_dev->data->rx_queues[rx_idx];
922                 if (!vq) {
923                         VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
924                         return -1;
925                 }
926
927                 rte_spinlock_lock(&vq->intr_lock);
928                 if (vq->intr_enable)
929                         ret = eth_vhost_update_intr(eth_dev, rx_idx);
930                 rte_spinlock_unlock(&vq->intr_lock);
931         }
932
933         return ret;
934 }
935
936 static int
937 vring_state_changed(int vid, uint16_t vring, int enable)
938 {
939         struct rte_vhost_vring_state *state;
940         struct rte_eth_dev *eth_dev;
941         struct internal_list *list;
942         char ifname[PATH_MAX];
943
944         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
945         list = find_internal_resource(ifname);
946         if (list == NULL) {
947                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
948                 return -1;
949         }
950
951         eth_dev = list->eth_dev;
952         /* won't be NULL */
953         state = vring_states[eth_dev->data->port_id];
954
955         if (enable && vring_conf_update(vid, eth_dev, vring))
956                 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
957                           (int)vring);
958
959         rte_spinlock_lock(&state->lock);
960         if (state->cur[vring] == enable) {
961                 rte_spinlock_unlock(&state->lock);
962                 return 0;
963         }
964         state->cur[vring] = enable;
965         state->max_vring = RTE_MAX(vring, state->max_vring);
966         rte_spinlock_unlock(&state->lock);
967
968         VHOST_LOG(INFO, "vring%u is %s\n",
969                         vring, enable ? "enabled" : "disabled");
970
971         _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
972
973         return 0;
974 }
975
976 static struct vhost_device_ops vhost_ops = {
977         .new_device          = new_device,
978         .destroy_device      = destroy_device,
979         .vring_state_changed = vring_state_changed,
980 };
981
982 static int
983 vhost_driver_setup(struct rte_eth_dev *eth_dev)
984 {
985         struct pmd_internal *internal = eth_dev->data->dev_private;
986         struct internal_list *list = NULL;
987         struct rte_vhost_vring_state *vring_state = NULL;
988         unsigned int numa_node = eth_dev->device->numa_node;
989         const char *name = eth_dev->device->name;
990
991         /* Don't try to setup again if it has already been done. */
992         list = find_internal_resource(internal->iface_name);
993         if (list)
994                 return 0;
995
996         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
997         if (list == NULL)
998                 return -1;
999
1000         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
1001                                          0, numa_node);
1002         if (vring_state == NULL)
1003                 goto free_list;
1004
1005         list->eth_dev = eth_dev;
1006         pthread_mutex_lock(&internal_list_lock);
1007         TAILQ_INSERT_TAIL(&internal_list, list, next);
1008         pthread_mutex_unlock(&internal_list_lock);
1009
1010         rte_spinlock_init(&vring_state->lock);
1011         vring_states[eth_dev->data->port_id] = vring_state;
1012
1013         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1014                 goto list_remove;
1015
1016         if (internal->disable_flags) {
1017                 if (rte_vhost_driver_disable_features(internal->iface_name,
1018                                                       internal->disable_flags))
1019                         goto drv_unreg;
1020         }
1021
1022         if (rte_vhost_driver_callback_register(internal->iface_name,
1023                                                &vhost_ops) < 0) {
1024                 VHOST_LOG(ERR, "Can't register callbacks\n");
1025                 goto drv_unreg;
1026         }
1027
1028         if (rte_vhost_driver_start(internal->iface_name) < 0) {
1029                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
1030                           internal->iface_name);
1031                 goto drv_unreg;
1032         }
1033
1034         return 0;
1035
1036 drv_unreg:
1037         rte_vhost_driver_unregister(internal->iface_name);
1038 list_remove:
1039         vring_states[eth_dev->data->port_id] = NULL;
1040         pthread_mutex_lock(&internal_list_lock);
1041         TAILQ_REMOVE(&internal_list, list, next);
1042         pthread_mutex_unlock(&internal_list_lock);
1043         rte_free(vring_state);
1044 free_list:
1045         rte_free(list);
1046
1047         return -1;
1048 }
1049
1050 int
1051 rte_eth_vhost_get_queue_event(uint16_t port_id,
1052                 struct rte_eth_vhost_queue_event *event)
1053 {
1054         struct rte_vhost_vring_state *state;
1055         unsigned int i;
1056         int idx;
1057
1058         if (port_id >= RTE_MAX_ETHPORTS) {
1059                 VHOST_LOG(ERR, "Invalid port id\n");
1060                 return -1;
1061         }
1062
1063         state = vring_states[port_id];
1064         if (!state) {
1065                 VHOST_LOG(ERR, "Unused port\n");
1066                 return -1;
1067         }
1068
1069         rte_spinlock_lock(&state->lock);
1070         for (i = 0; i <= state->max_vring; i++) {
1071                 idx = state->index++ % (state->max_vring + 1);
1072
1073                 if (state->cur[idx] != state->seen[idx]) {
1074                         state->seen[idx] = state->cur[idx];
1075                         event->queue_id = idx / 2;
1076                         event->rx = idx & 1;
1077                         event->enable = state->cur[idx];
1078                         rte_spinlock_unlock(&state->lock);
1079                         return 0;
1080                 }
1081         }
1082         rte_spinlock_unlock(&state->lock);
1083
1084         return -1;
1085 }
1086
1087 int
1088 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1089 {
1090         struct internal_list *list;
1091         struct rte_eth_dev *eth_dev;
1092         struct vhost_queue *vq;
1093         int vid = -1;
1094
1095         if (!rte_eth_dev_is_valid_port(port_id))
1096                 return -1;
1097
1098         pthread_mutex_lock(&internal_list_lock);
1099
1100         TAILQ_FOREACH(list, &internal_list, next) {
1101                 eth_dev = list->eth_dev;
1102                 if (eth_dev->data->port_id == port_id) {
1103                         vq = eth_dev->data->rx_queues[0];
1104                         if (vq) {
1105                                 vid = vq->vid;
1106                         }
1107                         break;
1108                 }
1109         }
1110
1111         pthread_mutex_unlock(&internal_list_lock);
1112
1113         return vid;
1114 }
1115
1116 static int
1117 eth_dev_configure(struct rte_eth_dev *dev)
1118 {
1119         struct pmd_internal *internal = dev->data->dev_private;
1120         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1121
1122         /* NOTE: the same process has to operate a vhost interface
1123          * from beginning to end (from eth_dev configure to eth_dev close).
1124          * It is user's responsibility at the moment.
1125          */
1126         if (vhost_driver_setup(dev) < 0)
1127                 return -1;
1128
1129         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1130
1131         return 0;
1132 }
1133
1134 static int
1135 eth_dev_start(struct rte_eth_dev *eth_dev)
1136 {
1137         struct pmd_internal *internal = eth_dev->data->dev_private;
1138         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1139
1140         queue_setup(eth_dev, internal);
1141
1142         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1143                 if (dev_conf->intr_conf.rxq) {
1144                         if (eth_vhost_install_intr(eth_dev) < 0) {
1145                                 VHOST_LOG(INFO,
1146                                         "Failed to install interrupt handler.");
1147                                         return -1;
1148                         }
1149                 }
1150         }
1151
1152         rte_atomic32_set(&internal->started, 1);
1153         update_queuing_status(eth_dev);
1154
1155         return 0;
1156 }
1157
1158 static void
1159 eth_dev_stop(struct rte_eth_dev *dev)
1160 {
1161         struct pmd_internal *internal = dev->data->dev_private;
1162
1163         rte_atomic32_set(&internal->started, 0);
1164         update_queuing_status(dev);
1165 }
1166
1167 static void
1168 eth_dev_close(struct rte_eth_dev *dev)
1169 {
1170         struct pmd_internal *internal;
1171         struct internal_list *list;
1172         unsigned int i;
1173
1174         internal = dev->data->dev_private;
1175         if (!internal)
1176                 return;
1177
1178         eth_dev_stop(dev);
1179
1180         list = find_internal_resource(internal->iface_name);
1181         if (list) {
1182                 rte_vhost_driver_unregister(internal->iface_name);
1183                 pthread_mutex_lock(&internal_list_lock);
1184                 TAILQ_REMOVE(&internal_list, list, next);
1185                 pthread_mutex_unlock(&internal_list_lock);
1186                 rte_free(list);
1187         }
1188
1189         if (dev->data->rx_queues)
1190                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1191                         rte_free(dev->data->rx_queues[i]);
1192
1193         if (dev->data->tx_queues)
1194                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1195                         rte_free(dev->data->tx_queues[i]);
1196
1197         rte_free(internal->iface_name);
1198         rte_free(internal);
1199
1200         dev->data->dev_private = NULL;
1201
1202         rte_free(vring_states[dev->data->port_id]);
1203         vring_states[dev->data->port_id] = NULL;
1204 }
1205
1206 static int
1207 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1208                    uint16_t nb_rx_desc __rte_unused,
1209                    unsigned int socket_id,
1210                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1211                    struct rte_mempool *mb_pool)
1212 {
1213         struct vhost_queue *vq;
1214
1215         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1216                         RTE_CACHE_LINE_SIZE, socket_id);
1217         if (vq == NULL) {
1218                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1219                 return -ENOMEM;
1220         }
1221
1222         vq->mb_pool = mb_pool;
1223         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1224         rte_spinlock_init(&vq->intr_lock);
1225         dev->data->rx_queues[rx_queue_id] = vq;
1226
1227         return 0;
1228 }
1229
1230 static int
1231 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1232                    uint16_t nb_tx_desc __rte_unused,
1233                    unsigned int socket_id,
1234                    const struct rte_eth_txconf *tx_conf __rte_unused)
1235 {
1236         struct vhost_queue *vq;
1237
1238         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1239                         RTE_CACHE_LINE_SIZE, socket_id);
1240         if (vq == NULL) {
1241                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1242                 return -ENOMEM;
1243         }
1244
1245         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1246         rte_spinlock_init(&vq->intr_lock);
1247         dev->data->tx_queues[tx_queue_id] = vq;
1248
1249         return 0;
1250 }
1251
1252 static int
1253 eth_dev_info(struct rte_eth_dev *dev,
1254              struct rte_eth_dev_info *dev_info)
1255 {
1256         struct pmd_internal *internal;
1257
1258         internal = dev->data->dev_private;
1259         if (internal == NULL) {
1260                 VHOST_LOG(ERR, "Invalid device specified\n");
1261                 return -ENODEV;
1262         }
1263
1264         dev_info->max_mac_addrs = 1;
1265         dev_info->max_rx_pktlen = (uint32_t)-1;
1266         dev_info->max_rx_queues = internal->max_queues;
1267         dev_info->max_tx_queues = internal->max_queues;
1268         dev_info->min_rx_bufsize = 0;
1269
1270         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1271                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1272         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1273
1274         return 0;
1275 }
1276
1277 static int
1278 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1279 {
1280         unsigned i;
1281         unsigned long rx_total = 0, tx_total = 0;
1282         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1283         struct vhost_queue *vq;
1284
1285         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1286                         i < dev->data->nb_rx_queues; i++) {
1287                 if (dev->data->rx_queues[i] == NULL)
1288                         continue;
1289                 vq = dev->data->rx_queues[i];
1290                 stats->q_ipackets[i] = vq->stats.pkts;
1291                 rx_total += stats->q_ipackets[i];
1292
1293                 stats->q_ibytes[i] = vq->stats.bytes;
1294                 rx_total_bytes += stats->q_ibytes[i];
1295         }
1296
1297         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1298                         i < dev->data->nb_tx_queues; i++) {
1299                 if (dev->data->tx_queues[i] == NULL)
1300                         continue;
1301                 vq = dev->data->tx_queues[i];
1302                 stats->q_opackets[i] = vq->stats.pkts;
1303                 tx_total += stats->q_opackets[i];
1304
1305                 stats->q_obytes[i] = vq->stats.bytes;
1306                 tx_total_bytes += stats->q_obytes[i];
1307         }
1308
1309         stats->ipackets = rx_total;
1310         stats->opackets = tx_total;
1311         stats->ibytes = rx_total_bytes;
1312         stats->obytes = tx_total_bytes;
1313
1314         return 0;
1315 }
1316
1317 static int
1318 eth_stats_reset(struct rte_eth_dev *dev)
1319 {
1320         struct vhost_queue *vq;
1321         unsigned i;
1322
1323         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1324                 if (dev->data->rx_queues[i] == NULL)
1325                         continue;
1326                 vq = dev->data->rx_queues[i];
1327                 vq->stats.pkts = 0;
1328                 vq->stats.bytes = 0;
1329         }
1330         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1331                 if (dev->data->tx_queues[i] == NULL)
1332                         continue;
1333                 vq = dev->data->tx_queues[i];
1334                 vq->stats.pkts = 0;
1335                 vq->stats.bytes = 0;
1336                 vq->stats.missed_pkts = 0;
1337         }
1338
1339         return 0;
1340 }
1341
1342 static void
1343 eth_queue_release(void *q)
1344 {
1345         rte_free(q);
1346 }
1347
1348 static int
1349 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1350 {
1351         /*
1352          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1353          * and releases mbuf, so nothing to cleanup.
1354          */
1355         return 0;
1356 }
1357
1358 static int
1359 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1360                 int wait_to_complete __rte_unused)
1361 {
1362         return 0;
1363 }
1364
1365 static uint32_t
1366 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1367 {
1368         struct vhost_queue *vq;
1369
1370         vq = dev->data->rx_queues[rx_queue_id];
1371         if (vq == NULL)
1372                 return 0;
1373
1374         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1375 }
1376
1377 static const struct eth_dev_ops ops = {
1378         .dev_start = eth_dev_start,
1379         .dev_stop = eth_dev_stop,
1380         .dev_close = eth_dev_close,
1381         .dev_configure = eth_dev_configure,
1382         .dev_infos_get = eth_dev_info,
1383         .rx_queue_setup = eth_rx_queue_setup,
1384         .tx_queue_setup = eth_tx_queue_setup,
1385         .rx_queue_release = eth_queue_release,
1386         .tx_queue_release = eth_queue_release,
1387         .tx_done_cleanup = eth_tx_done_cleanup,
1388         .rx_queue_count = eth_rx_queue_count,
1389         .link_update = eth_link_update,
1390         .stats_get = eth_stats_get,
1391         .stats_reset = eth_stats_reset,
1392         .xstats_reset = vhost_dev_xstats_reset,
1393         .xstats_get = vhost_dev_xstats_get,
1394         .xstats_get_names = vhost_dev_xstats_get_names,
1395         .rx_queue_intr_enable = eth_rxq_intr_enable,
1396         .rx_queue_intr_disable = eth_rxq_intr_disable,
1397 };
1398
1399 static int
1400 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1401         int16_t queues, const unsigned int numa_node, uint64_t flags,
1402         uint64_t disable_flags)
1403 {
1404         const char *name = rte_vdev_device_name(dev);
1405         struct rte_eth_dev_data *data;
1406         struct pmd_internal *internal = NULL;
1407         struct rte_eth_dev *eth_dev = NULL;
1408         struct rte_ether_addr *eth_addr = NULL;
1409
1410         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1411                 numa_node);
1412
1413         /* reserve an ethdev entry */
1414         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1415         if (eth_dev == NULL)
1416                 goto error;
1417         data = eth_dev->data;
1418
1419         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1420         if (eth_addr == NULL)
1421                 goto error;
1422         data->mac_addrs = eth_addr;
1423         *eth_addr = base_eth_addr;
1424         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1425
1426         /* now put it all together
1427          * - store queue data in internal,
1428          * - point eth_dev_data to internals
1429          * - and point eth_dev structure to new eth_dev_data structure
1430          */
1431         internal = eth_dev->data->dev_private;
1432         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1433                                                  0, numa_node);
1434         if (internal->iface_name == NULL)
1435                 goto error;
1436         strcpy(internal->iface_name, iface_name);
1437
1438         data->nb_rx_queues = queues;
1439         data->nb_tx_queues = queues;
1440         internal->max_queues = queues;
1441         internal->vid = -1;
1442         internal->flags = flags;
1443         internal->disable_flags = disable_flags;
1444         data->dev_link = pmd_link;
1445         data->dev_flags = RTE_ETH_DEV_INTR_LSC | RTE_ETH_DEV_CLOSE_REMOVE;
1446         data->promiscuous = 1;
1447         data->all_multicast = 1;
1448
1449         eth_dev->dev_ops = &ops;
1450
1451         /* finally assign rx and tx ops */
1452         eth_dev->rx_pkt_burst = eth_vhost_rx;
1453         eth_dev->tx_pkt_burst = eth_vhost_tx;
1454
1455         rte_eth_dev_probing_finish(eth_dev);
1456         return 0;
1457
1458 error:
1459         if (internal)
1460                 rte_free(internal->iface_name);
1461         rte_eth_dev_release_port(eth_dev);
1462
1463         return -1;
1464 }
1465
1466 static inline int
1467 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1468 {
1469         const char **iface_name = extra_args;
1470
1471         if (value == NULL)
1472                 return -1;
1473
1474         *iface_name = value;
1475
1476         return 0;
1477 }
1478
1479 static inline int
1480 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1481 {
1482         uint16_t *n = extra_args;
1483
1484         if (value == NULL || extra_args == NULL)
1485                 return -EINVAL;
1486
1487         *n = (uint16_t)strtoul(value, NULL, 0);
1488         if (*n == USHRT_MAX && errno == ERANGE)
1489                 return -1;
1490
1491         return 0;
1492 }
1493
1494 static int
1495 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1496 {
1497         struct rte_kvargs *kvlist = NULL;
1498         int ret = 0;
1499         char *iface_name;
1500         uint16_t queues;
1501         uint64_t flags = 0;
1502         uint64_t disable_flags = 0;
1503         int client_mode = 0;
1504         int dequeue_zero_copy = 0;
1505         int iommu_support = 0;
1506         int postcopy_support = 0;
1507         int tso = 0;
1508         int linear_buf = 0;
1509         int ext_buf = 0;
1510         struct rte_eth_dev *eth_dev;
1511         const char *name = rte_vdev_device_name(dev);
1512
1513         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1514
1515         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1516                 eth_dev = rte_eth_dev_attach_secondary(name);
1517                 if (!eth_dev) {
1518                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1519                         return -1;
1520                 }
1521                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1522                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1523                 eth_dev->dev_ops = &ops;
1524                 if (dev->device.numa_node == SOCKET_ID_ANY)
1525                         dev->device.numa_node = rte_socket_id();
1526                 eth_dev->device = &dev->device;
1527                 rte_eth_dev_probing_finish(eth_dev);
1528                 return 0;
1529         }
1530
1531         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1532         if (kvlist == NULL)
1533                 return -1;
1534
1535         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1536                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1537                                          &open_iface, &iface_name);
1538                 if (ret < 0)
1539                         goto out_free;
1540         } else {
1541                 ret = -1;
1542                 goto out_free;
1543         }
1544
1545         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1546                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1547                                          &open_int, &queues);
1548                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1549                         goto out_free;
1550
1551         } else
1552                 queues = 1;
1553
1554         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1555                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1556                                          &open_int, &client_mode);
1557                 if (ret < 0)
1558                         goto out_free;
1559
1560                 if (client_mode)
1561                         flags |= RTE_VHOST_USER_CLIENT;
1562         }
1563
1564         if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1565                 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1566                                          &open_int, &dequeue_zero_copy);
1567                 if (ret < 0)
1568                         goto out_free;
1569
1570                 if (dequeue_zero_copy)
1571                         flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1572         }
1573
1574         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1575                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1576                                          &open_int, &iommu_support);
1577                 if (ret < 0)
1578                         goto out_free;
1579
1580                 if (iommu_support)
1581                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1582         }
1583
1584         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1585                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1586                                          &open_int, &postcopy_support);
1587                 if (ret < 0)
1588                         goto out_free;
1589
1590                 if (postcopy_support)
1591                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1592         }
1593
1594         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1595                 ret = rte_kvargs_process(kvlist,
1596                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1597                                 &open_int, &tso);
1598                 if (ret < 0)
1599                         goto out_free;
1600
1601                 if (tso == 0) {
1602                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1603                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1604                 }
1605         }
1606
1607         if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1608                 ret = rte_kvargs_process(kvlist,
1609                                 ETH_VHOST_LINEAR_BUF,
1610                                 &open_int, &linear_buf);
1611                 if (ret < 0)
1612                         goto out_free;
1613
1614                 if (linear_buf == 1)
1615                         flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1616         }
1617
1618         if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1619                 ret = rte_kvargs_process(kvlist,
1620                                 ETH_VHOST_EXT_BUF,
1621                                 &open_int, &ext_buf);
1622                 if (ret < 0)
1623                         goto out_free;
1624
1625                 if (ext_buf == 1)
1626                         flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1627         }
1628
1629         if (dev->device.numa_node == SOCKET_ID_ANY)
1630                 dev->device.numa_node = rte_socket_id();
1631
1632         ret = eth_dev_vhost_create(dev, iface_name, queues,
1633                                    dev->device.numa_node, flags, disable_flags);
1634         if (ret == -1)
1635                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1636
1637 out_free:
1638         rte_kvargs_free(kvlist);
1639         return ret;
1640 }
1641
1642 static int
1643 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1644 {
1645         const char *name;
1646         struct rte_eth_dev *eth_dev = NULL;
1647
1648         name = rte_vdev_device_name(dev);
1649         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1650
1651         /* find an ethdev entry */
1652         eth_dev = rte_eth_dev_allocated(name);
1653         if (eth_dev == NULL)
1654                 return 0;
1655
1656         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1657                 return rte_eth_dev_release_port(eth_dev);
1658
1659         eth_dev_close(eth_dev);
1660
1661         rte_eth_dev_release_port(eth_dev);
1662
1663         return 0;
1664 }
1665
1666 static struct rte_vdev_driver pmd_vhost_drv = {
1667         .probe = rte_pmd_vhost_probe,
1668         .remove = rte_pmd_vhost_remove,
1669 };
1670
1671 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1672 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1673 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1674         "iface=<ifc> "
1675         "queues=<int> "
1676         "client=<0|1> "
1677         "dequeue-zero-copy=<0|1> "
1678         "iommu-support=<0|1> "
1679         "postcopy-support=<0|1> "
1680         "tso=<0|1> "
1681         "linear-buffer=<0|1> "
1682         "ext-buffer=<0|1>");