log: register with standardized names
[dpdk.git] / drivers / net / vhost / rte_eth_vhost.c
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016 IGEL Co., Ltd.
3  * Copyright(c) 2016-2018 Intel Corporation
4  */
5 #include <unistd.h>
6 #include <pthread.h>
7 #include <stdbool.h>
8 #include <sys/epoll.h>
9
10 #include <rte_mbuf.h>
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
19
20 #include "rte_eth_vhost.h"
21
22 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
23
24 #define VHOST_LOG(level, ...) \
25         rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
26
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
28
29 #define ETH_VHOST_IFACE_ARG             "iface"
30 #define ETH_VHOST_QUEUES_ARG            "queues"
31 #define ETH_VHOST_CLIENT_ARG            "client"
32 #define ETH_VHOST_IOMMU_SUPPORT         "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT      "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF  "linear-buffer"
36 #define ETH_VHOST_EXT_BUF  "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
38
39 static const char *valid_arguments[] = {
40         ETH_VHOST_IFACE_ARG,
41         ETH_VHOST_QUEUES_ARG,
42         ETH_VHOST_CLIENT_ARG,
43         ETH_VHOST_IOMMU_SUPPORT,
44         ETH_VHOST_POSTCOPY_SUPPORT,
45         ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
46         ETH_VHOST_LINEAR_BUF,
47         ETH_VHOST_EXT_BUF,
48         NULL
49 };
50
51 static struct rte_ether_addr base_eth_addr = {
52         .addr_bytes = {
53                 0x56 /* V */,
54                 0x48 /* H */,
55                 0x4F /* O */,
56                 0x53 /* S */,
57                 0x54 /* T */,
58                 0x00
59         }
60 };
61
62 enum vhost_xstats_pkts {
63         VHOST_UNDERSIZE_PKT = 0,
64         VHOST_64_PKT,
65         VHOST_65_TO_127_PKT,
66         VHOST_128_TO_255_PKT,
67         VHOST_256_TO_511_PKT,
68         VHOST_512_TO_1023_PKT,
69         VHOST_1024_TO_1522_PKT,
70         VHOST_1523_TO_MAX_PKT,
71         VHOST_BROADCAST_PKT,
72         VHOST_MULTICAST_PKT,
73         VHOST_UNICAST_PKT,
74         VHOST_PKT,
75         VHOST_BYTE,
76         VHOST_MISSED_PKT,
77         VHOST_ERRORS_PKT,
78         VHOST_ERRORS_FRAGMENTED,
79         VHOST_ERRORS_JABBER,
80         VHOST_UNKNOWN_PROTOCOL,
81         VHOST_XSTATS_MAX,
82 };
83
84 struct vhost_stats {
85         uint64_t pkts;
86         uint64_t bytes;
87         uint64_t missed_pkts;
88         uint64_t xstats[VHOST_XSTATS_MAX];
89 };
90
91 struct vhost_queue {
92         int vid;
93         rte_atomic32_t allow_queuing;
94         rte_atomic32_t while_queuing;
95         struct pmd_internal *internal;
96         struct rte_mempool *mb_pool;
97         uint16_t port;
98         uint16_t virtqueue_id;
99         struct vhost_stats stats;
100         int intr_enable;
101         rte_spinlock_t intr_lock;
102 };
103
104 struct pmd_internal {
105         rte_atomic32_t dev_attached;
106         char *iface_name;
107         uint64_t flags;
108         uint64_t disable_flags;
109         uint16_t max_queues;
110         int vid;
111         rte_atomic32_t started;
112         uint8_t vlan_strip;
113 };
114
115 struct internal_list {
116         TAILQ_ENTRY(internal_list) next;
117         struct rte_eth_dev *eth_dev;
118 };
119
120 TAILQ_HEAD(internal_list_head, internal_list);
121 static struct internal_list_head internal_list =
122         TAILQ_HEAD_INITIALIZER(internal_list);
123
124 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
125
126 static struct rte_eth_link pmd_link = {
127                 .link_speed = 10000,
128                 .link_duplex = ETH_LINK_FULL_DUPLEX,
129                 .link_status = ETH_LINK_DOWN
130 };
131
132 struct rte_vhost_vring_state {
133         rte_spinlock_t lock;
134
135         bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
136         bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
137         unsigned int index;
138         unsigned int max_vring;
139 };
140
141 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
142
143 #define VHOST_XSTATS_NAME_SIZE 64
144
145 struct vhost_xstats_name_off {
146         char name[VHOST_XSTATS_NAME_SIZE];
147         uint64_t offset;
148 };
149
150 /* [rx]_is prepended to the name string here */
151 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
152         {"good_packets",
153          offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
154         {"total_bytes",
155          offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
156         {"missed_pkts",
157          offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
158         {"broadcast_packets",
159          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
160         {"multicast_packets",
161          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
162         {"unicast_packets",
163          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
164          {"undersize_packets",
165          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
166         {"size_64_packets",
167          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
168         {"size_65_to_127_packets",
169          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
170         {"size_128_to_255_packets",
171          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
172         {"size_256_to_511_packets",
173          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
174         {"size_512_to_1023_packets",
175          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
176         {"size_1024_to_1522_packets",
177          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
178         {"size_1523_to_max_packets",
179          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
180         {"errors_with_bad_CRC",
181          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
182         {"fragmented_errors",
183          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
184         {"jabber_errors",
185          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
186         {"unknown_protos_packets",
187          offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
188 };
189
190 /* [tx]_ is prepended to the name string here */
191 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
192         {"good_packets",
193          offsetof(struct vhost_queue, stats.xstats[VHOST_PKT])},
194         {"total_bytes",
195          offsetof(struct vhost_queue, stats.xstats[VHOST_BYTE])},
196         {"missed_pkts",
197          offsetof(struct vhost_queue, stats.xstats[VHOST_MISSED_PKT])},
198         {"broadcast_packets",
199          offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
200         {"multicast_packets",
201          offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
202         {"unicast_packets",
203          offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
204         {"undersize_packets",
205          offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
206         {"size_64_packets",
207          offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
208         {"size_65_to_127_packets",
209          offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
210         {"size_128_to_255_packets",
211          offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
212         {"size_256_to_511_packets",
213          offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
214         {"size_512_to_1023_packets",
215          offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
216         {"size_1024_to_1522_packets",
217          offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
218         {"size_1523_to_max_packets",
219          offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
220         {"errors_with_bad_CRC",
221          offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
222 };
223
224 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
225                                 sizeof(vhost_rxport_stat_strings[0]))
226
227 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
228                                 sizeof(vhost_txport_stat_strings[0]))
229
230 static int
231 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
232 {
233         struct vhost_queue *vq = NULL;
234         unsigned int i = 0;
235
236         for (i = 0; i < dev->data->nb_rx_queues; i++) {
237                 vq = dev->data->rx_queues[i];
238                 if (!vq)
239                         continue;
240                 memset(&vq->stats, 0, sizeof(vq->stats));
241         }
242         for (i = 0; i < dev->data->nb_tx_queues; i++) {
243                 vq = dev->data->tx_queues[i];
244                 if (!vq)
245                         continue;
246                 memset(&vq->stats, 0, sizeof(vq->stats));
247         }
248
249         return 0;
250 }
251
252 static int
253 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
254                            struct rte_eth_xstat_name *xstats_names,
255                            unsigned int limit __rte_unused)
256 {
257         unsigned int t = 0;
258         int count = 0;
259         int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
260
261         if (!xstats_names)
262                 return nstats;
263         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
264                 snprintf(xstats_names[count].name,
265                          sizeof(xstats_names[count].name),
266                          "rx_%s", vhost_rxport_stat_strings[t].name);
267                 count++;
268         }
269         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
270                 snprintf(xstats_names[count].name,
271                          sizeof(xstats_names[count].name),
272                          "tx_%s", vhost_txport_stat_strings[t].name);
273                 count++;
274         }
275         return count;
276 }
277
278 static int
279 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
280                      unsigned int n)
281 {
282         unsigned int i;
283         unsigned int t;
284         unsigned int count = 0;
285         struct vhost_queue *vq = NULL;
286         unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
287
288         if (n < nxstats)
289                 return nxstats;
290
291         for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
292                 xstats[count].value = 0;
293                 for (i = 0; i < dev->data->nb_rx_queues; i++) {
294                         vq = dev->data->rx_queues[i];
295                         if (!vq)
296                                 continue;
297                         xstats[count].value +=
298                                 *(uint64_t *)(((char *)vq)
299                                 + vhost_rxport_stat_strings[t].offset);
300                 }
301                 xstats[count].id = count;
302                 count++;
303         }
304         for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
305                 xstats[count].value = 0;
306                 for (i = 0; i < dev->data->nb_tx_queues; i++) {
307                         vq = dev->data->tx_queues[i];
308                         if (!vq)
309                                 continue;
310                         xstats[count].value +=
311                                 *(uint64_t *)(((char *)vq)
312                                 + vhost_txport_stat_strings[t].offset);
313                 }
314                 xstats[count].id = count;
315                 count++;
316         }
317         return count;
318 }
319
320 static inline void
321 vhost_count_xcast_packets(struct vhost_queue *vq,
322                                 struct rte_mbuf *mbuf)
323 {
324         struct rte_ether_addr *ea = NULL;
325         struct vhost_stats *pstats = &vq->stats;
326
327         ea = rte_pktmbuf_mtod(mbuf, struct rte_ether_addr *);
328         if (rte_is_multicast_ether_addr(ea)) {
329                 if (rte_is_broadcast_ether_addr(ea))
330                         pstats->xstats[VHOST_BROADCAST_PKT]++;
331                 else
332                         pstats->xstats[VHOST_MULTICAST_PKT]++;
333         } else {
334                 pstats->xstats[VHOST_UNICAST_PKT]++;
335         }
336 }
337
338 static void
339 vhost_update_packet_xstats(struct vhost_queue *vq, struct rte_mbuf **bufs,
340                            uint16_t count, uint64_t nb_bytes,
341                            uint64_t nb_missed)
342 {
343         uint32_t pkt_len = 0;
344         uint64_t i = 0;
345         uint64_t index;
346         struct vhost_stats *pstats = &vq->stats;
347
348         pstats->xstats[VHOST_BYTE] += nb_bytes;
349         pstats->xstats[VHOST_MISSED_PKT] += nb_missed;
350         pstats->xstats[VHOST_UNICAST_PKT] += nb_missed;
351
352         for (i = 0; i < count ; i++) {
353                 pstats->xstats[VHOST_PKT]++;
354                 pkt_len = bufs[i]->pkt_len;
355                 if (pkt_len == 64) {
356                         pstats->xstats[VHOST_64_PKT]++;
357                 } else if (pkt_len > 64 && pkt_len < 1024) {
358                         index = (sizeof(pkt_len) * 8)
359                                 - __builtin_clz(pkt_len) - 5;
360                         pstats->xstats[index]++;
361                 } else {
362                         if (pkt_len < 64)
363                                 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
364                         else if (pkt_len <= 1522)
365                                 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
366                         else if (pkt_len > 1522)
367                                 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
368                 }
369                 vhost_count_xcast_packets(vq, bufs[i]);
370         }
371 }
372
373 static uint16_t
374 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
375 {
376         struct vhost_queue *r = q;
377         uint16_t i, nb_rx = 0;
378         uint16_t nb_receive = nb_bufs;
379         uint64_t nb_bytes = 0;
380
381         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
382                 return 0;
383
384         rte_atomic32_set(&r->while_queuing, 1);
385
386         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
387                 goto out;
388
389         /* Dequeue packets from guest TX queue */
390         while (nb_receive) {
391                 uint16_t nb_pkts;
392                 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
393                                                  VHOST_MAX_PKT_BURST);
394
395                 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
396                                                   r->mb_pool, &bufs[nb_rx],
397                                                   num);
398
399                 nb_rx += nb_pkts;
400                 nb_receive -= nb_pkts;
401                 if (nb_pkts < num)
402                         break;
403         }
404
405         r->stats.pkts += nb_rx;
406
407         for (i = 0; likely(i < nb_rx); i++) {
408                 bufs[i]->port = r->port;
409                 bufs[i]->vlan_tci = 0;
410
411                 if (r->internal->vlan_strip)
412                         rte_vlan_strip(bufs[i]);
413
414                 nb_bytes += bufs[i]->pkt_len;
415         }
416
417         r->stats.bytes += nb_bytes;
418         vhost_update_packet_xstats(r, bufs, nb_rx, nb_bytes, 0);
419
420 out:
421         rte_atomic32_set(&r->while_queuing, 0);
422
423         return nb_rx;
424 }
425
426 static uint16_t
427 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
428 {
429         struct vhost_queue *r = q;
430         uint16_t i, nb_tx = 0;
431         uint16_t nb_send = 0;
432         uint64_t nb_bytes = 0;
433         uint64_t nb_missed = 0;
434
435         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
436                 return 0;
437
438         rte_atomic32_set(&r->while_queuing, 1);
439
440         if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
441                 goto out;
442
443         for (i = 0; i < nb_bufs; i++) {
444                 struct rte_mbuf *m = bufs[i];
445
446                 /* Do VLAN tag insertion */
447                 if (m->ol_flags & PKT_TX_VLAN_PKT) {
448                         int error = rte_vlan_insert(&m);
449                         if (unlikely(error)) {
450                                 rte_pktmbuf_free(m);
451                                 continue;
452                         }
453                 }
454
455                 bufs[nb_send] = m;
456                 ++nb_send;
457         }
458
459         /* Enqueue packets to guest RX queue */
460         while (nb_send) {
461                 uint16_t nb_pkts;
462                 uint16_t num = (uint16_t)RTE_MIN(nb_send,
463                                                  VHOST_MAX_PKT_BURST);
464
465                 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
466                                                   &bufs[nb_tx], num);
467
468                 nb_tx += nb_pkts;
469                 nb_send -= nb_pkts;
470                 if (nb_pkts < num)
471                         break;
472         }
473
474         for (i = 0; likely(i < nb_tx); i++)
475                 nb_bytes += bufs[i]->pkt_len;
476
477         nb_missed = nb_bufs - nb_tx;
478
479         r->stats.pkts += nb_tx;
480         r->stats.bytes += nb_bytes;
481         r->stats.missed_pkts += nb_bufs - nb_tx;
482
483         vhost_update_packet_xstats(r, bufs, nb_tx, nb_bytes, nb_missed);
484
485         /* According to RFC2863, ifHCOutUcastPkts, ifHCOutMulticastPkts and
486          * ifHCOutBroadcastPkts counters are increased when packets are not
487          * transmitted successfully.
488          */
489         for (i = nb_tx; i < nb_bufs; i++)
490                 vhost_count_xcast_packets(r, bufs[i]);
491
492         for (i = 0; likely(i < nb_tx); i++)
493                 rte_pktmbuf_free(bufs[i]);
494 out:
495         rte_atomic32_set(&r->while_queuing, 0);
496
497         return nb_tx;
498 }
499
500 static inline struct internal_list *
501 find_internal_resource(char *ifname)
502 {
503         int found = 0;
504         struct internal_list *list;
505         struct pmd_internal *internal;
506
507         if (ifname == NULL)
508                 return NULL;
509
510         pthread_mutex_lock(&internal_list_lock);
511
512         TAILQ_FOREACH(list, &internal_list, next) {
513                 internal = list->eth_dev->data->dev_private;
514                 if (!strcmp(internal->iface_name, ifname)) {
515                         found = 1;
516                         break;
517                 }
518         }
519
520         pthread_mutex_unlock(&internal_list_lock);
521
522         if (!found)
523                 return NULL;
524
525         return list;
526 }
527
528 static int
529 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
530 {
531         struct rte_intr_handle *handle = eth_dev->intr_handle;
532         struct rte_epoll_event rev;
533         int epfd, ret;
534
535         if (!handle)
536                 return 0;
537
538         if (handle->efds[rxq_idx] == handle->elist[rxq_idx].fd)
539                 return 0;
540
541         VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
542                         rxq_idx);
543
544         if (handle->elist[rxq_idx].fd != -1)
545                 VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
546                                 handle->elist[rxq_idx].fd);
547
548         /*
549          * First remove invalid epoll event, and then install
550          * the new one. May be solved with a proper API in the
551          * future.
552          */
553         epfd = handle->elist[rxq_idx].epfd;
554         rev = handle->elist[rxq_idx];
555         ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
556                         &handle->elist[rxq_idx]);
557         if (ret) {
558                 VHOST_LOG(ERR, "Delete epoll event failed.\n");
559                 return ret;
560         }
561
562         rev.fd = handle->efds[rxq_idx];
563         handle->elist[rxq_idx] = rev;
564         ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd,
565                         &handle->elist[rxq_idx]);
566         if (ret) {
567                 VHOST_LOG(ERR, "Add epoll event failed.\n");
568                 return ret;
569         }
570
571         return 0;
572 }
573
574 static int
575 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
576 {
577         struct rte_vhost_vring vring;
578         struct vhost_queue *vq;
579         int old_intr_enable, ret = 0;
580
581         vq = dev->data->rx_queues[qid];
582         if (!vq) {
583                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
584                 return -1;
585         }
586
587         rte_spinlock_lock(&vq->intr_lock);
588         old_intr_enable = vq->intr_enable;
589         vq->intr_enable = 1;
590         ret = eth_vhost_update_intr(dev, qid);
591         rte_spinlock_unlock(&vq->intr_lock);
592
593         if (ret < 0) {
594                 VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
595                 vq->intr_enable = old_intr_enable;
596                 return ret;
597         }
598
599         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
600         if (ret < 0) {
601                 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
602                 return ret;
603         }
604         VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
605         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
606         rte_wmb();
607
608         return ret;
609 }
610
611 static int
612 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
613 {
614         struct rte_vhost_vring vring;
615         struct vhost_queue *vq;
616         int ret = 0;
617
618         vq = dev->data->rx_queues[qid];
619         if (!vq) {
620                 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
621                 return -1;
622         }
623
624         ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
625         if (ret < 0) {
626                 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
627                 return ret;
628         }
629         VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
630         rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
631         rte_wmb();
632
633         vq->intr_enable = 0;
634
635         return 0;
636 }
637
638 static void
639 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
640 {
641         struct rte_intr_handle *intr_handle = dev->intr_handle;
642
643         if (intr_handle) {
644                 if (intr_handle->intr_vec)
645                         free(intr_handle->intr_vec);
646                 free(intr_handle);
647         }
648
649         dev->intr_handle = NULL;
650 }
651
652 static int
653 eth_vhost_install_intr(struct rte_eth_dev *dev)
654 {
655         struct rte_vhost_vring vring;
656         struct vhost_queue *vq;
657         int nb_rxq = dev->data->nb_rx_queues;
658         int i;
659         int ret;
660
661         /* uninstall firstly if we are reconnecting */
662         if (dev->intr_handle)
663                 eth_vhost_uninstall_intr(dev);
664
665         dev->intr_handle = malloc(sizeof(*dev->intr_handle));
666         if (!dev->intr_handle) {
667                 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
668                 return -ENOMEM;
669         }
670         memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
671
672         dev->intr_handle->efd_counter_size = sizeof(uint64_t);
673
674         dev->intr_handle->intr_vec =
675                 malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
676
677         if (!dev->intr_handle->intr_vec) {
678                 VHOST_LOG(ERR,
679                         "Failed to allocate memory for interrupt vector\n");
680                 free(dev->intr_handle);
681                 return -ENOMEM;
682         }
683
684         VHOST_LOG(INFO, "Prepare intr vec\n");
685         for (i = 0; i < nb_rxq; i++) {
686                 dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
687                 dev->intr_handle->efds[i] = -1;
688                 vq = dev->data->rx_queues[i];
689                 if (!vq) {
690                         VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
691                         continue;
692                 }
693
694                 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
695                 if (ret < 0) {
696                         VHOST_LOG(INFO,
697                                 "Failed to get rxq-%d's vring, skip!\n", i);
698                         continue;
699                 }
700
701                 if (vring.kickfd < 0) {
702                         VHOST_LOG(INFO,
703                                 "rxq-%d's kickfd is invalid, skip!\n", i);
704                         continue;
705                 }
706                 dev->intr_handle->efds[i] = vring.kickfd;
707                 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
708         }
709
710         dev->intr_handle->nb_efd = nb_rxq;
711         dev->intr_handle->max_intr = nb_rxq + 1;
712         dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
713
714         return 0;
715 }
716
717 static void
718 update_queuing_status(struct rte_eth_dev *dev)
719 {
720         struct pmd_internal *internal = dev->data->dev_private;
721         struct vhost_queue *vq;
722         unsigned int i;
723         int allow_queuing = 1;
724
725         if (!dev->data->rx_queues || !dev->data->tx_queues)
726                 return;
727
728         if (rte_atomic32_read(&internal->started) == 0 ||
729             rte_atomic32_read(&internal->dev_attached) == 0)
730                 allow_queuing = 0;
731
732         /* Wait until rx/tx_pkt_burst stops accessing vhost device */
733         for (i = 0; i < dev->data->nb_rx_queues; i++) {
734                 vq = dev->data->rx_queues[i];
735                 if (vq == NULL)
736                         continue;
737                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
738                 while (rte_atomic32_read(&vq->while_queuing))
739                         rte_pause();
740         }
741
742         for (i = 0; i < dev->data->nb_tx_queues; i++) {
743                 vq = dev->data->tx_queues[i];
744                 if (vq == NULL)
745                         continue;
746                 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
747                 while (rte_atomic32_read(&vq->while_queuing))
748                         rte_pause();
749         }
750 }
751
752 static void
753 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
754 {
755         struct vhost_queue *vq;
756         int i;
757
758         for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
759                 vq = eth_dev->data->rx_queues[i];
760                 if (!vq)
761                         continue;
762                 vq->vid = internal->vid;
763                 vq->internal = internal;
764                 vq->port = eth_dev->data->port_id;
765         }
766         for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
767                 vq = eth_dev->data->tx_queues[i];
768                 if (!vq)
769                         continue;
770                 vq->vid = internal->vid;
771                 vq->internal = internal;
772                 vq->port = eth_dev->data->port_id;
773         }
774 }
775
776 static int
777 new_device(int vid)
778 {
779         struct rte_eth_dev *eth_dev;
780         struct internal_list *list;
781         struct pmd_internal *internal;
782         struct rte_eth_conf *dev_conf;
783         unsigned i;
784         char ifname[PATH_MAX];
785 #ifdef RTE_LIBRTE_VHOST_NUMA
786         int newnode;
787 #endif
788
789         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
790         list = find_internal_resource(ifname);
791         if (list == NULL) {
792                 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
793                 return -1;
794         }
795
796         eth_dev = list->eth_dev;
797         internal = eth_dev->data->dev_private;
798         dev_conf = &eth_dev->data->dev_conf;
799
800 #ifdef RTE_LIBRTE_VHOST_NUMA
801         newnode = rte_vhost_get_numa_node(vid);
802         if (newnode >= 0)
803                 eth_dev->data->numa_node = newnode;
804 #endif
805
806         internal->vid = vid;
807         if (rte_atomic32_read(&internal->started) == 1) {
808                 queue_setup(eth_dev, internal);
809
810                 if (dev_conf->intr_conf.rxq) {
811                         if (eth_vhost_install_intr(eth_dev) < 0) {
812                                 VHOST_LOG(INFO,
813                                         "Failed to install interrupt handler.");
814                                         return -1;
815                         }
816                 }
817         } else {
818                 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
819         }
820
821         for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
822                 rte_vhost_enable_guest_notification(vid, i, 0);
823
824         rte_vhost_get_mtu(vid, &eth_dev->data->mtu);
825
826         eth_dev->data->dev_link.link_status = ETH_LINK_UP;
827
828         rte_atomic32_set(&internal->dev_attached, 1);
829         update_queuing_status(eth_dev);
830
831         VHOST_LOG(INFO, "Vhost device %d created\n", vid);
832
833         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
834
835         return 0;
836 }
837
838 static void
839 destroy_device(int vid)
840 {
841         struct rte_eth_dev *eth_dev;
842         struct pmd_internal *internal;
843         struct vhost_queue *vq;
844         struct internal_list *list;
845         char ifname[PATH_MAX];
846         unsigned i;
847         struct rte_vhost_vring_state *state;
848
849         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
850         list = find_internal_resource(ifname);
851         if (list == NULL) {
852                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
853                 return;
854         }
855         eth_dev = list->eth_dev;
856         internal = eth_dev->data->dev_private;
857
858         rte_atomic32_set(&internal->dev_attached, 0);
859         update_queuing_status(eth_dev);
860
861         eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
862
863         if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
864                 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
865                         vq = eth_dev->data->rx_queues[i];
866                         if (!vq)
867                                 continue;
868                         vq->vid = -1;
869                 }
870                 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
871                         vq = eth_dev->data->tx_queues[i];
872                         if (!vq)
873                                 continue;
874                         vq->vid = -1;
875                 }
876         }
877
878         state = vring_states[eth_dev->data->port_id];
879         rte_spinlock_lock(&state->lock);
880         for (i = 0; i <= state->max_vring; i++) {
881                 state->cur[i] = false;
882                 state->seen[i] = false;
883         }
884         state->max_vring = 0;
885         rte_spinlock_unlock(&state->lock);
886
887         VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
888         eth_vhost_uninstall_intr(eth_dev);
889
890         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
891 }
892
893 static int
894 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
895 {
896         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
897         struct pmd_internal *internal = eth_dev->data->dev_private;
898         struct vhost_queue *vq;
899         struct rte_vhost_vring vring;
900         int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
901         int ret = 0;
902
903         /*
904          * The vring kickfd may be changed after the new device notification.
905          * Update it when the vring state is updated.
906          */
907         if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
908             rte_atomic32_read(&internal->dev_attached) &&
909             rte_atomic32_read(&internal->started) &&
910             dev_conf->intr_conf.rxq) {
911                 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
912                 if (ret) {
913                         VHOST_LOG(ERR, "Failed to get vring %d information.\n",
914                                         vring_id);
915                         return ret;
916                 }
917                 eth_dev->intr_handle->efds[rx_idx] = vring.kickfd;
918
919                 vq = eth_dev->data->rx_queues[rx_idx];
920                 if (!vq) {
921                         VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
922                         return -1;
923                 }
924
925                 rte_spinlock_lock(&vq->intr_lock);
926                 if (vq->intr_enable)
927                         ret = eth_vhost_update_intr(eth_dev, rx_idx);
928                 rte_spinlock_unlock(&vq->intr_lock);
929         }
930
931         return ret;
932 }
933
934 static int
935 vring_state_changed(int vid, uint16_t vring, int enable)
936 {
937         struct rte_vhost_vring_state *state;
938         struct rte_eth_dev *eth_dev;
939         struct internal_list *list;
940         char ifname[PATH_MAX];
941
942         rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
943         list = find_internal_resource(ifname);
944         if (list == NULL) {
945                 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
946                 return -1;
947         }
948
949         eth_dev = list->eth_dev;
950         /* won't be NULL */
951         state = vring_states[eth_dev->data->port_id];
952
953         if (enable && vring_conf_update(vid, eth_dev, vring))
954                 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
955                           (int)vring);
956
957         rte_spinlock_lock(&state->lock);
958         if (state->cur[vring] == enable) {
959                 rte_spinlock_unlock(&state->lock);
960                 return 0;
961         }
962         state->cur[vring] = enable;
963         state->max_vring = RTE_MAX(vring, state->max_vring);
964         rte_spinlock_unlock(&state->lock);
965
966         VHOST_LOG(INFO, "vring%u is %s\n",
967                         vring, enable ? "enabled" : "disabled");
968
969         rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
970
971         return 0;
972 }
973
974 static struct vhost_device_ops vhost_ops = {
975         .new_device          = new_device,
976         .destroy_device      = destroy_device,
977         .vring_state_changed = vring_state_changed,
978 };
979
980 static int
981 vhost_driver_setup(struct rte_eth_dev *eth_dev)
982 {
983         struct pmd_internal *internal = eth_dev->data->dev_private;
984         struct internal_list *list = NULL;
985         struct rte_vhost_vring_state *vring_state = NULL;
986         unsigned int numa_node = eth_dev->device->numa_node;
987         const char *name = eth_dev->device->name;
988
989         /* Don't try to setup again if it has already been done. */
990         list = find_internal_resource(internal->iface_name);
991         if (list)
992                 return 0;
993
994         list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
995         if (list == NULL)
996                 return -1;
997
998         vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
999                                          0, numa_node);
1000         if (vring_state == NULL)
1001                 goto free_list;
1002
1003         list->eth_dev = eth_dev;
1004         pthread_mutex_lock(&internal_list_lock);
1005         TAILQ_INSERT_TAIL(&internal_list, list, next);
1006         pthread_mutex_unlock(&internal_list_lock);
1007
1008         rte_spinlock_init(&vring_state->lock);
1009         vring_states[eth_dev->data->port_id] = vring_state;
1010
1011         if (rte_vhost_driver_register(internal->iface_name, internal->flags))
1012                 goto list_remove;
1013
1014         if (internal->disable_flags) {
1015                 if (rte_vhost_driver_disable_features(internal->iface_name,
1016                                                       internal->disable_flags))
1017                         goto drv_unreg;
1018         }
1019
1020         if (rte_vhost_driver_callback_register(internal->iface_name,
1021                                                &vhost_ops) < 0) {
1022                 VHOST_LOG(ERR, "Can't register callbacks\n");
1023                 goto drv_unreg;
1024         }
1025
1026         if (rte_vhost_driver_start(internal->iface_name) < 0) {
1027                 VHOST_LOG(ERR, "Failed to start driver for %s\n",
1028                           internal->iface_name);
1029                 goto drv_unreg;
1030         }
1031
1032         return 0;
1033
1034 drv_unreg:
1035         rte_vhost_driver_unregister(internal->iface_name);
1036 list_remove:
1037         vring_states[eth_dev->data->port_id] = NULL;
1038         pthread_mutex_lock(&internal_list_lock);
1039         TAILQ_REMOVE(&internal_list, list, next);
1040         pthread_mutex_unlock(&internal_list_lock);
1041         rte_free(vring_state);
1042 free_list:
1043         rte_free(list);
1044
1045         return -1;
1046 }
1047
1048 int
1049 rte_eth_vhost_get_queue_event(uint16_t port_id,
1050                 struct rte_eth_vhost_queue_event *event)
1051 {
1052         struct rte_vhost_vring_state *state;
1053         unsigned int i;
1054         int idx;
1055
1056         if (port_id >= RTE_MAX_ETHPORTS) {
1057                 VHOST_LOG(ERR, "Invalid port id\n");
1058                 return -1;
1059         }
1060
1061         state = vring_states[port_id];
1062         if (!state) {
1063                 VHOST_LOG(ERR, "Unused port\n");
1064                 return -1;
1065         }
1066
1067         rte_spinlock_lock(&state->lock);
1068         for (i = 0; i <= state->max_vring; i++) {
1069                 idx = state->index++ % (state->max_vring + 1);
1070
1071                 if (state->cur[idx] != state->seen[idx]) {
1072                         state->seen[idx] = state->cur[idx];
1073                         event->queue_id = idx / 2;
1074                         event->rx = idx & 1;
1075                         event->enable = state->cur[idx];
1076                         rte_spinlock_unlock(&state->lock);
1077                         return 0;
1078                 }
1079         }
1080         rte_spinlock_unlock(&state->lock);
1081
1082         return -1;
1083 }
1084
1085 int
1086 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
1087 {
1088         struct internal_list *list;
1089         struct rte_eth_dev *eth_dev;
1090         struct vhost_queue *vq;
1091         int vid = -1;
1092
1093         if (!rte_eth_dev_is_valid_port(port_id))
1094                 return -1;
1095
1096         pthread_mutex_lock(&internal_list_lock);
1097
1098         TAILQ_FOREACH(list, &internal_list, next) {
1099                 eth_dev = list->eth_dev;
1100                 if (eth_dev->data->port_id == port_id) {
1101                         vq = eth_dev->data->rx_queues[0];
1102                         if (vq) {
1103                                 vid = vq->vid;
1104                         }
1105                         break;
1106                 }
1107         }
1108
1109         pthread_mutex_unlock(&internal_list_lock);
1110
1111         return vid;
1112 }
1113
1114 static int
1115 eth_dev_configure(struct rte_eth_dev *dev)
1116 {
1117         struct pmd_internal *internal = dev->data->dev_private;
1118         const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1119
1120         /* NOTE: the same process has to operate a vhost interface
1121          * from beginning to end (from eth_dev configure to eth_dev close).
1122          * It is user's responsibility at the moment.
1123          */
1124         if (vhost_driver_setup(dev) < 0)
1125                 return -1;
1126
1127         internal->vlan_strip = !!(rxmode->offloads & DEV_RX_OFFLOAD_VLAN_STRIP);
1128
1129         return 0;
1130 }
1131
1132 static int
1133 eth_dev_start(struct rte_eth_dev *eth_dev)
1134 {
1135         struct pmd_internal *internal = eth_dev->data->dev_private;
1136         struct rte_eth_conf *dev_conf = &eth_dev->data->dev_conf;
1137
1138         queue_setup(eth_dev, internal);
1139
1140         if (rte_atomic32_read(&internal->dev_attached) == 1) {
1141                 if (dev_conf->intr_conf.rxq) {
1142                         if (eth_vhost_install_intr(eth_dev) < 0) {
1143                                 VHOST_LOG(INFO,
1144                                         "Failed to install interrupt handler.");
1145                                         return -1;
1146                         }
1147                 }
1148         }
1149
1150         rte_atomic32_set(&internal->started, 1);
1151         update_queuing_status(eth_dev);
1152
1153         return 0;
1154 }
1155
1156 static int
1157 eth_dev_stop(struct rte_eth_dev *dev)
1158 {
1159         struct pmd_internal *internal = dev->data->dev_private;
1160
1161         dev->data->dev_started = 0;
1162         rte_atomic32_set(&internal->started, 0);
1163         update_queuing_status(dev);
1164
1165         return 0;
1166 }
1167
1168 static int
1169 eth_dev_close(struct rte_eth_dev *dev)
1170 {
1171         struct pmd_internal *internal;
1172         struct internal_list *list;
1173         unsigned int i, ret;
1174
1175         if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1176                 return 0;
1177
1178         internal = dev->data->dev_private;
1179         if (!internal)
1180                 return 0;
1181
1182         ret = eth_dev_stop(dev);
1183
1184         list = find_internal_resource(internal->iface_name);
1185         if (list) {
1186                 rte_vhost_driver_unregister(internal->iface_name);
1187                 pthread_mutex_lock(&internal_list_lock);
1188                 TAILQ_REMOVE(&internal_list, list, next);
1189                 pthread_mutex_unlock(&internal_list_lock);
1190                 rte_free(list);
1191         }
1192
1193         if (dev->data->rx_queues)
1194                 for (i = 0; i < dev->data->nb_rx_queues; i++)
1195                         rte_free(dev->data->rx_queues[i]);
1196
1197         if (dev->data->tx_queues)
1198                 for (i = 0; i < dev->data->nb_tx_queues; i++)
1199                         rte_free(dev->data->tx_queues[i]);
1200
1201         rte_free(internal->iface_name);
1202         rte_free(internal);
1203
1204         dev->data->dev_private = NULL;
1205
1206         rte_free(vring_states[dev->data->port_id]);
1207         vring_states[dev->data->port_id] = NULL;
1208
1209         return ret;
1210 }
1211
1212 static int
1213 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1214                    uint16_t nb_rx_desc __rte_unused,
1215                    unsigned int socket_id,
1216                    const struct rte_eth_rxconf *rx_conf __rte_unused,
1217                    struct rte_mempool *mb_pool)
1218 {
1219         struct vhost_queue *vq;
1220
1221         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1222                         RTE_CACHE_LINE_SIZE, socket_id);
1223         if (vq == NULL) {
1224                 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1225                 return -ENOMEM;
1226         }
1227
1228         vq->mb_pool = mb_pool;
1229         vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1230         rte_spinlock_init(&vq->intr_lock);
1231         dev->data->rx_queues[rx_queue_id] = vq;
1232
1233         return 0;
1234 }
1235
1236 static int
1237 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1238                    uint16_t nb_tx_desc __rte_unused,
1239                    unsigned int socket_id,
1240                    const struct rte_eth_txconf *tx_conf __rte_unused)
1241 {
1242         struct vhost_queue *vq;
1243
1244         vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1245                         RTE_CACHE_LINE_SIZE, socket_id);
1246         if (vq == NULL) {
1247                 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1248                 return -ENOMEM;
1249         }
1250
1251         vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1252         rte_spinlock_init(&vq->intr_lock);
1253         dev->data->tx_queues[tx_queue_id] = vq;
1254
1255         return 0;
1256 }
1257
1258 static int
1259 eth_dev_info(struct rte_eth_dev *dev,
1260              struct rte_eth_dev_info *dev_info)
1261 {
1262         struct pmd_internal *internal;
1263
1264         internal = dev->data->dev_private;
1265         if (internal == NULL) {
1266                 VHOST_LOG(ERR, "Invalid device specified\n");
1267                 return -ENODEV;
1268         }
1269
1270         dev_info->max_mac_addrs = 1;
1271         dev_info->max_rx_pktlen = (uint32_t)-1;
1272         dev_info->max_rx_queues = internal->max_queues;
1273         dev_info->max_tx_queues = internal->max_queues;
1274         dev_info->min_rx_bufsize = 0;
1275
1276         dev_info->tx_offload_capa = DEV_TX_OFFLOAD_MULTI_SEGS |
1277                                 DEV_TX_OFFLOAD_VLAN_INSERT;
1278         dev_info->rx_offload_capa = DEV_RX_OFFLOAD_VLAN_STRIP;
1279
1280         return 0;
1281 }
1282
1283 static int
1284 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1285 {
1286         unsigned i;
1287         unsigned long rx_total = 0, tx_total = 0;
1288         unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1289         struct vhost_queue *vq;
1290
1291         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1292                         i < dev->data->nb_rx_queues; i++) {
1293                 if (dev->data->rx_queues[i] == NULL)
1294                         continue;
1295                 vq = dev->data->rx_queues[i];
1296                 stats->q_ipackets[i] = vq->stats.pkts;
1297                 rx_total += stats->q_ipackets[i];
1298
1299                 stats->q_ibytes[i] = vq->stats.bytes;
1300                 rx_total_bytes += stats->q_ibytes[i];
1301         }
1302
1303         for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1304                         i < dev->data->nb_tx_queues; i++) {
1305                 if (dev->data->tx_queues[i] == NULL)
1306                         continue;
1307                 vq = dev->data->tx_queues[i];
1308                 stats->q_opackets[i] = vq->stats.pkts;
1309                 tx_total += stats->q_opackets[i];
1310
1311                 stats->q_obytes[i] = vq->stats.bytes;
1312                 tx_total_bytes += stats->q_obytes[i];
1313         }
1314
1315         stats->ipackets = rx_total;
1316         stats->opackets = tx_total;
1317         stats->ibytes = rx_total_bytes;
1318         stats->obytes = tx_total_bytes;
1319
1320         return 0;
1321 }
1322
1323 static int
1324 eth_stats_reset(struct rte_eth_dev *dev)
1325 {
1326         struct vhost_queue *vq;
1327         unsigned i;
1328
1329         for (i = 0; i < dev->data->nb_rx_queues; i++) {
1330                 if (dev->data->rx_queues[i] == NULL)
1331                         continue;
1332                 vq = dev->data->rx_queues[i];
1333                 vq->stats.pkts = 0;
1334                 vq->stats.bytes = 0;
1335         }
1336         for (i = 0; i < dev->data->nb_tx_queues; i++) {
1337                 if (dev->data->tx_queues[i] == NULL)
1338                         continue;
1339                 vq = dev->data->tx_queues[i];
1340                 vq->stats.pkts = 0;
1341                 vq->stats.bytes = 0;
1342                 vq->stats.missed_pkts = 0;
1343         }
1344
1345         return 0;
1346 }
1347
1348 static void
1349 eth_queue_release(void *q)
1350 {
1351         rte_free(q);
1352 }
1353
1354 static int
1355 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1356 {
1357         /*
1358          * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1359          * and releases mbuf, so nothing to cleanup.
1360          */
1361         return 0;
1362 }
1363
1364 static int
1365 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1366                 int wait_to_complete __rte_unused)
1367 {
1368         return 0;
1369 }
1370
1371 static uint32_t
1372 eth_rx_queue_count(struct rte_eth_dev *dev, uint16_t rx_queue_id)
1373 {
1374         struct vhost_queue *vq;
1375
1376         vq = dev->data->rx_queues[rx_queue_id];
1377         if (vq == NULL)
1378                 return 0;
1379
1380         return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1381 }
1382
1383 static const struct eth_dev_ops ops = {
1384         .dev_start = eth_dev_start,
1385         .dev_stop = eth_dev_stop,
1386         .dev_close = eth_dev_close,
1387         .dev_configure = eth_dev_configure,
1388         .dev_infos_get = eth_dev_info,
1389         .rx_queue_setup = eth_rx_queue_setup,
1390         .tx_queue_setup = eth_tx_queue_setup,
1391         .rx_queue_release = eth_queue_release,
1392         .tx_queue_release = eth_queue_release,
1393         .tx_done_cleanup = eth_tx_done_cleanup,
1394         .link_update = eth_link_update,
1395         .stats_get = eth_stats_get,
1396         .stats_reset = eth_stats_reset,
1397         .xstats_reset = vhost_dev_xstats_reset,
1398         .xstats_get = vhost_dev_xstats_get,
1399         .xstats_get_names = vhost_dev_xstats_get_names,
1400         .rx_queue_intr_enable = eth_rxq_intr_enable,
1401         .rx_queue_intr_disable = eth_rxq_intr_disable,
1402 };
1403
1404 static int
1405 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1406         int16_t queues, const unsigned int numa_node, uint64_t flags,
1407         uint64_t disable_flags)
1408 {
1409         const char *name = rte_vdev_device_name(dev);
1410         struct rte_eth_dev_data *data;
1411         struct pmd_internal *internal = NULL;
1412         struct rte_eth_dev *eth_dev = NULL;
1413         struct rte_ether_addr *eth_addr = NULL;
1414
1415         VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1416                 numa_node);
1417
1418         /* reserve an ethdev entry */
1419         eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1420         if (eth_dev == NULL)
1421                 goto error;
1422         data = eth_dev->data;
1423
1424         eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1425         if (eth_addr == NULL)
1426                 goto error;
1427         data->mac_addrs = eth_addr;
1428         *eth_addr = base_eth_addr;
1429         eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1430
1431         /* now put it all together
1432          * - store queue data in internal,
1433          * - point eth_dev_data to internals
1434          * - and point eth_dev structure to new eth_dev_data structure
1435          */
1436         internal = eth_dev->data->dev_private;
1437         internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1438                                                  0, numa_node);
1439         if (internal->iface_name == NULL)
1440                 goto error;
1441         strcpy(internal->iface_name, iface_name);
1442
1443         data->nb_rx_queues = queues;
1444         data->nb_tx_queues = queues;
1445         internal->max_queues = queues;
1446         internal->vid = -1;
1447         internal->flags = flags;
1448         internal->disable_flags = disable_flags;
1449         data->dev_link = pmd_link;
1450         data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1451                                 RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1452         data->promiscuous = 1;
1453         data->all_multicast = 1;
1454
1455         eth_dev->dev_ops = &ops;
1456         eth_dev->rx_queue_count = eth_rx_queue_count;
1457
1458         /* finally assign rx and tx ops */
1459         eth_dev->rx_pkt_burst = eth_vhost_rx;
1460         eth_dev->tx_pkt_burst = eth_vhost_tx;
1461
1462         rte_eth_dev_probing_finish(eth_dev);
1463         return 0;
1464
1465 error:
1466         if (internal)
1467                 rte_free(internal->iface_name);
1468         rte_eth_dev_release_port(eth_dev);
1469
1470         return -1;
1471 }
1472
1473 static inline int
1474 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1475 {
1476         const char **iface_name = extra_args;
1477
1478         if (value == NULL)
1479                 return -1;
1480
1481         *iface_name = value;
1482
1483         return 0;
1484 }
1485
1486 static inline int
1487 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1488 {
1489         uint16_t *n = extra_args;
1490
1491         if (value == NULL || extra_args == NULL)
1492                 return -EINVAL;
1493
1494         *n = (uint16_t)strtoul(value, NULL, 0);
1495         if (*n == USHRT_MAX && errno == ERANGE)
1496                 return -1;
1497
1498         return 0;
1499 }
1500
1501 static int
1502 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1503 {
1504         struct rte_kvargs *kvlist = NULL;
1505         int ret = 0;
1506         char *iface_name;
1507         uint16_t queues;
1508         uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1509         uint64_t disable_flags = 0;
1510         int client_mode = 0;
1511         int iommu_support = 0;
1512         int postcopy_support = 0;
1513         int tso = 0;
1514         int linear_buf = 0;
1515         int ext_buf = 0;
1516         struct rte_eth_dev *eth_dev;
1517         const char *name = rte_vdev_device_name(dev);
1518
1519         VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1520
1521         if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1522                 eth_dev = rte_eth_dev_attach_secondary(name);
1523                 if (!eth_dev) {
1524                         VHOST_LOG(ERR, "Failed to probe %s\n", name);
1525                         return -1;
1526                 }
1527                 eth_dev->rx_pkt_burst = eth_vhost_rx;
1528                 eth_dev->tx_pkt_burst = eth_vhost_tx;
1529                 eth_dev->dev_ops = &ops;
1530                 if (dev->device.numa_node == SOCKET_ID_ANY)
1531                         dev->device.numa_node = rte_socket_id();
1532                 eth_dev->device = &dev->device;
1533                 rte_eth_dev_probing_finish(eth_dev);
1534                 return 0;
1535         }
1536
1537         kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1538         if (kvlist == NULL)
1539                 return -1;
1540
1541         if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1542                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1543                                          &open_iface, &iface_name);
1544                 if (ret < 0)
1545                         goto out_free;
1546         } else {
1547                 ret = -1;
1548                 goto out_free;
1549         }
1550
1551         if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1552                 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1553                                          &open_int, &queues);
1554                 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1555                         goto out_free;
1556
1557         } else
1558                 queues = 1;
1559
1560         if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1561                 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1562                                          &open_int, &client_mode);
1563                 if (ret < 0)
1564                         goto out_free;
1565
1566                 if (client_mode)
1567                         flags |= RTE_VHOST_USER_CLIENT;
1568         }
1569
1570         if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1571                 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1572                                          &open_int, &iommu_support);
1573                 if (ret < 0)
1574                         goto out_free;
1575
1576                 if (iommu_support)
1577                         flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1578         }
1579
1580         if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1581                 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1582                                          &open_int, &postcopy_support);
1583                 if (ret < 0)
1584                         goto out_free;
1585
1586                 if (postcopy_support)
1587                         flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1588         }
1589
1590         if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1591                 ret = rte_kvargs_process(kvlist,
1592                                 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1593                                 &open_int, &tso);
1594                 if (ret < 0)
1595                         goto out_free;
1596
1597                 if (tso == 0) {
1598                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1599                         disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1600                 }
1601         }
1602
1603         if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1604                 ret = rte_kvargs_process(kvlist,
1605                                 ETH_VHOST_LINEAR_BUF,
1606                                 &open_int, &linear_buf);
1607                 if (ret < 0)
1608                         goto out_free;
1609
1610                 if (linear_buf == 1)
1611                         flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1612         }
1613
1614         if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1615                 ret = rte_kvargs_process(kvlist,
1616                                 ETH_VHOST_EXT_BUF,
1617                                 &open_int, &ext_buf);
1618                 if (ret < 0)
1619                         goto out_free;
1620
1621                 if (ext_buf == 1)
1622                         flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1623         }
1624
1625         if (dev->device.numa_node == SOCKET_ID_ANY)
1626                 dev->device.numa_node = rte_socket_id();
1627
1628         ret = eth_dev_vhost_create(dev, iface_name, queues,
1629                                    dev->device.numa_node, flags, disable_flags);
1630         if (ret == -1)
1631                 VHOST_LOG(ERR, "Failed to create %s\n", name);
1632
1633 out_free:
1634         rte_kvargs_free(kvlist);
1635         return ret;
1636 }
1637
1638 static int
1639 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1640 {
1641         const char *name;
1642         struct rte_eth_dev *eth_dev = NULL;
1643
1644         name = rte_vdev_device_name(dev);
1645         VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1646
1647         /* find an ethdev entry */
1648         eth_dev = rte_eth_dev_allocated(name);
1649         if (eth_dev == NULL)
1650                 return 0;
1651
1652         eth_dev_close(eth_dev);
1653         rte_eth_dev_release_port(eth_dev);
1654
1655         return 0;
1656 }
1657
1658 static struct rte_vdev_driver pmd_vhost_drv = {
1659         .probe = rte_pmd_vhost_probe,
1660         .remove = rte_pmd_vhost_remove,
1661 };
1662
1663 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1664 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1665 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1666         "iface=<ifc> "
1667         "queues=<int> "
1668         "client=<0|1> "
1669         "iommu-support=<0|1> "
1670         "postcopy-support=<0|1> "
1671         "tso=<0|1> "
1672         "linear-buffer=<0|1> "
1673         "ext-buffer=<0|1>");