1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2016 IGEL Co., Ltd.
3 * Copyright(c) 2016-2018 Intel Corporation
11 #include <ethdev_driver.h>
12 #include <ethdev_vdev.h>
13 #include <rte_malloc.h>
14 #include <rte_memcpy.h>
15 #include <rte_bus_vdev.h>
16 #include <rte_kvargs.h>
17 #include <rte_vhost.h>
18 #include <rte_spinlock.h>
20 #include "rte_eth_vhost.h"
22 RTE_LOG_REGISTER_DEFAULT(vhost_logtype, NOTICE);
24 #define VHOST_LOG(level, ...) \
25 rte_log(RTE_LOG_ ## level, vhost_logtype, __VA_ARGS__)
27 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
29 #define ETH_VHOST_IFACE_ARG "iface"
30 #define ETH_VHOST_QUEUES_ARG "queues"
31 #define ETH_VHOST_CLIENT_ARG "client"
32 #define ETH_VHOST_IOMMU_SUPPORT "iommu-support"
33 #define ETH_VHOST_POSTCOPY_SUPPORT "postcopy-support"
34 #define ETH_VHOST_VIRTIO_NET_F_HOST_TSO "tso"
35 #define ETH_VHOST_LINEAR_BUF "linear-buffer"
36 #define ETH_VHOST_EXT_BUF "ext-buffer"
37 #define VHOST_MAX_PKT_BURST 32
39 static const char *valid_arguments[] = {
43 ETH_VHOST_IOMMU_SUPPORT,
44 ETH_VHOST_POSTCOPY_SUPPORT,
45 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
51 static struct rte_ether_addr base_eth_addr = {
70 rte_atomic32_t allow_queuing;
71 rte_atomic32_t while_queuing;
72 struct pmd_internal *internal;
73 struct rte_mempool *mb_pool;
75 uint16_t virtqueue_id;
76 struct vhost_stats stats;
78 rte_spinlock_t intr_lock;
82 rte_atomic32_t dev_attached;
85 uint64_t disable_flags;
88 rte_atomic32_t started;
92 struct internal_list {
93 TAILQ_ENTRY(internal_list) next;
94 struct rte_eth_dev *eth_dev;
97 TAILQ_HEAD(internal_list_head, internal_list);
98 static struct internal_list_head internal_list =
99 TAILQ_HEAD_INITIALIZER(internal_list);
101 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
103 static struct rte_eth_link pmd_link = {
105 .link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
106 .link_status = RTE_ETH_LINK_DOWN
109 struct rte_vhost_vring_state {
112 bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
113 bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
115 unsigned int max_vring;
118 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
121 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
123 struct vhost_queue *vq;
126 for (i = 0; i < dev->data->nb_rx_queues; i++) {
127 vq = dev->data->rx_queues[i];
128 ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
133 for (i = 0; i < dev->data->nb_tx_queues; i++) {
134 vq = dev->data->tx_queues[i];
135 ret = rte_vhost_vring_stats_reset(vq->vid, vq->virtqueue_id);
144 vhost_dev_xstats_get_names(struct rte_eth_dev *dev,
145 struct rte_eth_xstat_name *xstats_names,
148 struct rte_vhost_stat_name *name;
149 struct vhost_queue *vq;
150 int ret, i, count = 0, nstats = 0;
152 for (i = 0; i < dev->data->nb_rx_queues; i++) {
153 vq = dev->data->rx_queues[i];
154 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
161 for (i = 0; i < dev->data->nb_tx_queues; i++) {
162 vq = dev->data->tx_queues[i];
163 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id, NULL, 0);
170 if (!xstats_names || limit < (unsigned int)nstats)
173 name = calloc(nstats, sizeof(*name));
177 for (i = 0; i < dev->data->nb_rx_queues; i++) {
178 vq = dev->data->rx_queues[i];
179 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
180 name + count, nstats - count);
189 for (i = 0; i < dev->data->nb_tx_queues; i++) {
190 vq = dev->data->tx_queues[i];
191 ret = rte_vhost_vring_stats_get_names(vq->vid, vq->virtqueue_id,
192 name + count, nstats - count);
201 for (i = 0; i < count; i++)
202 strncpy(xstats_names[i].name, name[i].name, RTE_ETH_XSTATS_NAME_SIZE);
210 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
213 struct rte_vhost_stat *stats;
214 struct vhost_queue *vq;
215 int ret, i, count = 0, nstats = 0;
217 for (i = 0; i < dev->data->nb_rx_queues; i++) {
218 vq = dev->data->rx_queues[i];
219 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
226 for (i = 0; i < dev->data->nb_tx_queues; i++) {
227 vq = dev->data->tx_queues[i];
228 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id, NULL, 0);
235 if (!xstats || n < (unsigned int)nstats)
238 stats = calloc(nstats, sizeof(*stats));
242 for (i = 0; i < dev->data->nb_rx_queues; i++) {
243 vq = dev->data->rx_queues[i];
244 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
245 stats + count, nstats - count);
254 for (i = 0; i < dev->data->nb_tx_queues; i++) {
255 vq = dev->data->tx_queues[i];
256 ret = rte_vhost_vring_stats_get(vq->vid, vq->virtqueue_id,
257 stats + count, nstats - count);
266 for (i = 0; i < count; i++) {
267 xstats[i].id = stats[i].id;
268 xstats[i].value = stats[i].value;
277 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
279 struct vhost_queue *r = q;
280 uint16_t i, nb_rx = 0;
281 uint16_t nb_receive = nb_bufs;
283 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
286 rte_atomic32_set(&r->while_queuing, 1);
288 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
291 /* Dequeue packets from guest TX queue */
294 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
295 VHOST_MAX_PKT_BURST);
297 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
298 r->mb_pool, &bufs[nb_rx],
302 nb_receive -= nb_pkts;
307 r->stats.pkts += nb_rx;
309 for (i = 0; likely(i < nb_rx); i++) {
310 bufs[i]->port = r->port;
311 bufs[i]->vlan_tci = 0;
313 if (r->internal->vlan_strip)
314 rte_vlan_strip(bufs[i]);
316 r->stats.bytes += bufs[i]->pkt_len;
320 rte_atomic32_set(&r->while_queuing, 0);
326 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
328 struct vhost_queue *r = q;
329 uint16_t i, nb_tx = 0;
330 uint16_t nb_send = 0;
331 uint64_t nb_bytes = 0;
332 uint64_t nb_missed = 0;
334 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
337 rte_atomic32_set(&r->while_queuing, 1);
339 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
342 for (i = 0; i < nb_bufs; i++) {
343 struct rte_mbuf *m = bufs[i];
345 /* Do VLAN tag insertion */
346 if (m->ol_flags & RTE_MBUF_F_TX_VLAN) {
347 int error = rte_vlan_insert(&m);
348 if (unlikely(error)) {
358 /* Enqueue packets to guest RX queue */
361 uint16_t num = (uint16_t)RTE_MIN(nb_send,
362 VHOST_MAX_PKT_BURST);
364 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
373 for (i = 0; likely(i < nb_tx); i++)
374 nb_bytes += bufs[i]->pkt_len;
376 nb_missed = nb_bufs - nb_tx;
378 r->stats.pkts += nb_tx;
379 r->stats.bytes += nb_bytes;
380 r->stats.missed_pkts += nb_missed;
382 for (i = 0; likely(i < nb_tx); i++)
383 rte_pktmbuf_free(bufs[i]);
385 rte_atomic32_set(&r->while_queuing, 0);
390 static inline struct internal_list *
391 find_internal_resource(char *ifname)
394 struct internal_list *list;
395 struct pmd_internal *internal;
400 pthread_mutex_lock(&internal_list_lock);
402 TAILQ_FOREACH(list, &internal_list, next) {
403 internal = list->eth_dev->data->dev_private;
404 if (!strcmp(internal->iface_name, ifname)) {
410 pthread_mutex_unlock(&internal_list_lock);
419 eth_vhost_update_intr(struct rte_eth_dev *eth_dev, uint16_t rxq_idx)
421 struct rte_intr_handle *handle = eth_dev->intr_handle;
422 struct rte_epoll_event rev, *elist;
428 elist = rte_intr_elist_index_get(handle, rxq_idx);
429 if (rte_intr_efds_index_get(handle, rxq_idx) == elist->fd)
432 VHOST_LOG(INFO, "kickfd for rxq-%d was changed, updating handler.\n",
436 VHOST_LOG(ERR, "Unexpected previous kickfd value (Got %d, expected -1).\n",
440 * First remove invalid epoll event, and then install
441 * the new one. May be solved with a proper API in the
446 ret = rte_epoll_ctl(epfd, EPOLL_CTL_DEL, rev.fd,
449 VHOST_LOG(ERR, "Delete epoll event failed.\n");
453 rev.fd = rte_intr_efds_index_get(handle, rxq_idx);
454 if (rte_intr_elist_index_set(handle, rxq_idx, rev))
457 elist = rte_intr_elist_index_get(handle, rxq_idx);
458 ret = rte_epoll_ctl(epfd, EPOLL_CTL_ADD, rev.fd, elist);
460 VHOST_LOG(ERR, "Add epoll event failed.\n");
468 eth_rxq_intr_enable(struct rte_eth_dev *dev, uint16_t qid)
470 struct rte_vhost_vring vring;
471 struct vhost_queue *vq;
472 int old_intr_enable, ret = 0;
474 vq = dev->data->rx_queues[qid];
476 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
480 rte_spinlock_lock(&vq->intr_lock);
481 old_intr_enable = vq->intr_enable;
483 ret = eth_vhost_update_intr(dev, qid);
484 rte_spinlock_unlock(&vq->intr_lock);
487 VHOST_LOG(ERR, "Failed to update rxq%d's intr\n", qid);
488 vq->intr_enable = old_intr_enable;
492 ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
494 VHOST_LOG(ERR, "Failed to get rxq%d's vring\n", qid);
497 VHOST_LOG(INFO, "Enable interrupt for rxq%d\n", qid);
498 rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 1);
505 eth_rxq_intr_disable(struct rte_eth_dev *dev, uint16_t qid)
507 struct rte_vhost_vring vring;
508 struct vhost_queue *vq;
511 vq = dev->data->rx_queues[qid];
513 VHOST_LOG(ERR, "rxq%d is not setup yet\n", qid);
517 ret = rte_vhost_get_vhost_vring(vq->vid, (qid << 1) + 1, &vring);
519 VHOST_LOG(ERR, "Failed to get rxq%d's vring", qid);
522 VHOST_LOG(INFO, "Disable interrupt for rxq%d\n", qid);
523 rte_vhost_enable_guest_notification(vq->vid, (qid << 1) + 1, 0);
532 eth_vhost_uninstall_intr(struct rte_eth_dev *dev)
534 struct rte_intr_handle *intr_handle = dev->intr_handle;
536 if (intr_handle != NULL) {
537 rte_intr_vec_list_free(intr_handle);
538 rte_intr_instance_free(intr_handle);
540 dev->intr_handle = NULL;
544 eth_vhost_install_intr(struct rte_eth_dev *dev)
546 struct rte_vhost_vring vring;
547 struct vhost_queue *vq;
548 int nb_rxq = dev->data->nb_rx_queues;
552 /* uninstall firstly if we are reconnecting */
553 if (dev->intr_handle != NULL)
554 eth_vhost_uninstall_intr(dev);
556 dev->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_PRIVATE);
557 if (dev->intr_handle == NULL) {
558 VHOST_LOG(ERR, "Fail to allocate intr_handle\n");
561 if (rte_intr_efd_counter_size_set(dev->intr_handle, sizeof(uint64_t)))
564 if (rte_intr_vec_list_alloc(dev->intr_handle, NULL, nb_rxq)) {
566 "Failed to allocate memory for interrupt vector\n");
567 rte_intr_instance_free(dev->intr_handle);
572 VHOST_LOG(INFO, "Prepare intr vec\n");
573 for (i = 0; i < nb_rxq; i++) {
574 if (rte_intr_vec_list_index_set(dev->intr_handle, i, RTE_INTR_VEC_RXTX_OFFSET + i))
576 if (rte_intr_efds_index_set(dev->intr_handle, i, -1))
578 vq = dev->data->rx_queues[i];
580 VHOST_LOG(INFO, "rxq-%d not setup yet, skip!\n", i);
584 ret = rte_vhost_get_vhost_vring(vq->vid, (i << 1) + 1, &vring);
587 "Failed to get rxq-%d's vring, skip!\n", i);
591 if (vring.kickfd < 0) {
593 "rxq-%d's kickfd is invalid, skip!\n", i);
597 if (rte_intr_efds_index_set(dev->intr_handle, i, vring.kickfd))
599 VHOST_LOG(INFO, "Installed intr vec for rxq-%d\n", i);
602 if (rte_intr_nb_efd_set(dev->intr_handle, nb_rxq))
605 if (rte_intr_max_intr_set(dev->intr_handle, nb_rxq + 1))
608 if (rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_VDEV))
615 update_queuing_status(struct rte_eth_dev *dev)
617 struct pmd_internal *internal = dev->data->dev_private;
618 struct vhost_queue *vq;
619 struct rte_vhost_vring_state *state;
621 int allow_queuing = 1;
623 if (!dev->data->rx_queues || !dev->data->tx_queues)
626 if (rte_atomic32_read(&internal->started) == 0 ||
627 rte_atomic32_read(&internal->dev_attached) == 0)
630 state = vring_states[dev->data->port_id];
632 /* Wait until rx/tx_pkt_burst stops accessing vhost device */
633 for (i = 0; i < dev->data->nb_rx_queues; i++) {
634 vq = dev->data->rx_queues[i];
637 if (allow_queuing && state->cur[vq->virtqueue_id])
638 rte_atomic32_set(&vq->allow_queuing, 1);
640 rte_atomic32_set(&vq->allow_queuing, 0);
641 while (rte_atomic32_read(&vq->while_queuing))
645 for (i = 0; i < dev->data->nb_tx_queues; i++) {
646 vq = dev->data->tx_queues[i];
649 if (allow_queuing && state->cur[vq->virtqueue_id])
650 rte_atomic32_set(&vq->allow_queuing, 1);
652 rte_atomic32_set(&vq->allow_queuing, 0);
653 while (rte_atomic32_read(&vq->while_queuing))
659 queue_setup(struct rte_eth_dev *eth_dev, struct pmd_internal *internal)
661 struct vhost_queue *vq;
664 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
665 vq = eth_dev->data->rx_queues[i];
668 vq->vid = internal->vid;
669 vq->internal = internal;
670 vq->port = eth_dev->data->port_id;
672 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
673 vq = eth_dev->data->tx_queues[i];
676 vq->vid = internal->vid;
677 vq->internal = internal;
678 vq->port = eth_dev->data->port_id;
685 struct rte_eth_dev *eth_dev;
686 struct internal_list *list;
687 struct pmd_internal *internal;
688 struct rte_eth_conf *dev_conf;
690 char ifname[PATH_MAX];
691 #ifdef RTE_LIBRTE_VHOST_NUMA
695 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
696 list = find_internal_resource(ifname);
698 VHOST_LOG(INFO, "Invalid device name: %s\n", ifname);
702 eth_dev = list->eth_dev;
703 internal = eth_dev->data->dev_private;
704 dev_conf = ð_dev->data->dev_conf;
706 #ifdef RTE_LIBRTE_VHOST_NUMA
707 newnode = rte_vhost_get_numa_node(vid);
709 eth_dev->data->numa_node = newnode;
713 if (rte_atomic32_read(&internal->started) == 1) {
714 queue_setup(eth_dev, internal);
716 if (dev_conf->intr_conf.rxq) {
717 if (eth_vhost_install_intr(eth_dev) < 0) {
719 "Failed to install interrupt handler.");
724 VHOST_LOG(INFO, "RX/TX queues not exist yet\n");
727 for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
728 rte_vhost_enable_guest_notification(vid, i, 0);
730 rte_vhost_get_mtu(vid, ð_dev->data->mtu);
732 eth_dev->data->dev_link.link_status = RTE_ETH_LINK_UP;
734 rte_atomic32_set(&internal->dev_attached, 1);
735 update_queuing_status(eth_dev);
737 VHOST_LOG(INFO, "Vhost device %d created\n", vid);
739 rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
745 destroy_device(int vid)
747 struct rte_eth_dev *eth_dev;
748 struct pmd_internal *internal;
749 struct vhost_queue *vq;
750 struct internal_list *list;
751 char ifname[PATH_MAX];
753 struct rte_vhost_vring_state *state;
755 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
756 list = find_internal_resource(ifname);
758 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
761 eth_dev = list->eth_dev;
762 internal = eth_dev->data->dev_private;
764 rte_atomic32_set(&internal->dev_attached, 0);
765 update_queuing_status(eth_dev);
767 eth_dev->data->dev_link.link_status = RTE_ETH_LINK_DOWN;
769 if (eth_dev->data->rx_queues && eth_dev->data->tx_queues) {
770 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
771 vq = eth_dev->data->rx_queues[i];
776 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
777 vq = eth_dev->data->tx_queues[i];
784 state = vring_states[eth_dev->data->port_id];
785 rte_spinlock_lock(&state->lock);
786 for (i = 0; i <= state->max_vring; i++) {
787 state->cur[i] = false;
788 state->seen[i] = false;
790 state->max_vring = 0;
791 rte_spinlock_unlock(&state->lock);
793 VHOST_LOG(INFO, "Vhost device %d destroyed\n", vid);
794 eth_vhost_uninstall_intr(eth_dev);
796 rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
800 vring_conf_update(int vid, struct rte_eth_dev *eth_dev, uint16_t vring_id)
802 struct rte_eth_conf *dev_conf = ð_dev->data->dev_conf;
803 struct pmd_internal *internal = eth_dev->data->dev_private;
804 struct vhost_queue *vq;
805 struct rte_vhost_vring vring;
806 int rx_idx = vring_id % 2 ? (vring_id - 1) >> 1 : -1;
810 * The vring kickfd may be changed after the new device notification.
811 * Update it when the vring state is updated.
813 if (rx_idx >= 0 && rx_idx < eth_dev->data->nb_rx_queues &&
814 rte_atomic32_read(&internal->dev_attached) &&
815 rte_atomic32_read(&internal->started) &&
816 dev_conf->intr_conf.rxq) {
817 ret = rte_vhost_get_vhost_vring(vid, vring_id, &vring);
819 VHOST_LOG(ERR, "Failed to get vring %d information.\n",
824 if (rte_intr_efds_index_set(eth_dev->intr_handle, rx_idx,
828 vq = eth_dev->data->rx_queues[rx_idx];
830 VHOST_LOG(ERR, "rxq%d is not setup yet\n", rx_idx);
834 rte_spinlock_lock(&vq->intr_lock);
836 ret = eth_vhost_update_intr(eth_dev, rx_idx);
837 rte_spinlock_unlock(&vq->intr_lock);
844 vring_state_changed(int vid, uint16_t vring, int enable)
846 struct rte_vhost_vring_state *state;
847 struct rte_eth_dev *eth_dev;
848 struct internal_list *list;
849 char ifname[PATH_MAX];
851 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
852 list = find_internal_resource(ifname);
854 VHOST_LOG(ERR, "Invalid interface name: %s\n", ifname);
858 eth_dev = list->eth_dev;
860 state = vring_states[eth_dev->data->port_id];
862 if (enable && vring_conf_update(vid, eth_dev, vring))
863 VHOST_LOG(INFO, "Failed to update vring-%d configuration.\n",
866 rte_spinlock_lock(&state->lock);
867 if (state->cur[vring] == enable) {
868 rte_spinlock_unlock(&state->lock);
871 state->cur[vring] = enable;
872 state->max_vring = RTE_MAX(vring, state->max_vring);
873 rte_spinlock_unlock(&state->lock);
875 update_queuing_status(eth_dev);
877 VHOST_LOG(INFO, "vring%u is %s\n",
878 vring, enable ? "enabled" : "disabled");
880 rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
885 static struct rte_vhost_device_ops vhost_ops = {
886 .new_device = new_device,
887 .destroy_device = destroy_device,
888 .vring_state_changed = vring_state_changed,
892 vhost_driver_setup(struct rte_eth_dev *eth_dev)
894 struct pmd_internal *internal = eth_dev->data->dev_private;
895 struct internal_list *list = NULL;
896 struct rte_vhost_vring_state *vring_state = NULL;
897 unsigned int numa_node = eth_dev->device->numa_node;
898 const char *name = eth_dev->device->name;
900 /* Don't try to setup again if it has already been done. */
901 list = find_internal_resource(internal->iface_name);
905 list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
909 vring_state = rte_zmalloc_socket(name, sizeof(*vring_state),
911 if (vring_state == NULL)
914 list->eth_dev = eth_dev;
915 pthread_mutex_lock(&internal_list_lock);
916 TAILQ_INSERT_TAIL(&internal_list, list, next);
917 pthread_mutex_unlock(&internal_list_lock);
919 rte_spinlock_init(&vring_state->lock);
920 vring_states[eth_dev->data->port_id] = vring_state;
922 if (rte_vhost_driver_register(internal->iface_name, internal->flags))
925 if (internal->disable_flags) {
926 if (rte_vhost_driver_disable_features(internal->iface_name,
927 internal->disable_flags))
931 if (rte_vhost_driver_callback_register(internal->iface_name,
933 VHOST_LOG(ERR, "Can't register callbacks\n");
937 if (rte_vhost_driver_start(internal->iface_name) < 0) {
938 VHOST_LOG(ERR, "Failed to start driver for %s\n",
939 internal->iface_name);
946 rte_vhost_driver_unregister(internal->iface_name);
948 vring_states[eth_dev->data->port_id] = NULL;
949 pthread_mutex_lock(&internal_list_lock);
950 TAILQ_REMOVE(&internal_list, list, next);
951 pthread_mutex_unlock(&internal_list_lock);
952 rte_free(vring_state);
960 rte_eth_vhost_get_queue_event(uint16_t port_id,
961 struct rte_eth_vhost_queue_event *event)
963 struct rte_vhost_vring_state *state;
967 if (port_id >= RTE_MAX_ETHPORTS) {
968 VHOST_LOG(ERR, "Invalid port id\n");
972 state = vring_states[port_id];
974 VHOST_LOG(ERR, "Unused port\n");
978 rte_spinlock_lock(&state->lock);
979 for (i = 0; i <= state->max_vring; i++) {
980 idx = state->index++ % (state->max_vring + 1);
982 if (state->cur[idx] != state->seen[idx]) {
983 state->seen[idx] = state->cur[idx];
984 event->queue_id = idx / 2;
986 event->enable = state->cur[idx];
987 rte_spinlock_unlock(&state->lock);
991 rte_spinlock_unlock(&state->lock);
997 rte_eth_vhost_get_vid_from_port_id(uint16_t port_id)
999 struct internal_list *list;
1000 struct rte_eth_dev *eth_dev;
1001 struct vhost_queue *vq;
1004 if (!rte_eth_dev_is_valid_port(port_id))
1007 pthread_mutex_lock(&internal_list_lock);
1009 TAILQ_FOREACH(list, &internal_list, next) {
1010 eth_dev = list->eth_dev;
1011 if (eth_dev->data->port_id == port_id) {
1012 vq = eth_dev->data->rx_queues[0];
1020 pthread_mutex_unlock(&internal_list_lock);
1026 eth_dev_configure(struct rte_eth_dev *dev)
1028 struct pmd_internal *internal = dev->data->dev_private;
1029 const struct rte_eth_rxmode *rxmode = &dev->data->dev_conf.rxmode;
1031 /* NOTE: the same process has to operate a vhost interface
1032 * from beginning to end (from eth_dev configure to eth_dev close).
1033 * It is user's responsibility at the moment.
1035 if (vhost_driver_setup(dev) < 0)
1038 internal->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
1044 eth_dev_start(struct rte_eth_dev *eth_dev)
1046 struct pmd_internal *internal = eth_dev->data->dev_private;
1047 struct rte_eth_conf *dev_conf = ð_dev->data->dev_conf;
1049 queue_setup(eth_dev, internal);
1051 if (rte_atomic32_read(&internal->dev_attached) == 1) {
1052 if (dev_conf->intr_conf.rxq) {
1053 if (eth_vhost_install_intr(eth_dev) < 0) {
1055 "Failed to install interrupt handler.");
1061 rte_atomic32_set(&internal->started, 1);
1062 update_queuing_status(eth_dev);
1068 eth_dev_stop(struct rte_eth_dev *dev)
1070 struct pmd_internal *internal = dev->data->dev_private;
1072 dev->data->dev_started = 0;
1073 rte_atomic32_set(&internal->started, 0);
1074 update_queuing_status(dev);
1080 eth_dev_close(struct rte_eth_dev *dev)
1082 struct pmd_internal *internal;
1083 struct internal_list *list;
1084 unsigned int i, ret;
1086 if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1089 internal = dev->data->dev_private;
1093 ret = eth_dev_stop(dev);
1095 list = find_internal_resource(internal->iface_name);
1097 rte_vhost_driver_unregister(internal->iface_name);
1098 pthread_mutex_lock(&internal_list_lock);
1099 TAILQ_REMOVE(&internal_list, list, next);
1100 pthread_mutex_unlock(&internal_list_lock);
1104 if (dev->data->rx_queues)
1105 for (i = 0; i < dev->data->nb_rx_queues; i++)
1106 rte_free(dev->data->rx_queues[i]);
1108 if (dev->data->tx_queues)
1109 for (i = 0; i < dev->data->nb_tx_queues; i++)
1110 rte_free(dev->data->tx_queues[i]);
1112 rte_free(internal->iface_name);
1115 dev->data->dev_private = NULL;
1117 rte_free(vring_states[dev->data->port_id]);
1118 vring_states[dev->data->port_id] = NULL;
1124 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
1125 uint16_t nb_rx_desc __rte_unused,
1126 unsigned int socket_id,
1127 const struct rte_eth_rxconf *rx_conf __rte_unused,
1128 struct rte_mempool *mb_pool)
1130 struct vhost_queue *vq;
1132 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1133 RTE_CACHE_LINE_SIZE, socket_id);
1135 VHOST_LOG(ERR, "Failed to allocate memory for rx queue\n");
1139 vq->mb_pool = mb_pool;
1140 vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
1141 rte_spinlock_init(&vq->intr_lock);
1142 dev->data->rx_queues[rx_queue_id] = vq;
1148 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
1149 uint16_t nb_tx_desc __rte_unused,
1150 unsigned int socket_id,
1151 const struct rte_eth_txconf *tx_conf __rte_unused)
1153 struct vhost_queue *vq;
1155 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
1156 RTE_CACHE_LINE_SIZE, socket_id);
1158 VHOST_LOG(ERR, "Failed to allocate memory for tx queue\n");
1162 vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
1163 rte_spinlock_init(&vq->intr_lock);
1164 dev->data->tx_queues[tx_queue_id] = vq;
1170 eth_dev_info(struct rte_eth_dev *dev,
1171 struct rte_eth_dev_info *dev_info)
1173 struct pmd_internal *internal;
1175 internal = dev->data->dev_private;
1176 if (internal == NULL) {
1177 VHOST_LOG(ERR, "Invalid device specified\n");
1181 dev_info->max_mac_addrs = 1;
1182 dev_info->max_rx_pktlen = (uint32_t)-1;
1183 dev_info->max_rx_queues = internal->max_queues;
1184 dev_info->max_tx_queues = internal->max_queues;
1185 dev_info->min_rx_bufsize = 0;
1187 dev_info->tx_offload_capa = RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
1188 RTE_ETH_TX_OFFLOAD_VLAN_INSERT;
1189 dev_info->rx_offload_capa = RTE_ETH_RX_OFFLOAD_VLAN_STRIP;
1195 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
1198 unsigned long rx_total = 0, tx_total = 0;
1199 unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
1200 struct vhost_queue *vq;
1202 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1203 i < dev->data->nb_rx_queues; i++) {
1204 if (dev->data->rx_queues[i] == NULL)
1206 vq = dev->data->rx_queues[i];
1207 stats->q_ipackets[i] = vq->stats.pkts;
1208 rx_total += stats->q_ipackets[i];
1210 stats->q_ibytes[i] = vq->stats.bytes;
1211 rx_total_bytes += stats->q_ibytes[i];
1214 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
1215 i < dev->data->nb_tx_queues; i++) {
1216 if (dev->data->tx_queues[i] == NULL)
1218 vq = dev->data->tx_queues[i];
1219 stats->q_opackets[i] = vq->stats.pkts;
1220 tx_total += stats->q_opackets[i];
1222 stats->q_obytes[i] = vq->stats.bytes;
1223 tx_total_bytes += stats->q_obytes[i];
1226 stats->ipackets = rx_total;
1227 stats->opackets = tx_total;
1228 stats->ibytes = rx_total_bytes;
1229 stats->obytes = tx_total_bytes;
1235 eth_stats_reset(struct rte_eth_dev *dev)
1237 struct vhost_queue *vq;
1240 for (i = 0; i < dev->data->nb_rx_queues; i++) {
1241 if (dev->data->rx_queues[i] == NULL)
1243 vq = dev->data->rx_queues[i];
1245 vq->stats.bytes = 0;
1247 for (i = 0; i < dev->data->nb_tx_queues; i++) {
1248 if (dev->data->tx_queues[i] == NULL)
1250 vq = dev->data->tx_queues[i];
1252 vq->stats.bytes = 0;
1253 vq->stats.missed_pkts = 0;
1260 eth_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1262 rte_free(dev->data->rx_queues[qid]);
1266 eth_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
1268 rte_free(dev->data->tx_queues[qid]);
1272 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
1275 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
1276 * and releases mbuf, so nothing to cleanup.
1282 eth_link_update(struct rte_eth_dev *dev __rte_unused,
1283 int wait_to_complete __rte_unused)
1289 eth_rx_queue_count(void *rx_queue)
1291 struct vhost_queue *vq;
1297 return rte_vhost_rx_queue_count(vq->vid, vq->virtqueue_id);
1300 #define CLB_VAL_IDX 0
1301 #define CLB_MSK_IDX 1
1302 #define CLB_MATCH_IDX 2
1304 vhost_monitor_callback(const uint64_t value,
1305 const uint64_t opaque[RTE_POWER_MONITOR_OPAQUE_SZ])
1307 const uint64_t m = opaque[CLB_MSK_IDX];
1308 const uint64_t v = opaque[CLB_VAL_IDX];
1309 const uint64_t c = opaque[CLB_MATCH_IDX];
1312 return (value & m) == v ? -1 : 0;
1314 return (value & m) == v ? 0 : -1;
1318 vhost_get_monitor_addr(void *rx_queue, struct rte_power_monitor_cond *pmc)
1320 struct vhost_queue *vq = rx_queue;
1321 struct rte_vhost_power_monitor_cond vhost_pmc;
1325 ret = rte_vhost_get_monitor_addr(vq->vid, vq->virtqueue_id,
1329 pmc->addr = vhost_pmc.addr;
1330 pmc->opaque[CLB_VAL_IDX] = vhost_pmc.val;
1331 pmc->opaque[CLB_MSK_IDX] = vhost_pmc.mask;
1332 pmc->opaque[CLB_MATCH_IDX] = vhost_pmc.match;
1333 pmc->size = vhost_pmc.size;
1334 pmc->fn = vhost_monitor_callback;
1339 static const struct eth_dev_ops ops = {
1340 .dev_start = eth_dev_start,
1341 .dev_stop = eth_dev_stop,
1342 .dev_close = eth_dev_close,
1343 .dev_configure = eth_dev_configure,
1344 .dev_infos_get = eth_dev_info,
1345 .rx_queue_setup = eth_rx_queue_setup,
1346 .tx_queue_setup = eth_tx_queue_setup,
1347 .rx_queue_release = eth_rx_queue_release,
1348 .tx_queue_release = eth_tx_queue_release,
1349 .tx_done_cleanup = eth_tx_done_cleanup,
1350 .link_update = eth_link_update,
1351 .stats_get = eth_stats_get,
1352 .stats_reset = eth_stats_reset,
1353 .xstats_reset = vhost_dev_xstats_reset,
1354 .xstats_get = vhost_dev_xstats_get,
1355 .xstats_get_names = vhost_dev_xstats_get_names,
1356 .rx_queue_intr_enable = eth_rxq_intr_enable,
1357 .rx_queue_intr_disable = eth_rxq_intr_disable,
1358 .get_monitor_addr = vhost_get_monitor_addr,
1362 eth_dev_vhost_create(struct rte_vdev_device *dev, char *iface_name,
1363 int16_t queues, const unsigned int numa_node, uint64_t flags,
1364 uint64_t disable_flags)
1366 const char *name = rte_vdev_device_name(dev);
1367 struct rte_eth_dev_data *data;
1368 struct pmd_internal *internal = NULL;
1369 struct rte_eth_dev *eth_dev = NULL;
1370 struct rte_ether_addr *eth_addr = NULL;
1372 VHOST_LOG(INFO, "Creating VHOST-USER backend on numa socket %u\n",
1375 /* reserve an ethdev entry */
1376 eth_dev = rte_eth_vdev_allocate(dev, sizeof(*internal));
1377 if (eth_dev == NULL)
1379 data = eth_dev->data;
1381 eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1382 if (eth_addr == NULL)
1384 data->mac_addrs = eth_addr;
1385 *eth_addr = base_eth_addr;
1386 eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1388 /* now put it all together
1389 * - store queue data in internal,
1390 * - point eth_dev_data to internals
1391 * - and point eth_dev structure to new eth_dev_data structure
1393 internal = eth_dev->data->dev_private;
1394 internal->iface_name = rte_malloc_socket(name, strlen(iface_name) + 1,
1396 if (internal->iface_name == NULL)
1398 strcpy(internal->iface_name, iface_name);
1400 data->nb_rx_queues = queues;
1401 data->nb_tx_queues = queues;
1402 internal->max_queues = queues;
1404 internal->flags = flags;
1405 internal->disable_flags = disable_flags;
1406 data->dev_link = pmd_link;
1407 data->dev_flags = RTE_ETH_DEV_INTR_LSC |
1408 RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1409 data->promiscuous = 1;
1410 data->all_multicast = 1;
1412 eth_dev->dev_ops = &ops;
1413 eth_dev->rx_queue_count = eth_rx_queue_count;
1415 /* finally assign rx and tx ops */
1416 eth_dev->rx_pkt_burst = eth_vhost_rx;
1417 eth_dev->tx_pkt_burst = eth_vhost_tx;
1419 rte_eth_dev_probing_finish(eth_dev);
1424 rte_free(internal->iface_name);
1425 rte_eth_dev_release_port(eth_dev);
1431 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1433 const char **iface_name = extra_args;
1438 *iface_name = value;
1444 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1446 uint16_t *n = extra_args;
1448 if (value == NULL || extra_args == NULL)
1451 *n = (uint16_t)strtoul(value, NULL, 0);
1452 if (*n == USHRT_MAX && errno == ERANGE)
1459 rte_pmd_vhost_probe(struct rte_vdev_device *dev)
1461 struct rte_kvargs *kvlist = NULL;
1465 uint64_t flags = RTE_VHOST_USER_NET_STATS_ENABLE;
1466 uint64_t disable_flags = 0;
1467 int client_mode = 0;
1468 int iommu_support = 0;
1469 int postcopy_support = 0;
1473 struct rte_eth_dev *eth_dev;
1474 const char *name = rte_vdev_device_name(dev);
1476 VHOST_LOG(INFO, "Initializing pmd_vhost for %s\n", name);
1478 if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1479 eth_dev = rte_eth_dev_attach_secondary(name);
1481 VHOST_LOG(ERR, "Failed to probe %s\n", name);
1484 eth_dev->rx_pkt_burst = eth_vhost_rx;
1485 eth_dev->tx_pkt_burst = eth_vhost_tx;
1486 eth_dev->dev_ops = &ops;
1487 if (dev->device.numa_node == SOCKET_ID_ANY)
1488 dev->device.numa_node = rte_socket_id();
1489 eth_dev->device = &dev->device;
1490 rte_eth_dev_probing_finish(eth_dev);
1494 kvlist = rte_kvargs_parse(rte_vdev_device_args(dev), valid_arguments);
1498 if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1499 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1500 &open_iface, &iface_name);
1508 if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1509 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1510 &open_int, &queues);
1511 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1517 if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1518 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1519 &open_int, &client_mode);
1524 flags |= RTE_VHOST_USER_CLIENT;
1527 if (rte_kvargs_count(kvlist, ETH_VHOST_IOMMU_SUPPORT) == 1) {
1528 ret = rte_kvargs_process(kvlist, ETH_VHOST_IOMMU_SUPPORT,
1529 &open_int, &iommu_support);
1534 flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
1537 if (rte_kvargs_count(kvlist, ETH_VHOST_POSTCOPY_SUPPORT) == 1) {
1538 ret = rte_kvargs_process(kvlist, ETH_VHOST_POSTCOPY_SUPPORT,
1539 &open_int, &postcopy_support);
1543 if (postcopy_support)
1544 flags |= RTE_VHOST_USER_POSTCOPY_SUPPORT;
1547 if (rte_kvargs_count(kvlist, ETH_VHOST_VIRTIO_NET_F_HOST_TSO) == 1) {
1548 ret = rte_kvargs_process(kvlist,
1549 ETH_VHOST_VIRTIO_NET_F_HOST_TSO,
1556 disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO4);
1557 disable_flags |= (1ULL << VIRTIO_NET_F_HOST_TSO6);
1560 if (rte_kvargs_count(kvlist, ETH_VHOST_LINEAR_BUF) == 1) {
1561 ret = rte_kvargs_process(kvlist,
1562 ETH_VHOST_LINEAR_BUF,
1563 &open_int, &linear_buf);
1567 if (linear_buf == 1)
1568 flags |= RTE_VHOST_USER_LINEARBUF_SUPPORT;
1571 if (rte_kvargs_count(kvlist, ETH_VHOST_EXT_BUF) == 1) {
1572 ret = rte_kvargs_process(kvlist,
1574 &open_int, &ext_buf);
1579 flags |= RTE_VHOST_USER_EXTBUF_SUPPORT;
1582 if (dev->device.numa_node == SOCKET_ID_ANY)
1583 dev->device.numa_node = rte_socket_id();
1585 ret = eth_dev_vhost_create(dev, iface_name, queues,
1586 dev->device.numa_node, flags, disable_flags);
1588 VHOST_LOG(ERR, "Failed to create %s\n", name);
1591 rte_kvargs_free(kvlist);
1596 rte_pmd_vhost_remove(struct rte_vdev_device *dev)
1599 struct rte_eth_dev *eth_dev = NULL;
1601 name = rte_vdev_device_name(dev);
1602 VHOST_LOG(INFO, "Un-Initializing pmd_vhost for %s\n", name);
1604 /* find an ethdev entry */
1605 eth_dev = rte_eth_dev_allocated(name);
1606 if (eth_dev == NULL)
1609 eth_dev_close(eth_dev);
1610 rte_eth_dev_release_port(eth_dev);
1615 static struct rte_vdev_driver pmd_vhost_drv = {
1616 .probe = rte_pmd_vhost_probe,
1617 .remove = rte_pmd_vhost_remove,
1620 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1621 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1622 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,
1626 "iommu-support=<0|1> "
1627 "postcopy-support=<0|1> "
1629 "linear-buffer=<0|1> "
1630 "ext-buffer=<0|1>");