4 * Copyright (c) 2016 IGEL Co., Ltd.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of IGEL Co.,Ltd. nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 #include <rte_ethdev.h>
39 #include <rte_malloc.h>
40 #include <rte_memcpy.h>
42 #include <rte_kvargs.h>
43 #include <rte_vhost.h>
44 #include <rte_spinlock.h>
46 #include "rte_eth_vhost.h"
48 enum {VIRTIO_RXQ, VIRTIO_TXQ, VIRTIO_QNUM};
50 #define ETH_VHOST_IFACE_ARG "iface"
51 #define ETH_VHOST_QUEUES_ARG "queues"
52 #define ETH_VHOST_CLIENT_ARG "client"
53 #define ETH_VHOST_DEQUEUE_ZERO_COPY "dequeue-zero-copy"
54 #define VHOST_MAX_PKT_BURST 32
56 static const char *valid_arguments[] = {
60 ETH_VHOST_DEQUEUE_ZERO_COPY,
64 static struct ether_addr base_eth_addr = {
75 enum vhost_xstats_pkts {
76 VHOST_UNDERSIZE_PKT = 0,
81 VHOST_512_TO_1023_PKT,
82 VHOST_1024_TO_1522_PKT,
83 VHOST_1523_TO_MAX_PKT,
88 VHOST_ERRORS_FRAGMENTED,
90 VHOST_UNKNOWN_PROTOCOL,
98 uint64_t xstats[VHOST_XSTATS_MAX];
103 rte_atomic32_t allow_queuing;
104 rte_atomic32_t while_queuing;
105 struct pmd_internal *internal;
106 struct rte_mempool *mb_pool;
108 uint16_t virtqueue_id;
109 struct vhost_stats stats;
112 struct pmd_internal {
113 rte_atomic32_t dev_attached;
117 rte_atomic32_t started;
120 struct internal_list {
121 TAILQ_ENTRY(internal_list) next;
122 struct rte_eth_dev *eth_dev;
125 TAILQ_HEAD(internal_list_head, internal_list);
126 static struct internal_list_head internal_list =
127 TAILQ_HEAD_INITIALIZER(internal_list);
129 static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
131 static struct rte_eth_link pmd_link = {
133 .link_duplex = ETH_LINK_FULL_DUPLEX,
134 .link_status = ETH_LINK_DOWN
137 struct rte_vhost_vring_state {
140 bool cur[RTE_MAX_QUEUES_PER_PORT * 2];
141 bool seen[RTE_MAX_QUEUES_PER_PORT * 2];
143 unsigned int max_vring;
146 static struct rte_vhost_vring_state *vring_states[RTE_MAX_ETHPORTS];
148 #define VHOST_XSTATS_NAME_SIZE 64
150 struct vhost_xstats_name_off {
151 char name[VHOST_XSTATS_NAME_SIZE];
155 /* [rx]_is prepended to the name string here */
156 static const struct vhost_xstats_name_off vhost_rxport_stat_strings[] = {
158 offsetof(struct vhost_queue, stats.pkts)},
160 offsetof(struct vhost_queue, stats.bytes)},
162 offsetof(struct vhost_queue, stats.missed_pkts)},
163 {"broadcast_packets",
164 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
165 {"multicast_packets",
166 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
168 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
169 {"undersize_packets",
170 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
172 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
173 {"size_65_to_127_packets",
174 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
175 {"size_128_to_255_packets",
176 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
177 {"size_256_to_511_packets",
178 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
179 {"size_512_to_1023_packets",
180 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
181 {"size_1024_to_1522_packets",
182 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
183 {"size_1523_to_max_packets",
184 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
185 {"errors_with_bad_CRC",
186 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
187 {"fragmented_errors",
188 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_FRAGMENTED])},
190 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_JABBER])},
191 {"unknown_protos_packets",
192 offsetof(struct vhost_queue, stats.xstats[VHOST_UNKNOWN_PROTOCOL])},
195 /* [tx]_ is prepended to the name string here */
196 static const struct vhost_xstats_name_off vhost_txport_stat_strings[] = {
198 offsetof(struct vhost_queue, stats.pkts)},
200 offsetof(struct vhost_queue, stats.bytes)},
202 offsetof(struct vhost_queue, stats.missed_pkts)},
203 {"broadcast_packets",
204 offsetof(struct vhost_queue, stats.xstats[VHOST_BROADCAST_PKT])},
205 {"multicast_packets",
206 offsetof(struct vhost_queue, stats.xstats[VHOST_MULTICAST_PKT])},
208 offsetof(struct vhost_queue, stats.xstats[VHOST_UNICAST_PKT])},
209 {"undersize_packets",
210 offsetof(struct vhost_queue, stats.xstats[VHOST_UNDERSIZE_PKT])},
212 offsetof(struct vhost_queue, stats.xstats[VHOST_64_PKT])},
213 {"size_65_to_127_packets",
214 offsetof(struct vhost_queue, stats.xstats[VHOST_65_TO_127_PKT])},
215 {"size_128_to_255_packets",
216 offsetof(struct vhost_queue, stats.xstats[VHOST_128_TO_255_PKT])},
217 {"size_256_to_511_packets",
218 offsetof(struct vhost_queue, stats.xstats[VHOST_256_TO_511_PKT])},
219 {"size_512_to_1023_packets",
220 offsetof(struct vhost_queue, stats.xstats[VHOST_512_TO_1023_PKT])},
221 {"size_1024_to_1522_packets",
222 offsetof(struct vhost_queue, stats.xstats[VHOST_1024_TO_1522_PKT])},
223 {"size_1523_to_max_packets",
224 offsetof(struct vhost_queue, stats.xstats[VHOST_1523_TO_MAX_PKT])},
225 {"errors_with_bad_CRC",
226 offsetof(struct vhost_queue, stats.xstats[VHOST_ERRORS_PKT])},
229 #define VHOST_NB_XSTATS_RXPORT (sizeof(vhost_rxport_stat_strings) / \
230 sizeof(vhost_rxport_stat_strings[0]))
232 #define VHOST_NB_XSTATS_TXPORT (sizeof(vhost_txport_stat_strings) / \
233 sizeof(vhost_txport_stat_strings[0]))
236 vhost_dev_xstats_reset(struct rte_eth_dev *dev)
238 struct vhost_queue *vq = NULL;
241 for (i = 0; i < dev->data->nb_rx_queues; i++) {
242 vq = dev->data->rx_queues[i];
245 memset(&vq->stats, 0, sizeof(vq->stats));
247 for (i = 0; i < dev->data->nb_tx_queues; i++) {
248 vq = dev->data->tx_queues[i];
251 memset(&vq->stats, 0, sizeof(vq->stats));
256 vhost_dev_xstats_get_names(struct rte_eth_dev *dev __rte_unused,
257 struct rte_eth_xstat_name *xstats_names,
258 unsigned int limit __rte_unused)
262 int nstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
266 for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
267 snprintf(xstats_names[count].name,
268 sizeof(xstats_names[count].name),
269 "rx_%s", vhost_rxport_stat_strings[t].name);
272 for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
273 snprintf(xstats_names[count].name,
274 sizeof(xstats_names[count].name),
275 "tx_%s", vhost_txport_stat_strings[t].name);
282 vhost_dev_xstats_get(struct rte_eth_dev *dev, struct rte_eth_xstat *xstats,
287 unsigned int count = 0;
288 struct vhost_queue *vq = NULL;
289 unsigned int nxstats = VHOST_NB_XSTATS_RXPORT + VHOST_NB_XSTATS_TXPORT;
294 for (i = 0; i < dev->data->nb_rx_queues; i++) {
295 vq = dev->data->rx_queues[i];
298 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
299 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
300 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
302 for (i = 0; i < dev->data->nb_tx_queues; i++) {
303 vq = dev->data->tx_queues[i];
306 vq->stats.xstats[VHOST_UNICAST_PKT] = vq->stats.pkts
307 + vq->stats.missed_pkts
308 - (vq->stats.xstats[VHOST_BROADCAST_PKT]
309 + vq->stats.xstats[VHOST_MULTICAST_PKT]);
311 for (t = 0; t < VHOST_NB_XSTATS_RXPORT; t++) {
312 xstats[count].value = 0;
313 for (i = 0; i < dev->data->nb_rx_queues; i++) {
314 vq = dev->data->rx_queues[i];
317 xstats[count].value +=
318 *(uint64_t *)(((char *)vq)
319 + vhost_rxport_stat_strings[t].offset);
321 xstats[count].id = count;
324 for (t = 0; t < VHOST_NB_XSTATS_TXPORT; t++) {
325 xstats[count].value = 0;
326 for (i = 0; i < dev->data->nb_tx_queues; i++) {
327 vq = dev->data->tx_queues[i];
330 xstats[count].value +=
331 *(uint64_t *)(((char *)vq)
332 + vhost_txport_stat_strings[t].offset);
334 xstats[count].id = count;
341 vhost_count_multicast_broadcast(struct vhost_queue *vq,
342 struct rte_mbuf *mbuf)
344 struct ether_addr *ea = NULL;
345 struct vhost_stats *pstats = &vq->stats;
347 ea = rte_pktmbuf_mtod(mbuf, struct ether_addr *);
348 if (is_multicast_ether_addr(ea)) {
349 if (is_broadcast_ether_addr(ea))
350 pstats->xstats[VHOST_BROADCAST_PKT]++;
352 pstats->xstats[VHOST_MULTICAST_PKT]++;
357 vhost_update_packet_xstats(struct vhost_queue *vq,
358 struct rte_mbuf **bufs,
361 uint32_t pkt_len = 0;
364 struct vhost_stats *pstats = &vq->stats;
366 for (i = 0; i < count ; i++) {
367 pkt_len = bufs[i]->pkt_len;
369 pstats->xstats[VHOST_64_PKT]++;
370 } else if (pkt_len > 64 && pkt_len < 1024) {
371 index = (sizeof(pkt_len) * 8)
372 - __builtin_clz(pkt_len) - 5;
373 pstats->xstats[index]++;
376 pstats->xstats[VHOST_UNDERSIZE_PKT]++;
377 else if (pkt_len <= 1522)
378 pstats->xstats[VHOST_1024_TO_1522_PKT]++;
379 else if (pkt_len > 1522)
380 pstats->xstats[VHOST_1523_TO_MAX_PKT]++;
382 vhost_count_multicast_broadcast(vq, bufs[i]);
387 eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
389 struct vhost_queue *r = q;
390 uint16_t i, nb_rx = 0;
391 uint16_t nb_receive = nb_bufs;
393 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
396 rte_atomic32_set(&r->while_queuing, 1);
398 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
401 /* Dequeue packets from guest TX queue */
404 uint16_t num = (uint16_t)RTE_MIN(nb_receive,
405 VHOST_MAX_PKT_BURST);
407 nb_pkts = rte_vhost_dequeue_burst(r->vid, r->virtqueue_id,
408 r->mb_pool, &bufs[nb_rx],
412 nb_receive -= nb_pkts;
417 r->stats.pkts += nb_rx;
419 for (i = 0; likely(i < nb_rx); i++) {
420 bufs[i]->port = r->port;
421 r->stats.bytes += bufs[i]->pkt_len;
424 vhost_update_packet_xstats(r, bufs, nb_rx);
427 rte_atomic32_set(&r->while_queuing, 0);
433 eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
435 struct vhost_queue *r = q;
436 uint16_t i, nb_tx = 0;
437 uint16_t nb_send = nb_bufs;
439 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
442 rte_atomic32_set(&r->while_queuing, 1);
444 if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
447 /* Enqueue packets to guest RX queue */
450 uint16_t num = (uint16_t)RTE_MIN(nb_send,
451 VHOST_MAX_PKT_BURST);
453 nb_pkts = rte_vhost_enqueue_burst(r->vid, r->virtqueue_id,
462 r->stats.pkts += nb_tx;
463 r->stats.missed_pkts += nb_bufs - nb_tx;
465 for (i = 0; likely(i < nb_tx); i++)
466 r->stats.bytes += bufs[i]->pkt_len;
468 vhost_update_packet_xstats(r, bufs, nb_tx);
470 /* According to RFC2863 page42 section ifHCOutMulticastPkts and
471 * ifHCOutBroadcastPkts, the counters "multicast" and "broadcast"
472 * are increased when packets are not transmitted successfully.
474 for (i = nb_tx; i < nb_bufs; i++)
475 vhost_count_multicast_broadcast(r, bufs[i]);
477 for (i = 0; likely(i < nb_tx); i++)
478 rte_pktmbuf_free(bufs[i]);
480 rte_atomic32_set(&r->while_queuing, 0);
486 eth_dev_configure(struct rte_eth_dev *dev __rte_unused)
491 static inline struct internal_list *
492 find_internal_resource(char *ifname)
495 struct internal_list *list;
496 struct pmd_internal *internal;
501 pthread_mutex_lock(&internal_list_lock);
503 TAILQ_FOREACH(list, &internal_list, next) {
504 internal = list->eth_dev->data->dev_private;
505 if (!strcmp(internal->iface_name, ifname)) {
511 pthread_mutex_unlock(&internal_list_lock);
520 update_queuing_status(struct rte_eth_dev *dev)
522 struct pmd_internal *internal = dev->data->dev_private;
523 struct vhost_queue *vq;
525 int allow_queuing = 1;
527 if (rte_atomic32_read(&internal->started) == 0 ||
528 rte_atomic32_read(&internal->dev_attached) == 0)
531 /* Wait until rx/tx_pkt_burst stops accessing vhost device */
532 for (i = 0; i < dev->data->nb_rx_queues; i++) {
533 vq = dev->data->rx_queues[i];
536 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
537 while (rte_atomic32_read(&vq->while_queuing))
541 for (i = 0; i < dev->data->nb_tx_queues; i++) {
542 vq = dev->data->tx_queues[i];
545 rte_atomic32_set(&vq->allow_queuing, allow_queuing);
546 while (rte_atomic32_read(&vq->while_queuing))
554 struct rte_eth_dev *eth_dev;
555 struct internal_list *list;
556 struct pmd_internal *internal;
557 struct vhost_queue *vq;
559 char ifname[PATH_MAX];
560 #ifdef RTE_LIBRTE_VHOST_NUMA
564 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
565 list = find_internal_resource(ifname);
567 RTE_LOG(INFO, PMD, "Invalid device name: %s\n", ifname);
571 eth_dev = list->eth_dev;
572 internal = eth_dev->data->dev_private;
574 #ifdef RTE_LIBRTE_VHOST_NUMA
575 newnode = rte_vhost_get_numa_node(vid);
577 eth_dev->data->numa_node = newnode;
580 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
581 vq = eth_dev->data->rx_queues[i];
585 vq->internal = internal;
586 vq->port = eth_dev->data->port_id;
588 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
589 vq = eth_dev->data->tx_queues[i];
593 vq->internal = internal;
594 vq->port = eth_dev->data->port_id;
597 for (i = 0; i < rte_vhost_get_vring_num(vid); i++)
598 rte_vhost_enable_guest_notification(vid, i, 0);
600 rte_vhost_get_mtu(vid, ð_dev->data->mtu);
602 eth_dev->data->dev_link.link_status = ETH_LINK_UP;
604 rte_atomic32_set(&internal->dev_attached, 1);
605 update_queuing_status(eth_dev);
607 RTE_LOG(INFO, PMD, "New connection established\n");
609 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
615 destroy_device(int vid)
617 struct rte_eth_dev *eth_dev;
618 struct pmd_internal *internal;
619 struct vhost_queue *vq;
620 struct internal_list *list;
621 char ifname[PATH_MAX];
623 struct rte_vhost_vring_state *state;
625 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
626 list = find_internal_resource(ifname);
628 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
631 eth_dev = list->eth_dev;
632 internal = eth_dev->data->dev_private;
634 rte_atomic32_set(&internal->dev_attached, 0);
635 update_queuing_status(eth_dev);
637 eth_dev->data->dev_link.link_status = ETH_LINK_DOWN;
639 for (i = 0; i < eth_dev->data->nb_rx_queues; i++) {
640 vq = eth_dev->data->rx_queues[i];
645 for (i = 0; i < eth_dev->data->nb_tx_queues; i++) {
646 vq = eth_dev->data->tx_queues[i];
652 state = vring_states[eth_dev->data->port_id];
653 rte_spinlock_lock(&state->lock);
654 for (i = 0; i <= state->max_vring; i++) {
655 state->cur[i] = false;
656 state->seen[i] = false;
658 state->max_vring = 0;
659 rte_spinlock_unlock(&state->lock);
661 RTE_LOG(INFO, PMD, "Connection closed\n");
663 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_INTR_LSC, NULL);
667 vring_state_changed(int vid, uint16_t vring, int enable)
669 struct rte_vhost_vring_state *state;
670 struct rte_eth_dev *eth_dev;
671 struct internal_list *list;
672 char ifname[PATH_MAX];
674 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
675 list = find_internal_resource(ifname);
677 RTE_LOG(ERR, PMD, "Invalid interface name: %s\n", ifname);
681 eth_dev = list->eth_dev;
683 state = vring_states[eth_dev->data->port_id];
684 rte_spinlock_lock(&state->lock);
685 state->cur[vring] = enable;
686 state->max_vring = RTE_MAX(vring, state->max_vring);
687 rte_spinlock_unlock(&state->lock);
689 RTE_LOG(INFO, PMD, "vring%u is %s\n",
690 vring, enable ? "enabled" : "disabled");
692 _rte_eth_dev_callback_process(eth_dev, RTE_ETH_EVENT_QUEUE_STATE, NULL);
697 static struct vhost_device_ops vhost_ops = {
698 .new_device = new_device,
699 .destroy_device = destroy_device,
700 .vring_state_changed = vring_state_changed,
704 rte_eth_vhost_get_queue_event(uint8_t port_id,
705 struct rte_eth_vhost_queue_event *event)
707 struct rte_vhost_vring_state *state;
711 if (port_id >= RTE_MAX_ETHPORTS) {
712 RTE_LOG(ERR, PMD, "Invalid port id\n");
716 state = vring_states[port_id];
718 RTE_LOG(ERR, PMD, "Unused port\n");
722 rte_spinlock_lock(&state->lock);
723 for (i = 0; i <= state->max_vring; i++) {
724 idx = state->index++ % (state->max_vring + 1);
726 if (state->cur[idx] != state->seen[idx]) {
727 state->seen[idx] = state->cur[idx];
728 event->queue_id = idx / 2;
730 event->enable = state->cur[idx];
731 rte_spinlock_unlock(&state->lock);
735 rte_spinlock_unlock(&state->lock);
741 rte_eth_vhost_get_vid_from_port_id(uint8_t port_id)
743 struct internal_list *list;
744 struct rte_eth_dev *eth_dev;
745 struct vhost_queue *vq;
748 if (!rte_eth_dev_is_valid_port(port_id))
751 pthread_mutex_lock(&internal_list_lock);
753 TAILQ_FOREACH(list, &internal_list, next) {
754 eth_dev = list->eth_dev;
755 if (eth_dev->data->port_id == port_id) {
756 vq = eth_dev->data->rx_queues[0];
764 pthread_mutex_unlock(&internal_list_lock);
770 eth_dev_start(struct rte_eth_dev *dev)
772 struct pmd_internal *internal = dev->data->dev_private;
774 rte_atomic32_set(&internal->started, 1);
775 update_queuing_status(dev);
781 eth_dev_stop(struct rte_eth_dev *dev)
783 struct pmd_internal *internal = dev->data->dev_private;
785 rte_atomic32_set(&internal->started, 0);
786 update_queuing_status(dev);
790 eth_dev_close(struct rte_eth_dev *dev)
792 struct pmd_internal *internal;
793 struct internal_list *list;
795 internal = dev->data->dev_private;
799 rte_vhost_driver_unregister(internal->iface_name);
801 list = find_internal_resource(internal->iface_name);
805 pthread_mutex_lock(&internal_list_lock);
806 TAILQ_REMOVE(&internal_list, list, next);
807 pthread_mutex_unlock(&internal_list_lock);
810 free(internal->dev_name);
811 free(internal->iface_name);
816 eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
817 uint16_t nb_rx_desc __rte_unused,
818 unsigned int socket_id,
819 const struct rte_eth_rxconf *rx_conf __rte_unused,
820 struct rte_mempool *mb_pool)
822 struct vhost_queue *vq;
824 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
825 RTE_CACHE_LINE_SIZE, socket_id);
827 RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
831 vq->mb_pool = mb_pool;
832 vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
833 dev->data->rx_queues[rx_queue_id] = vq;
839 eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
840 uint16_t nb_tx_desc __rte_unused,
841 unsigned int socket_id,
842 const struct rte_eth_txconf *tx_conf __rte_unused)
844 struct vhost_queue *vq;
846 vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
847 RTE_CACHE_LINE_SIZE, socket_id);
849 RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
853 vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
854 dev->data->tx_queues[tx_queue_id] = vq;
860 eth_dev_info(struct rte_eth_dev *dev,
861 struct rte_eth_dev_info *dev_info)
863 struct pmd_internal *internal;
865 internal = dev->data->dev_private;
866 if (internal == NULL) {
867 RTE_LOG(ERR, PMD, "Invalid device specified\n");
871 dev_info->max_mac_addrs = 1;
872 dev_info->max_rx_pktlen = (uint32_t)-1;
873 dev_info->max_rx_queues = internal->max_queues;
874 dev_info->max_tx_queues = internal->max_queues;
875 dev_info->min_rx_bufsize = 0;
879 eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
882 unsigned long rx_total = 0, tx_total = 0, tx_missed_total = 0;
883 unsigned long rx_total_bytes = 0, tx_total_bytes = 0;
884 struct vhost_queue *vq;
886 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
887 i < dev->data->nb_rx_queues; i++) {
888 if (dev->data->rx_queues[i] == NULL)
890 vq = dev->data->rx_queues[i];
891 stats->q_ipackets[i] = vq->stats.pkts;
892 rx_total += stats->q_ipackets[i];
894 stats->q_ibytes[i] = vq->stats.bytes;
895 rx_total_bytes += stats->q_ibytes[i];
898 for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
899 i < dev->data->nb_tx_queues; i++) {
900 if (dev->data->tx_queues[i] == NULL)
902 vq = dev->data->tx_queues[i];
903 stats->q_opackets[i] = vq->stats.pkts;
904 tx_missed_total += vq->stats.missed_pkts;
905 tx_total += stats->q_opackets[i];
907 stats->q_obytes[i] = vq->stats.bytes;
908 tx_total_bytes += stats->q_obytes[i];
911 stats->ipackets = rx_total;
912 stats->opackets = tx_total;
913 stats->oerrors = tx_missed_total;
914 stats->ibytes = rx_total_bytes;
915 stats->obytes = tx_total_bytes;
919 eth_stats_reset(struct rte_eth_dev *dev)
921 struct vhost_queue *vq;
924 for (i = 0; i < dev->data->nb_rx_queues; i++) {
925 if (dev->data->rx_queues[i] == NULL)
927 vq = dev->data->rx_queues[i];
931 for (i = 0; i < dev->data->nb_tx_queues; i++) {
932 if (dev->data->tx_queues[i] == NULL)
934 vq = dev->data->tx_queues[i];
937 vq->stats.missed_pkts = 0;
942 eth_queue_release(void *q)
948 eth_tx_done_cleanup(void *txq __rte_unused, uint32_t free_cnt __rte_unused)
951 * vHost does not hang onto mbuf. eth_vhost_tx() copies packet data
952 * and releases mbuf, so nothing to cleanup.
958 eth_link_update(struct rte_eth_dev *dev __rte_unused,
959 int wait_to_complete __rte_unused)
964 static const struct eth_dev_ops ops = {
965 .dev_start = eth_dev_start,
966 .dev_stop = eth_dev_stop,
967 .dev_close = eth_dev_close,
968 .dev_configure = eth_dev_configure,
969 .dev_infos_get = eth_dev_info,
970 .rx_queue_setup = eth_rx_queue_setup,
971 .tx_queue_setup = eth_tx_queue_setup,
972 .rx_queue_release = eth_queue_release,
973 .tx_queue_release = eth_queue_release,
974 .tx_done_cleanup = eth_tx_done_cleanup,
975 .link_update = eth_link_update,
976 .stats_get = eth_stats_get,
977 .stats_reset = eth_stats_reset,
978 .xstats_reset = vhost_dev_xstats_reset,
979 .xstats_get = vhost_dev_xstats_get,
980 .xstats_get_names = vhost_dev_xstats_get_names,
983 static struct rte_vdev_driver pmd_vhost_drv;
986 eth_dev_vhost_create(const char *name, char *iface_name, int16_t queues,
987 const unsigned numa_node, uint64_t flags)
989 struct rte_eth_dev_data *data = NULL;
990 struct pmd_internal *internal = NULL;
991 struct rte_eth_dev *eth_dev = NULL;
992 struct ether_addr *eth_addr = NULL;
993 struct rte_vhost_vring_state *vring_state = NULL;
994 struct internal_list *list = NULL;
996 RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
999 /* now do all data allocation - for eth_dev structure, dummy pci driver
1000 * and internal (private) data
1002 data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
1006 internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node);
1007 if (internal == NULL)
1010 list = rte_zmalloc_socket(name, sizeof(*list), 0, numa_node);
1014 /* reserve an ethdev entry */
1015 eth_dev = rte_eth_dev_allocate(name);
1016 if (eth_dev == NULL)
1019 eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
1020 if (eth_addr == NULL)
1022 *eth_addr = base_eth_addr;
1023 eth_addr->addr_bytes[5] = eth_dev->data->port_id;
1025 vring_state = rte_zmalloc_socket(name,
1026 sizeof(*vring_state), 0, numa_node);
1027 if (vring_state == NULL)
1030 /* now put it all together
1031 * - store queue data in internal,
1032 * - store numa_node info in ethdev data
1033 * - point eth_dev_data to internals
1034 * - and point eth_dev structure to new eth_dev_data structure
1036 internal->dev_name = strdup(name);
1037 if (internal->dev_name == NULL)
1039 internal->iface_name = strdup(iface_name);
1040 if (internal->iface_name == NULL)
1043 list->eth_dev = eth_dev;
1044 pthread_mutex_lock(&internal_list_lock);
1045 TAILQ_INSERT_TAIL(&internal_list, list, next);
1046 pthread_mutex_unlock(&internal_list_lock);
1048 rte_spinlock_init(&vring_state->lock);
1049 vring_states[eth_dev->data->port_id] = vring_state;
1051 data->dev_private = internal;
1052 data->port_id = eth_dev->data->port_id;
1053 memmove(data->name, eth_dev->data->name, sizeof(data->name));
1054 data->nb_rx_queues = queues;
1055 data->nb_tx_queues = queues;
1056 internal->max_queues = queues;
1057 data->dev_link = pmd_link;
1058 data->mac_addrs = eth_addr;
1060 /* We'll replace the 'data' originally allocated by eth_dev. So the
1061 * vhost PMD resources won't be shared between multi processes.
1063 eth_dev->data = data;
1064 eth_dev->dev_ops = &ops;
1065 eth_dev->driver = NULL;
1067 RTE_ETH_DEV_DETACHABLE | RTE_ETH_DEV_INTR_LSC;
1068 data->kdrv = RTE_KDRV_NONE;
1069 data->drv_name = pmd_vhost_drv.driver.name;
1070 data->numa_node = numa_node;
1072 /* finally assign rx and tx ops */
1073 eth_dev->rx_pkt_burst = eth_vhost_rx;
1074 eth_dev->tx_pkt_burst = eth_vhost_tx;
1076 if (rte_vhost_driver_register(iface_name, flags))
1079 if (rte_vhost_driver_callback_register(iface_name, &vhost_ops) < 0) {
1080 RTE_LOG(ERR, PMD, "Can't register callbacks\n");
1084 if (rte_vhost_driver_start(iface_name) < 0) {
1085 RTE_LOG(ERR, PMD, "Failed to start driver for %s\n",
1090 return data->port_id;
1094 free(internal->dev_name);
1095 rte_free(vring_state);
1098 rte_eth_dev_release_port(eth_dev);
1107 open_iface(const char *key __rte_unused, const char *value, void *extra_args)
1109 const char **iface_name = extra_args;
1114 *iface_name = value;
1120 open_int(const char *key __rte_unused, const char *value, void *extra_args)
1122 uint16_t *n = extra_args;
1124 if (value == NULL || extra_args == NULL)
1127 *n = (uint16_t)strtoul(value, NULL, 0);
1128 if (*n == USHRT_MAX && errno == ERANGE)
1135 rte_pmd_vhost_probe(const char *name, const char *params)
1137 struct rte_kvargs *kvlist = NULL;
1142 int client_mode = 0;
1143 int dequeue_zero_copy = 0;
1145 RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
1147 kvlist = rte_kvargs_parse(params, valid_arguments);
1151 if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
1152 ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
1153 &open_iface, &iface_name);
1161 if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
1162 ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
1163 &open_int, &queues);
1164 if (ret < 0 || queues > RTE_MAX_QUEUES_PER_PORT)
1170 if (rte_kvargs_count(kvlist, ETH_VHOST_CLIENT_ARG) == 1) {
1171 ret = rte_kvargs_process(kvlist, ETH_VHOST_CLIENT_ARG,
1172 &open_int, &client_mode);
1177 flags |= RTE_VHOST_USER_CLIENT;
1180 if (rte_kvargs_count(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY) == 1) {
1181 ret = rte_kvargs_process(kvlist, ETH_VHOST_DEQUEUE_ZERO_COPY,
1182 &open_int, &dequeue_zero_copy);
1186 if (dequeue_zero_copy)
1187 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1190 eth_dev_vhost_create(name, iface_name, queues, rte_socket_id(), flags);
1193 rte_kvargs_free(kvlist);
1198 rte_pmd_vhost_remove(const char *name)
1200 struct rte_eth_dev *eth_dev = NULL;
1203 RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
1205 /* find an ethdev entry */
1206 eth_dev = rte_eth_dev_allocated(name);
1207 if (eth_dev == NULL)
1210 eth_dev_stop(eth_dev);
1212 eth_dev_close(eth_dev);
1214 rte_free(vring_states[eth_dev->data->port_id]);
1215 vring_states[eth_dev->data->port_id] = NULL;
1217 for (i = 0; i < eth_dev->data->nb_rx_queues; i++)
1218 rte_free(eth_dev->data->rx_queues[i]);
1219 for (i = 0; i < eth_dev->data->nb_tx_queues; i++)
1220 rte_free(eth_dev->data->tx_queues[i]);
1222 rte_free(eth_dev->data->mac_addrs);
1223 rte_free(eth_dev->data);
1225 rte_eth_dev_release_port(eth_dev);
1230 static struct rte_vdev_driver pmd_vhost_drv = {
1231 .probe = rte_pmd_vhost_probe,
1232 .remove = rte_pmd_vhost_remove,
1235 RTE_PMD_REGISTER_VDEV(net_vhost, pmd_vhost_drv);
1236 RTE_PMD_REGISTER_ALIAS(net_vhost, eth_vhost);
1237 RTE_PMD_REGISTER_PARAM_STRING(net_vhost,