1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
33 #define MAX_QUEUES 128
36 #define NUM_MBUFS_DEFAULT 0x24000
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
41 #define MBUF_CACHE_SIZE 128
42 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
44 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
46 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
49 #define JUMBO_FRAME_MAX_SIZE 0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
55 #define DEVICE_SAFE_REMOVE 2
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
64 #define DMA_RING_SIZE 4096
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
86 static struct rte_mempool *mbuf_pool;
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
109 static int client_mode;
111 static int builtin_net_driver;
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
127 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY,
130 * VLAN strip is necessary for 1G NIC such as I350,
131 * this fixes bug of ipv4 forwarding in guest can't
132 * forward packets from one virtio dev to another virtio dev.
134 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
138 .mq_mode = RTE_ETH_MQ_TX_NONE,
139 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140 RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141 RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142 RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143 RTE_ETH_TX_OFFLOAD_TCP_TSO),
147 * should be overridden separately in code with
151 .nb_queue_pools = RTE_ETH_8_POOLS,
152 .enable_default_pool = 0,
155 .pool_map = {{0, 0},},
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
168 const uint16_t vlan_tags[] = {
169 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
171 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
182 static struct vhost_dev_tailq_list vhost_dev_list =
183 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
187 /* Used for queueing bursts of TX packets. */
191 struct rte_mbuf *m_table[MAX_PKT_BURST];
194 struct vhost_bufftable {
197 struct rte_mbuf *m_table[MAX_PKT_BURST];
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204 * Vhost TX buffer for each data core.
205 * Every data core maintains a TX buffer for every vhost device,
206 * which is used for batch pkts enqueue for higher performance.
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
210 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
211 / US_PER_S * BURST_TX_DRAIN_US)
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
218 return dma_bind[socketid].async_flag;
222 init_vid2socketid_array(int vid, int socketid)
224 vid2socketid[vid] = socketid;
228 is_dma_configured(int16_t dev_id)
232 for (i = 0; i < dma_count; i++)
233 if (dmas_id[i] == dev_id)
239 open_dma(const char *value)
241 struct dma_for_vhost *dma_info = dma_bind;
242 char *input = strndup(value, strlen(value) + 1);
245 char *start, *end, *substr;
246 int64_t socketid, vring_id;
248 struct rte_dma_info info;
249 struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250 struct rte_dma_vchan_conf qconf = {
251 .direction = RTE_DMA_DIR_MEM_TO_MEM,
252 .nb_desc = DMA_RING_SIZE
258 char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 while (isblank(*addrs))
263 if (*addrs == '\0') {
268 /* process DMA devices within bracket. */
270 substr = strtok(addrs, ";]");
276 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
282 while (i < args_nr) {
283 char *arg_temp = dma_arg[i];
288 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
294 txd = strstr(ptrs[0], "txd");
295 rxd = strstr(ptrs[0], "rxd");
298 vring_id = VIRTIO_RXQ;
299 async_flag = ASYNC_ENQUEUE_VHOST;
302 vring_id = VIRTIO_TXQ;
303 async_flag = ASYNC_DEQUEUE_VHOST;
310 socketid = strtol(start, &end, 0);
316 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
318 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
323 /* DMA device is already configured, so skip */
324 if (is_dma_configured(dev_id))
327 if (rte_dma_info_get(dev_id, &info) != 0) {
328 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
333 if (info.max_vchans < 1) {
334 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
339 if (rte_dma_configure(dev_id, &dev_config) != 0) {
340 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
345 /* Check the max desc supported by DMA device */
346 rte_dma_info_get(dev_id, &info);
347 if (info.nb_vchans != 1) {
348 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
354 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
356 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
362 if (rte_dma_start(dev_id) != 0) {
363 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
368 dmas_id[dma_count++] = dev_id;
371 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372 (dma_info + socketid)->async_flag |= async_flag;
381 * Builds up the correct configuration for VMDQ VLAN pool map
382 * according to the pool & queue limits.
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
387 struct rte_eth_vmdq_rx_conf conf;
388 struct rte_eth_vmdq_rx_conf *def_conf =
389 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
392 memset(&conf, 0, sizeof(conf));
393 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394 conf.nb_pool_maps = num_devices;
395 conf.enable_loop_back = def_conf->enable_loop_back;
396 conf.rx_mode = def_conf->rx_mode;
398 for (i = 0; i < conf.nb_pool_maps; i++) {
399 conf.pool_map[i].vlan_id = vlan_tags[ i ];
400 conf.pool_map[i].pools = (1UL << i);
403 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
410 * Initialises a given port using global settings and with the rx buffers
411 * coming from the mbuf_pool passed as parameter
414 port_init(uint16_t port)
416 struct rte_eth_dev_info dev_info;
417 struct rte_eth_conf port_conf;
418 struct rte_eth_rxconf *rxconf;
419 struct rte_eth_txconf *txconf;
420 int16_t rx_rings, tx_rings;
421 uint16_t rx_ring_size, tx_ring_size;
425 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426 retval = rte_eth_dev_info_get(port, &dev_info);
428 RTE_LOG(ERR, VHOST_PORT,
429 "Error during getting device (port %u) info: %s\n",
430 port, strerror(-retval));
435 rxconf = &dev_info.default_rxconf;
436 txconf = &dev_info.default_txconf;
437 rxconf->rx_drop_en = 1;
439 /*configure the number of supported virtio devices based on VMDQ limits */
440 num_devices = dev_info.max_vmdq_pools;
442 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
443 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
445 tx_rings = (uint16_t)rte_lcore_count();
448 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
449 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
451 vmdq_conf_default.rxmode.mtu = MAX_MTU;
454 /* Get port configuration. */
455 retval = get_eth_conf(&port_conf, num_devices);
458 /* NIC queues are divided into pf queues and vmdq queues. */
459 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
460 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
461 num_vmdq_queues = num_devices * queues_per_pool;
462 num_queues = num_pf_queues + num_vmdq_queues;
463 vmdq_queue_base = dev_info.vmdq_queue_base;
464 vmdq_pool_base = dev_info.vmdq_pool_base;
465 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
466 num_pf_queues, num_devices, queues_per_pool);
468 if (!rte_eth_dev_is_valid_port(port))
471 rx_rings = (uint16_t)dev_info.max_rx_queues;
472 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
473 port_conf.txmode.offloads |=
474 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
475 /* Configure ethernet device. */
476 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
478 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
479 port, strerror(-retval));
483 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
486 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
487 "for port %u: %s.\n", port, strerror(-retval));
490 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
491 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
492 "for Rx queues on port %u.\n", port);
496 /* Setup the queues. */
497 rxconf->offloads = port_conf.rxmode.offloads;
498 for (q = 0; q < rx_rings; q ++) {
499 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
500 rte_eth_dev_socket_id(port),
504 RTE_LOG(ERR, VHOST_PORT,
505 "Failed to setup rx queue %u of port %u: %s.\n",
506 q, port, strerror(-retval));
510 txconf->offloads = port_conf.txmode.offloads;
511 for (q = 0; q < tx_rings; q ++) {
512 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
513 rte_eth_dev_socket_id(port),
516 RTE_LOG(ERR, VHOST_PORT,
517 "Failed to setup tx queue %u of port %u: %s.\n",
518 q, port, strerror(-retval));
523 /* Start the device. */
524 retval = rte_eth_dev_start(port);
526 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
527 port, strerror(-retval));
532 retval = rte_eth_promiscuous_enable(port);
534 RTE_LOG(ERR, VHOST_PORT,
535 "Failed to enable promiscuous mode on port %u: %s\n",
536 port, rte_strerror(-retval));
541 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
543 RTE_LOG(ERR, VHOST_PORT,
544 "Failed to get MAC address on port %u: %s\n",
545 port, rte_strerror(-retval));
549 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
550 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
551 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
552 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
558 * Set socket file path.
561 us_vhost_parse_socket_path(const char *q_arg)
565 /* parse number string */
566 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
570 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
571 if (socket_files == NULL) {
576 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
583 * Parse the portmask provided at run time.
586 parse_portmask(const char *portmask)
593 /* parse hexadecimal string */
594 pm = strtoul(portmask, &end, 16);
595 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
603 * Parse num options at run time.
606 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
613 /* parse unsigned int string */
614 num = strtoul(q_arg, &end, 10);
615 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
618 if (num > max_valid_value)
629 us_vhost_usage(const char *prgname)
631 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
633 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
634 " --socket-file <path>\n"
636 " -p PORTMASK: Set mask for ports to be used by application\n"
637 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
638 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
639 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
640 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
641 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
642 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
643 " --socket-file: The path of the socket file.\n"
644 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
645 " --tso [0|1] disable/enable TCP segment offload.\n"
646 " --client register a vhost-user socket as client mode.\n"
647 " --dmas register dma channel for specific vhost device.\n"
648 " --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
653 #define OPT_VM2VM "vm2vm"
655 #define OPT_RX_RETRY "rx-retry"
657 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
658 OPT_RX_RETRY_DELAY_NUM,
659 #define OPT_RX_RETRY_NUMB "rx-retry-num"
660 OPT_RX_RETRY_NUMB_NUM,
661 #define OPT_MERGEABLE "mergeable"
663 #define OPT_STATS "stats"
665 #define OPT_SOCKET_FILE "socket-file"
667 #define OPT_TX_CSUM "tx-csum"
669 #define OPT_TSO "tso"
671 #define OPT_CLIENT "client"
673 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
674 OPT_BUILTIN_NET_DRIVER_NUM,
675 #define OPT_DMAS "dmas"
677 #define OPT_NUM_MBUFS "total-num-mbufs"
682 * Parse the arguments given in the command line of the application.
685 us_vhost_parse_args(int argc, char **argv)
690 const char *prgname = argv[0];
691 static struct option long_option[] = {
692 {OPT_VM2VM, required_argument,
693 NULL, OPT_VM2VM_NUM},
694 {OPT_RX_RETRY, required_argument,
695 NULL, OPT_RX_RETRY_NUM},
696 {OPT_RX_RETRY_DELAY, required_argument,
697 NULL, OPT_RX_RETRY_DELAY_NUM},
698 {OPT_RX_RETRY_NUMB, required_argument,
699 NULL, OPT_RX_RETRY_NUMB_NUM},
700 {OPT_MERGEABLE, required_argument,
701 NULL, OPT_MERGEABLE_NUM},
702 {OPT_STATS, required_argument,
703 NULL, OPT_STATS_NUM},
704 {OPT_SOCKET_FILE, required_argument,
705 NULL, OPT_SOCKET_FILE_NUM},
706 {OPT_TX_CSUM, required_argument,
707 NULL, OPT_TX_CSUM_NUM},
708 {OPT_TSO, required_argument,
710 {OPT_CLIENT, no_argument,
711 NULL, OPT_CLIENT_NUM},
712 {OPT_BUILTIN_NET_DRIVER, no_argument,
713 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
714 {OPT_DMAS, required_argument,
716 {OPT_NUM_MBUFS, required_argument,
717 NULL, OPT_NUM_MBUFS_NUM},
721 /* Parse command line */
722 while ((opt = getopt_long(argc, argv, "p:P",
723 long_option, &option_index)) != EOF) {
727 enabled_port_mask = parse_portmask(optarg);
728 if (enabled_port_mask == 0) {
729 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
730 us_vhost_usage(prgname);
737 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
738 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
739 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
743 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
745 RTE_LOG(INFO, VHOST_CONFIG,
746 "Invalid argument for "
748 us_vhost_usage(prgname);
751 vm2vm_mode = (vm2vm_type)ret;
754 case OPT_RX_RETRY_NUM:
755 ret = parse_num_opt(optarg, 1);
757 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
758 us_vhost_usage(prgname);
764 case OPT_TX_CSUM_NUM:
765 ret = parse_num_opt(optarg, 1);
767 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
768 us_vhost_usage(prgname);
771 enable_tx_csum = ret;
775 ret = parse_num_opt(optarg, 1);
777 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
778 us_vhost_usage(prgname);
784 case OPT_RX_RETRY_DELAY_NUM:
785 ret = parse_num_opt(optarg, INT32_MAX);
787 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
788 us_vhost_usage(prgname);
791 burst_rx_delay_time = ret;
794 case OPT_RX_RETRY_NUMB_NUM:
795 ret = parse_num_opt(optarg, INT32_MAX);
797 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
798 us_vhost_usage(prgname);
801 burst_rx_retry_num = ret;
804 case OPT_MERGEABLE_NUM:
805 ret = parse_num_opt(optarg, 1);
807 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
808 us_vhost_usage(prgname);
815 ret = parse_num_opt(optarg, INT32_MAX);
817 RTE_LOG(INFO, VHOST_CONFIG,
818 "Invalid argument for stats [0..N]\n");
819 us_vhost_usage(prgname);
825 /* Set socket file path. */
826 case OPT_SOCKET_FILE_NUM:
827 if (us_vhost_parse_socket_path(optarg) == -1) {
828 RTE_LOG(INFO, VHOST_CONFIG,
829 "Invalid argument for socket name (Max %d characters)\n",
831 us_vhost_usage(prgname);
837 if (open_dma(optarg) == -1) {
838 RTE_LOG(INFO, VHOST_CONFIG,
840 us_vhost_usage(prgname);
845 case OPT_NUM_MBUFS_NUM:
846 ret = parse_num_opt(optarg, INT32_MAX);
848 RTE_LOG(INFO, VHOST_CONFIG,
849 "Invalid argument for total-num-mbufs [0..N]\n");
850 us_vhost_usage(prgname);
854 if (total_num_mbufs < ret)
855 total_num_mbufs = ret;
862 case OPT_BUILTIN_NET_DRIVER_NUM:
863 builtin_net_driver = 1;
866 /* Invalid option - print options. */
868 us_vhost_usage(prgname);
873 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
874 if (enabled_port_mask & (1 << i))
875 ports[num_ports++] = i;
878 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
879 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
880 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
888 * Update the global var NUM_PORTS and array PORTS according to system ports number
889 * and return valid ports number
891 static unsigned check_ports_num(unsigned nb_ports)
893 unsigned valid_num_ports = num_ports;
896 if (num_ports > nb_ports) {
897 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
898 num_ports, nb_ports);
899 num_ports = nb_ports;
902 for (portid = 0; portid < num_ports; portid ++) {
903 if (!rte_eth_dev_is_valid_port(ports[portid])) {
904 RTE_LOG(INFO, VHOST_PORT,
905 "\nSpecified port ID(%u) is not valid\n",
907 ports[portid] = INVALID_PORT_ID;
911 return valid_num_ports;
914 static __rte_always_inline struct vhost_dev *
915 find_vhost_dev(struct rte_ether_addr *mac)
917 struct vhost_dev *vdev;
919 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
920 if (vdev->ready == DEVICE_RX &&
921 rte_is_same_ether_addr(mac, &vdev->mac_address))
929 * This function learns the MAC address of the device and registers this along with a
930 * vlan tag to a VMDQ.
933 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
935 struct rte_ether_hdr *pkt_hdr;
938 /* Learn MAC address of guest device from packet */
939 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
941 if (find_vhost_dev(&pkt_hdr->src_addr)) {
942 RTE_LOG(ERR, VHOST_DATA,
943 "(%d) device is using a registered MAC!\n",
948 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
949 vdev->mac_address.addr_bytes[i] =
950 pkt_hdr->src_addr.addr_bytes[i];
952 /* vlan_tag currently uses the device_id. */
953 vdev->vlan_tag = vlan_tags[vdev->vid];
955 /* Print out VMDQ registration info. */
956 RTE_LOG(INFO, VHOST_DATA,
957 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
958 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
961 /* Register the MAC address. */
962 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
963 (uint32_t)vdev->vid + vmdq_pool_base);
965 RTE_LOG(ERR, VHOST_DATA,
966 "(%d) failed to add device MAC address to VMDQ\n",
969 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
971 /* Set device as ready for RX. */
972 vdev->ready = DEVICE_RX;
978 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
979 * queue before disabling RX on the device.
982 unlink_vmdq(struct vhost_dev *vdev)
986 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
988 if (vdev->ready == DEVICE_RX) {
989 /*clear MAC and VLAN settings*/
990 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
991 for (i = 0; i < 6; i++)
992 vdev->mac_address.addr_bytes[i] = 0;
996 /*Clear out the receive buffers*/
997 rx_count = rte_eth_rx_burst(ports[0],
998 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1001 for (i = 0; i < rx_count; i++)
1002 rte_pktmbuf_free(pkts_burst[i]);
1004 rx_count = rte_eth_rx_burst(ports[0],
1005 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1008 vdev->ready = DEVICE_MAC_LEARNING;
1013 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1016 rte_pktmbuf_free(pkts[n]);
1019 static __rte_always_inline void
1020 complete_async_pkts(struct vhost_dev *vdev)
1022 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1023 uint16_t complete_count;
1024 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1026 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1027 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1029 free_pkts(p_cpl, complete_count);
1033 static __rte_always_inline void
1034 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1039 if (builtin_net_driver) {
1040 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1042 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1046 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1048 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1050 src_vdev->stats.tx_total++;
1051 src_vdev->stats.tx += ret;
1055 static __rte_always_inline void
1056 drain_vhost(struct vhost_dev *vdev)
1059 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1060 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1061 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1063 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1066 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1068 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1072 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1073 free_pkts(m, nr_xmit);
1076 static __rte_always_inline void
1077 drain_vhost_table(void)
1079 uint16_t lcore_id = rte_lcore_id();
1080 struct vhost_bufftable *vhost_txq;
1081 struct vhost_dev *vdev;
1084 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1085 if (unlikely(vdev->remove == 1))
1088 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1090 cur_tsc = rte_rdtsc();
1091 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1092 > MBUF_TABLE_DRAIN_TSC)) {
1093 RTE_LOG_DP(DEBUG, VHOST_DATA,
1094 "Vhost TX queue drained after timeout with burst size %u\n",
1098 vhost_txq->pre_tsc = cur_tsc;
1104 * Check if the packet destination MAC address is for a local device. If so then put
1105 * the packet on that devices RX queue. If not then return.
1107 static __rte_always_inline int
1108 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1110 struct rte_ether_hdr *pkt_hdr;
1111 struct vhost_dev *dst_vdev;
1112 struct vhost_bufftable *vhost_txq;
1113 uint16_t lcore_id = rte_lcore_id();
1114 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1116 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1120 if (vdev->vid == dst_vdev->vid) {
1121 RTE_LOG_DP(DEBUG, VHOST_DATA,
1122 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1127 RTE_LOG_DP(DEBUG, VHOST_DATA,
1128 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1130 if (unlikely(dst_vdev->remove)) {
1131 RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 "(%d) device is marked for removal\n", dst_vdev->vid);
1136 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1137 vhost_txq->m_table[vhost_txq->len++] = m;
1140 vdev->stats.tx_total++;
1144 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1145 drain_vhost(dst_vdev);
1147 vhost_txq->pre_tsc = rte_rdtsc();
1153 * Check if the destination MAC of a packet is one local VM,
1154 * and get its vlan tag, and offset if it is.
1156 static __rte_always_inline int
1157 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1158 uint32_t *offset, uint16_t *vlan_tag)
1160 struct vhost_dev *dst_vdev;
1161 struct rte_ether_hdr *pkt_hdr =
1162 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1164 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1168 if (vdev->vid == dst_vdev->vid) {
1169 RTE_LOG_DP(DEBUG, VHOST_DATA,
1170 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1176 * HW vlan strip will reduce the packet length
1177 * by minus length of vlan tag, so need restore
1178 * the packet length by plus it.
1180 *offset = RTE_VLAN_HLEN;
1181 *vlan_tag = vlan_tags[vdev->vid];
1183 RTE_LOG_DP(DEBUG, VHOST_DATA,
1184 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1185 vdev->vid, dst_vdev->vid, *vlan_tag);
1190 static void virtio_tx_offload(struct rte_mbuf *m)
1192 struct rte_net_hdr_lens hdr_lens;
1193 struct rte_ipv4_hdr *ipv4_hdr;
1194 struct rte_tcp_hdr *tcp_hdr;
1198 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1199 m->l2_len = hdr_lens.l2_len;
1200 m->l3_len = hdr_lens.l3_len;
1201 m->l4_len = hdr_lens.l4_len;
1203 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1204 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1205 m->l2_len + m->l3_len);
1207 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1208 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1209 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1210 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1212 ipv4_hdr->hdr_checksum = 0;
1213 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1214 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1215 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1216 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1220 static __rte_always_inline void
1221 do_drain_mbuf_table(struct mbuf_table *tx_q)
1225 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1226 tx_q->m_table, tx_q->len);
1227 if (unlikely(count < tx_q->len))
1228 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1234 * This function routes the TX packet to the correct interface. This
1235 * may be a local device or the physical port.
1237 static __rte_always_inline void
1238 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1240 struct mbuf_table *tx_q;
1241 unsigned offset = 0;
1242 const uint16_t lcore_id = rte_lcore_id();
1243 struct rte_ether_hdr *nh;
1246 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1247 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1248 struct vhost_dev *vdev2;
1250 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1252 sync_virtio_xmit(vdev2, vdev, m);
1257 /*check if destination is local VM*/
1258 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1261 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1262 if (unlikely(find_local_dest(vdev, m, &offset,
1264 rte_pktmbuf_free(m);
1269 RTE_LOG_DP(DEBUG, VHOST_DATA,
1270 "(%d) TX: MAC address is external\n", vdev->vid);
1274 /*Add packet to the port tx queue*/
1275 tx_q = &lcore_tx_queue[lcore_id];
1277 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1278 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1279 /* Guest has inserted the vlan tag. */
1280 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1281 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1282 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1283 (vh->vlan_tci != vlan_tag_be))
1284 vh->vlan_tci = vlan_tag_be;
1286 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1289 * Find the right seg to adjust the data len when offset is
1290 * bigger than tail room size.
1292 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1293 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1294 m->data_len += offset;
1296 struct rte_mbuf *seg = m;
1298 while ((seg->next != NULL) &&
1299 (offset > rte_pktmbuf_tailroom(seg)))
1302 seg->data_len += offset;
1304 m->pkt_len += offset;
1307 m->vlan_tci = vlan_tag;
1310 if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1311 virtio_tx_offload(m);
1313 tx_q->m_table[tx_q->len++] = m;
1315 vdev->stats.tx_total++;
1319 if (unlikely(tx_q->len == MAX_PKT_BURST))
1320 do_drain_mbuf_table(tx_q);
1324 static __rte_always_inline void
1325 drain_mbuf_table(struct mbuf_table *tx_q)
1327 static uint64_t prev_tsc;
1333 cur_tsc = rte_rdtsc();
1334 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1337 RTE_LOG_DP(DEBUG, VHOST_DATA,
1338 "TX queue drained after timeout with burst size %u\n",
1340 do_drain_mbuf_table(tx_q);
1345 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1346 struct rte_mbuf **pkts, uint32_t rx_count)
1348 uint16_t enqueue_count;
1349 uint16_t enqueue_fail = 0;
1350 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1352 complete_async_pkts(dev);
1353 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1354 pkts, rx_count, dma_id, 0);
1356 enqueue_fail = rx_count - enqueue_count;
1358 free_pkts(&pkts[enqueue_count], enqueue_fail);
1360 return enqueue_count;
1364 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1365 struct rte_mbuf **pkts, uint32_t rx_count)
1367 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1370 static __rte_always_inline void
1371 drain_eth_rx(struct vhost_dev *vdev)
1373 uint16_t rx_count, enqueue_count;
1374 struct rte_mbuf *pkts[MAX_PKT_BURST];
1376 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1377 pkts, MAX_PKT_BURST);
1383 * When "enable_retry" is set, here we wait and retry when there
1384 * is no enough free slots in the queue to hold @rx_count packets,
1385 * to diminish packet loss.
1388 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1392 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1393 rte_delay_us(burst_rx_delay_time);
1394 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1400 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1401 VIRTIO_RXQ, pkts, rx_count);
1404 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1406 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1410 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1411 free_pkts(pkts, rx_count);
1414 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1415 struct rte_mempool *mbuf_pool,
1416 struct rte_mbuf **pkts, uint16_t count)
1419 uint16_t dequeue_count;
1420 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1422 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1423 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1425 return dequeue_count;
1428 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1429 struct rte_mempool *mbuf_pool,
1430 struct rte_mbuf **pkts, uint16_t count)
1432 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1435 static __rte_always_inline void
1436 drain_virtio_tx(struct vhost_dev *vdev)
1438 struct rte_mbuf *pkts[MAX_PKT_BURST];
1442 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1443 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1445 /* setup VMDq for the first packet */
1446 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1447 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1448 free_pkts(pkts, count);
1451 for (i = 0; i < count; ++i)
1452 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1456 * Main function of vhost-switch. It basically does:
1458 * for each vhost device {
1461 * Which drains the host eth Rx queue linked to the vhost device,
1462 * and deliver all of them to guest virito Rx ring associated with
1463 * this vhost device.
1465 * - drain_virtio_tx()
1467 * Which drains the guest virtio Tx queue and deliver all of them
1468 * to the target, which could be another vhost device, or the
1469 * physical eth dev. The route is done in function "virtio_tx_route".
1473 switch_worker(void *arg __rte_unused)
1476 unsigned lcore_id = rte_lcore_id();
1477 struct vhost_dev *vdev;
1478 struct mbuf_table *tx_q;
1480 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1482 tx_q = &lcore_tx_queue[lcore_id];
1483 for (i = 0; i < rte_lcore_count(); i++) {
1484 if (lcore_ids[i] == lcore_id) {
1491 drain_mbuf_table(tx_q);
1492 drain_vhost_table();
1494 * Inform the configuration core that we have exited the
1495 * linked list and that no devices are in use if requested.
1497 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1498 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1501 * Process vhost devices
1503 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1505 if (unlikely(vdev->remove)) {
1507 vdev->ready = DEVICE_SAFE_REMOVE;
1511 if (likely(vdev->ready == DEVICE_RX))
1514 if (likely(!vdev->remove))
1515 drain_virtio_tx(vdev);
1523 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1528 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1529 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1531 struct rte_mbuf *m_cpl[pkts_inflight];
1533 while (pkts_inflight) {
1534 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1535 pkts_inflight, dma_id, 0);
1536 free_pkts(m_cpl, n_pkt);
1537 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1543 * Remove a device from the specific data core linked list and from the
1544 * main linked list. Synchronization occurs through the use of the
1545 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1546 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1549 destroy_device(int vid)
1551 struct vhost_dev *vdev = NULL;
1555 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1556 if (vdev->vid == vid)
1561 /*set the remove flag. */
1563 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1567 for (i = 0; i < RTE_MAX_LCORE; i++)
1568 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1570 if (builtin_net_driver)
1571 vs_vhost_net_remove(vdev);
1573 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1575 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1578 /* Set the dev_removal_flag on each lcore. */
1579 RTE_LCORE_FOREACH_WORKER(lcore)
1580 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1583 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1584 * we can be sure that they can no longer access the device removed
1585 * from the linked lists and that the devices are no longer in use.
1587 RTE_LCORE_FOREACH_WORKER(lcore) {
1588 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1592 lcore_info[vdev->coreid].device_num--;
1594 RTE_LOG(INFO, VHOST_DATA,
1595 "(%d) device has been removed from data core\n",
1598 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1599 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1600 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1601 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1604 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1605 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1606 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1607 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1614 get_socketid_by_vid(int vid)
1617 char ifname[PATH_MAX];
1618 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1620 for (i = 0; i < nb_sockets; i++) {
1621 char *file = socket_files + i * PATH_MAX;
1622 if (strcmp(file, ifname) == 0)
1630 init_vhost_queue_ops(int vid)
1632 if (builtin_net_driver) {
1633 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1634 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1636 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1637 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1639 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1641 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1642 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1644 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1651 vhost_async_channel_register(int vid)
1653 int rx_ret = 0, tx_ret = 0;
1655 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1656 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1658 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1661 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1662 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1664 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1667 return rx_ret | tx_ret;
1673 * A new device is added to a data core. First the device is added to the main linked list
1674 * and then allocated to a specific data core.
1679 int lcore, core_add = 0;
1681 uint32_t device_num_min = num_devices;
1682 struct vhost_dev *vdev;
1685 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1687 RTE_LOG(INFO, VHOST_DATA,
1688 "(%d) couldn't allocate memory for vhost dev\n",
1694 for (i = 0; i < RTE_MAX_LCORE; i++) {
1695 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1696 = rte_zmalloc("vhost bufftable",
1697 sizeof(struct vhost_bufftable),
1698 RTE_CACHE_LINE_SIZE);
1700 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1701 RTE_LOG(INFO, VHOST_DATA,
1702 "(%d) couldn't allocate memory for vhost TX\n", vid);
1707 int socketid = get_socketid_by_vid(vid);
1711 init_vid2socketid_array(vid, socketid);
1713 ret = vhost_async_channel_register(vid);
1715 if (init_vhost_queue_ops(vid) != 0)
1718 if (builtin_net_driver)
1719 vs_vhost_net_setup(vdev);
1721 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1722 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1724 /*reset ready flag*/
1725 vdev->ready = DEVICE_MAC_LEARNING;
1728 /* Find a suitable lcore to add the device. */
1729 RTE_LCORE_FOREACH_WORKER(lcore) {
1730 if (lcore_info[lcore].device_num < device_num_min) {
1731 device_num_min = lcore_info[lcore].device_num;
1735 vdev->coreid = core_add;
1737 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1739 lcore_info[vdev->coreid].device_num++;
1741 /* Disable notifications. */
1742 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1743 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1745 RTE_LOG(INFO, VHOST_DATA,
1746 "(%d) device has been added to data core %d\n",
1753 vring_state_changed(int vid, uint16_t queue_id, int enable)
1755 struct vhost_dev *vdev = NULL;
1757 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1758 if (vdev->vid == vid)
1764 if (queue_id != VIRTIO_RXQ)
1767 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1769 vhost_clear_queue_thread_unsafe(vdev, queue_id);
1776 * These callback allow devices to be added to the data core when configuration
1777 * has been fully complete.
1779 static const struct rte_vhost_device_ops virtio_net_device_ops =
1781 .new_device = new_device,
1782 .destroy_device = destroy_device,
1783 .vring_state_changed = vring_state_changed,
1787 * This is a thread will wake up after a period to print stats if the user has
1791 print_stats(__rte_unused void *arg)
1793 struct vhost_dev *vdev;
1794 uint64_t tx_dropped, rx_dropped;
1795 uint64_t tx, tx_total, rx, rx_total;
1796 const char clr[] = { 27, '[', '2', 'J', '\0' };
1797 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1800 sleep(enable_stats);
1802 /* Clear screen and move to top left */
1803 printf("%s%s\n", clr, top_left);
1804 printf("Device statistics =================================\n");
1806 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1807 tx_total = vdev->stats.tx_total;
1808 tx = vdev->stats.tx;
1809 tx_dropped = tx_total - tx;
1811 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1813 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1815 rx_dropped = rx_total - rx;
1817 printf("Statistics for device %d\n"
1818 "-----------------------\n"
1819 "TX total: %" PRIu64 "\n"
1820 "TX dropped: %" PRIu64 "\n"
1821 "TX successful: %" PRIu64 "\n"
1822 "RX total: %" PRIu64 "\n"
1823 "RX dropped: %" PRIu64 "\n"
1824 "RX successful: %" PRIu64 "\n",
1826 tx_total, tx_dropped, tx,
1827 rx_total, rx_dropped, rx);
1830 printf("===================================================\n");
1839 unregister_drivers(int socket_num)
1843 for (i = 0; i < socket_num; i++) {
1844 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1846 RTE_LOG(ERR, VHOST_CONFIG,
1847 "Fail to unregister vhost driver for %s.\n",
1848 socket_files + i * PATH_MAX);
1852 /* When we receive a INT signal, unregister vhost driver */
1854 sigint_handler(__rte_unused int signum)
1856 /* Unregister vhost driver. */
1857 unregister_drivers(nb_sockets);
1867 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1870 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1871 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1872 dma_bind[i].dmas[j].async_enabled = false;
1876 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1877 dmas_id[i] = INVALID_DMA_ID;
1881 * Main function, does initialisation and calls the per-lcore functions.
1884 main(int argc, char *argv[])
1886 unsigned lcore_id, core_id = 0;
1887 unsigned nb_ports, valid_num_ports;
1890 static pthread_t tid;
1891 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1893 signal(SIGINT, sigint_handler);
1896 ret = rte_eal_init(argc, argv);
1898 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1902 /* initialize dma structures */
1905 /* parse app arguments */
1906 ret = us_vhost_parse_args(argc, argv);
1908 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1910 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1911 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1913 if (rte_lcore_is_enabled(lcore_id))
1914 lcore_ids[core_id++] = lcore_id;
1917 if (rte_lcore_count() > RTE_MAX_LCORE)
1918 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1920 /* Get the number of physical ports. */
1921 nb_ports = rte_eth_dev_count_avail();
1924 * Update the global var NUM_PORTS and global array PORTS
1925 * and get value of var VALID_NUM_PORTS according to system ports number
1927 valid_num_ports = check_ports_num(nb_ports);
1929 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1930 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1931 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1936 * FIXME: here we are trying to allocate mbufs big enough for
1937 * @MAX_QUEUES, but the truth is we're never going to use that
1938 * many queues here. We probably should only do allocation for
1939 * those queues we are going to use.
1941 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1942 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1944 if (mbuf_pool == NULL)
1945 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1947 if (vm2vm_mode == VM2VM_HARDWARE) {
1948 /* Enable VT loop back to let L2 switch to do it. */
1949 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1950 RTE_LOG(DEBUG, VHOST_CONFIG,
1951 "Enable loop back for L2 switch in vmdq.\n");
1954 /* initialize all ports */
1955 RTE_ETH_FOREACH_DEV(portid) {
1956 /* skip ports that are not enabled */
1957 if ((enabled_port_mask & (1 << portid)) == 0) {
1958 RTE_LOG(INFO, VHOST_PORT,
1959 "Skipping disabled port %d\n", portid);
1962 if (port_init(portid) != 0)
1963 rte_exit(EXIT_FAILURE,
1964 "Cannot initialize network ports\n");
1967 /* Enable stats if the user option is set. */
1969 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1972 rte_exit(EXIT_FAILURE,
1973 "Cannot create print-stats thread\n");
1976 /* Launch all data cores. */
1977 RTE_LCORE_FOREACH_WORKER(lcore_id)
1978 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1981 flags |= RTE_VHOST_USER_CLIENT;
1983 for (i = 0; i < dma_count; i++) {
1984 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1985 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1986 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1990 /* Register vhost user driver to handle vhost messages. */
1991 for (i = 0; i < nb_sockets; i++) {
1992 char *file = socket_files + i * PATH_MAX;
1994 if (dma_count && get_async_flag_by_socketid(i) != 0)
1995 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1997 ret = rte_vhost_driver_register(file, flags);
1999 unregister_drivers(i);
2000 rte_exit(EXIT_FAILURE,
2001 "vhost driver register failure.\n");
2004 if (builtin_net_driver)
2005 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2007 if (mergeable == 0) {
2008 rte_vhost_driver_disable_features(file,
2009 1ULL << VIRTIO_NET_F_MRG_RXBUF);
2012 if (enable_tx_csum == 0) {
2013 rte_vhost_driver_disable_features(file,
2014 1ULL << VIRTIO_NET_F_CSUM);
2017 if (enable_tso == 0) {
2018 rte_vhost_driver_disable_features(file,
2019 1ULL << VIRTIO_NET_F_HOST_TSO4);
2020 rte_vhost_driver_disable_features(file,
2021 1ULL << VIRTIO_NET_F_HOST_TSO6);
2022 rte_vhost_driver_disable_features(file,
2023 1ULL << VIRTIO_NET_F_GUEST_TSO4);
2024 rte_vhost_driver_disable_features(file,
2025 1ULL << VIRTIO_NET_F_GUEST_TSO6);
2029 rte_vhost_driver_enable_features(file,
2030 1ULL << VIRTIO_NET_F_CTRL_RX);
2033 ret = rte_vhost_driver_callback_register(file,
2034 &virtio_net_device_ops);
2036 rte_exit(EXIT_FAILURE,
2037 "failed to register vhost driver callbacks.\n");
2040 if (rte_vhost_driver_start(file) < 0) {
2041 rte_exit(EXIT_FAILURE,
2042 "failed to start vhost driver.\n");
2046 RTE_LCORE_FOREACH_WORKER(lcore_id)
2047 rte_eal_wait_lcore(lcore_id);
2049 /* clean up the EAL */