1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
33 #define MAX_QUEUES 128
36 #define NUM_MBUFS_DEFAULT 0x24000
38 /* the maximum number of external ports supported */
39 #define MAX_SUP_PORTS 1
41 #define MBUF_CACHE_SIZE 128
42 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
44 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
46 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
47 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
49 #define JUMBO_FRAME_MAX_SIZE 0x2600
50 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
52 /* State of virtio device. */
53 #define DEVICE_MAC_LEARNING 0
55 #define DEVICE_SAFE_REMOVE 2
57 /* Configurable number of RX/TX ring descriptors */
58 #define RTE_TEST_RX_DESC_DEFAULT 1024
59 #define RTE_TEST_TX_DESC_DEFAULT 512
61 #define INVALID_PORT_ID 0xFF
62 #define INVALID_DMA_ID -1
64 #define DMA_RING_SIZE 4096
66 #define ASYNC_ENQUEUE_VHOST 1
67 #define ASYNC_DEQUEUE_VHOST 2
69 /* number of mbufs in all pools - if specified on command-line. */
70 static int total_num_mbufs = NUM_MBUFS_DEFAULT;
72 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
73 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
76 /* mask of enabled ports */
77 static uint32_t enabled_port_mask = 0;
79 /* Promiscuous mode */
80 static uint32_t promiscuous;
82 /* number of devices/queues to support*/
83 static uint32_t num_queues = 0;
84 static uint32_t num_devices;
86 static struct rte_mempool *mbuf_pool;
89 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
96 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
99 static uint32_t enable_stats = 0;
100 /* Enable retries on RX. */
101 static uint32_t enable_retry = 1;
103 /* Disable TX checksum offload */
104 static uint32_t enable_tx_csum;
106 /* Disable TSO offload */
107 static uint32_t enable_tso;
109 static int client_mode;
111 static int builtin_net_driver;
113 /* Specify timeout (in useconds) between retries on RX. */
114 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
115 /* Specify the number of retries on RX. */
116 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
118 /* Socket file paths. Can be set by user */
119 static char *socket_files;
120 static int nb_sockets;
122 static struct vhost_queue_ops vdev_queue_ops[RTE_MAX_VHOST_DEVICE];
124 /* empty VMDq configuration structure. Filled in programmatically */
125 static struct rte_eth_conf vmdq_conf_default = {
127 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY,
130 * VLAN strip is necessary for 1G NIC such as I350,
131 * this fixes bug of ipv4 forwarding in guest can't
132 * forward packets from one virtio dev to another virtio dev.
134 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
138 .mq_mode = RTE_ETH_MQ_TX_NONE,
139 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
140 RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
141 RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
142 RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
143 RTE_ETH_TX_OFFLOAD_TCP_TSO),
147 * should be overridden separately in code with
151 .nb_queue_pools = RTE_ETH_8_POOLS,
152 .enable_default_pool = 0,
155 .pool_map = {{0, 0},},
161 static unsigned lcore_ids[RTE_MAX_LCORE];
162 static uint16_t ports[RTE_MAX_ETHPORTS];
163 static unsigned num_ports = 0; /**< The number of ports specified in command line */
164 static uint16_t num_pf_queues, num_vmdq_queues;
165 static uint16_t vmdq_pool_base, vmdq_queue_base;
166 static uint16_t queues_per_pool;
168 const uint16_t vlan_tags[] = {
169 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
170 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
171 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
172 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
173 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
174 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
175 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
176 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
179 /* ethernet addresses of ports */
180 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
182 static struct vhost_dev_tailq_list vhost_dev_list =
183 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
185 static struct lcore_info lcore_info[RTE_MAX_LCORE];
187 /* Used for queueing bursts of TX packets. */
191 struct rte_mbuf *m_table[MAX_PKT_BURST];
194 struct vhost_bufftable {
197 struct rte_mbuf *m_table[MAX_PKT_BURST];
200 /* TX queue for each data core. */
201 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
204 * Vhost TX buffer for each data core.
205 * Every data core maintains a TX buffer for every vhost device,
206 * which is used for batch pkts enqueue for higher performance.
208 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
210 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
211 / US_PER_S * BURST_TX_DRAIN_US)
213 static int vid2socketid[RTE_MAX_VHOST_DEVICE];
215 static inline uint32_t
216 get_async_flag_by_socketid(int socketid)
218 return dma_bind[socketid].async_flag;
222 init_vid2socketid_array(int vid, int socketid)
224 vid2socketid[vid] = socketid;
228 is_dma_configured(int16_t dev_id)
232 for (i = 0; i < dma_count; i++)
233 if (dmas_id[i] == dev_id)
239 open_dma(const char *value)
241 struct dma_for_vhost *dma_info = dma_bind;
242 char *input = strndup(value, strlen(value) + 1);
245 char *start, *end, *substr;
246 int64_t socketid, vring_id;
248 struct rte_dma_info info;
249 struct rte_dma_conf dev_config = { .nb_vchans = 1 };
250 struct rte_dma_vchan_conf qconf = {
251 .direction = RTE_DMA_DIR_MEM_TO_MEM,
252 .nb_desc = DMA_RING_SIZE
258 char *dma_arg[RTE_MAX_VHOST_DEVICE];
261 while (isblank(*addrs))
263 if (*addrs == '\0') {
268 /* process DMA devices within bracket. */
270 substr = strtok(addrs, ";]");
276 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
282 while (i < args_nr) {
283 char *arg_temp = dma_arg[i];
288 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
294 txd = strstr(ptrs[0], "txd");
295 rxd = strstr(ptrs[0], "rxd");
298 vring_id = VIRTIO_RXQ;
299 async_flag = ASYNC_ENQUEUE_VHOST;
302 vring_id = VIRTIO_TXQ;
303 async_flag = ASYNC_DEQUEUE_VHOST;
310 socketid = strtol(start, &end, 0);
316 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
318 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
323 /* DMA device is already configured, so skip */
324 if (is_dma_configured(dev_id))
327 if (rte_dma_info_get(dev_id, &info) != 0) {
328 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
333 if (info.max_vchans < 1) {
334 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
339 if (rte_dma_configure(dev_id, &dev_config) != 0) {
340 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
345 /* Check the max desc supported by DMA device */
346 rte_dma_info_get(dev_id, &info);
347 if (info.nb_vchans != 1) {
348 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
354 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
356 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
357 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
362 if (rte_dma_start(dev_id) != 0) {
363 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
368 dmas_id[dma_count++] = dev_id;
371 (dma_info + socketid)->dmas[vring_id].dev_id = dev_id;
372 (dma_info + socketid)->async_flag |= async_flag;
381 * Builds up the correct configuration for VMDQ VLAN pool map
382 * according to the pool & queue limits.
385 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
387 struct rte_eth_vmdq_rx_conf conf;
388 struct rte_eth_vmdq_rx_conf *def_conf =
389 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
392 memset(&conf, 0, sizeof(conf));
393 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
394 conf.nb_pool_maps = num_devices;
395 conf.enable_loop_back = def_conf->enable_loop_back;
396 conf.rx_mode = def_conf->rx_mode;
398 for (i = 0; i < conf.nb_pool_maps; i++) {
399 conf.pool_map[i].vlan_id = vlan_tags[ i ];
400 conf.pool_map[i].pools = (1UL << i);
403 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
404 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
405 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
410 * Initialises a given port using global settings and with the rx buffers
411 * coming from the mbuf_pool passed as parameter
414 port_init(uint16_t port)
416 struct rte_eth_dev_info dev_info;
417 struct rte_eth_conf port_conf;
418 struct rte_eth_rxconf *rxconf;
419 struct rte_eth_txconf *txconf;
420 int16_t rx_rings, tx_rings;
421 uint16_t rx_ring_size, tx_ring_size;
425 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
426 retval = rte_eth_dev_info_get(port, &dev_info);
428 RTE_LOG(ERR, VHOST_PORT,
429 "Error during getting device (port %u) info: %s\n",
430 port, strerror(-retval));
434 if (dev_info.max_vmdq_pools == 0) {
435 RTE_LOG(ERR, VHOST_PORT, "Failed to get VMDq info.\n");
439 rxconf = &dev_info.default_rxconf;
440 txconf = &dev_info.default_txconf;
441 rxconf->rx_drop_en = 1;
443 /*configure the number of supported virtio devices based on VMDQ limits */
444 num_devices = dev_info.max_vmdq_pools;
446 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
447 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
449 tx_rings = (uint16_t)rte_lcore_count();
452 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
453 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
455 vmdq_conf_default.rxmode.mtu = MAX_MTU;
458 /* Get port configuration. */
459 retval = get_eth_conf(&port_conf, num_devices);
462 /* NIC queues are divided into pf queues and vmdq queues. */
463 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
464 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
465 num_vmdq_queues = num_devices * queues_per_pool;
466 num_queues = num_pf_queues + num_vmdq_queues;
467 vmdq_queue_base = dev_info.vmdq_queue_base;
468 vmdq_pool_base = dev_info.vmdq_pool_base;
469 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
470 num_pf_queues, num_devices, queues_per_pool);
472 if (!rte_eth_dev_is_valid_port(port))
475 rx_rings = (uint16_t)dev_info.max_rx_queues;
476 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
477 port_conf.txmode.offloads |=
478 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
479 /* Configure ethernet device. */
480 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
482 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
483 port, strerror(-retval));
487 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
490 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
491 "for port %u: %s.\n", port, strerror(-retval));
494 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
495 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
496 "for Rx queues on port %u.\n", port);
500 /* Setup the queues. */
501 rxconf->offloads = port_conf.rxmode.offloads;
502 for (q = 0; q < rx_rings; q ++) {
503 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
504 rte_eth_dev_socket_id(port),
508 RTE_LOG(ERR, VHOST_PORT,
509 "Failed to setup rx queue %u of port %u: %s.\n",
510 q, port, strerror(-retval));
514 txconf->offloads = port_conf.txmode.offloads;
515 for (q = 0; q < tx_rings; q ++) {
516 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
517 rte_eth_dev_socket_id(port),
520 RTE_LOG(ERR, VHOST_PORT,
521 "Failed to setup tx queue %u of port %u: %s.\n",
522 q, port, strerror(-retval));
527 /* Start the device. */
528 retval = rte_eth_dev_start(port);
530 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
531 port, strerror(-retval));
536 retval = rte_eth_promiscuous_enable(port);
538 RTE_LOG(ERR, VHOST_PORT,
539 "Failed to enable promiscuous mode on port %u: %s\n",
540 port, rte_strerror(-retval));
545 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
547 RTE_LOG(ERR, VHOST_PORT,
548 "Failed to get MAC address on port %u: %s\n",
549 port, rte_strerror(-retval));
553 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
554 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
555 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
556 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
562 * Set socket file path.
565 us_vhost_parse_socket_path(const char *q_arg)
569 /* parse number string */
570 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
574 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
575 if (socket_files == NULL) {
580 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
587 * Parse the portmask provided at run time.
590 parse_portmask(const char *portmask)
597 /* parse hexadecimal string */
598 pm = strtoul(portmask, &end, 16);
599 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
607 * Parse num options at run time.
610 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
617 /* parse unsigned int string */
618 num = strtoul(q_arg, &end, 10);
619 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
622 if (num > max_valid_value)
633 us_vhost_usage(const char *prgname)
635 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
637 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
638 " --socket-file <path>\n"
640 " -p PORTMASK: Set mask for ports to be used by application\n"
641 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
642 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
643 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
644 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
645 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
646 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
647 " --socket-file: The path of the socket file.\n"
648 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
649 " --tso [0|1] disable/enable TCP segment offload.\n"
650 " --client register a vhost-user socket as client mode.\n"
651 " --dmas register dma channel for specific vhost device.\n"
652 " --total-num-mbufs [0-N] set the number of mbufs to be allocated in mbuf pools, the default value is 147456.\n",
657 #define OPT_VM2VM "vm2vm"
659 #define OPT_RX_RETRY "rx-retry"
661 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
662 OPT_RX_RETRY_DELAY_NUM,
663 #define OPT_RX_RETRY_NUMB "rx-retry-num"
664 OPT_RX_RETRY_NUMB_NUM,
665 #define OPT_MERGEABLE "mergeable"
667 #define OPT_STATS "stats"
669 #define OPT_SOCKET_FILE "socket-file"
671 #define OPT_TX_CSUM "tx-csum"
673 #define OPT_TSO "tso"
675 #define OPT_CLIENT "client"
677 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
678 OPT_BUILTIN_NET_DRIVER_NUM,
679 #define OPT_DMAS "dmas"
681 #define OPT_NUM_MBUFS "total-num-mbufs"
686 * Parse the arguments given in the command line of the application.
689 us_vhost_parse_args(int argc, char **argv)
694 const char *prgname = argv[0];
695 static struct option long_option[] = {
696 {OPT_VM2VM, required_argument,
697 NULL, OPT_VM2VM_NUM},
698 {OPT_RX_RETRY, required_argument,
699 NULL, OPT_RX_RETRY_NUM},
700 {OPT_RX_RETRY_DELAY, required_argument,
701 NULL, OPT_RX_RETRY_DELAY_NUM},
702 {OPT_RX_RETRY_NUMB, required_argument,
703 NULL, OPT_RX_RETRY_NUMB_NUM},
704 {OPT_MERGEABLE, required_argument,
705 NULL, OPT_MERGEABLE_NUM},
706 {OPT_STATS, required_argument,
707 NULL, OPT_STATS_NUM},
708 {OPT_SOCKET_FILE, required_argument,
709 NULL, OPT_SOCKET_FILE_NUM},
710 {OPT_TX_CSUM, required_argument,
711 NULL, OPT_TX_CSUM_NUM},
712 {OPT_TSO, required_argument,
714 {OPT_CLIENT, no_argument,
715 NULL, OPT_CLIENT_NUM},
716 {OPT_BUILTIN_NET_DRIVER, no_argument,
717 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
718 {OPT_DMAS, required_argument,
720 {OPT_NUM_MBUFS, required_argument,
721 NULL, OPT_NUM_MBUFS_NUM},
725 /* Parse command line */
726 while ((opt = getopt_long(argc, argv, "p:P",
727 long_option, &option_index)) != EOF) {
731 enabled_port_mask = parse_portmask(optarg);
732 if (enabled_port_mask == 0) {
733 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
734 us_vhost_usage(prgname);
741 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
742 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
743 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
747 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
749 RTE_LOG(INFO, VHOST_CONFIG,
750 "Invalid argument for "
752 us_vhost_usage(prgname);
755 vm2vm_mode = (vm2vm_type)ret;
758 case OPT_RX_RETRY_NUM:
759 ret = parse_num_opt(optarg, 1);
761 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
762 us_vhost_usage(prgname);
768 case OPT_TX_CSUM_NUM:
769 ret = parse_num_opt(optarg, 1);
771 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
772 us_vhost_usage(prgname);
775 enable_tx_csum = ret;
779 ret = parse_num_opt(optarg, 1);
781 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
782 us_vhost_usage(prgname);
788 case OPT_RX_RETRY_DELAY_NUM:
789 ret = parse_num_opt(optarg, INT32_MAX);
791 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
792 us_vhost_usage(prgname);
795 burst_rx_delay_time = ret;
798 case OPT_RX_RETRY_NUMB_NUM:
799 ret = parse_num_opt(optarg, INT32_MAX);
801 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
802 us_vhost_usage(prgname);
805 burst_rx_retry_num = ret;
808 case OPT_MERGEABLE_NUM:
809 ret = parse_num_opt(optarg, 1);
811 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
812 us_vhost_usage(prgname);
819 ret = parse_num_opt(optarg, INT32_MAX);
821 RTE_LOG(INFO, VHOST_CONFIG,
822 "Invalid argument for stats [0..N]\n");
823 us_vhost_usage(prgname);
829 /* Set socket file path. */
830 case OPT_SOCKET_FILE_NUM:
831 if (us_vhost_parse_socket_path(optarg) == -1) {
832 RTE_LOG(INFO, VHOST_CONFIG,
833 "Invalid argument for socket name (Max %d characters)\n",
835 us_vhost_usage(prgname);
841 if (open_dma(optarg) == -1) {
842 RTE_LOG(INFO, VHOST_CONFIG,
844 us_vhost_usage(prgname);
849 case OPT_NUM_MBUFS_NUM:
850 ret = parse_num_opt(optarg, INT32_MAX);
852 RTE_LOG(INFO, VHOST_CONFIG,
853 "Invalid argument for total-num-mbufs [0..N]\n");
854 us_vhost_usage(prgname);
858 if (total_num_mbufs < ret)
859 total_num_mbufs = ret;
866 case OPT_BUILTIN_NET_DRIVER_NUM:
867 builtin_net_driver = 1;
870 /* Invalid option - print options. */
872 us_vhost_usage(prgname);
877 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
878 if (enabled_port_mask & (1 << i))
879 ports[num_ports++] = i;
882 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
883 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
884 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
892 * Update the global var NUM_PORTS and array PORTS according to system ports number
893 * and return valid ports number
895 static unsigned check_ports_num(unsigned nb_ports)
897 unsigned valid_num_ports = num_ports;
900 if (num_ports > nb_ports) {
901 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
902 num_ports, nb_ports);
903 num_ports = nb_ports;
906 for (portid = 0; portid < num_ports; portid ++) {
907 if (!rte_eth_dev_is_valid_port(ports[portid])) {
908 RTE_LOG(INFO, VHOST_PORT,
909 "\nSpecified port ID(%u) is not valid\n",
911 ports[portid] = INVALID_PORT_ID;
915 return valid_num_ports;
918 static __rte_always_inline struct vhost_dev *
919 find_vhost_dev(struct rte_ether_addr *mac)
921 struct vhost_dev *vdev;
923 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
924 if (vdev->ready == DEVICE_RX &&
925 rte_is_same_ether_addr(mac, &vdev->mac_address))
933 * This function learns the MAC address of the device and registers this along with a
934 * vlan tag to a VMDQ.
937 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
939 struct rte_ether_hdr *pkt_hdr;
942 /* Learn MAC address of guest device from packet */
943 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
945 if (find_vhost_dev(&pkt_hdr->src_addr)) {
946 RTE_LOG(ERR, VHOST_DATA,
947 "(%d) device is using a registered MAC!\n",
952 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
953 vdev->mac_address.addr_bytes[i] =
954 pkt_hdr->src_addr.addr_bytes[i];
956 /* vlan_tag currently uses the device_id. */
957 vdev->vlan_tag = vlan_tags[vdev->vid];
959 /* Print out VMDQ registration info. */
960 RTE_LOG(INFO, VHOST_DATA,
961 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
962 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
965 /* Register the MAC address. */
966 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
967 (uint32_t)vdev->vid + vmdq_pool_base);
969 RTE_LOG(ERR, VHOST_DATA,
970 "(%d) failed to add device MAC address to VMDQ\n",
973 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
975 /* Set device as ready for RX. */
976 vdev->ready = DEVICE_RX;
982 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
983 * queue before disabling RX on the device.
986 unlink_vmdq(struct vhost_dev *vdev)
990 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
992 if (vdev->ready == DEVICE_RX) {
993 /*clear MAC and VLAN settings*/
994 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
995 for (i = 0; i < 6; i++)
996 vdev->mac_address.addr_bytes[i] = 0;
1000 /*Clear out the receive buffers*/
1001 rx_count = rte_eth_rx_burst(ports[0],
1002 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1005 for (i = 0; i < rx_count; i++)
1006 rte_pktmbuf_free(pkts_burst[i]);
1008 rx_count = rte_eth_rx_burst(ports[0],
1009 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1012 vdev->ready = DEVICE_MAC_LEARNING;
1017 free_pkts(struct rte_mbuf **pkts, uint16_t n)
1020 rte_pktmbuf_free(pkts[n]);
1023 static __rte_always_inline void
1024 complete_async_pkts(struct vhost_dev *vdev)
1026 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
1027 uint16_t complete_count;
1028 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].dev_id;
1030 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
1031 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
1033 free_pkts(p_cpl, complete_count);
1037 static __rte_always_inline void
1038 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
1043 if (builtin_net_driver) {
1044 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
1046 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
1050 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
1052 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
1054 src_vdev->stats.tx_total++;
1055 src_vdev->stats.tx += ret;
1059 static __rte_always_inline void
1060 drain_vhost(struct vhost_dev *vdev)
1063 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1064 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1065 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1067 ret = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev, VIRTIO_RXQ, m, nr_xmit);
1070 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1072 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1076 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1077 free_pkts(m, nr_xmit);
1080 static __rte_always_inline void
1081 drain_vhost_table(void)
1083 uint16_t lcore_id = rte_lcore_id();
1084 struct vhost_bufftable *vhost_txq;
1085 struct vhost_dev *vdev;
1088 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1089 if (unlikely(vdev->remove == 1))
1092 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1094 cur_tsc = rte_rdtsc();
1095 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1096 > MBUF_TABLE_DRAIN_TSC)) {
1097 RTE_LOG_DP(DEBUG, VHOST_DATA,
1098 "Vhost TX queue drained after timeout with burst size %u\n",
1102 vhost_txq->pre_tsc = cur_tsc;
1108 * Check if the packet destination MAC address is for a local device. If so then put
1109 * the packet on that devices RX queue. If not then return.
1111 static __rte_always_inline int
1112 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1114 struct rte_ether_hdr *pkt_hdr;
1115 struct vhost_dev *dst_vdev;
1116 struct vhost_bufftable *vhost_txq;
1117 uint16_t lcore_id = rte_lcore_id();
1118 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1120 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1124 if (vdev->vid == dst_vdev->vid) {
1125 RTE_LOG_DP(DEBUG, VHOST_DATA,
1126 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1131 RTE_LOG_DP(DEBUG, VHOST_DATA,
1132 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1134 if (unlikely(dst_vdev->remove)) {
1135 RTE_LOG_DP(DEBUG, VHOST_DATA,
1136 "(%d) device is marked for removal\n", dst_vdev->vid);
1140 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1141 vhost_txq->m_table[vhost_txq->len++] = m;
1144 vdev->stats.tx_total++;
1148 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1149 drain_vhost(dst_vdev);
1151 vhost_txq->pre_tsc = rte_rdtsc();
1157 * Check if the destination MAC of a packet is one local VM,
1158 * and get its vlan tag, and offset if it is.
1160 static __rte_always_inline int
1161 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1162 uint32_t *offset, uint16_t *vlan_tag)
1164 struct vhost_dev *dst_vdev;
1165 struct rte_ether_hdr *pkt_hdr =
1166 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1168 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1172 if (vdev->vid == dst_vdev->vid) {
1173 RTE_LOG_DP(DEBUG, VHOST_DATA,
1174 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1180 * HW vlan strip will reduce the packet length
1181 * by minus length of vlan tag, so need restore
1182 * the packet length by plus it.
1184 *offset = RTE_VLAN_HLEN;
1185 *vlan_tag = vlan_tags[vdev->vid];
1187 RTE_LOG_DP(DEBUG, VHOST_DATA,
1188 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1189 vdev->vid, dst_vdev->vid, *vlan_tag);
1194 static void virtio_tx_offload(struct rte_mbuf *m)
1196 struct rte_net_hdr_lens hdr_lens;
1197 struct rte_ipv4_hdr *ipv4_hdr;
1198 struct rte_tcp_hdr *tcp_hdr;
1202 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1203 m->l2_len = hdr_lens.l2_len;
1204 m->l3_len = hdr_lens.l3_len;
1205 m->l4_len = hdr_lens.l4_len;
1207 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1208 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1209 m->l2_len + m->l3_len);
1211 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1212 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1213 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1214 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1216 ipv4_hdr->hdr_checksum = 0;
1217 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1218 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1219 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1220 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1224 static __rte_always_inline void
1225 do_drain_mbuf_table(struct mbuf_table *tx_q)
1229 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1230 tx_q->m_table, tx_q->len);
1231 if (unlikely(count < tx_q->len))
1232 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1238 * This function routes the TX packet to the correct interface. This
1239 * may be a local device or the physical port.
1241 static __rte_always_inline void
1242 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1244 struct mbuf_table *tx_q;
1245 unsigned offset = 0;
1246 const uint16_t lcore_id = rte_lcore_id();
1247 struct rte_ether_hdr *nh;
1250 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1251 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1252 struct vhost_dev *vdev2;
1254 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1256 sync_virtio_xmit(vdev2, vdev, m);
1261 /*check if destination is local VM*/
1262 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1265 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1266 if (unlikely(find_local_dest(vdev, m, &offset,
1268 rte_pktmbuf_free(m);
1273 RTE_LOG_DP(DEBUG, VHOST_DATA,
1274 "(%d) TX: MAC address is external\n", vdev->vid);
1278 /*Add packet to the port tx queue*/
1279 tx_q = &lcore_tx_queue[lcore_id];
1281 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1282 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1283 /* Guest has inserted the vlan tag. */
1284 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1285 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1286 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1287 (vh->vlan_tci != vlan_tag_be))
1288 vh->vlan_tci = vlan_tag_be;
1290 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1293 * Find the right seg to adjust the data len when offset is
1294 * bigger than tail room size.
1296 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1297 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1298 m->data_len += offset;
1300 struct rte_mbuf *seg = m;
1302 while ((seg->next != NULL) &&
1303 (offset > rte_pktmbuf_tailroom(seg)))
1306 seg->data_len += offset;
1308 m->pkt_len += offset;
1311 m->vlan_tci = vlan_tag;
1314 if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1315 virtio_tx_offload(m);
1317 tx_q->m_table[tx_q->len++] = m;
1319 vdev->stats.tx_total++;
1323 if (unlikely(tx_q->len == MAX_PKT_BURST))
1324 do_drain_mbuf_table(tx_q);
1328 static __rte_always_inline void
1329 drain_mbuf_table(struct mbuf_table *tx_q)
1331 static uint64_t prev_tsc;
1337 cur_tsc = rte_rdtsc();
1338 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1341 RTE_LOG_DP(DEBUG, VHOST_DATA,
1342 "TX queue drained after timeout with burst size %u\n",
1344 do_drain_mbuf_table(tx_q);
1349 async_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1350 struct rte_mbuf **pkts, uint32_t rx_count)
1352 uint16_t enqueue_count;
1353 uint16_t enqueue_fail = 0;
1354 uint16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_RXQ].dev_id;
1356 complete_async_pkts(dev);
1357 enqueue_count = rte_vhost_submit_enqueue_burst(dev->vid, queue_id,
1358 pkts, rx_count, dma_id, 0);
1360 enqueue_fail = rx_count - enqueue_count;
1362 free_pkts(&pkts[enqueue_count], enqueue_fail);
1364 return enqueue_count;
1368 sync_enqueue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1369 struct rte_mbuf **pkts, uint32_t rx_count)
1371 return rte_vhost_enqueue_burst(dev->vid, queue_id, pkts, rx_count);
1374 static __rte_always_inline void
1375 drain_eth_rx(struct vhost_dev *vdev)
1377 uint16_t rx_count, enqueue_count;
1378 struct rte_mbuf *pkts[MAX_PKT_BURST];
1380 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1381 pkts, MAX_PKT_BURST);
1387 * When "enable_retry" is set, here we wait and retry when there
1388 * is no enough free slots in the queue to hold @rx_count packets,
1389 * to diminish packet loss.
1392 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1396 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1397 rte_delay_us(burst_rx_delay_time);
1398 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1404 enqueue_count = vdev_queue_ops[vdev->vid].enqueue_pkt_burst(vdev,
1405 VIRTIO_RXQ, pkts, rx_count);
1408 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1410 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1414 if (!dma_bind[vid2socketid[vdev->vid]].dmas[VIRTIO_RXQ].async_enabled)
1415 free_pkts(pkts, rx_count);
1418 uint16_t async_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1419 struct rte_mempool *mbuf_pool,
1420 struct rte_mbuf **pkts, uint16_t count)
1423 uint16_t dequeue_count;
1424 int16_t dma_id = dma_bind[vid2socketid[dev->vid]].dmas[VIRTIO_TXQ].dev_id;
1426 dequeue_count = rte_vhost_async_try_dequeue_burst(dev->vid, queue_id,
1427 mbuf_pool, pkts, count, &nr_inflight, dma_id, 0);
1429 return dequeue_count;
1432 uint16_t sync_dequeue_pkts(struct vhost_dev *dev, uint16_t queue_id,
1433 struct rte_mempool *mbuf_pool,
1434 struct rte_mbuf **pkts, uint16_t count)
1436 return rte_vhost_dequeue_burst(dev->vid, queue_id, mbuf_pool, pkts, count);
1439 static __rte_always_inline void
1440 drain_virtio_tx(struct vhost_dev *vdev)
1442 struct rte_mbuf *pkts[MAX_PKT_BURST];
1446 count = vdev_queue_ops[vdev->vid].dequeue_pkt_burst(vdev,
1447 VIRTIO_TXQ, mbuf_pool, pkts, MAX_PKT_BURST);
1449 /* setup VMDq for the first packet */
1450 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1451 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1452 free_pkts(pkts, count);
1455 for (i = 0; i < count; ++i)
1456 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1460 * Main function of vhost-switch. It basically does:
1462 * for each vhost device {
1465 * Which drains the host eth Rx queue linked to the vhost device,
1466 * and deliver all of them to guest virito Rx ring associated with
1467 * this vhost device.
1469 * - drain_virtio_tx()
1471 * Which drains the guest virtio Tx queue and deliver all of them
1472 * to the target, which could be another vhost device, or the
1473 * physical eth dev. The route is done in function "virtio_tx_route".
1477 switch_worker(void *arg __rte_unused)
1480 unsigned lcore_id = rte_lcore_id();
1481 struct vhost_dev *vdev;
1482 struct mbuf_table *tx_q;
1484 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1486 tx_q = &lcore_tx_queue[lcore_id];
1487 for (i = 0; i < rte_lcore_count(); i++) {
1488 if (lcore_ids[i] == lcore_id) {
1495 drain_mbuf_table(tx_q);
1496 drain_vhost_table();
1498 * Inform the configuration core that we have exited the
1499 * linked list and that no devices are in use if requested.
1501 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1502 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1505 * Process vhost devices
1507 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1509 if (unlikely(vdev->remove)) {
1511 vdev->ready = DEVICE_SAFE_REMOVE;
1515 if (likely(vdev->ready == DEVICE_RX))
1518 if (likely(!vdev->remove))
1519 drain_virtio_tx(vdev);
1527 vhost_clear_queue_thread_unsafe(struct vhost_dev *vdev, uint16_t queue_id)
1532 int16_t dma_id = dma_bind[vid2socketid[vdev->vid]].dmas[queue_id].dev_id;
1533 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid, queue_id);
1535 struct rte_mbuf *m_cpl[pkts_inflight];
1537 while (pkts_inflight) {
1538 n_pkt = rte_vhost_clear_queue_thread_unsafe(vdev->vid, queue_id, m_cpl,
1539 pkts_inflight, dma_id, 0);
1540 free_pkts(m_cpl, n_pkt);
1541 pkts_inflight = rte_vhost_async_get_inflight_thread_unsafe(vdev->vid,
1547 * Remove a device from the specific data core linked list and from the
1548 * main linked list. Synchronization occurs through the use of the
1549 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1550 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1553 destroy_device(int vid)
1555 struct vhost_dev *vdev = NULL;
1559 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1560 if (vdev->vid == vid)
1565 /*set the remove flag. */
1567 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1571 for (i = 0; i < RTE_MAX_LCORE; i++)
1572 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1574 if (builtin_net_driver)
1575 vs_vhost_net_remove(vdev);
1577 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1579 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1582 /* Set the dev_removal_flag on each lcore. */
1583 RTE_LCORE_FOREACH_WORKER(lcore)
1584 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1587 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1588 * we can be sure that they can no longer access the device removed
1589 * from the linked lists and that the devices are no longer in use.
1591 RTE_LCORE_FOREACH_WORKER(lcore) {
1592 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1596 lcore_info[vdev->coreid].device_num--;
1598 RTE_LOG(INFO, VHOST_DATA,
1599 "(%d) device has been removed from data core\n",
1602 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1603 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_RXQ);
1604 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1605 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1608 if (dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled) {
1609 vhost_clear_queue_thread_unsafe(vdev, VIRTIO_TXQ);
1610 rte_vhost_async_channel_unregister(vid, VIRTIO_TXQ);
1611 dma_bind[vid].dmas[VIRTIO_TXQ].async_enabled = false;
1618 get_socketid_by_vid(int vid)
1621 char ifname[PATH_MAX];
1622 rte_vhost_get_ifname(vid, ifname, sizeof(ifname));
1624 for (i = 0; i < nb_sockets; i++) {
1625 char *file = socket_files + i * PATH_MAX;
1626 if (strcmp(file, ifname) == 0)
1634 init_vhost_queue_ops(int vid)
1636 if (builtin_net_driver) {
1637 vdev_queue_ops[vid].enqueue_pkt_burst = builtin_enqueue_pkts;
1638 vdev_queue_ops[vid].dequeue_pkt_burst = builtin_dequeue_pkts;
1640 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled)
1641 vdev_queue_ops[vid].enqueue_pkt_burst = async_enqueue_pkts;
1643 vdev_queue_ops[vid].enqueue_pkt_burst = sync_enqueue_pkts;
1645 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled)
1646 vdev_queue_ops[vid].dequeue_pkt_burst = async_dequeue_pkts;
1648 vdev_queue_ops[vid].dequeue_pkt_burst = sync_dequeue_pkts;
1655 vhost_async_channel_register(int vid)
1657 int rx_ret = 0, tx_ret = 0;
1659 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1660 rx_ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1662 dma_bind[vid2socketid[vid]].dmas[VIRTIO_RXQ].async_enabled = true;
1665 if (dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].dev_id != INVALID_DMA_ID) {
1666 tx_ret = rte_vhost_async_channel_register(vid, VIRTIO_TXQ);
1668 dma_bind[vid2socketid[vid]].dmas[VIRTIO_TXQ].async_enabled = true;
1671 return rx_ret | tx_ret;
1677 * A new device is added to a data core. First the device is added to the main linked list
1678 * and then allocated to a specific data core.
1683 int lcore, core_add = 0;
1685 uint32_t device_num_min = num_devices;
1686 struct vhost_dev *vdev;
1689 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1691 RTE_LOG(INFO, VHOST_DATA,
1692 "(%d) couldn't allocate memory for vhost dev\n",
1698 for (i = 0; i < RTE_MAX_LCORE; i++) {
1699 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1700 = rte_zmalloc("vhost bufftable",
1701 sizeof(struct vhost_bufftable),
1702 RTE_CACHE_LINE_SIZE);
1704 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1705 RTE_LOG(INFO, VHOST_DATA,
1706 "(%d) couldn't allocate memory for vhost TX\n", vid);
1711 int socketid = get_socketid_by_vid(vid);
1715 init_vid2socketid_array(vid, socketid);
1717 ret = vhost_async_channel_register(vid);
1719 if (init_vhost_queue_ops(vid) != 0)
1722 if (builtin_net_driver)
1723 vs_vhost_net_setup(vdev);
1725 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1726 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1728 /*reset ready flag*/
1729 vdev->ready = DEVICE_MAC_LEARNING;
1732 /* Find a suitable lcore to add the device. */
1733 RTE_LCORE_FOREACH_WORKER(lcore) {
1734 if (lcore_info[lcore].device_num < device_num_min) {
1735 device_num_min = lcore_info[lcore].device_num;
1739 vdev->coreid = core_add;
1741 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1743 lcore_info[vdev->coreid].device_num++;
1745 /* Disable notifications. */
1746 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1747 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1749 RTE_LOG(INFO, VHOST_DATA,
1750 "(%d) device has been added to data core %d\n",
1757 vring_state_changed(int vid, uint16_t queue_id, int enable)
1759 struct vhost_dev *vdev = NULL;
1761 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1762 if (vdev->vid == vid)
1768 if (queue_id != VIRTIO_RXQ)
1771 if (dma_bind[vid2socketid[vid]].dmas[queue_id].async_enabled) {
1773 vhost_clear_queue_thread_unsafe(vdev, queue_id);
1780 * These callback allow devices to be added to the data core when configuration
1781 * has been fully complete.
1783 static const struct rte_vhost_device_ops virtio_net_device_ops =
1785 .new_device = new_device,
1786 .destroy_device = destroy_device,
1787 .vring_state_changed = vring_state_changed,
1791 * This is a thread will wake up after a period to print stats if the user has
1795 print_stats(__rte_unused void *arg)
1797 struct vhost_dev *vdev;
1798 uint64_t tx_dropped, rx_dropped;
1799 uint64_t tx, tx_total, rx, rx_total;
1800 const char clr[] = { 27, '[', '2', 'J', '\0' };
1801 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1804 sleep(enable_stats);
1806 /* Clear screen and move to top left */
1807 printf("%s%s\n", clr, top_left);
1808 printf("Device statistics =================================\n");
1810 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1811 tx_total = vdev->stats.tx_total;
1812 tx = vdev->stats.tx;
1813 tx_dropped = tx_total - tx;
1815 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1817 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1819 rx_dropped = rx_total - rx;
1821 printf("Statistics for device %d\n"
1822 "-----------------------\n"
1823 "TX total: %" PRIu64 "\n"
1824 "TX dropped: %" PRIu64 "\n"
1825 "TX successful: %" PRIu64 "\n"
1826 "RX total: %" PRIu64 "\n"
1827 "RX dropped: %" PRIu64 "\n"
1828 "RX successful: %" PRIu64 "\n",
1830 tx_total, tx_dropped, tx,
1831 rx_total, rx_dropped, rx);
1834 printf("===================================================\n");
1843 unregister_drivers(int socket_num)
1847 for (i = 0; i < socket_num; i++) {
1848 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1850 RTE_LOG(ERR, VHOST_CONFIG,
1851 "Fail to unregister vhost driver for %s.\n",
1852 socket_files + i * PATH_MAX);
1856 /* When we receive a INT signal, unregister vhost driver */
1858 sigint_handler(__rte_unused int signum)
1860 /* Unregister vhost driver. */
1861 unregister_drivers(nb_sockets);
1871 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1874 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1875 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1876 dma_bind[i].dmas[j].async_enabled = false;
1880 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1881 dmas_id[i] = INVALID_DMA_ID;
1885 * Main function, does initialisation and calls the per-lcore functions.
1888 main(int argc, char *argv[])
1890 unsigned lcore_id, core_id = 0;
1891 unsigned nb_ports, valid_num_ports;
1894 static pthread_t tid;
1895 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1897 signal(SIGINT, sigint_handler);
1900 ret = rte_eal_init(argc, argv);
1902 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1906 /* initialize dma structures */
1909 /* parse app arguments */
1910 ret = us_vhost_parse_args(argc, argv);
1912 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1914 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1915 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1917 if (rte_lcore_is_enabled(lcore_id))
1918 lcore_ids[core_id++] = lcore_id;
1921 if (rte_lcore_count() > RTE_MAX_LCORE)
1922 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1924 /* Get the number of physical ports. */
1925 nb_ports = rte_eth_dev_count_avail();
1928 * Update the global var NUM_PORTS and global array PORTS
1929 * and get value of var VALID_NUM_PORTS according to system ports number
1931 valid_num_ports = check_ports_num(nb_ports);
1933 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1934 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1935 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1940 * FIXME: here we are trying to allocate mbufs big enough for
1941 * @MAX_QUEUES, but the truth is we're never going to use that
1942 * many queues here. We probably should only do allocation for
1943 * those queues we are going to use.
1945 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", total_num_mbufs,
1946 MBUF_CACHE_SIZE, 0, MBUF_DATA_SIZE,
1948 if (mbuf_pool == NULL)
1949 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1951 if (vm2vm_mode == VM2VM_HARDWARE) {
1952 /* Enable VT loop back to let L2 switch to do it. */
1953 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1954 RTE_LOG(DEBUG, VHOST_CONFIG,
1955 "Enable loop back for L2 switch in vmdq.\n");
1958 /* initialize all ports */
1959 RTE_ETH_FOREACH_DEV(portid) {
1960 /* skip ports that are not enabled */
1961 if ((enabled_port_mask & (1 << portid)) == 0) {
1962 RTE_LOG(INFO, VHOST_PORT,
1963 "Skipping disabled port %d\n", portid);
1966 if (port_init(portid) != 0)
1967 rte_exit(EXIT_FAILURE,
1968 "Cannot initialize network ports\n");
1971 /* Enable stats if the user option is set. */
1973 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1976 rte_exit(EXIT_FAILURE,
1977 "Cannot create print-stats thread\n");
1980 /* Launch all data cores. */
1981 RTE_LCORE_FOREACH_WORKER(lcore_id)
1982 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1985 flags |= RTE_VHOST_USER_CLIENT;
1987 for (i = 0; i < dma_count; i++) {
1988 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1989 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1990 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1994 /* Register vhost user driver to handle vhost messages. */
1995 for (i = 0; i < nb_sockets; i++) {
1996 char *file = socket_files + i * PATH_MAX;
1998 if (dma_count && get_async_flag_by_socketid(i) != 0)
1999 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
2001 ret = rte_vhost_driver_register(file, flags);
2003 unregister_drivers(i);
2004 rte_exit(EXIT_FAILURE,
2005 "vhost driver register failure.\n");
2008 if (builtin_net_driver)
2009 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
2011 if (mergeable == 0) {
2012 rte_vhost_driver_disable_features(file,
2013 1ULL << VIRTIO_NET_F_MRG_RXBUF);
2016 if (enable_tx_csum == 0) {
2017 rte_vhost_driver_disable_features(file,
2018 1ULL << VIRTIO_NET_F_CSUM);
2021 if (enable_tso == 0) {
2022 rte_vhost_driver_disable_features(file,
2023 1ULL << VIRTIO_NET_F_HOST_TSO4);
2024 rte_vhost_driver_disable_features(file,
2025 1ULL << VIRTIO_NET_F_HOST_TSO6);
2026 rte_vhost_driver_disable_features(file,
2027 1ULL << VIRTIO_NET_F_GUEST_TSO4);
2028 rte_vhost_driver_disable_features(file,
2029 1ULL << VIRTIO_NET_F_GUEST_TSO6);
2033 rte_vhost_driver_enable_features(file,
2034 1ULL << VIRTIO_NET_F_CTRL_RX);
2037 ret = rte_vhost_driver_callback_register(file,
2038 &virtio_net_device_ops);
2040 rte_exit(EXIT_FAILURE,
2041 "failed to register vhost driver callbacks.\n");
2044 if (rte_vhost_driver_start(file) < 0) {
2045 rte_exit(EXIT_FAILURE,
2046 "failed to start vhost driver.\n");
2050 RTE_LCORE_FOREACH_WORKER(lcore_id)
2051 rte_eal_wait_lcore(lcore_id);
2053 /* clean up the EAL */