1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
33 #define MAX_QUEUES 128
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
39 #define MBUF_CACHE_SIZE 128
40 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
42 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
44 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
47 #define JUMBO_FRAME_MAX_SIZE 0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
53 #define DEVICE_SAFE_REMOVE 2
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
62 #define DMA_RING_SIZE 4096
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
78 static struct rte_mempool *mbuf_pool;
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
101 static int client_mode;
103 static int builtin_net_driver;
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
117 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY,
120 * VLAN strip is necessary for 1G NIC such as I350,
121 * this fixes bug of ipv4 forwarding in guest can't
122 * forward packets from one virtio dev to another virtio dev.
124 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
128 .mq_mode = RTE_ETH_MQ_TX_NONE,
129 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130 RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131 RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132 RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133 RTE_ETH_TX_OFFLOAD_TCP_TSO),
137 * should be overridden separately in code with
141 .nb_queue_pools = RTE_ETH_8_POOLS,
142 .enable_default_pool = 0,
145 .pool_map = {{0, 0},},
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
158 const uint16_t vlan_tags[] = {
159 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
161 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
172 static struct vhost_dev_tailq_list vhost_dev_list =
173 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
177 /* Used for queueing bursts of TX packets. */
181 struct rte_mbuf *m_table[MAX_PKT_BURST];
184 struct vhost_bufftable {
187 struct rte_mbuf *m_table[MAX_PKT_BURST];
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
194 * Vhost TX buffer for each data core.
195 * Every data core maintains a TX buffer for every vhost device,
196 * which is used for batch pkts enqueue for higher performance.
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
200 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
201 / US_PER_S * BURST_TX_DRAIN_US)
204 is_dma_configured(int16_t dev_id)
208 for (i = 0; i < dma_count; i++)
209 if (dmas_id[i] == dev_id)
215 open_dma(const char *value)
217 struct dma_for_vhost *dma_info = dma_bind;
218 char *input = strndup(value, strlen(value) + 1);
221 char *start, *end, *substr;
224 struct rte_dma_info info;
225 struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226 struct rte_dma_vchan_conf qconf = {
227 .direction = RTE_DMA_DIR_MEM_TO_MEM,
228 .nb_desc = DMA_RING_SIZE
234 char *dma_arg[RTE_MAX_VHOST_DEVICE];
237 while (isblank(*addrs))
239 if (*addrs == '\0') {
244 /* process DMA devices within bracket. */
246 substr = strtok(addrs, ";]");
252 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
258 while (i < args_nr) {
259 char *arg_temp = dma_arg[i];
262 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
268 start = strstr(ptrs[0], "txd");
275 vid = strtol(start, &end, 0);
281 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
283 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
288 /* DMA device is already configured, so skip */
289 if (is_dma_configured(dev_id))
292 if (rte_dma_info_get(dev_id, &info) != 0) {
293 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
298 if (info.max_vchans < 1) {
299 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
304 if (rte_dma_configure(dev_id, &dev_config) != 0) {
305 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
310 /* Check the max desc supported by DMA device */
311 rte_dma_info_get(dev_id, &info);
312 if (info.nb_vchans != 1) {
313 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
319 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
321 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
327 if (rte_dma_start(dev_id) != 0) {
328 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
333 dmas_id[dma_count++] = dev_id;
336 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
345 * Builds up the correct configuration for VMDQ VLAN pool map
346 * according to the pool & queue limits.
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
351 struct rte_eth_vmdq_rx_conf conf;
352 struct rte_eth_vmdq_rx_conf *def_conf =
353 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
356 memset(&conf, 0, sizeof(conf));
357 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358 conf.nb_pool_maps = num_devices;
359 conf.enable_loop_back = def_conf->enable_loop_back;
360 conf.rx_mode = def_conf->rx_mode;
362 for (i = 0; i < conf.nb_pool_maps; i++) {
363 conf.pool_map[i].vlan_id = vlan_tags[ i ];
364 conf.pool_map[i].pools = (1UL << i);
367 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
374 * Initialises a given port using global settings and with the rx buffers
375 * coming from the mbuf_pool passed as parameter
378 port_init(uint16_t port)
380 struct rte_eth_dev_info dev_info;
381 struct rte_eth_conf port_conf;
382 struct rte_eth_rxconf *rxconf;
383 struct rte_eth_txconf *txconf;
384 int16_t rx_rings, tx_rings;
385 uint16_t rx_ring_size, tx_ring_size;
389 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390 retval = rte_eth_dev_info_get(port, &dev_info);
392 RTE_LOG(ERR, VHOST_PORT,
393 "Error during getting device (port %u) info: %s\n",
394 port, strerror(-retval));
399 rxconf = &dev_info.default_rxconf;
400 txconf = &dev_info.default_txconf;
401 rxconf->rx_drop_en = 1;
403 /*configure the number of supported virtio devices based on VMDQ limits */
404 num_devices = dev_info.max_vmdq_pools;
406 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
409 tx_rings = (uint16_t)rte_lcore_count();
412 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
415 vmdq_conf_default.rxmode.mtu = MAX_MTU;
418 /* Get port configuration. */
419 retval = get_eth_conf(&port_conf, num_devices);
422 /* NIC queues are divided into pf queues and vmdq queues. */
423 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425 num_vmdq_queues = num_devices * queues_per_pool;
426 num_queues = num_pf_queues + num_vmdq_queues;
427 vmdq_queue_base = dev_info.vmdq_queue_base;
428 vmdq_pool_base = dev_info.vmdq_pool_base;
429 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430 num_pf_queues, num_devices, queues_per_pool);
432 if (!rte_eth_dev_is_valid_port(port))
435 rx_rings = (uint16_t)dev_info.max_rx_queues;
436 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437 port_conf.txmode.offloads |=
438 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439 /* Configure ethernet device. */
440 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
442 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443 port, strerror(-retval));
447 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
450 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451 "for port %u: %s.\n", port, strerror(-retval));
454 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456 "for Rx queues on port %u.\n", port);
460 /* Setup the queues. */
461 rxconf->offloads = port_conf.rxmode.offloads;
462 for (q = 0; q < rx_rings; q ++) {
463 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464 rte_eth_dev_socket_id(port),
468 RTE_LOG(ERR, VHOST_PORT,
469 "Failed to setup rx queue %u of port %u: %s.\n",
470 q, port, strerror(-retval));
474 txconf->offloads = port_conf.txmode.offloads;
475 for (q = 0; q < tx_rings; q ++) {
476 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477 rte_eth_dev_socket_id(port),
480 RTE_LOG(ERR, VHOST_PORT,
481 "Failed to setup tx queue %u of port %u: %s.\n",
482 q, port, strerror(-retval));
487 /* Start the device. */
488 retval = rte_eth_dev_start(port);
490 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491 port, strerror(-retval));
496 retval = rte_eth_promiscuous_enable(port);
498 RTE_LOG(ERR, VHOST_PORT,
499 "Failed to enable promiscuous mode on port %u: %s\n",
500 port, rte_strerror(-retval));
505 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
507 RTE_LOG(ERR, VHOST_PORT,
508 "Failed to get MAC address on port %u: %s\n",
509 port, rte_strerror(-retval));
513 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
522 * Set socket file path.
525 us_vhost_parse_socket_path(const char *q_arg)
529 /* parse number string */
530 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
534 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535 if (socket_files == NULL) {
540 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
547 * Parse the portmask provided at run time.
550 parse_portmask(const char *portmask)
557 /* parse hexadecimal string */
558 pm = strtoul(portmask, &end, 16);
559 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
567 * Parse num options at run time.
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
577 /* parse unsigned int string */
578 num = strtoul(q_arg, &end, 10);
579 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
582 if (num > max_valid_value)
593 us_vhost_usage(const char *prgname)
595 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
597 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598 " --socket-file <path>\n"
600 " -p PORTMASK: Set mask for ports to be used by application\n"
601 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607 " --socket-file: The path of the socket file.\n"
608 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
609 " --tso [0|1] disable/enable TCP segment offload.\n"
610 " --client register a vhost-user socket as client mode.\n"
611 " --dmas register dma channel for specific vhost device.\n",
616 #define OPT_VM2VM "vm2vm"
618 #define OPT_RX_RETRY "rx-retry"
620 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
621 OPT_RX_RETRY_DELAY_NUM,
622 #define OPT_RX_RETRY_NUMB "rx-retry-num"
623 OPT_RX_RETRY_NUMB_NUM,
624 #define OPT_MERGEABLE "mergeable"
626 #define OPT_STATS "stats"
628 #define OPT_SOCKET_FILE "socket-file"
630 #define OPT_TX_CSUM "tx-csum"
632 #define OPT_TSO "tso"
634 #define OPT_CLIENT "client"
636 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
637 OPT_BUILTIN_NET_DRIVER_NUM,
638 #define OPT_DMAS "dmas"
643 * Parse the arguments given in the command line of the application.
646 us_vhost_parse_args(int argc, char **argv)
651 const char *prgname = argv[0];
652 static struct option long_option[] = {
653 {OPT_VM2VM, required_argument,
654 NULL, OPT_VM2VM_NUM},
655 {OPT_RX_RETRY, required_argument,
656 NULL, OPT_RX_RETRY_NUM},
657 {OPT_RX_RETRY_DELAY, required_argument,
658 NULL, OPT_RX_RETRY_DELAY_NUM},
659 {OPT_RX_RETRY_NUMB, required_argument,
660 NULL, OPT_RX_RETRY_NUMB_NUM},
661 {OPT_MERGEABLE, required_argument,
662 NULL, OPT_MERGEABLE_NUM},
663 {OPT_STATS, required_argument,
664 NULL, OPT_STATS_NUM},
665 {OPT_SOCKET_FILE, required_argument,
666 NULL, OPT_SOCKET_FILE_NUM},
667 {OPT_TX_CSUM, required_argument,
668 NULL, OPT_TX_CSUM_NUM},
669 {OPT_TSO, required_argument,
671 {OPT_CLIENT, no_argument,
672 NULL, OPT_CLIENT_NUM},
673 {OPT_BUILTIN_NET_DRIVER, no_argument,
674 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
675 {OPT_DMAS, required_argument,
680 /* Parse command line */
681 while ((opt = getopt_long(argc, argv, "p:P",
682 long_option, &option_index)) != EOF) {
686 enabled_port_mask = parse_portmask(optarg);
687 if (enabled_port_mask == 0) {
688 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
689 us_vhost_usage(prgname);
696 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
697 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
698 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
702 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
704 RTE_LOG(INFO, VHOST_CONFIG,
705 "Invalid argument for "
707 us_vhost_usage(prgname);
710 vm2vm_mode = (vm2vm_type)ret;
713 case OPT_RX_RETRY_NUM:
714 ret = parse_num_opt(optarg, 1);
716 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
717 us_vhost_usage(prgname);
723 case OPT_TX_CSUM_NUM:
724 ret = parse_num_opt(optarg, 1);
726 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
727 us_vhost_usage(prgname);
730 enable_tx_csum = ret;
734 ret = parse_num_opt(optarg, 1);
736 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
737 us_vhost_usage(prgname);
743 case OPT_RX_RETRY_DELAY_NUM:
744 ret = parse_num_opt(optarg, INT32_MAX);
746 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
747 us_vhost_usage(prgname);
750 burst_rx_delay_time = ret;
753 case OPT_RX_RETRY_NUMB_NUM:
754 ret = parse_num_opt(optarg, INT32_MAX);
756 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
757 us_vhost_usage(prgname);
760 burst_rx_retry_num = ret;
763 case OPT_MERGEABLE_NUM:
764 ret = parse_num_opt(optarg, 1);
766 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
767 us_vhost_usage(prgname);
774 ret = parse_num_opt(optarg, INT32_MAX);
776 RTE_LOG(INFO, VHOST_CONFIG,
777 "Invalid argument for stats [0..N]\n");
778 us_vhost_usage(prgname);
784 /* Set socket file path. */
785 case OPT_SOCKET_FILE_NUM:
786 if (us_vhost_parse_socket_path(optarg) == -1) {
787 RTE_LOG(INFO, VHOST_CONFIG,
788 "Invalid argument for socket name (Max %d characters)\n",
790 us_vhost_usage(prgname);
796 if (open_dma(optarg) == -1) {
797 RTE_LOG(INFO, VHOST_CONFIG,
799 us_vhost_usage(prgname);
808 case OPT_BUILTIN_NET_DRIVER_NUM:
809 builtin_net_driver = 1;
812 /* Invalid option - print options. */
814 us_vhost_usage(prgname);
819 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
820 if (enabled_port_mask & (1 << i))
821 ports[num_ports++] = i;
824 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
825 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
826 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
834 * Update the global var NUM_PORTS and array PORTS according to system ports number
835 * and return valid ports number
837 static unsigned check_ports_num(unsigned nb_ports)
839 unsigned valid_num_ports = num_ports;
842 if (num_ports > nb_ports) {
843 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
844 num_ports, nb_ports);
845 num_ports = nb_ports;
848 for (portid = 0; portid < num_ports; portid ++) {
849 if (!rte_eth_dev_is_valid_port(ports[portid])) {
850 RTE_LOG(INFO, VHOST_PORT,
851 "\nSpecified port ID(%u) is not valid\n",
853 ports[portid] = INVALID_PORT_ID;
857 return valid_num_ports;
860 static __rte_always_inline struct vhost_dev *
861 find_vhost_dev(struct rte_ether_addr *mac)
863 struct vhost_dev *vdev;
865 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
866 if (vdev->ready == DEVICE_RX &&
867 rte_is_same_ether_addr(mac, &vdev->mac_address))
875 * This function learns the MAC address of the device and registers this along with a
876 * vlan tag to a VMDQ.
879 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
881 struct rte_ether_hdr *pkt_hdr;
884 /* Learn MAC address of guest device from packet */
885 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
887 if (find_vhost_dev(&pkt_hdr->src_addr)) {
888 RTE_LOG(ERR, VHOST_DATA,
889 "(%d) device is using a registered MAC!\n",
894 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
895 vdev->mac_address.addr_bytes[i] =
896 pkt_hdr->src_addr.addr_bytes[i];
898 /* vlan_tag currently uses the device_id. */
899 vdev->vlan_tag = vlan_tags[vdev->vid];
901 /* Print out VMDQ registration info. */
902 RTE_LOG(INFO, VHOST_DATA,
903 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
904 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
907 /* Register the MAC address. */
908 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
909 (uint32_t)vdev->vid + vmdq_pool_base);
911 RTE_LOG(ERR, VHOST_DATA,
912 "(%d) failed to add device MAC address to VMDQ\n",
915 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
917 /* Set device as ready for RX. */
918 vdev->ready = DEVICE_RX;
924 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
925 * queue before disabling RX on the device.
928 unlink_vmdq(struct vhost_dev *vdev)
932 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
934 if (vdev->ready == DEVICE_RX) {
935 /*clear MAC and VLAN settings*/
936 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
937 for (i = 0; i < 6; i++)
938 vdev->mac_address.addr_bytes[i] = 0;
942 /*Clear out the receive buffers*/
943 rx_count = rte_eth_rx_burst(ports[0],
944 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
947 for (i = 0; i < rx_count; i++)
948 rte_pktmbuf_free(pkts_burst[i]);
950 rx_count = rte_eth_rx_burst(ports[0],
951 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
954 vdev->ready = DEVICE_MAC_LEARNING;
959 free_pkts(struct rte_mbuf **pkts, uint16_t n)
962 rte_pktmbuf_free(pkts[n]);
965 static __rte_always_inline void
966 complete_async_pkts(struct vhost_dev *vdev)
968 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
969 uint16_t complete_count;
970 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
972 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
973 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
974 if (complete_count) {
975 free_pkts(p_cpl, complete_count);
976 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
981 static __rte_always_inline void
982 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
987 if (builtin_net_driver) {
988 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
990 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
994 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
996 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
998 src_vdev->stats.tx_total++;
999 src_vdev->stats.tx += ret;
1003 static __rte_always_inline void
1004 drain_vhost(struct vhost_dev *vdev)
1007 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1008 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1009 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1011 if (builtin_net_driver) {
1012 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1013 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1014 uint16_t enqueue_fail = 0;
1015 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1017 complete_async_pkts(vdev);
1018 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1019 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1021 enqueue_fail = nr_xmit - ret;
1023 free_pkts(&m[ret], nr_xmit - ret);
1025 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1030 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1032 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1036 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1037 free_pkts(m, nr_xmit);
1040 static __rte_always_inline void
1041 drain_vhost_table(void)
1043 uint16_t lcore_id = rte_lcore_id();
1044 struct vhost_bufftable *vhost_txq;
1045 struct vhost_dev *vdev;
1048 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1049 if (unlikely(vdev->remove == 1))
1052 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1054 cur_tsc = rte_rdtsc();
1055 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1056 > MBUF_TABLE_DRAIN_TSC)) {
1057 RTE_LOG_DP(DEBUG, VHOST_DATA,
1058 "Vhost TX queue drained after timeout with burst size %u\n",
1062 vhost_txq->pre_tsc = cur_tsc;
1068 * Check if the packet destination MAC address is for a local device. If so then put
1069 * the packet on that devices RX queue. If not then return.
1071 static __rte_always_inline int
1072 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1074 struct rte_ether_hdr *pkt_hdr;
1075 struct vhost_dev *dst_vdev;
1076 struct vhost_bufftable *vhost_txq;
1077 uint16_t lcore_id = rte_lcore_id();
1078 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1080 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1084 if (vdev->vid == dst_vdev->vid) {
1085 RTE_LOG_DP(DEBUG, VHOST_DATA,
1086 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1091 RTE_LOG_DP(DEBUG, VHOST_DATA,
1092 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1094 if (unlikely(dst_vdev->remove)) {
1095 RTE_LOG_DP(DEBUG, VHOST_DATA,
1096 "(%d) device is marked for removal\n", dst_vdev->vid);
1100 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1101 vhost_txq->m_table[vhost_txq->len++] = m;
1104 vdev->stats.tx_total++;
1108 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1109 drain_vhost(dst_vdev);
1111 vhost_txq->pre_tsc = rte_rdtsc();
1117 * Check if the destination MAC of a packet is one local VM,
1118 * and get its vlan tag, and offset if it is.
1120 static __rte_always_inline int
1121 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1122 uint32_t *offset, uint16_t *vlan_tag)
1124 struct vhost_dev *dst_vdev;
1125 struct rte_ether_hdr *pkt_hdr =
1126 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1128 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1132 if (vdev->vid == dst_vdev->vid) {
1133 RTE_LOG_DP(DEBUG, VHOST_DATA,
1134 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1140 * HW vlan strip will reduce the packet length
1141 * by minus length of vlan tag, so need restore
1142 * the packet length by plus it.
1144 *offset = RTE_VLAN_HLEN;
1145 *vlan_tag = vlan_tags[vdev->vid];
1147 RTE_LOG_DP(DEBUG, VHOST_DATA,
1148 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1149 vdev->vid, dst_vdev->vid, *vlan_tag);
1154 static void virtio_tx_offload(struct rte_mbuf *m)
1156 struct rte_net_hdr_lens hdr_lens;
1157 struct rte_ipv4_hdr *ipv4_hdr;
1158 struct rte_tcp_hdr *tcp_hdr;
1162 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1163 m->l2_len = hdr_lens.l2_len;
1164 m->l3_len = hdr_lens.l3_len;
1165 m->l4_len = hdr_lens.l4_len;
1167 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1168 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1169 m->l2_len + m->l3_len);
1171 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1172 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1173 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1174 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1176 ipv4_hdr->hdr_checksum = 0;
1177 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1178 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1179 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1180 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1184 static __rte_always_inline void
1185 do_drain_mbuf_table(struct mbuf_table *tx_q)
1189 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1190 tx_q->m_table, tx_q->len);
1191 if (unlikely(count < tx_q->len))
1192 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1198 * This function routes the TX packet to the correct interface. This
1199 * may be a local device or the physical port.
1201 static __rte_always_inline void
1202 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1204 struct mbuf_table *tx_q;
1205 unsigned offset = 0;
1206 const uint16_t lcore_id = rte_lcore_id();
1207 struct rte_ether_hdr *nh;
1210 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1211 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1212 struct vhost_dev *vdev2;
1214 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1216 sync_virtio_xmit(vdev2, vdev, m);
1221 /*check if destination is local VM*/
1222 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1225 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1226 if (unlikely(find_local_dest(vdev, m, &offset,
1228 rte_pktmbuf_free(m);
1233 RTE_LOG_DP(DEBUG, VHOST_DATA,
1234 "(%d) TX: MAC address is external\n", vdev->vid);
1238 /*Add packet to the port tx queue*/
1239 tx_q = &lcore_tx_queue[lcore_id];
1241 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1242 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1243 /* Guest has inserted the vlan tag. */
1244 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1245 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1246 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1247 (vh->vlan_tci != vlan_tag_be))
1248 vh->vlan_tci = vlan_tag_be;
1250 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1253 * Find the right seg to adjust the data len when offset is
1254 * bigger than tail room size.
1256 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1257 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1258 m->data_len += offset;
1260 struct rte_mbuf *seg = m;
1262 while ((seg->next != NULL) &&
1263 (offset > rte_pktmbuf_tailroom(seg)))
1266 seg->data_len += offset;
1268 m->pkt_len += offset;
1271 m->vlan_tci = vlan_tag;
1274 if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1275 virtio_tx_offload(m);
1277 tx_q->m_table[tx_q->len++] = m;
1279 vdev->stats.tx_total++;
1283 if (unlikely(tx_q->len == MAX_PKT_BURST))
1284 do_drain_mbuf_table(tx_q);
1288 static __rte_always_inline void
1289 drain_mbuf_table(struct mbuf_table *tx_q)
1291 static uint64_t prev_tsc;
1297 cur_tsc = rte_rdtsc();
1298 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1301 RTE_LOG_DP(DEBUG, VHOST_DATA,
1302 "TX queue drained after timeout with burst size %u\n",
1304 do_drain_mbuf_table(tx_q);
1308 static __rte_always_inline void
1309 drain_eth_rx(struct vhost_dev *vdev)
1311 uint16_t rx_count, enqueue_count;
1312 struct rte_mbuf *pkts[MAX_PKT_BURST];
1314 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1315 pkts, MAX_PKT_BURST);
1321 * When "enable_retry" is set, here we wait and retry when there
1322 * is no enough free slots in the queue to hold @rx_count packets,
1323 * to diminish packet loss.
1326 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1330 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1331 rte_delay_us(burst_rx_delay_time);
1332 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1338 if (builtin_net_driver) {
1339 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1341 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1342 uint16_t enqueue_fail = 0;
1343 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1345 complete_async_pkts(vdev);
1346 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1347 VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1348 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1350 enqueue_fail = rx_count - enqueue_count;
1352 free_pkts(&pkts[enqueue_count], enqueue_fail);
1355 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1360 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1362 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1366 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1367 free_pkts(pkts, rx_count);
1370 static __rte_always_inline void
1371 drain_virtio_tx(struct vhost_dev *vdev)
1373 struct rte_mbuf *pkts[MAX_PKT_BURST];
1377 if (builtin_net_driver) {
1378 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1379 pkts, MAX_PKT_BURST);
1381 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1382 mbuf_pool, pkts, MAX_PKT_BURST);
1385 /* setup VMDq for the first packet */
1386 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1387 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1388 free_pkts(pkts, count);
1391 for (i = 0; i < count; ++i)
1392 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1396 * Main function of vhost-switch. It basically does:
1398 * for each vhost device {
1401 * Which drains the host eth Rx queue linked to the vhost device,
1402 * and deliver all of them to guest virito Rx ring associated with
1403 * this vhost device.
1405 * - drain_virtio_tx()
1407 * Which drains the guest virtio Tx queue and deliver all of them
1408 * to the target, which could be another vhost device, or the
1409 * physical eth dev. The route is done in function "virtio_tx_route".
1413 switch_worker(void *arg __rte_unused)
1416 unsigned lcore_id = rte_lcore_id();
1417 struct vhost_dev *vdev;
1418 struct mbuf_table *tx_q;
1420 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1422 tx_q = &lcore_tx_queue[lcore_id];
1423 for (i = 0; i < rte_lcore_count(); i++) {
1424 if (lcore_ids[i] == lcore_id) {
1431 drain_mbuf_table(tx_q);
1432 drain_vhost_table();
1434 * Inform the configuration core that we have exited the
1435 * linked list and that no devices are in use if requested.
1437 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1438 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1441 * Process vhost devices
1443 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1445 if (unlikely(vdev->remove)) {
1447 vdev->ready = DEVICE_SAFE_REMOVE;
1451 if (likely(vdev->ready == DEVICE_RX))
1454 if (likely(!vdev->remove))
1455 drain_virtio_tx(vdev);
1463 * Remove a device from the specific data core linked list and from the
1464 * main linked list. Synchronization occurs through the use of the
1465 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1466 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1469 destroy_device(int vid)
1471 struct vhost_dev *vdev = NULL;
1475 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1476 if (vdev->vid == vid)
1481 /*set the remove flag. */
1483 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1487 for (i = 0; i < RTE_MAX_LCORE; i++)
1488 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1490 if (builtin_net_driver)
1491 vs_vhost_net_remove(vdev);
1493 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1495 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1498 /* Set the dev_removal_flag on each lcore. */
1499 RTE_LCORE_FOREACH_WORKER(lcore)
1500 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1503 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1504 * we can be sure that they can no longer access the device removed
1505 * from the linked lists and that the devices are no longer in use.
1507 RTE_LCORE_FOREACH_WORKER(lcore) {
1508 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1512 lcore_info[vdev->coreid].device_num--;
1514 RTE_LOG(INFO, VHOST_DATA,
1515 "(%d) device has been removed from data core\n",
1518 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1520 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1521 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1523 while (vdev->pkts_inflight) {
1524 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1525 m_cpl, vdev->pkts_inflight, dma_id, 0);
1526 free_pkts(m_cpl, n_pkt);
1527 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1530 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1531 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1538 * A new device is added to a data core. First the device is added to the main linked list
1539 * and then allocated to a specific data core.
1544 int lcore, core_add = 0;
1546 uint32_t device_num_min = num_devices;
1547 struct vhost_dev *vdev;
1548 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1550 RTE_LOG(INFO, VHOST_DATA,
1551 "(%d) couldn't allocate memory for vhost dev\n",
1557 for (i = 0; i < RTE_MAX_LCORE; i++) {
1558 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1559 = rte_zmalloc("vhost bufftable",
1560 sizeof(struct vhost_bufftable),
1561 RTE_CACHE_LINE_SIZE);
1563 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1564 RTE_LOG(INFO, VHOST_DATA,
1565 "(%d) couldn't allocate memory for vhost TX\n", vid);
1570 if (builtin_net_driver)
1571 vs_vhost_net_setup(vdev);
1573 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1574 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1576 /*reset ready flag*/
1577 vdev->ready = DEVICE_MAC_LEARNING;
1580 /* Find a suitable lcore to add the device. */
1581 RTE_LCORE_FOREACH_WORKER(lcore) {
1582 if (lcore_info[lcore].device_num < device_num_min) {
1583 device_num_min = lcore_info[lcore].device_num;
1587 vdev->coreid = core_add;
1589 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1591 lcore_info[vdev->coreid].device_num++;
1593 /* Disable notifications. */
1594 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1595 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1597 RTE_LOG(INFO, VHOST_DATA,
1598 "(%d) device has been added to data core %d\n",
1601 if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1604 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1606 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1614 vring_state_changed(int vid, uint16_t queue_id, int enable)
1616 struct vhost_dev *vdev = NULL;
1618 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1619 if (vdev->vid == vid)
1625 if (queue_id != VIRTIO_RXQ)
1628 if (dma_bind[vid].dmas[queue_id].async_enabled) {
1631 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1632 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1634 while (vdev->pkts_inflight) {
1635 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1636 m_cpl, vdev->pkts_inflight, dma_id, 0);
1637 free_pkts(m_cpl, n_pkt);
1638 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1647 * These callback allow devices to be added to the data core when configuration
1648 * has been fully complete.
1650 static const struct rte_vhost_device_ops virtio_net_device_ops =
1652 .new_device = new_device,
1653 .destroy_device = destroy_device,
1654 .vring_state_changed = vring_state_changed,
1658 * This is a thread will wake up after a period to print stats if the user has
1662 print_stats(__rte_unused void *arg)
1664 struct vhost_dev *vdev;
1665 uint64_t tx_dropped, rx_dropped;
1666 uint64_t tx, tx_total, rx, rx_total;
1667 const char clr[] = { 27, '[', '2', 'J', '\0' };
1668 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1671 sleep(enable_stats);
1673 /* Clear screen and move to top left */
1674 printf("%s%s\n", clr, top_left);
1675 printf("Device statistics =================================\n");
1677 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1678 tx_total = vdev->stats.tx_total;
1679 tx = vdev->stats.tx;
1680 tx_dropped = tx_total - tx;
1682 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1684 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1686 rx_dropped = rx_total - rx;
1688 printf("Statistics for device %d\n"
1689 "-----------------------\n"
1690 "TX total: %" PRIu64 "\n"
1691 "TX dropped: %" PRIu64 "\n"
1692 "TX successful: %" PRIu64 "\n"
1693 "RX total: %" PRIu64 "\n"
1694 "RX dropped: %" PRIu64 "\n"
1695 "RX successful: %" PRIu64 "\n",
1697 tx_total, tx_dropped, tx,
1698 rx_total, rx_dropped, rx);
1701 printf("===================================================\n");
1710 unregister_drivers(int socket_num)
1714 for (i = 0; i < socket_num; i++) {
1715 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1717 RTE_LOG(ERR, VHOST_CONFIG,
1718 "Fail to unregister vhost driver for %s.\n",
1719 socket_files + i * PATH_MAX);
1723 /* When we receive a INT signal, unregister vhost driver */
1725 sigint_handler(__rte_unused int signum)
1727 /* Unregister vhost driver. */
1728 unregister_drivers(nb_sockets);
1734 * While creating an mbuf pool, one key thing is to figure out how
1735 * many mbuf entries is enough for our use. FYI, here are some
1738 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1740 * - For each switch core (A CPU core does the packet switch), we need
1741 * also make some reservation for receiving the packets from virtio
1742 * Tx queue. How many is enough depends on the usage. It's normally
1743 * a simple calculation like following:
1745 * MAX_PKT_BURST * max packet size / mbuf size
1747 * So, we definitely need allocate more mbufs when TSO is enabled.
1749 * - Similarly, for each switching core, we should serve @nr_rx_desc
1750 * mbufs for receiving the packets from physical NIC device.
1752 * - We also need make sure, for each switch core, we have allocated
1753 * enough mbufs to fill up the mbuf cache.
1756 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1757 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1760 uint32_t nr_mbufs_per_core;
1761 uint32_t mtu = 1500;
1768 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST /
1769 (mbuf_size - RTE_PKTMBUF_HEADROOM);
1770 nr_mbufs_per_core += nr_rx_desc;
1771 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1773 nr_mbufs = nr_queues * nr_rx_desc;
1774 nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1775 nr_mbufs *= nr_port;
1777 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1778 nr_mbuf_cache, 0, mbuf_size,
1780 if (mbuf_pool == NULL)
1781 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1789 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1792 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1793 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1794 dma_bind[i].dmas[j].async_enabled = false;
1798 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1799 dmas_id[i] = INVALID_DMA_ID;
1803 * Main function, does initialisation and calls the per-lcore functions.
1806 main(int argc, char *argv[])
1808 unsigned lcore_id, core_id = 0;
1809 unsigned nb_ports, valid_num_ports;
1812 static pthread_t tid;
1813 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1815 signal(SIGINT, sigint_handler);
1818 ret = rte_eal_init(argc, argv);
1820 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1824 /* initialize dma structures */
1827 /* parse app arguments */
1828 ret = us_vhost_parse_args(argc, argv);
1830 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1832 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1833 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1835 if (rte_lcore_is_enabled(lcore_id))
1836 lcore_ids[core_id++] = lcore_id;
1839 if (rte_lcore_count() > RTE_MAX_LCORE)
1840 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1842 /* Get the number of physical ports. */
1843 nb_ports = rte_eth_dev_count_avail();
1846 * Update the global var NUM_PORTS and global array PORTS
1847 * and get value of var VALID_NUM_PORTS according to system ports number
1849 valid_num_ports = check_ports_num(nb_ports);
1851 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1852 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1853 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1858 * FIXME: here we are trying to allocate mbufs big enough for
1859 * @MAX_QUEUES, but the truth is we're never going to use that
1860 * many queues here. We probably should only do allocation for
1861 * those queues we are going to use.
1863 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1864 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1866 if (vm2vm_mode == VM2VM_HARDWARE) {
1867 /* Enable VT loop back to let L2 switch to do it. */
1868 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1869 RTE_LOG(DEBUG, VHOST_CONFIG,
1870 "Enable loop back for L2 switch in vmdq.\n");
1873 /* initialize all ports */
1874 RTE_ETH_FOREACH_DEV(portid) {
1875 /* skip ports that are not enabled */
1876 if ((enabled_port_mask & (1 << portid)) == 0) {
1877 RTE_LOG(INFO, VHOST_PORT,
1878 "Skipping disabled port %d\n", portid);
1881 if (port_init(portid) != 0)
1882 rte_exit(EXIT_FAILURE,
1883 "Cannot initialize network ports\n");
1886 /* Enable stats if the user option is set. */
1888 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1891 rte_exit(EXIT_FAILURE,
1892 "Cannot create print-stats thread\n");
1895 /* Launch all data cores. */
1896 RTE_LCORE_FOREACH_WORKER(lcore_id)
1897 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1900 flags |= RTE_VHOST_USER_CLIENT;
1902 for (i = 0; i < dma_count; i++) {
1903 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1904 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1905 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1909 /* Register vhost user driver to handle vhost messages. */
1910 for (i = 0; i < nb_sockets; i++) {
1911 char *file = socket_files + i * PATH_MAX;
1914 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1916 ret = rte_vhost_driver_register(file, flags);
1918 unregister_drivers(i);
1919 rte_exit(EXIT_FAILURE,
1920 "vhost driver register failure.\n");
1923 if (builtin_net_driver)
1924 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1926 if (mergeable == 0) {
1927 rte_vhost_driver_disable_features(file,
1928 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1931 if (enable_tx_csum == 0) {
1932 rte_vhost_driver_disable_features(file,
1933 1ULL << VIRTIO_NET_F_CSUM);
1936 if (enable_tso == 0) {
1937 rte_vhost_driver_disable_features(file,
1938 1ULL << VIRTIO_NET_F_HOST_TSO4);
1939 rte_vhost_driver_disable_features(file,
1940 1ULL << VIRTIO_NET_F_HOST_TSO6);
1941 rte_vhost_driver_disable_features(file,
1942 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1943 rte_vhost_driver_disable_features(file,
1944 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1948 rte_vhost_driver_enable_features(file,
1949 1ULL << VIRTIO_NET_F_CTRL_RX);
1952 ret = rte_vhost_driver_callback_register(file,
1953 &virtio_net_device_ops);
1955 rte_exit(EXIT_FAILURE,
1956 "failed to register vhost driver callbacks.\n");
1959 if (rte_vhost_driver_start(file) < 0) {
1960 rte_exit(EXIT_FAILURE,
1961 "failed to start vhost driver.\n");
1965 RTE_LCORE_FOREACH_WORKER(lcore_id)
1966 rte_eal_wait_lcore(lcore_id);
1968 /* clean up the EAL */