1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
27 #include <rte_dmadev.h>
28 #include <rte_vhost_async.h>
33 #define MAX_QUEUES 128
36 /* the maximum number of external ports supported */
37 #define MAX_SUP_PORTS 1
39 #define MBUF_CACHE_SIZE 128
40 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
42 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
44 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
45 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
47 #define JUMBO_FRAME_MAX_SIZE 0x2600
48 #define MAX_MTU (JUMBO_FRAME_MAX_SIZE - (RTE_ETHER_HDR_LEN + RTE_ETHER_CRC_LEN))
50 /* State of virtio device. */
51 #define DEVICE_MAC_LEARNING 0
53 #define DEVICE_SAFE_REMOVE 2
55 /* Configurable number of RX/TX ring descriptors */
56 #define RTE_TEST_RX_DESC_DEFAULT 1024
57 #define RTE_TEST_TX_DESC_DEFAULT 512
59 #define INVALID_PORT_ID 0xFF
60 #define INVALID_DMA_ID -1
62 #define DMA_RING_SIZE 4096
64 struct dma_for_vhost dma_bind[RTE_MAX_VHOST_DEVICE];
65 int16_t dmas_id[RTE_DMADEV_DEFAULT_MAX];
68 /* mask of enabled ports */
69 static uint32_t enabled_port_mask = 0;
71 /* Promiscuous mode */
72 static uint32_t promiscuous;
74 /* number of devices/queues to support*/
75 static uint32_t num_queues = 0;
76 static uint32_t num_devices;
78 static struct rte_mempool *mbuf_pool;
81 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
88 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
91 static uint32_t enable_stats = 0;
92 /* Enable retries on RX. */
93 static uint32_t enable_retry = 1;
95 /* Disable TX checksum offload */
96 static uint32_t enable_tx_csum;
98 /* Disable TSO offload */
99 static uint32_t enable_tso;
101 static int client_mode;
103 static int builtin_net_driver;
105 /* Specify timeout (in useconds) between retries on RX. */
106 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
107 /* Specify the number of retries on RX. */
108 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
110 /* Socket file paths. Can be set by user */
111 static char *socket_files;
112 static int nb_sockets;
114 /* empty VMDq configuration structure. Filled in programmatically */
115 static struct rte_eth_conf vmdq_conf_default = {
117 .mq_mode = RTE_ETH_MQ_RX_VMDQ_ONLY,
120 * VLAN strip is necessary for 1G NIC such as I350,
121 * this fixes bug of ipv4 forwarding in guest can't
122 * forward packets from one virtio dev to another virtio dev.
124 .offloads = RTE_ETH_RX_OFFLOAD_VLAN_STRIP,
128 .mq_mode = RTE_ETH_MQ_TX_NONE,
129 .offloads = (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM |
130 RTE_ETH_TX_OFFLOAD_TCP_CKSUM |
131 RTE_ETH_TX_OFFLOAD_VLAN_INSERT |
132 RTE_ETH_TX_OFFLOAD_MULTI_SEGS |
133 RTE_ETH_TX_OFFLOAD_TCP_TSO),
137 * should be overridden separately in code with
141 .nb_queue_pools = RTE_ETH_8_POOLS,
142 .enable_default_pool = 0,
145 .pool_map = {{0, 0},},
151 static unsigned lcore_ids[RTE_MAX_LCORE];
152 static uint16_t ports[RTE_MAX_ETHPORTS];
153 static unsigned num_ports = 0; /**< The number of ports specified in command line */
154 static uint16_t num_pf_queues, num_vmdq_queues;
155 static uint16_t vmdq_pool_base, vmdq_queue_base;
156 static uint16_t queues_per_pool;
158 const uint16_t vlan_tags[] = {
159 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
160 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
161 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
162 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
163 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
164 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
165 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
166 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
169 /* ethernet addresses of ports */
170 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
172 static struct vhost_dev_tailq_list vhost_dev_list =
173 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
175 static struct lcore_info lcore_info[RTE_MAX_LCORE];
177 /* Used for queueing bursts of TX packets. */
181 struct rte_mbuf *m_table[MAX_PKT_BURST];
184 struct vhost_bufftable {
187 struct rte_mbuf *m_table[MAX_PKT_BURST];
190 /* TX queue for each data core. */
191 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
194 * Vhost TX buffer for each data core.
195 * Every data core maintains a TX buffer for every vhost device,
196 * which is used for batch pkts enqueue for higher performance.
198 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * RTE_MAX_VHOST_DEVICE];
200 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
201 / US_PER_S * BURST_TX_DRAIN_US)
204 is_dma_configured(int16_t dev_id)
208 for (i = 0; i < dma_count; i++)
209 if (dmas_id[i] == dev_id)
215 open_dma(const char *value)
217 struct dma_for_vhost *dma_info = dma_bind;
218 char *input = strndup(value, strlen(value) + 1);
221 char *start, *end, *substr;
224 struct rte_dma_info info;
225 struct rte_dma_conf dev_config = { .nb_vchans = 1 };
226 struct rte_dma_vchan_conf qconf = {
227 .direction = RTE_DMA_DIR_MEM_TO_MEM,
228 .nb_desc = DMA_RING_SIZE
234 char *dma_arg[RTE_MAX_VHOST_DEVICE];
237 while (isblank(*addrs))
239 if (*addrs == '\0') {
244 /* process DMA devices within bracket. */
246 substr = strtok(addrs, ";]");
252 args_nr = rte_strsplit(substr, strlen(substr), dma_arg, RTE_MAX_VHOST_DEVICE, ',');
258 while (i < args_nr) {
259 char *arg_temp = dma_arg[i];
262 sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@');
268 start = strstr(ptrs[0], "txd");
275 vid = strtol(start, &end, 0);
281 dev_id = rte_dma_get_dev_id_by_name(ptrs[1]);
283 RTE_LOG(ERR, VHOST_CONFIG, "Fail to find DMA %s.\n", ptrs[1]);
288 /* DMA device is already configured, so skip */
289 if (is_dma_configured(dev_id))
292 if (rte_dma_info_get(dev_id, &info) != 0) {
293 RTE_LOG(ERR, VHOST_CONFIG, "Error with rte_dma_info_get()\n");
298 if (info.max_vchans < 1) {
299 RTE_LOG(ERR, VHOST_CONFIG, "No channels available on device %d\n", dev_id);
304 if (rte_dma_configure(dev_id, &dev_config) != 0) {
305 RTE_LOG(ERR, VHOST_CONFIG, "Fail to configure DMA %d.\n", dev_id);
310 /* Check the max desc supported by DMA device */
311 rte_dma_info_get(dev_id, &info);
312 if (info.nb_vchans != 1) {
313 RTE_LOG(ERR, VHOST_CONFIG, "No configured queues reported by DMA %d.\n",
319 qconf.nb_desc = RTE_MIN(DMA_RING_SIZE, info.max_desc);
321 if (rte_dma_vchan_setup(dev_id, 0, &qconf) != 0) {
322 RTE_LOG(ERR, VHOST_CONFIG, "Fail to set up DMA %d.\n", dev_id);
327 if (rte_dma_start(dev_id) != 0) {
328 RTE_LOG(ERR, VHOST_CONFIG, "Fail to start DMA %u.\n", dev_id);
333 dmas_id[dma_count++] = dev_id;
336 (dma_info + vid)->dmas[VIRTIO_RXQ].dev_id = dev_id;
345 * Builds up the correct configuration for VMDQ VLAN pool map
346 * according to the pool & queue limits.
349 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
351 struct rte_eth_vmdq_rx_conf conf;
352 struct rte_eth_vmdq_rx_conf *def_conf =
353 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
356 memset(&conf, 0, sizeof(conf));
357 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
358 conf.nb_pool_maps = num_devices;
359 conf.enable_loop_back = def_conf->enable_loop_back;
360 conf.rx_mode = def_conf->rx_mode;
362 for (i = 0; i < conf.nb_pool_maps; i++) {
363 conf.pool_map[i].vlan_id = vlan_tags[ i ];
364 conf.pool_map[i].pools = (1UL << i);
367 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
368 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
369 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
374 * Initialises a given port using global settings and with the rx buffers
375 * coming from the mbuf_pool passed as parameter
378 port_init(uint16_t port)
380 struct rte_eth_dev_info dev_info;
381 struct rte_eth_conf port_conf;
382 struct rte_eth_rxconf *rxconf;
383 struct rte_eth_txconf *txconf;
384 int16_t rx_rings, tx_rings;
385 uint16_t rx_ring_size, tx_ring_size;
389 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
390 retval = rte_eth_dev_info_get(port, &dev_info);
392 RTE_LOG(ERR, VHOST_PORT,
393 "Error during getting device (port %u) info: %s\n",
394 port, strerror(-retval));
399 rxconf = &dev_info.default_rxconf;
400 txconf = &dev_info.default_txconf;
401 rxconf->rx_drop_en = 1;
403 /*configure the number of supported virtio devices based on VMDQ limits */
404 num_devices = dev_info.max_vmdq_pools;
406 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
407 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
409 tx_rings = (uint16_t)rte_lcore_count();
412 if (dev_info.max_mtu != UINT16_MAX && dev_info.max_rx_pktlen > dev_info.max_mtu)
413 vmdq_conf_default.rxmode.mtu = dev_info.max_mtu;
415 vmdq_conf_default.rxmode.mtu = MAX_MTU;
418 /* Get port configuration. */
419 retval = get_eth_conf(&port_conf, num_devices);
422 /* NIC queues are divided into pf queues and vmdq queues. */
423 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
424 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
425 num_vmdq_queues = num_devices * queues_per_pool;
426 num_queues = num_pf_queues + num_vmdq_queues;
427 vmdq_queue_base = dev_info.vmdq_queue_base;
428 vmdq_pool_base = dev_info.vmdq_pool_base;
429 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
430 num_pf_queues, num_devices, queues_per_pool);
432 if (!rte_eth_dev_is_valid_port(port))
435 rx_rings = (uint16_t)dev_info.max_rx_queues;
436 if (dev_info.tx_offload_capa & RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE)
437 port_conf.txmode.offloads |=
438 RTE_ETH_TX_OFFLOAD_MBUF_FAST_FREE;
439 /* Configure ethernet device. */
440 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
442 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
443 port, strerror(-retval));
447 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
450 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
451 "for port %u: %s.\n", port, strerror(-retval));
454 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
455 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
456 "for Rx queues on port %u.\n", port);
460 /* Setup the queues. */
461 rxconf->offloads = port_conf.rxmode.offloads;
462 for (q = 0; q < rx_rings; q ++) {
463 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
464 rte_eth_dev_socket_id(port),
468 RTE_LOG(ERR, VHOST_PORT,
469 "Failed to setup rx queue %u of port %u: %s.\n",
470 q, port, strerror(-retval));
474 txconf->offloads = port_conf.txmode.offloads;
475 for (q = 0; q < tx_rings; q ++) {
476 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
477 rte_eth_dev_socket_id(port),
480 RTE_LOG(ERR, VHOST_PORT,
481 "Failed to setup tx queue %u of port %u: %s.\n",
482 q, port, strerror(-retval));
487 /* Start the device. */
488 retval = rte_eth_dev_start(port);
490 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
491 port, strerror(-retval));
496 retval = rte_eth_promiscuous_enable(port);
498 RTE_LOG(ERR, VHOST_PORT,
499 "Failed to enable promiscuous mode on port %u: %s\n",
500 port, rte_strerror(-retval));
505 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
507 RTE_LOG(ERR, VHOST_PORT,
508 "Failed to get MAC address on port %u: %s\n",
509 port, rte_strerror(-retval));
513 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
514 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
515 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
516 port, RTE_ETHER_ADDR_BYTES(&vmdq_ports_eth_addr[port]));
522 * Set socket file path.
525 us_vhost_parse_socket_path(const char *q_arg)
529 /* parse number string */
530 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
534 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
535 if (socket_files == NULL) {
540 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
547 * Parse the portmask provided at run time.
550 parse_portmask(const char *portmask)
557 /* parse hexadecimal string */
558 pm = strtoul(portmask, &end, 16);
559 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
567 * Parse num options at run time.
570 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
577 /* parse unsigned int string */
578 num = strtoul(q_arg, &end, 10);
579 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
582 if (num > max_valid_value)
593 us_vhost_usage(const char *prgname)
595 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
597 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
598 " --socket-file <path>\n"
600 " -p PORTMASK: Set mask for ports to be used by application\n"
601 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
602 " --rx-retry [0|1]: disable/enable(default) retries on Rx. Enable retry if destination queue is full\n"
603 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
604 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
605 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
606 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
607 " --socket-file: The path of the socket file.\n"
608 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
609 " --tso [0|1] disable/enable TCP segment offload.\n"
610 " --client register a vhost-user socket as client mode.\n"
611 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
612 " --dmas register dma channel for specific vhost device.\n",
617 #define OPT_VM2VM "vm2vm"
619 #define OPT_RX_RETRY "rx-retry"
621 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
622 OPT_RX_RETRY_DELAY_NUM,
623 #define OPT_RX_RETRY_NUMB "rx-retry-num"
624 OPT_RX_RETRY_NUMB_NUM,
625 #define OPT_MERGEABLE "mergeable"
627 #define OPT_STATS "stats"
629 #define OPT_SOCKET_FILE "socket-file"
631 #define OPT_TX_CSUM "tx-csum"
633 #define OPT_TSO "tso"
635 #define OPT_CLIENT "client"
637 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
638 OPT_BUILTIN_NET_DRIVER_NUM,
639 #define OPT_DMAS "dmas"
644 * Parse the arguments given in the command line of the application.
647 us_vhost_parse_args(int argc, char **argv)
652 const char *prgname = argv[0];
653 static struct option long_option[] = {
654 {OPT_VM2VM, required_argument,
655 NULL, OPT_VM2VM_NUM},
656 {OPT_RX_RETRY, required_argument,
657 NULL, OPT_RX_RETRY_NUM},
658 {OPT_RX_RETRY_DELAY, required_argument,
659 NULL, OPT_RX_RETRY_DELAY_NUM},
660 {OPT_RX_RETRY_NUMB, required_argument,
661 NULL, OPT_RX_RETRY_NUMB_NUM},
662 {OPT_MERGEABLE, required_argument,
663 NULL, OPT_MERGEABLE_NUM},
664 {OPT_STATS, required_argument,
665 NULL, OPT_STATS_NUM},
666 {OPT_SOCKET_FILE, required_argument,
667 NULL, OPT_SOCKET_FILE_NUM},
668 {OPT_TX_CSUM, required_argument,
669 NULL, OPT_TX_CSUM_NUM},
670 {OPT_TSO, required_argument,
672 {OPT_CLIENT, no_argument,
673 NULL, OPT_CLIENT_NUM},
674 {OPT_BUILTIN_NET_DRIVER, no_argument,
675 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
676 {OPT_DMAS, required_argument,
681 /* Parse command line */
682 while ((opt = getopt_long(argc, argv, "p:P",
683 long_option, &option_index)) != EOF) {
687 enabled_port_mask = parse_portmask(optarg);
688 if (enabled_port_mask == 0) {
689 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
690 us_vhost_usage(prgname);
697 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
698 RTE_ETH_VMDQ_ACCEPT_BROADCAST |
699 RTE_ETH_VMDQ_ACCEPT_MULTICAST;
703 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
705 RTE_LOG(INFO, VHOST_CONFIG,
706 "Invalid argument for "
708 us_vhost_usage(prgname);
711 vm2vm_mode = (vm2vm_type)ret;
714 case OPT_RX_RETRY_NUM:
715 ret = parse_num_opt(optarg, 1);
717 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
718 us_vhost_usage(prgname);
724 case OPT_TX_CSUM_NUM:
725 ret = parse_num_opt(optarg, 1);
727 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
728 us_vhost_usage(prgname);
731 enable_tx_csum = ret;
735 ret = parse_num_opt(optarg, 1);
737 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
738 us_vhost_usage(prgname);
744 case OPT_RX_RETRY_DELAY_NUM:
745 ret = parse_num_opt(optarg, INT32_MAX);
747 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
748 us_vhost_usage(prgname);
751 burst_rx_delay_time = ret;
754 case OPT_RX_RETRY_NUMB_NUM:
755 ret = parse_num_opt(optarg, INT32_MAX);
757 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
758 us_vhost_usage(prgname);
761 burst_rx_retry_num = ret;
764 case OPT_MERGEABLE_NUM:
765 ret = parse_num_opt(optarg, 1);
767 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
768 us_vhost_usage(prgname);
775 ret = parse_num_opt(optarg, INT32_MAX);
777 RTE_LOG(INFO, VHOST_CONFIG,
778 "Invalid argument for stats [0..N]\n");
779 us_vhost_usage(prgname);
785 /* Set socket file path. */
786 case OPT_SOCKET_FILE_NUM:
787 if (us_vhost_parse_socket_path(optarg) == -1) {
788 RTE_LOG(INFO, VHOST_CONFIG,
789 "Invalid argument for socket name (Max %d characters)\n",
791 us_vhost_usage(prgname);
797 if (open_dma(optarg) == -1) {
798 RTE_LOG(INFO, VHOST_CONFIG,
800 us_vhost_usage(prgname);
809 case OPT_BUILTIN_NET_DRIVER_NUM:
810 builtin_net_driver = 1;
813 /* Invalid option - print options. */
815 us_vhost_usage(prgname);
820 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
821 if (enabled_port_mask & (1 << i))
822 ports[num_ports++] = i;
825 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
826 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
827 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
835 * Update the global var NUM_PORTS and array PORTS according to system ports number
836 * and return valid ports number
838 static unsigned check_ports_num(unsigned nb_ports)
840 unsigned valid_num_ports = num_ports;
843 if (num_ports > nb_ports) {
844 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
845 num_ports, nb_ports);
846 num_ports = nb_ports;
849 for (portid = 0; portid < num_ports; portid ++) {
850 if (!rte_eth_dev_is_valid_port(ports[portid])) {
851 RTE_LOG(INFO, VHOST_PORT,
852 "\nSpecified port ID(%u) is not valid\n",
854 ports[portid] = INVALID_PORT_ID;
858 return valid_num_ports;
861 static __rte_always_inline struct vhost_dev *
862 find_vhost_dev(struct rte_ether_addr *mac)
864 struct vhost_dev *vdev;
866 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
867 if (vdev->ready == DEVICE_RX &&
868 rte_is_same_ether_addr(mac, &vdev->mac_address))
876 * This function learns the MAC address of the device and registers this along with a
877 * vlan tag to a VMDQ.
880 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
882 struct rte_ether_hdr *pkt_hdr;
885 /* Learn MAC address of guest device from packet */
886 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
888 if (find_vhost_dev(&pkt_hdr->src_addr)) {
889 RTE_LOG(ERR, VHOST_DATA,
890 "(%d) device is using a registered MAC!\n",
895 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
896 vdev->mac_address.addr_bytes[i] =
897 pkt_hdr->src_addr.addr_bytes[i];
899 /* vlan_tag currently uses the device_id. */
900 vdev->vlan_tag = vlan_tags[vdev->vid];
902 /* Print out VMDQ registration info. */
903 RTE_LOG(INFO, VHOST_DATA,
904 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
905 vdev->vid, RTE_ETHER_ADDR_BYTES(&vdev->mac_address),
908 /* Register the MAC address. */
909 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
910 (uint32_t)vdev->vid + vmdq_pool_base);
912 RTE_LOG(ERR, VHOST_DATA,
913 "(%d) failed to add device MAC address to VMDQ\n",
916 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
918 /* Set device as ready for RX. */
919 vdev->ready = DEVICE_RX;
925 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
926 * queue before disabling RX on the device.
929 unlink_vmdq(struct vhost_dev *vdev)
933 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
935 if (vdev->ready == DEVICE_RX) {
936 /*clear MAC and VLAN settings*/
937 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
938 for (i = 0; i < 6; i++)
939 vdev->mac_address.addr_bytes[i] = 0;
943 /*Clear out the receive buffers*/
944 rx_count = rte_eth_rx_burst(ports[0],
945 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
948 for (i = 0; i < rx_count; i++)
949 rte_pktmbuf_free(pkts_burst[i]);
951 rx_count = rte_eth_rx_burst(ports[0],
952 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
955 vdev->ready = DEVICE_MAC_LEARNING;
960 free_pkts(struct rte_mbuf **pkts, uint16_t n)
963 rte_pktmbuf_free(pkts[n]);
966 static __rte_always_inline void
967 complete_async_pkts(struct vhost_dev *vdev)
969 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
970 uint16_t complete_count;
971 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
973 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
974 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST, dma_id, 0);
975 if (complete_count) {
976 free_pkts(p_cpl, complete_count);
977 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
982 static __rte_always_inline void
983 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
988 if (builtin_net_driver) {
989 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
991 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
995 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
997 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
999 src_vdev->stats.tx_total++;
1000 src_vdev->stats.tx += ret;
1004 static __rte_always_inline void
1005 drain_vhost(struct vhost_dev *vdev)
1008 uint32_t buff_idx = rte_lcore_id() * RTE_MAX_VHOST_DEVICE + vdev->vid;
1009 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
1010 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
1012 if (builtin_net_driver) {
1013 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
1014 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1015 uint16_t enqueue_fail = 0;
1016 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1018 complete_async_pkts(vdev);
1019 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ, m, nr_xmit, dma_id, 0);
1020 __atomic_add_fetch(&vdev->pkts_inflight, ret, __ATOMIC_SEQ_CST);
1022 enqueue_fail = nr_xmit - ret;
1024 free_pkts(&m[ret], nr_xmit - ret);
1026 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1031 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
1033 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
1037 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1038 free_pkts(m, nr_xmit);
1041 static __rte_always_inline void
1042 drain_vhost_table(void)
1044 uint16_t lcore_id = rte_lcore_id();
1045 struct vhost_bufftable *vhost_txq;
1046 struct vhost_dev *vdev;
1049 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1050 if (unlikely(vdev->remove == 1))
1053 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + vdev->vid];
1055 cur_tsc = rte_rdtsc();
1056 if (unlikely(cur_tsc - vhost_txq->pre_tsc
1057 > MBUF_TABLE_DRAIN_TSC)) {
1058 RTE_LOG_DP(DEBUG, VHOST_DATA,
1059 "Vhost TX queue drained after timeout with burst size %u\n",
1063 vhost_txq->pre_tsc = cur_tsc;
1069 * Check if the packet destination MAC address is for a local device. If so then put
1070 * the packet on that devices RX queue. If not then return.
1072 static __rte_always_inline int
1073 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1075 struct rte_ether_hdr *pkt_hdr;
1076 struct vhost_dev *dst_vdev;
1077 struct vhost_bufftable *vhost_txq;
1078 uint16_t lcore_id = rte_lcore_id();
1079 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1081 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1085 if (vdev->vid == dst_vdev->vid) {
1086 RTE_LOG_DP(DEBUG, VHOST_DATA,
1087 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1092 RTE_LOG_DP(DEBUG, VHOST_DATA,
1093 "(%d) TX: MAC address is local\n", dst_vdev->vid);
1095 if (unlikely(dst_vdev->remove)) {
1096 RTE_LOG_DP(DEBUG, VHOST_DATA,
1097 "(%d) device is marked for removal\n", dst_vdev->vid);
1101 vhost_txq = vhost_txbuff[lcore_id * RTE_MAX_VHOST_DEVICE + dst_vdev->vid];
1102 vhost_txq->m_table[vhost_txq->len++] = m;
1105 vdev->stats.tx_total++;
1109 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
1110 drain_vhost(dst_vdev);
1112 vhost_txq->pre_tsc = rte_rdtsc();
1118 * Check if the destination MAC of a packet is one local VM,
1119 * and get its vlan tag, and offset if it is.
1121 static __rte_always_inline int
1122 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1123 uint32_t *offset, uint16_t *vlan_tag)
1125 struct vhost_dev *dst_vdev;
1126 struct rte_ether_hdr *pkt_hdr =
1127 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1129 dst_vdev = find_vhost_dev(&pkt_hdr->dst_addr);
1133 if (vdev->vid == dst_vdev->vid) {
1134 RTE_LOG_DP(DEBUG, VHOST_DATA,
1135 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1141 * HW vlan strip will reduce the packet length
1142 * by minus length of vlan tag, so need restore
1143 * the packet length by plus it.
1145 *offset = RTE_VLAN_HLEN;
1146 *vlan_tag = vlan_tags[vdev->vid];
1148 RTE_LOG_DP(DEBUG, VHOST_DATA,
1149 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1150 vdev->vid, dst_vdev->vid, *vlan_tag);
1155 static void virtio_tx_offload(struct rte_mbuf *m)
1157 struct rte_net_hdr_lens hdr_lens;
1158 struct rte_ipv4_hdr *ipv4_hdr;
1159 struct rte_tcp_hdr *tcp_hdr;
1163 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1164 m->l2_len = hdr_lens.l2_len;
1165 m->l3_len = hdr_lens.l3_len;
1166 m->l4_len = hdr_lens.l4_len;
1168 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1169 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1170 m->l2_len + m->l3_len);
1172 m->ol_flags |= RTE_MBUF_F_TX_TCP_SEG;
1173 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1174 m->ol_flags |= RTE_MBUF_F_TX_IPV4;
1175 m->ol_flags |= RTE_MBUF_F_TX_IP_CKSUM;
1177 ipv4_hdr->hdr_checksum = 0;
1178 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1179 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1180 m->ol_flags |= RTE_MBUF_F_TX_IPV6;
1181 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1185 static __rte_always_inline void
1186 do_drain_mbuf_table(struct mbuf_table *tx_q)
1190 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1191 tx_q->m_table, tx_q->len);
1192 if (unlikely(count < tx_q->len))
1193 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1199 * This function routes the TX packet to the correct interface. This
1200 * may be a local device or the physical port.
1202 static __rte_always_inline void
1203 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1205 struct mbuf_table *tx_q;
1206 unsigned offset = 0;
1207 const uint16_t lcore_id = rte_lcore_id();
1208 struct rte_ether_hdr *nh;
1211 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1212 if (unlikely(rte_is_broadcast_ether_addr(&nh->dst_addr))) {
1213 struct vhost_dev *vdev2;
1215 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1217 sync_virtio_xmit(vdev2, vdev, m);
1222 /*check if destination is local VM*/
1223 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1226 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1227 if (unlikely(find_local_dest(vdev, m, &offset,
1229 rte_pktmbuf_free(m);
1234 RTE_LOG_DP(DEBUG, VHOST_DATA,
1235 "(%d) TX: MAC address is external\n", vdev->vid);
1239 /*Add packet to the port tx queue*/
1240 tx_q = &lcore_tx_queue[lcore_id];
1242 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1243 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1244 /* Guest has inserted the vlan tag. */
1245 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1246 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1247 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1248 (vh->vlan_tci != vlan_tag_be))
1249 vh->vlan_tci = vlan_tag_be;
1251 m->ol_flags |= RTE_MBUF_F_TX_VLAN;
1254 * Find the right seg to adjust the data len when offset is
1255 * bigger than tail room size.
1257 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1258 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1259 m->data_len += offset;
1261 struct rte_mbuf *seg = m;
1263 while ((seg->next != NULL) &&
1264 (offset > rte_pktmbuf_tailroom(seg)))
1267 seg->data_len += offset;
1269 m->pkt_len += offset;
1272 m->vlan_tci = vlan_tag;
1275 if (m->ol_flags & RTE_MBUF_F_RX_LRO)
1276 virtio_tx_offload(m);
1278 tx_q->m_table[tx_q->len++] = m;
1280 vdev->stats.tx_total++;
1284 if (unlikely(tx_q->len == MAX_PKT_BURST))
1285 do_drain_mbuf_table(tx_q);
1289 static __rte_always_inline void
1290 drain_mbuf_table(struct mbuf_table *tx_q)
1292 static uint64_t prev_tsc;
1298 cur_tsc = rte_rdtsc();
1299 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1302 RTE_LOG_DP(DEBUG, VHOST_DATA,
1303 "TX queue drained after timeout with burst size %u\n",
1305 do_drain_mbuf_table(tx_q);
1309 static __rte_always_inline void
1310 drain_eth_rx(struct vhost_dev *vdev)
1312 uint16_t rx_count, enqueue_count;
1313 struct rte_mbuf *pkts[MAX_PKT_BURST];
1315 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1316 pkts, MAX_PKT_BURST);
1322 * When "enable_retry" is set, here we wait and retry when there
1323 * is no enough free slots in the queue to hold @rx_count packets,
1324 * to diminish packet loss.
1327 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1331 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1332 rte_delay_us(burst_rx_delay_time);
1333 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1339 if (builtin_net_driver) {
1340 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1342 } else if (dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled) {
1343 uint16_t enqueue_fail = 0;
1344 int16_t dma_id = dma_bind[vdev->vid].dmas[VIRTIO_RXQ].dev_id;
1346 complete_async_pkts(vdev);
1347 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1348 VIRTIO_RXQ, pkts, rx_count, dma_id, 0);
1349 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count, __ATOMIC_SEQ_CST);
1351 enqueue_fail = rx_count - enqueue_count;
1353 free_pkts(&pkts[enqueue_count], enqueue_fail);
1356 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1361 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1363 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1367 if (!dma_bind[vdev->vid].dmas[VIRTIO_RXQ].async_enabled)
1368 free_pkts(pkts, rx_count);
1371 static __rte_always_inline void
1372 drain_virtio_tx(struct vhost_dev *vdev)
1374 struct rte_mbuf *pkts[MAX_PKT_BURST];
1378 if (builtin_net_driver) {
1379 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1380 pkts, MAX_PKT_BURST);
1382 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1383 mbuf_pool, pkts, MAX_PKT_BURST);
1386 /* setup VMDq for the first packet */
1387 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1388 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1389 free_pkts(pkts, count);
1392 for (i = 0; i < count; ++i)
1393 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1397 * Main function of vhost-switch. It basically does:
1399 * for each vhost device {
1402 * Which drains the host eth Rx queue linked to the vhost device,
1403 * and deliver all of them to guest virito Rx ring associated with
1404 * this vhost device.
1406 * - drain_virtio_tx()
1408 * Which drains the guest virtio Tx queue and deliver all of them
1409 * to the target, which could be another vhost device, or the
1410 * physical eth dev. The route is done in function "virtio_tx_route".
1414 switch_worker(void *arg __rte_unused)
1417 unsigned lcore_id = rte_lcore_id();
1418 struct vhost_dev *vdev;
1419 struct mbuf_table *tx_q;
1421 RTE_LOG(INFO, VHOST_DATA, "Processing on Core %u started\n", lcore_id);
1423 tx_q = &lcore_tx_queue[lcore_id];
1424 for (i = 0; i < rte_lcore_count(); i++) {
1425 if (lcore_ids[i] == lcore_id) {
1432 drain_mbuf_table(tx_q);
1433 drain_vhost_table();
1435 * Inform the configuration core that we have exited the
1436 * linked list and that no devices are in use if requested.
1438 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1439 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1442 * Process vhost devices
1444 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1446 if (unlikely(vdev->remove)) {
1448 vdev->ready = DEVICE_SAFE_REMOVE;
1452 if (likely(vdev->ready == DEVICE_RX))
1455 if (likely(!vdev->remove))
1456 drain_virtio_tx(vdev);
1464 * Remove a device from the specific data core linked list and from the
1465 * main linked list. Synchronization occurs through the use of the
1466 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1467 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1470 destroy_device(int vid)
1472 struct vhost_dev *vdev = NULL;
1476 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1477 if (vdev->vid == vid)
1482 /*set the remove flag. */
1484 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1488 for (i = 0; i < RTE_MAX_LCORE; i++)
1489 rte_free(vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]);
1491 if (builtin_net_driver)
1492 vs_vhost_net_remove(vdev);
1494 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1496 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1499 /* Set the dev_removal_flag on each lcore. */
1500 RTE_LCORE_FOREACH_WORKER(lcore)
1501 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1504 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1505 * we can be sure that they can no longer access the device removed
1506 * from the linked lists and that the devices are no longer in use.
1508 RTE_LCORE_FOREACH_WORKER(lcore) {
1509 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1513 lcore_info[vdev->coreid].device_num--;
1515 RTE_LOG(INFO, VHOST_DATA,
1516 "(%d) device has been removed from data core\n",
1519 if (dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled) {
1521 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1522 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1524 while (vdev->pkts_inflight) {
1525 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1526 m_cpl, vdev->pkts_inflight, dma_id, 0);
1527 free_pkts(m_cpl, n_pkt);
1528 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1531 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1532 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = false;
1539 * A new device is added to a data core. First the device is added to the main linked list
1540 * and then allocated to a specific data core.
1545 int lcore, core_add = 0;
1547 uint32_t device_num_min = num_devices;
1548 struct vhost_dev *vdev;
1549 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1551 RTE_LOG(INFO, VHOST_DATA,
1552 "(%d) couldn't allocate memory for vhost dev\n",
1558 for (i = 0; i < RTE_MAX_LCORE; i++) {
1559 vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid]
1560 = rte_zmalloc("vhost bufftable",
1561 sizeof(struct vhost_bufftable),
1562 RTE_CACHE_LINE_SIZE);
1564 if (vhost_txbuff[i * RTE_MAX_VHOST_DEVICE + vid] == NULL) {
1565 RTE_LOG(INFO, VHOST_DATA,
1566 "(%d) couldn't allocate memory for vhost TX\n", vid);
1571 if (builtin_net_driver)
1572 vs_vhost_net_setup(vdev);
1574 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1575 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1577 /*reset ready flag*/
1578 vdev->ready = DEVICE_MAC_LEARNING;
1581 /* Find a suitable lcore to add the device. */
1582 RTE_LCORE_FOREACH_WORKER(lcore) {
1583 if (lcore_info[lcore].device_num < device_num_min) {
1584 device_num_min = lcore_info[lcore].device_num;
1588 vdev->coreid = core_add;
1590 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1592 lcore_info[vdev->coreid].device_num++;
1594 /* Disable notifications. */
1595 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1596 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1598 RTE_LOG(INFO, VHOST_DATA,
1599 "(%d) device has been added to data core %d\n",
1602 if (dma_bind[vid].dmas[VIRTIO_RXQ].dev_id != INVALID_DMA_ID) {
1605 ret = rte_vhost_async_channel_register(vid, VIRTIO_RXQ);
1607 dma_bind[vid].dmas[VIRTIO_RXQ].async_enabled = true;
1615 vring_state_changed(int vid, uint16_t queue_id, int enable)
1617 struct vhost_dev *vdev = NULL;
1619 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1620 if (vdev->vid == vid)
1626 if (queue_id != VIRTIO_RXQ)
1629 if (dma_bind[vid].dmas[queue_id].async_enabled) {
1632 int16_t dma_id = dma_bind[vid].dmas[VIRTIO_RXQ].dev_id;
1633 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1635 while (vdev->pkts_inflight) {
1636 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1637 m_cpl, vdev->pkts_inflight, dma_id, 0);
1638 free_pkts(m_cpl, n_pkt);
1639 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1648 * These callback allow devices to be added to the data core when configuration
1649 * has been fully complete.
1651 static const struct rte_vhost_device_ops virtio_net_device_ops =
1653 .new_device = new_device,
1654 .destroy_device = destroy_device,
1655 .vring_state_changed = vring_state_changed,
1659 * This is a thread will wake up after a period to print stats if the user has
1663 print_stats(__rte_unused void *arg)
1665 struct vhost_dev *vdev;
1666 uint64_t tx_dropped, rx_dropped;
1667 uint64_t tx, tx_total, rx, rx_total;
1668 const char clr[] = { 27, '[', '2', 'J', '\0' };
1669 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1672 sleep(enable_stats);
1674 /* Clear screen and move to top left */
1675 printf("%s%s\n", clr, top_left);
1676 printf("Device statistics =================================\n");
1678 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1679 tx_total = vdev->stats.tx_total;
1680 tx = vdev->stats.tx;
1681 tx_dropped = tx_total - tx;
1683 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1685 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1687 rx_dropped = rx_total - rx;
1689 printf("Statistics for device %d\n"
1690 "-----------------------\n"
1691 "TX total: %" PRIu64 "\n"
1692 "TX dropped: %" PRIu64 "\n"
1693 "TX successful: %" PRIu64 "\n"
1694 "RX total: %" PRIu64 "\n"
1695 "RX dropped: %" PRIu64 "\n"
1696 "RX successful: %" PRIu64 "\n",
1698 tx_total, tx_dropped, tx,
1699 rx_total, rx_dropped, rx);
1702 printf("===================================================\n");
1711 unregister_drivers(int socket_num)
1715 for (i = 0; i < socket_num; i++) {
1716 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1718 RTE_LOG(ERR, VHOST_CONFIG,
1719 "Fail to unregister vhost driver for %s.\n",
1720 socket_files + i * PATH_MAX);
1724 /* When we receive a INT signal, unregister vhost driver */
1726 sigint_handler(__rte_unused int signum)
1728 /* Unregister vhost driver. */
1729 unregister_drivers(nb_sockets);
1735 * While creating an mbuf pool, one key thing is to figure out how
1736 * many mbuf entries is enough for our use. FYI, here are some
1739 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1741 * - For each switch core (A CPU core does the packet switch), we need
1742 * also make some reservation for receiving the packets from virtio
1743 * Tx queue. How many is enough depends on the usage. It's normally
1744 * a simple calculation like following:
1746 * MAX_PKT_BURST * max packet size / mbuf size
1748 * So, we definitely need allocate more mbufs when TSO is enabled.
1750 * - Similarly, for each switching core, we should serve @nr_rx_desc
1751 * mbufs for receiving the packets from physical NIC device.
1753 * - We also need make sure, for each switch core, we have allocated
1754 * enough mbufs to fill up the mbuf cache.
1757 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1758 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1761 uint32_t nr_mbufs_per_core;
1762 uint32_t mtu = 1500;
1769 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST /
1770 (mbuf_size - RTE_PKTMBUF_HEADROOM);
1771 nr_mbufs_per_core += nr_rx_desc;
1772 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1774 nr_mbufs = nr_queues * nr_rx_desc;
1775 nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1776 nr_mbufs *= nr_port;
1778 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1779 nr_mbuf_cache, 0, mbuf_size,
1781 if (mbuf_pool == NULL)
1782 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1790 for (i = 0; i < RTE_MAX_VHOST_DEVICE; i++) {
1793 for (j = 0; j < RTE_MAX_QUEUES_PER_PORT * 2; j++) {
1794 dma_bind[i].dmas[j].dev_id = INVALID_DMA_ID;
1795 dma_bind[i].dmas[j].async_enabled = false;
1799 for (i = 0; i < RTE_DMADEV_DEFAULT_MAX; i++)
1800 dmas_id[i] = INVALID_DMA_ID;
1804 * Main function, does initialisation and calls the per-lcore functions.
1807 main(int argc, char *argv[])
1809 unsigned lcore_id, core_id = 0;
1810 unsigned nb_ports, valid_num_ports;
1813 static pthread_t tid;
1814 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1816 signal(SIGINT, sigint_handler);
1819 ret = rte_eal_init(argc, argv);
1821 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1825 /* initialize dma structures */
1828 /* parse app arguments */
1829 ret = us_vhost_parse_args(argc, argv);
1831 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1833 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1834 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1836 if (rte_lcore_is_enabled(lcore_id))
1837 lcore_ids[core_id++] = lcore_id;
1840 if (rte_lcore_count() > RTE_MAX_LCORE)
1841 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1843 /* Get the number of physical ports. */
1844 nb_ports = rte_eth_dev_count_avail();
1847 * Update the global var NUM_PORTS and global array PORTS
1848 * and get value of var VALID_NUM_PORTS according to system ports number
1850 valid_num_ports = check_ports_num(nb_ports);
1852 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1853 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1854 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1859 * FIXME: here we are trying to allocate mbufs big enough for
1860 * @MAX_QUEUES, but the truth is we're never going to use that
1861 * many queues here. We probably should only do allocation for
1862 * those queues we are going to use.
1864 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1865 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1867 if (vm2vm_mode == VM2VM_HARDWARE) {
1868 /* Enable VT loop back to let L2 switch to do it. */
1869 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1870 RTE_LOG(DEBUG, VHOST_CONFIG,
1871 "Enable loop back for L2 switch in vmdq.\n");
1874 /* initialize all ports */
1875 RTE_ETH_FOREACH_DEV(portid) {
1876 /* skip ports that are not enabled */
1877 if ((enabled_port_mask & (1 << portid)) == 0) {
1878 RTE_LOG(INFO, VHOST_PORT,
1879 "Skipping disabled port %d\n", portid);
1882 if (port_init(portid) != 0)
1883 rte_exit(EXIT_FAILURE,
1884 "Cannot initialize network ports\n");
1887 /* Enable stats if the user option is set. */
1889 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1892 rte_exit(EXIT_FAILURE,
1893 "Cannot create print-stats thread\n");
1896 /* Launch all data cores. */
1897 RTE_LCORE_FOREACH_WORKER(lcore_id)
1898 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1901 flags |= RTE_VHOST_USER_CLIENT;
1903 for (i = 0; i < dma_count; i++) {
1904 if (rte_vhost_async_dma_configure(dmas_id[i], 0) < 0) {
1905 RTE_LOG(ERR, VHOST_PORT, "Failed to configure DMA in vhost.\n");
1906 rte_exit(EXIT_FAILURE, "Cannot use given DMA device\n");
1910 /* Register vhost user driver to handle vhost messages. */
1911 for (i = 0; i < nb_sockets; i++) {
1912 char *file = socket_files + i * PATH_MAX;
1915 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1917 ret = rte_vhost_driver_register(file, flags);
1919 unregister_drivers(i);
1920 rte_exit(EXIT_FAILURE,
1921 "vhost driver register failure.\n");
1924 if (builtin_net_driver)
1925 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1927 if (mergeable == 0) {
1928 rte_vhost_driver_disable_features(file,
1929 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1932 if (enable_tx_csum == 0) {
1933 rte_vhost_driver_disable_features(file,
1934 1ULL << VIRTIO_NET_F_CSUM);
1937 if (enable_tso == 0) {
1938 rte_vhost_driver_disable_features(file,
1939 1ULL << VIRTIO_NET_F_HOST_TSO4);
1940 rte_vhost_driver_disable_features(file,
1941 1ULL << VIRTIO_NET_F_HOST_TSO6);
1942 rte_vhost_driver_disable_features(file,
1943 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1944 rte_vhost_driver_disable_features(file,
1945 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1949 rte_vhost_driver_enable_features(file,
1950 1ULL << VIRTIO_NET_F_CTRL_RX);
1953 ret = rte_vhost_driver_callback_register(file,
1954 &virtio_net_device_ops);
1956 rte_exit(EXIT_FAILURE,
1957 "failed to register vhost driver callbacks.\n");
1960 if (rte_vhost_driver_start(file) < 0) {
1961 rte_exit(EXIT_FAILURE,
1962 "failed to start vhost driver.\n");
1966 RTE_LCORE_FOREACH_WORKER(lcore_id)
1967 rte_eal_wait_lcore(lcore_id);
1969 /* clean up the EAL */