1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
32 #define MAX_QUEUES 128
35 /* the maximum number of external ports supported */
36 #define MAX_SUP_PORTS 1
38 #define MBUF_CACHE_SIZE 128
39 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
41 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
43 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
44 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
46 #define JUMBO_FRAME_MAX_SIZE 0x2600
48 /* State of virtio device. */
49 #define DEVICE_MAC_LEARNING 0
51 #define DEVICE_SAFE_REMOVE 2
53 /* Configurable number of RX/TX ring descriptors */
54 #define RTE_TEST_RX_DESC_DEFAULT 1024
55 #define RTE_TEST_TX_DESC_DEFAULT 512
57 #define INVALID_PORT_ID 0xFF
59 /* mask of enabled ports */
60 static uint32_t enabled_port_mask = 0;
62 /* Promiscuous mode */
63 static uint32_t promiscuous;
65 /* number of devices/queues to support*/
66 static uint32_t num_queues = 0;
67 static uint32_t num_devices;
69 static struct rte_mempool *mbuf_pool;
72 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
79 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
82 static uint32_t enable_stats = 0;
83 /* Enable retries on RX. */
84 static uint32_t enable_retry = 1;
86 /* Disable TX checksum offload */
87 static uint32_t enable_tx_csum;
89 /* Disable TSO offload */
90 static uint32_t enable_tso;
92 static int client_mode;
94 static int builtin_net_driver;
96 static int async_vhost_driver;
98 static char *dma_type;
100 /* Specify timeout (in useconds) between retries on RX. */
101 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
102 /* Specify the number of retries on RX. */
103 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
105 /* Socket file paths. Can be set by user */
106 static char *socket_files;
107 static int nb_sockets;
109 /* empty vmdq configuration structure. Filled in programatically */
110 static struct rte_eth_conf vmdq_conf_default = {
112 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
115 * VLAN strip is necessary for 1G NIC such as I350,
116 * this fixes bug of ipv4 forwarding in guest can't
117 * forward pakets from one virtio dev to another virtio dev.
119 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
123 .mq_mode = ETH_MQ_TX_NONE,
124 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
125 DEV_TX_OFFLOAD_TCP_CKSUM |
126 DEV_TX_OFFLOAD_VLAN_INSERT |
127 DEV_TX_OFFLOAD_MULTI_SEGS |
128 DEV_TX_OFFLOAD_TCP_TSO),
132 * should be overridden separately in code with
136 .nb_queue_pools = ETH_8_POOLS,
137 .enable_default_pool = 0,
140 .pool_map = {{0, 0},},
146 static unsigned lcore_ids[RTE_MAX_LCORE];
147 static uint16_t ports[RTE_MAX_ETHPORTS];
148 static unsigned num_ports = 0; /**< The number of ports specified in command line */
149 static uint16_t num_pf_queues, num_vmdq_queues;
150 static uint16_t vmdq_pool_base, vmdq_queue_base;
151 static uint16_t queues_per_pool;
153 const uint16_t vlan_tags[] = {
154 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
155 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
156 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
157 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
158 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
159 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
160 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
161 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
164 /* ethernet addresses of ports */
165 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
167 static struct vhost_dev_tailq_list vhost_dev_list =
168 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
170 static struct lcore_info lcore_info[RTE_MAX_LCORE];
172 /* Used for queueing bursts of TX packets. */
176 struct rte_mbuf *m_table[MAX_PKT_BURST];
179 struct vhost_bufftable {
182 struct rte_mbuf *m_table[MAX_PKT_BURST];
185 /* TX queue for each data core. */
186 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
189 * Vhost TX buffer for each data core.
190 * Every data core maintains a TX buffer for every vhost device,
191 * which is used for batch pkts enqueue for higher performance.
193 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
195 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
196 / US_PER_S * BURST_TX_DRAIN_US)
200 open_dma(const char *value)
202 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
203 return open_ioat(value);
209 * Builds up the correct configuration for VMDQ VLAN pool map
210 * according to the pool & queue limits.
213 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
215 struct rte_eth_vmdq_rx_conf conf;
216 struct rte_eth_vmdq_rx_conf *def_conf =
217 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
220 memset(&conf, 0, sizeof(conf));
221 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
222 conf.nb_pool_maps = num_devices;
223 conf.enable_loop_back = def_conf->enable_loop_back;
224 conf.rx_mode = def_conf->rx_mode;
226 for (i = 0; i < conf.nb_pool_maps; i++) {
227 conf.pool_map[i].vlan_id = vlan_tags[ i ];
228 conf.pool_map[i].pools = (1UL << i);
231 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
232 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
233 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
238 * Initialises a given port using global settings and with the rx buffers
239 * coming from the mbuf_pool passed as parameter
242 port_init(uint16_t port)
244 struct rte_eth_dev_info dev_info;
245 struct rte_eth_conf port_conf;
246 struct rte_eth_rxconf *rxconf;
247 struct rte_eth_txconf *txconf;
248 int16_t rx_rings, tx_rings;
249 uint16_t rx_ring_size, tx_ring_size;
253 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
254 retval = rte_eth_dev_info_get(port, &dev_info);
256 RTE_LOG(ERR, VHOST_PORT,
257 "Error during getting device (port %u) info: %s\n",
258 port, strerror(-retval));
263 rxconf = &dev_info.default_rxconf;
264 txconf = &dev_info.default_txconf;
265 rxconf->rx_drop_en = 1;
267 /*configure the number of supported virtio devices based on VMDQ limits */
268 num_devices = dev_info.max_vmdq_pools;
270 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
271 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
273 tx_rings = (uint16_t)rte_lcore_count();
275 /* Get port configuration. */
276 retval = get_eth_conf(&port_conf, num_devices);
279 /* NIC queues are divided into pf queues and vmdq queues. */
280 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
281 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
282 num_vmdq_queues = num_devices * queues_per_pool;
283 num_queues = num_pf_queues + num_vmdq_queues;
284 vmdq_queue_base = dev_info.vmdq_queue_base;
285 vmdq_pool_base = dev_info.vmdq_pool_base;
286 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
287 num_pf_queues, num_devices, queues_per_pool);
289 if (!rte_eth_dev_is_valid_port(port))
292 rx_rings = (uint16_t)dev_info.max_rx_queues;
293 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
294 port_conf.txmode.offloads |=
295 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
296 /* Configure ethernet device. */
297 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
299 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
300 port, strerror(-retval));
304 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
307 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
308 "for port %u: %s.\n", port, strerror(-retval));
311 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
312 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
313 "for Rx queues on port %u.\n", port);
317 /* Setup the queues. */
318 rxconf->offloads = port_conf.rxmode.offloads;
319 for (q = 0; q < rx_rings; q ++) {
320 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
321 rte_eth_dev_socket_id(port),
325 RTE_LOG(ERR, VHOST_PORT,
326 "Failed to setup rx queue %u of port %u: %s.\n",
327 q, port, strerror(-retval));
331 txconf->offloads = port_conf.txmode.offloads;
332 for (q = 0; q < tx_rings; q ++) {
333 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
334 rte_eth_dev_socket_id(port),
337 RTE_LOG(ERR, VHOST_PORT,
338 "Failed to setup tx queue %u of port %u: %s.\n",
339 q, port, strerror(-retval));
344 /* Start the device. */
345 retval = rte_eth_dev_start(port);
347 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
348 port, strerror(-retval));
353 retval = rte_eth_promiscuous_enable(port);
355 RTE_LOG(ERR, VHOST_PORT,
356 "Failed to enable promiscuous mode on port %u: %s\n",
357 port, rte_strerror(-retval));
362 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
364 RTE_LOG(ERR, VHOST_PORT,
365 "Failed to get MAC address on port %u: %s\n",
366 port, rte_strerror(-retval));
370 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
371 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
372 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
374 vmdq_ports_eth_addr[port].addr_bytes[0],
375 vmdq_ports_eth_addr[port].addr_bytes[1],
376 vmdq_ports_eth_addr[port].addr_bytes[2],
377 vmdq_ports_eth_addr[port].addr_bytes[3],
378 vmdq_ports_eth_addr[port].addr_bytes[4],
379 vmdq_ports_eth_addr[port].addr_bytes[5]);
385 * Set socket file path.
388 us_vhost_parse_socket_path(const char *q_arg)
392 /* parse number string */
393 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
397 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
398 if (socket_files == NULL) {
403 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
410 * Parse the portmask provided at run time.
413 parse_portmask(const char *portmask)
420 /* parse hexadecimal string */
421 pm = strtoul(portmask, &end, 16);
422 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
430 * Parse num options at run time.
433 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
440 /* parse unsigned int string */
441 num = strtoul(q_arg, &end, 10);
442 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
445 if (num > max_valid_value)
456 us_vhost_usage(const char *prgname)
458 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
460 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
461 " --socket-file <path>\n"
463 " -p PORTMASK: Set mask for ports to be used by application\n"
464 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
465 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
466 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
467 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
468 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
469 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
470 " --socket-file: The path of the socket file.\n"
471 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
472 " --tso [0|1] disable/enable TCP segment offload.\n"
473 " --client register a vhost-user socket as client mode.\n"
474 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
475 " --dmas register dma channel for specific vhost device.\n",
480 #define OPT_VM2VM "vm2vm"
482 #define OPT_RX_RETRY "rx-retry"
484 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
485 OPT_RX_RETRY_DELAY_NUM,
486 #define OPT_RX_RETRY_NUMB "rx-retry-num"
487 OPT_RX_RETRY_NUMB_NUM,
488 #define OPT_MERGEABLE "mergeable"
490 #define OPT_STATS "stats"
492 #define OPT_SOCKET_FILE "socket-file"
494 #define OPT_TX_CSUM "tx-csum"
496 #define OPT_TSO "tso"
498 #define OPT_CLIENT "client"
500 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
501 OPT_BUILTIN_NET_DRIVER_NUM,
502 #define OPT_DMA_TYPE "dma-type"
504 #define OPT_DMAS "dmas"
509 * Parse the arguments given in the command line of the application.
512 us_vhost_parse_args(int argc, char **argv)
517 const char *prgname = argv[0];
518 static struct option long_option[] = {
519 {OPT_VM2VM, required_argument,
520 NULL, OPT_VM2VM_NUM},
521 {OPT_RX_RETRY, required_argument,
522 NULL, OPT_RX_RETRY_NUM},
523 {OPT_RX_RETRY_DELAY, required_argument,
524 NULL, OPT_RX_RETRY_DELAY_NUM},
525 {OPT_RX_RETRY_NUMB, required_argument,
526 NULL, OPT_RX_RETRY_NUMB_NUM},
527 {OPT_MERGEABLE, required_argument,
528 NULL, OPT_MERGEABLE_NUM},
529 {OPT_STATS, required_argument,
530 NULL, OPT_STATS_NUM},
531 {OPT_SOCKET_FILE, required_argument,
532 NULL, OPT_SOCKET_FILE_NUM},
533 {OPT_TX_CSUM, required_argument,
534 NULL, OPT_TX_CSUM_NUM},
535 {OPT_TSO, required_argument,
537 {OPT_CLIENT, no_argument,
538 NULL, OPT_CLIENT_NUM},
539 {OPT_BUILTIN_NET_DRIVER, no_argument,
540 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
541 {OPT_DMA_TYPE, required_argument,
542 NULL, OPT_DMA_TYPE_NUM},
543 {OPT_DMAS, required_argument,
548 /* Parse command line */
549 while ((opt = getopt_long(argc, argv, "p:P",
550 long_option, &option_index)) != EOF) {
554 enabled_port_mask = parse_portmask(optarg);
555 if (enabled_port_mask == 0) {
556 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
557 us_vhost_usage(prgname);
564 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
565 ETH_VMDQ_ACCEPT_BROADCAST |
566 ETH_VMDQ_ACCEPT_MULTICAST;
570 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
572 RTE_LOG(INFO, VHOST_CONFIG,
573 "Invalid argument for "
575 us_vhost_usage(prgname);
578 vm2vm_mode = (vm2vm_type)ret;
581 case OPT_RX_RETRY_NUM:
582 ret = parse_num_opt(optarg, 1);
584 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
585 us_vhost_usage(prgname);
591 case OPT_TX_CSUM_NUM:
592 ret = parse_num_opt(optarg, 1);
594 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
595 us_vhost_usage(prgname);
598 enable_tx_csum = ret;
602 ret = parse_num_opt(optarg, 1);
604 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
605 us_vhost_usage(prgname);
611 case OPT_RX_RETRY_DELAY_NUM:
612 ret = parse_num_opt(optarg, INT32_MAX);
614 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
615 us_vhost_usage(prgname);
618 burst_rx_delay_time = ret;
621 case OPT_RX_RETRY_NUMB_NUM:
622 ret = parse_num_opt(optarg, INT32_MAX);
624 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
625 us_vhost_usage(prgname);
628 burst_rx_retry_num = ret;
631 case OPT_MERGEABLE_NUM:
632 ret = parse_num_opt(optarg, 1);
634 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
635 us_vhost_usage(prgname);
640 vmdq_conf_default.rxmode.offloads |=
641 DEV_RX_OFFLOAD_JUMBO_FRAME;
642 vmdq_conf_default.rxmode.max_rx_pkt_len
643 = JUMBO_FRAME_MAX_SIZE;
648 ret = parse_num_opt(optarg, INT32_MAX);
650 RTE_LOG(INFO, VHOST_CONFIG,
651 "Invalid argument for stats [0..N]\n");
652 us_vhost_usage(prgname);
658 /* Set socket file path. */
659 case OPT_SOCKET_FILE_NUM:
660 if (us_vhost_parse_socket_path(optarg) == -1) {
661 RTE_LOG(INFO, VHOST_CONFIG,
662 "Invalid argument for socket name (Max %d characters)\n",
664 us_vhost_usage(prgname);
669 case OPT_DMA_TYPE_NUM:
674 if (open_dma(optarg) == -1) {
675 RTE_LOG(INFO, VHOST_CONFIG,
677 us_vhost_usage(prgname);
680 async_vhost_driver = 1;
687 case OPT_BUILTIN_NET_DRIVER_NUM:
688 builtin_net_driver = 1;
691 /* Invalid option - print options. */
693 us_vhost_usage(prgname);
698 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
699 if (enabled_port_mask & (1 << i))
700 ports[num_ports++] = i;
703 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
704 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
705 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
713 * Update the global var NUM_PORTS and array PORTS according to system ports number
714 * and return valid ports number
716 static unsigned check_ports_num(unsigned nb_ports)
718 unsigned valid_num_ports = num_ports;
721 if (num_ports > nb_ports) {
722 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
723 num_ports, nb_ports);
724 num_ports = nb_ports;
727 for (portid = 0; portid < num_ports; portid ++) {
728 if (!rte_eth_dev_is_valid_port(ports[portid])) {
729 RTE_LOG(INFO, VHOST_PORT,
730 "\nSpecified port ID(%u) is not valid\n",
732 ports[portid] = INVALID_PORT_ID;
736 return valid_num_ports;
739 static __rte_always_inline struct vhost_dev *
740 find_vhost_dev(struct rte_ether_addr *mac)
742 struct vhost_dev *vdev;
744 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
745 if (vdev->ready == DEVICE_RX &&
746 rte_is_same_ether_addr(mac, &vdev->mac_address))
754 * This function learns the MAC address of the device and registers this along with a
755 * vlan tag to a VMDQ.
758 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
760 struct rte_ether_hdr *pkt_hdr;
763 /* Learn MAC address of guest device from packet */
764 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
766 if (find_vhost_dev(&pkt_hdr->s_addr)) {
767 RTE_LOG(ERR, VHOST_DATA,
768 "(%d) device is using a registered MAC!\n",
773 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
774 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
776 /* vlan_tag currently uses the device_id. */
777 vdev->vlan_tag = vlan_tags[vdev->vid];
779 /* Print out VMDQ registration info. */
780 RTE_LOG(INFO, VHOST_DATA,
781 "(%d) mac " RTE_ETHER_ADDR_PRT_FMT " and vlan %d registered\n",
783 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
784 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
785 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
788 /* Register the MAC address. */
789 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
790 (uint32_t)vdev->vid + vmdq_pool_base);
792 RTE_LOG(ERR, VHOST_DATA,
793 "(%d) failed to add device MAC address to VMDQ\n",
796 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
798 /* Set device as ready for RX. */
799 vdev->ready = DEVICE_RX;
805 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
806 * queue before disabling RX on the device.
809 unlink_vmdq(struct vhost_dev *vdev)
813 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
815 if (vdev->ready == DEVICE_RX) {
816 /*clear MAC and VLAN settings*/
817 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
818 for (i = 0; i < 6; i++)
819 vdev->mac_address.addr_bytes[i] = 0;
823 /*Clear out the receive buffers*/
824 rx_count = rte_eth_rx_burst(ports[0],
825 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
828 for (i = 0; i < rx_count; i++)
829 rte_pktmbuf_free(pkts_burst[i]);
831 rx_count = rte_eth_rx_burst(ports[0],
832 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
835 vdev->ready = DEVICE_MAC_LEARNING;
840 free_pkts(struct rte_mbuf **pkts, uint16_t n)
843 rte_pktmbuf_free(pkts[n]);
846 static __rte_always_inline void
847 complete_async_pkts(struct vhost_dev *vdev)
849 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
850 uint16_t complete_count;
852 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
853 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
854 if (complete_count) {
855 free_pkts(p_cpl, complete_count);
856 __atomic_sub_fetch(&vdev->pkts_inflight, complete_count, __ATOMIC_SEQ_CST);
861 static __rte_always_inline void
862 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
867 if (builtin_net_driver) {
868 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
870 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
874 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
876 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
878 src_vdev->stats.tx_total++;
879 src_vdev->stats.tx += ret;
883 static __rte_always_inline void
884 drain_vhost(struct vhost_dev *vdev)
887 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
888 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
889 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
891 if (builtin_net_driver) {
892 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
893 } else if (async_vhost_driver) {
894 uint32_t cpu_cpl_nr = 0;
895 uint16_t enqueue_fail = 0;
896 struct rte_mbuf *m_cpu_cpl[nr_xmit];
898 complete_async_pkts(vdev);
899 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
900 m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
901 __atomic_add_fetch(&vdev->pkts_inflight, ret - cpu_cpl_nr, __ATOMIC_SEQ_CST);
904 free_pkts(m_cpu_cpl, cpu_cpl_nr);
906 enqueue_fail = nr_xmit - ret;
908 free_pkts(&m[ret], nr_xmit - ret);
910 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
915 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
917 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
921 if (!async_vhost_driver)
922 free_pkts(m, nr_xmit);
925 static __rte_always_inline void
926 drain_vhost_table(void)
928 uint16_t lcore_id = rte_lcore_id();
929 struct vhost_bufftable *vhost_txq;
930 struct vhost_dev *vdev;
933 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
934 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
937 cur_tsc = rte_rdtsc();
938 if (unlikely(cur_tsc - vhost_txq->pre_tsc
939 > MBUF_TABLE_DRAIN_TSC)) {
940 RTE_LOG_DP(DEBUG, VHOST_DATA,
941 "Vhost TX queue drained after timeout with burst size %u\n",
945 vhost_txq->pre_tsc = cur_tsc;
951 * Check if the packet destination MAC address is for a local device. If so then put
952 * the packet on that devices RX queue. If not then return.
954 static __rte_always_inline int
955 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
957 struct rte_ether_hdr *pkt_hdr;
958 struct vhost_dev *dst_vdev;
959 struct vhost_bufftable *vhost_txq;
960 uint16_t lcore_id = rte_lcore_id();
961 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
963 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
967 if (vdev->vid == dst_vdev->vid) {
968 RTE_LOG_DP(DEBUG, VHOST_DATA,
969 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
974 RTE_LOG_DP(DEBUG, VHOST_DATA,
975 "(%d) TX: MAC address is local\n", dst_vdev->vid);
977 if (unlikely(dst_vdev->remove)) {
978 RTE_LOG_DP(DEBUG, VHOST_DATA,
979 "(%d) device is marked for removal\n", dst_vdev->vid);
983 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
984 vhost_txq->m_table[vhost_txq->len++] = m;
987 vdev->stats.tx_total++;
991 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
992 drain_vhost(dst_vdev);
994 vhost_txq->pre_tsc = rte_rdtsc();
1000 * Check if the destination MAC of a packet is one local VM,
1001 * and get its vlan tag, and offset if it is.
1003 static __rte_always_inline int
1004 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1005 uint32_t *offset, uint16_t *vlan_tag)
1007 struct vhost_dev *dst_vdev;
1008 struct rte_ether_hdr *pkt_hdr =
1009 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1011 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1015 if (vdev->vid == dst_vdev->vid) {
1016 RTE_LOG_DP(DEBUG, VHOST_DATA,
1017 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1023 * HW vlan strip will reduce the packet length
1024 * by minus length of vlan tag, so need restore
1025 * the packet length by plus it.
1027 *offset = VLAN_HLEN;
1028 *vlan_tag = vlan_tags[vdev->vid];
1030 RTE_LOG_DP(DEBUG, VHOST_DATA,
1031 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1032 vdev->vid, dst_vdev->vid, *vlan_tag);
1037 static void virtio_tx_offload(struct rte_mbuf *m)
1039 struct rte_net_hdr_lens hdr_lens;
1040 struct rte_ipv4_hdr *ipv4_hdr;
1041 struct rte_tcp_hdr *tcp_hdr;
1045 ptype = rte_net_get_ptype(m, &hdr_lens, RTE_PTYPE_ALL_MASK);
1046 m->l2_len = hdr_lens.l2_len;
1047 m->l3_len = hdr_lens.l3_len;
1048 m->l4_len = hdr_lens.l4_len;
1050 l3_hdr = rte_pktmbuf_mtod_offset(m, void *, m->l2_len);
1051 tcp_hdr = rte_pktmbuf_mtod_offset(m, struct rte_tcp_hdr *,
1052 m->l2_len + m->l3_len);
1054 m->ol_flags |= PKT_TX_TCP_SEG;
1055 if ((ptype & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV4) {
1056 m->ol_flags |= PKT_TX_IPV4;
1057 m->ol_flags |= PKT_TX_IP_CKSUM;
1059 ipv4_hdr->hdr_checksum = 0;
1060 tcp_hdr->cksum = rte_ipv4_phdr_cksum(l3_hdr, m->ol_flags);
1061 } else { /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1062 m->ol_flags |= PKT_TX_IPV6;
1063 tcp_hdr->cksum = rte_ipv6_phdr_cksum(l3_hdr, m->ol_flags);
1067 static __rte_always_inline void
1068 do_drain_mbuf_table(struct mbuf_table *tx_q)
1072 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1073 tx_q->m_table, tx_q->len);
1074 if (unlikely(count < tx_q->len))
1075 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1081 * This function routes the TX packet to the correct interface. This
1082 * may be a local device or the physical port.
1084 static __rte_always_inline void
1085 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1087 struct mbuf_table *tx_q;
1088 unsigned offset = 0;
1089 const uint16_t lcore_id = rte_lcore_id();
1090 struct rte_ether_hdr *nh;
1093 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1094 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1095 struct vhost_dev *vdev2;
1097 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1099 sync_virtio_xmit(vdev2, vdev, m);
1104 /*check if destination is local VM*/
1105 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1108 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1109 if (unlikely(find_local_dest(vdev, m, &offset,
1111 rte_pktmbuf_free(m);
1116 RTE_LOG_DP(DEBUG, VHOST_DATA,
1117 "(%d) TX: MAC address is external\n", vdev->vid);
1121 /*Add packet to the port tx queue*/
1122 tx_q = &lcore_tx_queue[lcore_id];
1124 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1125 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1126 /* Guest has inserted the vlan tag. */
1127 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1128 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1129 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1130 (vh->vlan_tci != vlan_tag_be))
1131 vh->vlan_tci = vlan_tag_be;
1133 m->ol_flags |= PKT_TX_VLAN_PKT;
1136 * Find the right seg to adjust the data len when offset is
1137 * bigger than tail room size.
1139 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1140 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1141 m->data_len += offset;
1143 struct rte_mbuf *seg = m;
1145 while ((seg->next != NULL) &&
1146 (offset > rte_pktmbuf_tailroom(seg)))
1149 seg->data_len += offset;
1151 m->pkt_len += offset;
1154 m->vlan_tci = vlan_tag;
1157 if (m->ol_flags & PKT_RX_LRO)
1158 virtio_tx_offload(m);
1160 tx_q->m_table[tx_q->len++] = m;
1162 vdev->stats.tx_total++;
1166 if (unlikely(tx_q->len == MAX_PKT_BURST))
1167 do_drain_mbuf_table(tx_q);
1171 static __rte_always_inline void
1172 drain_mbuf_table(struct mbuf_table *tx_q)
1174 static uint64_t prev_tsc;
1180 cur_tsc = rte_rdtsc();
1181 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1184 RTE_LOG_DP(DEBUG, VHOST_DATA,
1185 "TX queue drained after timeout with burst size %u\n",
1187 do_drain_mbuf_table(tx_q);
1191 static __rte_always_inline void
1192 drain_eth_rx(struct vhost_dev *vdev)
1194 uint16_t rx_count, enqueue_count;
1195 struct rte_mbuf *pkts[MAX_PKT_BURST];
1197 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1198 pkts, MAX_PKT_BURST);
1204 * When "enable_retry" is set, here we wait and retry when there
1205 * is no enough free slots in the queue to hold @rx_count packets,
1206 * to diminish packet loss.
1209 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1213 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1214 rte_delay_us(burst_rx_delay_time);
1215 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1221 if (builtin_net_driver) {
1222 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1224 } else if (async_vhost_driver) {
1225 uint32_t cpu_cpl_nr = 0;
1226 uint16_t enqueue_fail = 0;
1227 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1229 complete_async_pkts(vdev);
1230 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1231 VIRTIO_RXQ, pkts, rx_count,
1232 m_cpu_cpl, &cpu_cpl_nr);
1233 __atomic_add_fetch(&vdev->pkts_inflight, enqueue_count - cpu_cpl_nr,
1237 free_pkts(m_cpu_cpl, cpu_cpl_nr);
1239 enqueue_fail = rx_count - enqueue_count;
1241 free_pkts(&pkts[enqueue_count], enqueue_fail);
1244 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1249 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1251 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1255 if (!async_vhost_driver)
1256 free_pkts(pkts, rx_count);
1259 static __rte_always_inline void
1260 drain_virtio_tx(struct vhost_dev *vdev)
1262 struct rte_mbuf *pkts[MAX_PKT_BURST];
1266 if (builtin_net_driver) {
1267 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1268 pkts, MAX_PKT_BURST);
1270 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1271 mbuf_pool, pkts, MAX_PKT_BURST);
1274 /* setup VMDq for the first packet */
1275 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1276 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1277 free_pkts(pkts, count);
1280 for (i = 0; i < count; ++i)
1281 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1285 * Main function of vhost-switch. It basically does:
1287 * for each vhost device {
1290 * Which drains the host eth Rx queue linked to the vhost device,
1291 * and deliver all of them to guest virito Rx ring associated with
1292 * this vhost device.
1294 * - drain_virtio_tx()
1296 * Which drains the guest virtio Tx queue and deliver all of them
1297 * to the target, which could be another vhost device, or the
1298 * physical eth dev. The route is done in function "virtio_tx_route".
1302 switch_worker(void *arg __rte_unused)
1305 unsigned lcore_id = rte_lcore_id();
1306 struct vhost_dev *vdev;
1307 struct mbuf_table *tx_q;
1309 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1311 tx_q = &lcore_tx_queue[lcore_id];
1312 for (i = 0; i < rte_lcore_count(); i++) {
1313 if (lcore_ids[i] == lcore_id) {
1320 drain_mbuf_table(tx_q);
1321 drain_vhost_table();
1323 * Inform the configuration core that we have exited the
1324 * linked list and that no devices are in use if requested.
1326 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1327 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1330 * Process vhost devices
1332 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1334 if (unlikely(vdev->remove)) {
1336 vdev->ready = DEVICE_SAFE_REMOVE;
1340 if (likely(vdev->ready == DEVICE_RX))
1343 if (likely(!vdev->remove))
1344 drain_virtio_tx(vdev);
1352 * Remove a device from the specific data core linked list and from the
1353 * main linked list. Synchonization occurs through the use of the
1354 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1355 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1358 destroy_device(int vid)
1360 struct vhost_dev *vdev = NULL;
1364 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1365 if (vdev->vid == vid)
1370 /*set the remove flag. */
1372 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1376 for (i = 0; i < RTE_MAX_LCORE; i++)
1377 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1379 if (builtin_net_driver)
1380 vs_vhost_net_remove(vdev);
1382 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1384 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1387 /* Set the dev_removal_flag on each lcore. */
1388 RTE_LCORE_FOREACH_WORKER(lcore)
1389 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1392 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1393 * we can be sure that they can no longer access the device removed
1394 * from the linked lists and that the devices are no longer in use.
1396 RTE_LCORE_FOREACH_WORKER(lcore) {
1397 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1401 lcore_info[vdev->coreid].device_num--;
1403 RTE_LOG(INFO, VHOST_DATA,
1404 "(%d) device has been removed from data core\n",
1407 if (async_vhost_driver) {
1409 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1411 while (vdev->pkts_inflight) {
1412 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, VIRTIO_RXQ,
1413 m_cpl, vdev->pkts_inflight);
1414 free_pkts(m_cpl, n_pkt);
1415 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1418 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1425 * A new device is added to a data core. First the device is added to the main linked list
1426 * and then allocated to a specific data core.
1431 int lcore, core_add = 0;
1433 uint32_t device_num_min = num_devices;
1434 struct vhost_dev *vdev;
1435 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1437 RTE_LOG(INFO, VHOST_DATA,
1438 "(%d) couldn't allocate memory for vhost dev\n",
1444 for (i = 0; i < RTE_MAX_LCORE; i++) {
1445 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1446 = rte_zmalloc("vhost bufftable",
1447 sizeof(struct vhost_bufftable),
1448 RTE_CACHE_LINE_SIZE);
1450 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1451 RTE_LOG(INFO, VHOST_DATA,
1452 "(%d) couldn't allocate memory for vhost TX\n", vid);
1457 if (builtin_net_driver)
1458 vs_vhost_net_setup(vdev);
1460 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1461 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1463 /*reset ready flag*/
1464 vdev->ready = DEVICE_MAC_LEARNING;
1467 /* Find a suitable lcore to add the device. */
1468 RTE_LCORE_FOREACH_WORKER(lcore) {
1469 if (lcore_info[lcore].device_num < device_num_min) {
1470 device_num_min = lcore_info[lcore].device_num;
1474 vdev->coreid = core_add;
1476 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1478 lcore_info[vdev->coreid].device_num++;
1480 /* Disable notifications. */
1481 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1482 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1484 RTE_LOG(INFO, VHOST_DATA,
1485 "(%d) device has been added to data core %d\n",
1488 if (async_vhost_driver) {
1489 struct rte_vhost_async_config config = {0};
1490 struct rte_vhost_async_channel_ops channel_ops;
1492 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1493 channel_ops.transfer_data = ioat_transfer_data_cb;
1494 channel_ops.check_completed_copies =
1495 ioat_check_completed_copies_cb;
1497 config.features = RTE_VHOST_ASYNC_INORDER;
1498 config.async_threshold = 256;
1500 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1501 config, &channel_ops);
1509 vring_state_changed(int vid, uint16_t queue_id, int enable)
1511 struct vhost_dev *vdev = NULL;
1513 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1514 if (vdev->vid == vid)
1520 if (queue_id != VIRTIO_RXQ)
1523 if (async_vhost_driver) {
1526 struct rte_mbuf *m_cpl[vdev->pkts_inflight];
1528 while (vdev->pkts_inflight) {
1529 n_pkt = rte_vhost_clear_queue_thread_unsafe(vid, queue_id,
1530 m_cpl, vdev->pkts_inflight);
1531 free_pkts(m_cpl, n_pkt);
1532 __atomic_sub_fetch(&vdev->pkts_inflight, n_pkt, __ATOMIC_SEQ_CST);
1541 * These callback allow devices to be added to the data core when configuration
1542 * has been fully complete.
1544 static const struct vhost_device_ops virtio_net_device_ops =
1546 .new_device = new_device,
1547 .destroy_device = destroy_device,
1548 .vring_state_changed = vring_state_changed,
1552 * This is a thread will wake up after a period to print stats if the user has
1556 print_stats(__rte_unused void *arg)
1558 struct vhost_dev *vdev;
1559 uint64_t tx_dropped, rx_dropped;
1560 uint64_t tx, tx_total, rx, rx_total;
1561 const char clr[] = { 27, '[', '2', 'J', '\0' };
1562 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1565 sleep(enable_stats);
1567 /* Clear screen and move to top left */
1568 printf("%s%s\n", clr, top_left);
1569 printf("Device statistics =================================\n");
1571 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1572 tx_total = vdev->stats.tx_total;
1573 tx = vdev->stats.tx;
1574 tx_dropped = tx_total - tx;
1576 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1578 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1580 rx_dropped = rx_total - rx;
1582 printf("Statistics for device %d\n"
1583 "-----------------------\n"
1584 "TX total: %" PRIu64 "\n"
1585 "TX dropped: %" PRIu64 "\n"
1586 "TX successful: %" PRIu64 "\n"
1587 "RX total: %" PRIu64 "\n"
1588 "RX dropped: %" PRIu64 "\n"
1589 "RX successful: %" PRIu64 "\n",
1591 tx_total, tx_dropped, tx,
1592 rx_total, rx_dropped, rx);
1595 printf("===================================================\n");
1604 unregister_drivers(int socket_num)
1608 for (i = 0; i < socket_num; i++) {
1609 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1611 RTE_LOG(ERR, VHOST_CONFIG,
1612 "Fail to unregister vhost driver for %s.\n",
1613 socket_files + i * PATH_MAX);
1617 /* When we receive a INT signal, unregister vhost driver */
1619 sigint_handler(__rte_unused int signum)
1621 /* Unregister vhost driver. */
1622 unregister_drivers(nb_sockets);
1628 * While creating an mbuf pool, one key thing is to figure out how
1629 * many mbuf entries is enough for our use. FYI, here are some
1632 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1634 * - For each switch core (A CPU core does the packet switch), we need
1635 * also make some reservation for receiving the packets from virtio
1636 * Tx queue. How many is enough depends on the usage. It's normally
1637 * a simple calculation like following:
1639 * MAX_PKT_BURST * max packet size / mbuf size
1641 * So, we definitely need allocate more mbufs when TSO is enabled.
1643 * - Similarly, for each switching core, we should serve @nr_rx_desc
1644 * mbufs for receiving the packets from physical NIC device.
1646 * - We also need make sure, for each switch core, we have allocated
1647 * enough mbufs to fill up the mbuf cache.
1650 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1651 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1654 uint32_t nr_mbufs_per_core;
1655 uint32_t mtu = 1500;
1662 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST /
1663 (mbuf_size - RTE_PKTMBUF_HEADROOM);
1664 nr_mbufs_per_core += nr_rx_desc;
1665 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1667 nr_mbufs = nr_queues * nr_rx_desc;
1668 nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1669 nr_mbufs *= nr_port;
1671 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1672 nr_mbuf_cache, 0, mbuf_size,
1674 if (mbuf_pool == NULL)
1675 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1679 * Main function, does initialisation and calls the per-lcore functions.
1682 main(int argc, char *argv[])
1684 unsigned lcore_id, core_id = 0;
1685 unsigned nb_ports, valid_num_ports;
1688 static pthread_t tid;
1689 uint64_t flags = RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
1691 signal(SIGINT, sigint_handler);
1694 ret = rte_eal_init(argc, argv);
1696 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1700 /* parse app arguments */
1701 ret = us_vhost_parse_args(argc, argv);
1703 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1705 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1706 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1708 if (rte_lcore_is_enabled(lcore_id))
1709 lcore_ids[core_id++] = lcore_id;
1712 if (rte_lcore_count() > RTE_MAX_LCORE)
1713 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1715 /* Get the number of physical ports. */
1716 nb_ports = rte_eth_dev_count_avail();
1719 * Update the global var NUM_PORTS and global array PORTS
1720 * and get value of var VALID_NUM_PORTS according to system ports number
1722 valid_num_ports = check_ports_num(nb_ports);
1724 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1725 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1726 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1731 * FIXME: here we are trying to allocate mbufs big enough for
1732 * @MAX_QUEUES, but the truth is we're never going to use that
1733 * many queues here. We probably should only do allocation for
1734 * those queues we are going to use.
1736 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1737 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1739 if (vm2vm_mode == VM2VM_HARDWARE) {
1740 /* Enable VT loop back to let L2 switch to do it. */
1741 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1742 RTE_LOG(DEBUG, VHOST_CONFIG,
1743 "Enable loop back for L2 switch in vmdq.\n");
1746 /* initialize all ports */
1747 RTE_ETH_FOREACH_DEV(portid) {
1748 /* skip ports that are not enabled */
1749 if ((enabled_port_mask & (1 << portid)) == 0) {
1750 RTE_LOG(INFO, VHOST_PORT,
1751 "Skipping disabled port %d\n", portid);
1754 if (port_init(portid) != 0)
1755 rte_exit(EXIT_FAILURE,
1756 "Cannot initialize network ports\n");
1759 /* Enable stats if the user option is set. */
1761 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1764 rte_exit(EXIT_FAILURE,
1765 "Cannot create print-stats thread\n");
1768 /* Launch all data cores. */
1769 RTE_LCORE_FOREACH_WORKER(lcore_id)
1770 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1773 flags |= RTE_VHOST_USER_CLIENT;
1775 /* Register vhost user driver to handle vhost messages. */
1776 for (i = 0; i < nb_sockets; i++) {
1777 char *file = socket_files + i * PATH_MAX;
1779 if (async_vhost_driver)
1780 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1782 ret = rte_vhost_driver_register(file, flags);
1784 unregister_drivers(i);
1785 rte_exit(EXIT_FAILURE,
1786 "vhost driver register failure.\n");
1789 if (builtin_net_driver)
1790 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1792 if (mergeable == 0) {
1793 rte_vhost_driver_disable_features(file,
1794 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1797 if (enable_tx_csum == 0) {
1798 rte_vhost_driver_disable_features(file,
1799 1ULL << VIRTIO_NET_F_CSUM);
1802 if (enable_tso == 0) {
1803 rte_vhost_driver_disable_features(file,
1804 1ULL << VIRTIO_NET_F_HOST_TSO4);
1805 rte_vhost_driver_disable_features(file,
1806 1ULL << VIRTIO_NET_F_HOST_TSO6);
1807 rte_vhost_driver_disable_features(file,
1808 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1809 rte_vhost_driver_disable_features(file,
1810 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1814 rte_vhost_driver_enable_features(file,
1815 1ULL << VIRTIO_NET_F_CTRL_RX);
1818 ret = rte_vhost_driver_callback_register(file,
1819 &virtio_net_device_ops);
1821 rte_exit(EXIT_FAILURE,
1822 "failed to register vhost driver callbacks.\n");
1825 if (rte_vhost_driver_start(file) < 0) {
1826 rte_exit(EXIT_FAILURE,
1827 "failed to start vhost driver.\n");
1831 RTE_LCORE_FOREACH_WORKER(lcore_id)
1832 rte_eal_wait_lcore(lcore_id);
1834 /* clean up the EAL */