1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_atomic.h>
18 #include <rte_cycles.h>
19 #include <rte_ethdev.h>
21 #include <rte_string_fns.h>
22 #include <rte_malloc.h>
23 #include <rte_vhost.h>
26 #include <rte_pause.h>
31 #define MAX_QUEUES 128
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
45 #define JUMBO_FRAME_MAX_SIZE 0x2600
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_SAFE_REMOVE 2
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
56 #define INVALID_PORT_ID 0xFF
58 /* Maximum long option length for option parsing. */
59 #define MAX_LONG_OPT_SZ 64
61 /* mask of enabled ports */
62 static uint32_t enabled_port_mask = 0;
64 /* Promiscuous mode */
65 static uint32_t promiscuous;
67 /* number of devices/queues to support*/
68 static uint32_t num_queues = 0;
69 static uint32_t num_devices;
71 static struct rte_mempool *mbuf_pool;
74 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
81 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
84 static uint32_t enable_stats = 0;
85 /* Enable retries on RX. */
86 static uint32_t enable_retry = 1;
88 /* Disable TX checksum offload */
89 static uint32_t enable_tx_csum;
91 /* Disable TSO offload */
92 static uint32_t enable_tso;
94 static int client_mode;
95 static int dequeue_zero_copy;
97 static int builtin_net_driver;
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
111 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
114 * VLAN strip is necessary for 1G NIC such as I350,
115 * this fixes bug of ipv4 forwarding in guest can't
116 * forward pakets from one virtio dev to another virtio dev.
118 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 .mq_mode = ETH_MQ_TX_NONE,
123 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124 DEV_TX_OFFLOAD_TCP_CKSUM |
125 DEV_TX_OFFLOAD_VLAN_INSERT |
126 DEV_TX_OFFLOAD_MULTI_SEGS |
127 DEV_TX_OFFLOAD_TCP_TSO),
131 * should be overridden separately in code with
135 .nb_queue_pools = ETH_8_POOLS,
136 .enable_default_pool = 0,
139 .pool_map = {{0, 0},},
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
152 const uint16_t vlan_tags[] = {
153 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166 static struct vhost_dev_tailq_list vhost_dev_list =
167 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171 /* Used for queueing bursts of TX packets. */
175 struct rte_mbuf *m_table[MAX_PKT_BURST];
178 /* TX queue for each data core. */
179 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
181 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
182 / US_PER_S * BURST_TX_DRAIN_US)
186 * Builds up the correct configuration for VMDQ VLAN pool map
187 * according to the pool & queue limits.
190 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
192 struct rte_eth_vmdq_rx_conf conf;
193 struct rte_eth_vmdq_rx_conf *def_conf =
194 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
197 memset(&conf, 0, sizeof(conf));
198 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
199 conf.nb_pool_maps = num_devices;
200 conf.enable_loop_back = def_conf->enable_loop_back;
201 conf.rx_mode = def_conf->rx_mode;
203 for (i = 0; i < conf.nb_pool_maps; i++) {
204 conf.pool_map[i].vlan_id = vlan_tags[ i ];
205 conf.pool_map[i].pools = (1UL << i);
208 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
209 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
210 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
215 * Initialises a given port using global settings and with the rx buffers
216 * coming from the mbuf_pool passed as parameter
219 port_init(uint16_t port)
221 struct rte_eth_dev_info dev_info;
222 struct rte_eth_conf port_conf;
223 struct rte_eth_rxconf *rxconf;
224 struct rte_eth_txconf *txconf;
225 int16_t rx_rings, tx_rings;
226 uint16_t rx_ring_size, tx_ring_size;
230 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
231 retval = rte_eth_dev_info_get(port, &dev_info);
233 RTE_LOG(ERR, VHOST_PORT,
234 "Error during getting device (port %u) info: %s\n",
235 port, strerror(-retval));
240 rxconf = &dev_info.default_rxconf;
241 txconf = &dev_info.default_txconf;
242 rxconf->rx_drop_en = 1;
244 /*configure the number of supported virtio devices based on VMDQ limits */
245 num_devices = dev_info.max_vmdq_pools;
247 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
248 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
251 * When dequeue zero copy is enabled, guest Tx used vring will be
252 * updated only when corresponding mbuf is freed. Thus, the nb_tx_desc
253 * (tx_ring_size here) must be small enough so that the driver will
254 * hit the free threshold easily and free mbufs timely. Otherwise,
255 * guest Tx vring would be starved.
257 if (dequeue_zero_copy)
260 tx_rings = (uint16_t)rte_lcore_count();
262 /* Get port configuration. */
263 retval = get_eth_conf(&port_conf, num_devices);
266 /* NIC queues are divided into pf queues and vmdq queues. */
267 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
268 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
269 num_vmdq_queues = num_devices * queues_per_pool;
270 num_queues = num_pf_queues + num_vmdq_queues;
271 vmdq_queue_base = dev_info.vmdq_queue_base;
272 vmdq_pool_base = dev_info.vmdq_pool_base;
273 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
274 num_pf_queues, num_devices, queues_per_pool);
276 if (!rte_eth_dev_is_valid_port(port))
279 rx_rings = (uint16_t)dev_info.max_rx_queues;
280 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
281 port_conf.txmode.offloads |=
282 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
283 /* Configure ethernet device. */
284 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
286 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
287 port, strerror(-retval));
291 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
294 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
295 "for port %u: %s.\n", port, strerror(-retval));
298 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
299 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
300 "for Rx queues on port %u.\n", port);
304 /* Setup the queues. */
305 rxconf->offloads = port_conf.rxmode.offloads;
306 for (q = 0; q < rx_rings; q ++) {
307 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
308 rte_eth_dev_socket_id(port),
312 RTE_LOG(ERR, VHOST_PORT,
313 "Failed to setup rx queue %u of port %u: %s.\n",
314 q, port, strerror(-retval));
318 txconf->offloads = port_conf.txmode.offloads;
319 for (q = 0; q < tx_rings; q ++) {
320 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
321 rte_eth_dev_socket_id(port),
324 RTE_LOG(ERR, VHOST_PORT,
325 "Failed to setup tx queue %u of port %u: %s.\n",
326 q, port, strerror(-retval));
331 /* Start the device. */
332 retval = rte_eth_dev_start(port);
334 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
335 port, strerror(-retval));
340 retval = rte_eth_promiscuous_enable(port);
342 RTE_LOG(ERR, VHOST_PORT,
343 "Failed to enable promiscuous mode on port %u: %s\n",
344 port, rte_strerror(-retval));
349 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
351 RTE_LOG(ERR, VHOST_PORT,
352 "Failed to get MAC address on port %u: %s\n",
353 port, rte_strerror(-retval));
357 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
358 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
359 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
361 vmdq_ports_eth_addr[port].addr_bytes[0],
362 vmdq_ports_eth_addr[port].addr_bytes[1],
363 vmdq_ports_eth_addr[port].addr_bytes[2],
364 vmdq_ports_eth_addr[port].addr_bytes[3],
365 vmdq_ports_eth_addr[port].addr_bytes[4],
366 vmdq_ports_eth_addr[port].addr_bytes[5]);
372 * Set socket file path.
375 us_vhost_parse_socket_path(const char *q_arg)
379 /* parse number string */
380 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
384 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
385 if (socket_files == NULL) {
390 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
397 * Parse the portmask provided at run time.
400 parse_portmask(const char *portmask)
407 /* parse hexadecimal string */
408 pm = strtoul(portmask, &end, 16);
409 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
420 * Parse num options at run time.
423 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
430 /* parse unsigned int string */
431 num = strtoul(q_arg, &end, 10);
432 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
435 if (num > max_valid_value)
446 us_vhost_usage(const char *prgname)
448 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
450 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
451 " --socket-file <path>\n"
453 " -p PORTMASK: Set mask for ports to be used by application\n"
454 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
455 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
456 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
457 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
458 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
459 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
460 " --socket-file: The path of the socket file.\n"
461 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
462 " --tso [0|1] disable/enable TCP segment offload.\n"
463 " --client register a vhost-user socket as client mode.\n"
464 " --dequeue-zero-copy enables dequeue zero copy\n",
469 * Parse the arguments given in the command line of the application.
472 us_vhost_parse_args(int argc, char **argv)
477 const char *prgname = argv[0];
478 static struct option long_option[] = {
479 {"vm2vm", required_argument, NULL, 0},
480 {"rx-retry", required_argument, NULL, 0},
481 {"rx-retry-delay", required_argument, NULL, 0},
482 {"rx-retry-num", required_argument, NULL, 0},
483 {"mergeable", required_argument, NULL, 0},
484 {"stats", required_argument, NULL, 0},
485 {"socket-file", required_argument, NULL, 0},
486 {"tx-csum", required_argument, NULL, 0},
487 {"tso", required_argument, NULL, 0},
488 {"client", no_argument, &client_mode, 1},
489 {"dequeue-zero-copy", no_argument, &dequeue_zero_copy, 1},
490 {"builtin-net-driver", no_argument, &builtin_net_driver, 1},
494 /* Parse command line */
495 while ((opt = getopt_long(argc, argv, "p:P",
496 long_option, &option_index)) != EOF) {
500 enabled_port_mask = parse_portmask(optarg);
501 if (enabled_port_mask == 0) {
502 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
503 us_vhost_usage(prgname);
510 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
511 ETH_VMDQ_ACCEPT_BROADCAST |
512 ETH_VMDQ_ACCEPT_MULTICAST;
517 /* Enable/disable vm2vm comms. */
518 if (!strncmp(long_option[option_index].name, "vm2vm",
520 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
522 RTE_LOG(INFO, VHOST_CONFIG,
523 "Invalid argument for "
525 us_vhost_usage(prgname);
528 vm2vm_mode = (vm2vm_type)ret;
532 /* Enable/disable retries on RX. */
533 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
534 ret = parse_num_opt(optarg, 1);
536 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
537 us_vhost_usage(prgname);
544 /* Enable/disable TX checksum offload. */
545 if (!strncmp(long_option[option_index].name, "tx-csum", MAX_LONG_OPT_SZ)) {
546 ret = parse_num_opt(optarg, 1);
548 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
549 us_vhost_usage(prgname);
552 enable_tx_csum = ret;
555 /* Enable/disable TSO offload. */
556 if (!strncmp(long_option[option_index].name, "tso", MAX_LONG_OPT_SZ)) {
557 ret = parse_num_opt(optarg, 1);
559 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
560 us_vhost_usage(prgname);
566 /* Specify the retries delay time (in useconds) on RX. */
567 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
568 ret = parse_num_opt(optarg, INT32_MAX);
570 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
571 us_vhost_usage(prgname);
574 burst_rx_delay_time = ret;
578 /* Specify the retries number on RX. */
579 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
580 ret = parse_num_opt(optarg, INT32_MAX);
582 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
583 us_vhost_usage(prgname);
586 burst_rx_retry_num = ret;
590 /* Enable/disable RX mergeable buffers. */
591 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
592 ret = parse_num_opt(optarg, 1);
594 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
595 us_vhost_usage(prgname);
600 vmdq_conf_default.rxmode.offloads |=
601 DEV_RX_OFFLOAD_JUMBO_FRAME;
602 vmdq_conf_default.rxmode.max_rx_pkt_len
603 = JUMBO_FRAME_MAX_SIZE;
608 /* Enable/disable stats. */
609 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
610 ret = parse_num_opt(optarg, INT32_MAX);
612 RTE_LOG(INFO, VHOST_CONFIG,
613 "Invalid argument for stats [0..N]\n");
614 us_vhost_usage(prgname);
621 /* Set socket file path. */
622 if (!strncmp(long_option[option_index].name,
623 "socket-file", MAX_LONG_OPT_SZ)) {
624 if (us_vhost_parse_socket_path(optarg) == -1) {
625 RTE_LOG(INFO, VHOST_CONFIG,
626 "Invalid argument for socket name (Max %d characters)\n",
628 us_vhost_usage(prgname);
635 /* Invalid option - print options. */
637 us_vhost_usage(prgname);
642 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
643 if (enabled_port_mask & (1 << i))
644 ports[num_ports++] = i;
647 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
648 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
649 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
657 * Update the global var NUM_PORTS and array PORTS according to system ports number
658 * and return valid ports number
660 static unsigned check_ports_num(unsigned nb_ports)
662 unsigned valid_num_ports = num_ports;
665 if (num_ports > nb_ports) {
666 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
667 num_ports, nb_ports);
668 num_ports = nb_ports;
671 for (portid = 0; portid < num_ports; portid ++) {
672 if (!rte_eth_dev_is_valid_port(ports[portid])) {
673 RTE_LOG(INFO, VHOST_PORT,
674 "\nSpecified port ID(%u) is not valid\n",
676 ports[portid] = INVALID_PORT_ID;
680 return valid_num_ports;
683 static __rte_always_inline struct vhost_dev *
684 find_vhost_dev(struct rte_ether_addr *mac)
686 struct vhost_dev *vdev;
688 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
689 if (vdev->ready == DEVICE_RX &&
690 rte_is_same_ether_addr(mac, &vdev->mac_address))
698 * This function learns the MAC address of the device and registers this along with a
699 * vlan tag to a VMDQ.
702 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
704 struct rte_ether_hdr *pkt_hdr;
707 /* Learn MAC address of guest device from packet */
708 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
710 if (find_vhost_dev(&pkt_hdr->s_addr)) {
711 RTE_LOG(ERR, VHOST_DATA,
712 "(%d) device is using a registered MAC!\n",
717 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
718 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
720 /* vlan_tag currently uses the device_id. */
721 vdev->vlan_tag = vlan_tags[vdev->vid];
723 /* Print out VMDQ registration info. */
724 RTE_LOG(INFO, VHOST_DATA,
725 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
727 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
728 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
729 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
732 /* Register the MAC address. */
733 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
734 (uint32_t)vdev->vid + vmdq_pool_base);
736 RTE_LOG(ERR, VHOST_DATA,
737 "(%d) failed to add device MAC address to VMDQ\n",
740 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
742 /* Set device as ready for RX. */
743 vdev->ready = DEVICE_RX;
749 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
750 * queue before disabling RX on the device.
753 unlink_vmdq(struct vhost_dev *vdev)
757 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
759 if (vdev->ready == DEVICE_RX) {
760 /*clear MAC and VLAN settings*/
761 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
762 for (i = 0; i < 6; i++)
763 vdev->mac_address.addr_bytes[i] = 0;
767 /*Clear out the receive buffers*/
768 rx_count = rte_eth_rx_burst(ports[0],
769 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
772 for (i = 0; i < rx_count; i++)
773 rte_pktmbuf_free(pkts_burst[i]);
775 rx_count = rte_eth_rx_burst(ports[0],
776 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
779 vdev->ready = DEVICE_MAC_LEARNING;
783 static __rte_always_inline void
784 virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
789 if (builtin_net_driver) {
790 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
792 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
796 rte_atomic64_inc(&dst_vdev->stats.rx_total_atomic);
797 rte_atomic64_add(&dst_vdev->stats.rx_atomic, ret);
798 src_vdev->stats.tx_total++;
799 src_vdev->stats.tx += ret;
804 * Check if the packet destination MAC address is for a local device. If so then put
805 * the packet on that devices RX queue. If not then return.
807 static __rte_always_inline int
808 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
810 struct rte_ether_hdr *pkt_hdr;
811 struct vhost_dev *dst_vdev;
813 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
815 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
819 if (vdev->vid == dst_vdev->vid) {
820 RTE_LOG_DP(DEBUG, VHOST_DATA,
821 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
826 RTE_LOG_DP(DEBUG, VHOST_DATA,
827 "(%d) TX: MAC address is local\n", dst_vdev->vid);
829 if (unlikely(dst_vdev->remove)) {
830 RTE_LOG_DP(DEBUG, VHOST_DATA,
831 "(%d) device is marked for removal\n", dst_vdev->vid);
835 virtio_xmit(dst_vdev, vdev, m);
840 * Check if the destination MAC of a packet is one local VM,
841 * and get its vlan tag, and offset if it is.
843 static __rte_always_inline int
844 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
845 uint32_t *offset, uint16_t *vlan_tag)
847 struct vhost_dev *dst_vdev;
848 struct rte_ether_hdr *pkt_hdr =
849 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
851 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
855 if (vdev->vid == dst_vdev->vid) {
856 RTE_LOG_DP(DEBUG, VHOST_DATA,
857 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
863 * HW vlan strip will reduce the packet length
864 * by minus length of vlan tag, so need restore
865 * the packet length by plus it.
868 *vlan_tag = vlan_tags[vdev->vid];
870 RTE_LOG_DP(DEBUG, VHOST_DATA,
871 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
872 vdev->vid, dst_vdev->vid, *vlan_tag);
878 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
880 if (ol_flags & PKT_TX_IPV4)
881 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
882 else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
883 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
886 static void virtio_tx_offload(struct rte_mbuf *m)
889 struct rte_ipv4_hdr *ipv4_hdr = NULL;
890 struct rte_tcp_hdr *tcp_hdr = NULL;
891 struct rte_ether_hdr *eth_hdr =
892 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
894 l3_hdr = (char *)eth_hdr + m->l2_len;
896 if (m->ol_flags & PKT_TX_IPV4) {
898 ipv4_hdr->hdr_checksum = 0;
899 m->ol_flags |= PKT_TX_IP_CKSUM;
902 tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
903 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
907 free_pkts(struct rte_mbuf **pkts, uint16_t n)
910 rte_pktmbuf_free(pkts[n]);
913 static __rte_always_inline void
914 do_drain_mbuf_table(struct mbuf_table *tx_q)
918 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
919 tx_q->m_table, tx_q->len);
920 if (unlikely(count < tx_q->len))
921 free_pkts(&tx_q->m_table[count], tx_q->len - count);
927 * This function routes the TX packet to the correct interface. This
928 * may be a local device or the physical port.
930 static __rte_always_inline void
931 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
933 struct mbuf_table *tx_q;
935 const uint16_t lcore_id = rte_lcore_id();
936 struct rte_ether_hdr *nh;
939 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
940 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
941 struct vhost_dev *vdev2;
943 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
945 virtio_xmit(vdev2, vdev, m);
950 /*check if destination is local VM*/
951 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
956 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
957 if (unlikely(find_local_dest(vdev, m, &offset,
964 RTE_LOG_DP(DEBUG, VHOST_DATA,
965 "(%d) TX: MAC address is external\n", vdev->vid);
969 /*Add packet to the port tx queue*/
970 tx_q = &lcore_tx_queue[lcore_id];
972 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
973 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
974 /* Guest has inserted the vlan tag. */
975 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
976 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
977 if ((vm2vm_mode == VM2VM_HARDWARE) &&
978 (vh->vlan_tci != vlan_tag_be))
979 vh->vlan_tci = vlan_tag_be;
981 m->ol_flags |= PKT_TX_VLAN_PKT;
984 * Find the right seg to adjust the data len when offset is
985 * bigger than tail room size.
987 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
988 if (likely(offset <= rte_pktmbuf_tailroom(m)))
989 m->data_len += offset;
991 struct rte_mbuf *seg = m;
993 while ((seg->next != NULL) &&
994 (offset > rte_pktmbuf_tailroom(seg)))
997 seg->data_len += offset;
999 m->pkt_len += offset;
1002 m->vlan_tci = vlan_tag;
1005 if (m->ol_flags & PKT_TX_TCP_SEG)
1006 virtio_tx_offload(m);
1008 tx_q->m_table[tx_q->len++] = m;
1010 vdev->stats.tx_total++;
1014 if (unlikely(tx_q->len == MAX_PKT_BURST))
1015 do_drain_mbuf_table(tx_q);
1019 static __rte_always_inline void
1020 drain_mbuf_table(struct mbuf_table *tx_q)
1022 static uint64_t prev_tsc;
1028 cur_tsc = rte_rdtsc();
1029 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1032 RTE_LOG_DP(DEBUG, VHOST_DATA,
1033 "TX queue drained after timeout with burst size %u\n",
1035 do_drain_mbuf_table(tx_q);
1039 static __rte_always_inline void
1040 drain_eth_rx(struct vhost_dev *vdev)
1042 uint16_t rx_count, enqueue_count;
1043 struct rte_mbuf *pkts[MAX_PKT_BURST];
1045 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1046 pkts, MAX_PKT_BURST);
1051 * When "enable_retry" is set, here we wait and retry when there
1052 * is no enough free slots in the queue to hold @rx_count packets,
1053 * to diminish packet loss.
1056 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1060 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1061 rte_delay_us(burst_rx_delay_time);
1062 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1068 if (builtin_net_driver) {
1069 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1072 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1076 rte_atomic64_add(&vdev->stats.rx_total_atomic, rx_count);
1077 rte_atomic64_add(&vdev->stats.rx_atomic, enqueue_count);
1080 free_pkts(pkts, rx_count);
1083 static __rte_always_inline void
1084 drain_virtio_tx(struct vhost_dev *vdev)
1086 struct rte_mbuf *pkts[MAX_PKT_BURST];
1090 if (builtin_net_driver) {
1091 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1092 pkts, MAX_PKT_BURST);
1094 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1095 mbuf_pool, pkts, MAX_PKT_BURST);
1098 /* setup VMDq for the first packet */
1099 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1100 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1101 free_pkts(pkts, count);
1104 for (i = 0; i < count; ++i)
1105 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1109 * Main function of vhost-switch. It basically does:
1111 * for each vhost device {
1114 * Which drains the host eth Rx queue linked to the vhost device,
1115 * and deliver all of them to guest virito Rx ring associated with
1116 * this vhost device.
1118 * - drain_virtio_tx()
1120 * Which drains the guest virtio Tx queue and deliver all of them
1121 * to the target, which could be another vhost device, or the
1122 * physical eth dev. The route is done in function "virtio_tx_route".
1126 switch_worker(void *arg __rte_unused)
1129 unsigned lcore_id = rte_lcore_id();
1130 struct vhost_dev *vdev;
1131 struct mbuf_table *tx_q;
1133 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1135 tx_q = &lcore_tx_queue[lcore_id];
1136 for (i = 0; i < rte_lcore_count(); i++) {
1137 if (lcore_ids[i] == lcore_id) {
1144 drain_mbuf_table(tx_q);
1147 * Inform the configuration core that we have exited the
1148 * linked list and that no devices are in use if requested.
1150 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1151 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1154 * Process vhost devices
1156 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1158 if (unlikely(vdev->remove)) {
1160 vdev->ready = DEVICE_SAFE_REMOVE;
1164 if (likely(vdev->ready == DEVICE_RX))
1167 if (likely(!vdev->remove))
1168 drain_virtio_tx(vdev);
1176 * Remove a device from the specific data core linked list and from the
1177 * main linked list. Synchonization occurs through the use of the
1178 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1179 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1182 destroy_device(int vid)
1184 struct vhost_dev *vdev = NULL;
1187 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1188 if (vdev->vid == vid)
1193 /*set the remove flag. */
1195 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1199 if (builtin_net_driver)
1200 vs_vhost_net_remove(vdev);
1202 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1204 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1207 /* Set the dev_removal_flag on each lcore. */
1208 RTE_LCORE_FOREACH_SLAVE(lcore)
1209 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1212 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1213 * we can be sure that they can no longer access the device removed
1214 * from the linked lists and that the devices are no longer in use.
1216 RTE_LCORE_FOREACH_SLAVE(lcore) {
1217 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1221 lcore_info[vdev->coreid].device_num--;
1223 RTE_LOG(INFO, VHOST_DATA,
1224 "(%d) device has been removed from data core\n",
1231 * A new device is added to a data core. First the device is added to the main linked list
1232 * and then allocated to a specific data core.
1237 int lcore, core_add = 0;
1238 uint32_t device_num_min = num_devices;
1239 struct vhost_dev *vdev;
1241 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1243 RTE_LOG(INFO, VHOST_DATA,
1244 "(%d) couldn't allocate memory for vhost dev\n",
1250 if (builtin_net_driver)
1251 vs_vhost_net_setup(vdev);
1253 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1254 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1256 /*reset ready flag*/
1257 vdev->ready = DEVICE_MAC_LEARNING;
1260 /* Find a suitable lcore to add the device. */
1261 RTE_LCORE_FOREACH_SLAVE(lcore) {
1262 if (lcore_info[lcore].device_num < device_num_min) {
1263 device_num_min = lcore_info[lcore].device_num;
1267 vdev->coreid = core_add;
1269 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1271 lcore_info[vdev->coreid].device_num++;
1273 /* Disable notifications. */
1274 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1275 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1277 RTE_LOG(INFO, VHOST_DATA,
1278 "(%d) device has been added to data core %d\n",
1285 * These callback allow devices to be added to the data core when configuration
1286 * has been fully complete.
1288 static const struct vhost_device_ops virtio_net_device_ops =
1290 .new_device = new_device,
1291 .destroy_device = destroy_device,
1295 * This is a thread will wake up after a period to print stats if the user has
1299 print_stats(__rte_unused void *arg)
1301 struct vhost_dev *vdev;
1302 uint64_t tx_dropped, rx_dropped;
1303 uint64_t tx, tx_total, rx, rx_total;
1304 const char clr[] = { 27, '[', '2', 'J', '\0' };
1305 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1308 sleep(enable_stats);
1310 /* Clear screen and move to top left */
1311 printf("%s%s\n", clr, top_left);
1312 printf("Device statistics =================================\n");
1314 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1315 tx_total = vdev->stats.tx_total;
1316 tx = vdev->stats.tx;
1317 tx_dropped = tx_total - tx;
1319 rx_total = rte_atomic64_read(&vdev->stats.rx_total_atomic);
1320 rx = rte_atomic64_read(&vdev->stats.rx_atomic);
1321 rx_dropped = rx_total - rx;
1323 printf("Statistics for device %d\n"
1324 "-----------------------\n"
1325 "TX total: %" PRIu64 "\n"
1326 "TX dropped: %" PRIu64 "\n"
1327 "TX successful: %" PRIu64 "\n"
1328 "RX total: %" PRIu64 "\n"
1329 "RX dropped: %" PRIu64 "\n"
1330 "RX successful: %" PRIu64 "\n",
1332 tx_total, tx_dropped, tx,
1333 rx_total, rx_dropped, rx);
1336 printf("===================================================\n");
1343 unregister_drivers(int socket_num)
1347 for (i = 0; i < socket_num; i++) {
1348 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1350 RTE_LOG(ERR, VHOST_CONFIG,
1351 "Fail to unregister vhost driver for %s.\n",
1352 socket_files + i * PATH_MAX);
1356 /* When we receive a INT signal, unregister vhost driver */
1358 sigint_handler(__rte_unused int signum)
1360 /* Unregister vhost driver. */
1361 unregister_drivers(nb_sockets);
1367 * While creating an mbuf pool, one key thing is to figure out how
1368 * many mbuf entries is enough for our use. FYI, here are some
1371 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1373 * - For each switch core (A CPU core does the packet switch), we need
1374 * also make some reservation for receiving the packets from virtio
1375 * Tx queue. How many is enough depends on the usage. It's normally
1376 * a simple calculation like following:
1378 * MAX_PKT_BURST * max packet size / mbuf size
1380 * So, we definitely need allocate more mbufs when TSO is enabled.
1382 * - Similarly, for each switching core, we should serve @nr_rx_desc
1383 * mbufs for receiving the packets from physical NIC device.
1385 * - We also need make sure, for each switch core, we have allocated
1386 * enough mbufs to fill up the mbuf cache.
1389 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1390 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1393 uint32_t nr_mbufs_per_core;
1394 uint32_t mtu = 1500;
1401 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST /
1402 (mbuf_size - RTE_PKTMBUF_HEADROOM);
1403 nr_mbufs_per_core += nr_rx_desc;
1404 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1406 nr_mbufs = nr_queues * nr_rx_desc;
1407 nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1408 nr_mbufs *= nr_port;
1410 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1411 nr_mbuf_cache, 0, mbuf_size,
1413 if (mbuf_pool == NULL)
1414 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1418 * Main function, does initialisation and calls the per-lcore functions.
1421 main(int argc, char *argv[])
1423 unsigned lcore_id, core_id = 0;
1424 unsigned nb_ports, valid_num_ports;
1427 static pthread_t tid;
1430 signal(SIGINT, sigint_handler);
1433 ret = rte_eal_init(argc, argv);
1435 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1439 /* parse app arguments */
1440 ret = us_vhost_parse_args(argc, argv);
1442 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1444 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1445 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1447 if (rte_lcore_is_enabled(lcore_id))
1448 lcore_ids[core_id++] = lcore_id;
1451 if (rte_lcore_count() > RTE_MAX_LCORE)
1452 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1454 /* Get the number of physical ports. */
1455 nb_ports = rte_eth_dev_count_avail();
1458 * Update the global var NUM_PORTS and global array PORTS
1459 * and get value of var VALID_NUM_PORTS according to system ports number
1461 valid_num_ports = check_ports_num(nb_ports);
1463 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1464 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1465 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1470 * FIXME: here we are trying to allocate mbufs big enough for
1471 * @MAX_QUEUES, but the truth is we're never going to use that
1472 * many queues here. We probably should only do allocation for
1473 * those queues we are going to use.
1475 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1476 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1478 if (vm2vm_mode == VM2VM_HARDWARE) {
1479 /* Enable VT loop back to let L2 switch to do it. */
1480 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1481 RTE_LOG(DEBUG, VHOST_CONFIG,
1482 "Enable loop back for L2 switch in vmdq.\n");
1485 /* initialize all ports */
1486 RTE_ETH_FOREACH_DEV(portid) {
1487 /* skip ports that are not enabled */
1488 if ((enabled_port_mask & (1 << portid)) == 0) {
1489 RTE_LOG(INFO, VHOST_PORT,
1490 "Skipping disabled port %d\n", portid);
1493 if (port_init(portid) != 0)
1494 rte_exit(EXIT_FAILURE,
1495 "Cannot initialize network ports\n");
1498 /* Enable stats if the user option is set. */
1500 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1503 rte_exit(EXIT_FAILURE,
1504 "Cannot create print-stats thread\n");
1507 /* Launch all data cores. */
1508 RTE_LCORE_FOREACH_SLAVE(lcore_id)
1509 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1512 flags |= RTE_VHOST_USER_CLIENT;
1514 if (dequeue_zero_copy)
1515 flags |= RTE_VHOST_USER_DEQUEUE_ZERO_COPY;
1517 /* Register vhost user driver to handle vhost messages. */
1518 for (i = 0; i < nb_sockets; i++) {
1519 char *file = socket_files + i * PATH_MAX;
1520 ret = rte_vhost_driver_register(file, flags);
1522 unregister_drivers(i);
1523 rte_exit(EXIT_FAILURE,
1524 "vhost driver register failure.\n");
1527 if (builtin_net_driver)
1528 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1530 if (mergeable == 0) {
1531 rte_vhost_driver_disable_features(file,
1532 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1535 if (enable_tx_csum == 0) {
1536 rte_vhost_driver_disable_features(file,
1537 1ULL << VIRTIO_NET_F_CSUM);
1540 if (enable_tso == 0) {
1541 rte_vhost_driver_disable_features(file,
1542 1ULL << VIRTIO_NET_F_HOST_TSO4);
1543 rte_vhost_driver_disable_features(file,
1544 1ULL << VIRTIO_NET_F_HOST_TSO6);
1545 rte_vhost_driver_disable_features(file,
1546 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1547 rte_vhost_driver_disable_features(file,
1548 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1552 rte_vhost_driver_enable_features(file,
1553 1ULL << VIRTIO_NET_F_CTRL_RX);
1556 ret = rte_vhost_driver_callback_register(file,
1557 &virtio_net_device_ops);
1559 rte_exit(EXIT_FAILURE,
1560 "failed to register vhost driver callbacks.\n");
1563 if (rte_vhost_driver_start(file) < 0) {
1564 rte_exit(EXIT_FAILURE,
1565 "failed to start vhost driver.\n");
1569 RTE_LCORE_FOREACH_SLAVE(lcore_id)
1570 rte_eal_wait_lcore(lcore_id);