1 /* SPDX-License-Identifier: BSD-3-Clause
2 * Copyright(c) 2010-2017 Intel Corporation
7 #include <linux/if_ether.h>
8 #include <linux/if_vlan.h>
9 #include <linux/virtio_net.h>
10 #include <linux/virtio_ring.h>
13 #include <sys/eventfd.h>
14 #include <sys/param.h>
17 #include <rte_cycles.h>
18 #include <rte_ethdev.h>
20 #include <rte_string_fns.h>
21 #include <rte_malloc.h>
22 #include <rte_vhost.h>
25 #include <rte_pause.h>
31 #define MAX_QUEUES 128
34 /* the maximum number of external ports supported */
35 #define MAX_SUP_PORTS 1
37 #define MBUF_CACHE_SIZE 128
38 #define MBUF_DATA_SIZE RTE_MBUF_DEFAULT_BUF_SIZE
40 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
42 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
43 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
45 #define JUMBO_FRAME_MAX_SIZE 0x2600
47 /* State of virtio device. */
48 #define DEVICE_MAC_LEARNING 0
50 #define DEVICE_SAFE_REMOVE 2
52 /* Configurable number of RX/TX ring descriptors */
53 #define RTE_TEST_RX_DESC_DEFAULT 1024
54 #define RTE_TEST_TX_DESC_DEFAULT 512
56 #define INVALID_PORT_ID 0xFF
58 /* mask of enabled ports */
59 static uint32_t enabled_port_mask = 0;
61 /* Promiscuous mode */
62 static uint32_t promiscuous;
64 /* number of devices/queues to support*/
65 static uint32_t num_queues = 0;
66 static uint32_t num_devices;
68 static struct rte_mempool *mbuf_pool;
71 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
78 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
81 static uint32_t enable_stats = 0;
82 /* Enable retries on RX. */
83 static uint32_t enable_retry = 1;
85 /* Disable TX checksum offload */
86 static uint32_t enable_tx_csum;
88 /* Disable TSO offload */
89 static uint32_t enable_tso;
91 static int client_mode;
93 static int builtin_net_driver;
95 static int async_vhost_driver;
97 static char *dma_type;
99 /* Specify timeout (in useconds) between retries on RX. */
100 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
101 /* Specify the number of retries on RX. */
102 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
104 /* Socket file paths. Can be set by user */
105 static char *socket_files;
106 static int nb_sockets;
108 /* empty vmdq configuration structure. Filled in programatically */
109 static struct rte_eth_conf vmdq_conf_default = {
111 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
114 * VLAN strip is necessary for 1G NIC such as I350,
115 * this fixes bug of ipv4 forwarding in guest can't
116 * forward pakets from one virtio dev to another virtio dev.
118 .offloads = DEV_RX_OFFLOAD_VLAN_STRIP,
122 .mq_mode = ETH_MQ_TX_NONE,
123 .offloads = (DEV_TX_OFFLOAD_IPV4_CKSUM |
124 DEV_TX_OFFLOAD_TCP_CKSUM |
125 DEV_TX_OFFLOAD_VLAN_INSERT |
126 DEV_TX_OFFLOAD_MULTI_SEGS |
127 DEV_TX_OFFLOAD_TCP_TSO),
131 * should be overridden separately in code with
135 .nb_queue_pools = ETH_8_POOLS,
136 .enable_default_pool = 0,
139 .pool_map = {{0, 0},},
145 static unsigned lcore_ids[RTE_MAX_LCORE];
146 static uint16_t ports[RTE_MAX_ETHPORTS];
147 static unsigned num_ports = 0; /**< The number of ports specified in command line */
148 static uint16_t num_pf_queues, num_vmdq_queues;
149 static uint16_t vmdq_pool_base, vmdq_queue_base;
150 static uint16_t queues_per_pool;
152 const uint16_t vlan_tags[] = {
153 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
154 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
155 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
156 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
157 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
158 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
159 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
160 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
163 /* ethernet addresses of ports */
164 static struct rte_ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
166 static struct vhost_dev_tailq_list vhost_dev_list =
167 TAILQ_HEAD_INITIALIZER(vhost_dev_list);
169 static struct lcore_info lcore_info[RTE_MAX_LCORE];
171 /* Used for queueing bursts of TX packets. */
175 struct rte_mbuf *m_table[MAX_PKT_BURST];
178 struct vhost_bufftable {
181 struct rte_mbuf *m_table[MAX_PKT_BURST];
184 /* TX queue for each data core. */
185 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
188 * Vhost TX buffer for each data core.
189 * Every data core maintains a TX buffer for every vhost device,
190 * which is used for batch pkts enqueue for higher performance.
192 struct vhost_bufftable *vhost_txbuff[RTE_MAX_LCORE * MAX_VHOST_DEVICE];
194 #define MBUF_TABLE_DRAIN_TSC ((rte_get_tsc_hz() + US_PER_S - 1) \
195 / US_PER_S * BURST_TX_DRAIN_US)
199 open_dma(const char *value)
201 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0)
202 return open_ioat(value);
208 * Builds up the correct configuration for VMDQ VLAN pool map
209 * according to the pool & queue limits.
212 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
214 struct rte_eth_vmdq_rx_conf conf;
215 struct rte_eth_vmdq_rx_conf *def_conf =
216 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
219 memset(&conf, 0, sizeof(conf));
220 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
221 conf.nb_pool_maps = num_devices;
222 conf.enable_loop_back = def_conf->enable_loop_back;
223 conf.rx_mode = def_conf->rx_mode;
225 for (i = 0; i < conf.nb_pool_maps; i++) {
226 conf.pool_map[i].vlan_id = vlan_tags[ i ];
227 conf.pool_map[i].pools = (1UL << i);
230 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
231 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
232 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
237 * Initialises a given port using global settings and with the rx buffers
238 * coming from the mbuf_pool passed as parameter
241 port_init(uint16_t port)
243 struct rte_eth_dev_info dev_info;
244 struct rte_eth_conf port_conf;
245 struct rte_eth_rxconf *rxconf;
246 struct rte_eth_txconf *txconf;
247 int16_t rx_rings, tx_rings;
248 uint16_t rx_ring_size, tx_ring_size;
252 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
253 retval = rte_eth_dev_info_get(port, &dev_info);
255 RTE_LOG(ERR, VHOST_PORT,
256 "Error during getting device (port %u) info: %s\n",
257 port, strerror(-retval));
262 rxconf = &dev_info.default_rxconf;
263 txconf = &dev_info.default_txconf;
264 rxconf->rx_drop_en = 1;
266 /*configure the number of supported virtio devices based on VMDQ limits */
267 num_devices = dev_info.max_vmdq_pools;
269 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
270 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
272 tx_rings = (uint16_t)rte_lcore_count();
274 /* Get port configuration. */
275 retval = get_eth_conf(&port_conf, num_devices);
278 /* NIC queues are divided into pf queues and vmdq queues. */
279 num_pf_queues = dev_info.max_rx_queues - dev_info.vmdq_queue_num;
280 queues_per_pool = dev_info.vmdq_queue_num / dev_info.max_vmdq_pools;
281 num_vmdq_queues = num_devices * queues_per_pool;
282 num_queues = num_pf_queues + num_vmdq_queues;
283 vmdq_queue_base = dev_info.vmdq_queue_base;
284 vmdq_pool_base = dev_info.vmdq_pool_base;
285 printf("pf queue num: %u, configured vmdq pool num: %u, each vmdq pool has %u queues\n",
286 num_pf_queues, num_devices, queues_per_pool);
288 if (!rte_eth_dev_is_valid_port(port))
291 rx_rings = (uint16_t)dev_info.max_rx_queues;
292 if (dev_info.tx_offload_capa & DEV_TX_OFFLOAD_MBUF_FAST_FREE)
293 port_conf.txmode.offloads |=
294 DEV_TX_OFFLOAD_MBUF_FAST_FREE;
295 /* Configure ethernet device. */
296 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
298 RTE_LOG(ERR, VHOST_PORT, "Failed to configure port %u: %s.\n",
299 port, strerror(-retval));
303 retval = rte_eth_dev_adjust_nb_rx_tx_desc(port, &rx_ring_size,
306 RTE_LOG(ERR, VHOST_PORT, "Failed to adjust number of descriptors "
307 "for port %u: %s.\n", port, strerror(-retval));
310 if (rx_ring_size > RTE_TEST_RX_DESC_DEFAULT) {
311 RTE_LOG(ERR, VHOST_PORT, "Mbuf pool has an insufficient size "
312 "for Rx queues on port %u.\n", port);
316 /* Setup the queues. */
317 rxconf->offloads = port_conf.rxmode.offloads;
318 for (q = 0; q < rx_rings; q ++) {
319 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
320 rte_eth_dev_socket_id(port),
324 RTE_LOG(ERR, VHOST_PORT,
325 "Failed to setup rx queue %u of port %u: %s.\n",
326 q, port, strerror(-retval));
330 txconf->offloads = port_conf.txmode.offloads;
331 for (q = 0; q < tx_rings; q ++) {
332 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
333 rte_eth_dev_socket_id(port),
336 RTE_LOG(ERR, VHOST_PORT,
337 "Failed to setup tx queue %u of port %u: %s.\n",
338 q, port, strerror(-retval));
343 /* Start the device. */
344 retval = rte_eth_dev_start(port);
346 RTE_LOG(ERR, VHOST_PORT, "Failed to start port %u: %s\n",
347 port, strerror(-retval));
352 retval = rte_eth_promiscuous_enable(port);
354 RTE_LOG(ERR, VHOST_PORT,
355 "Failed to enable promiscuous mode on port %u: %s\n",
356 port, rte_strerror(-retval));
361 retval = rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
363 RTE_LOG(ERR, VHOST_PORT,
364 "Failed to get MAC address on port %u: %s\n",
365 port, rte_strerror(-retval));
369 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
370 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
371 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
373 vmdq_ports_eth_addr[port].addr_bytes[0],
374 vmdq_ports_eth_addr[port].addr_bytes[1],
375 vmdq_ports_eth_addr[port].addr_bytes[2],
376 vmdq_ports_eth_addr[port].addr_bytes[3],
377 vmdq_ports_eth_addr[port].addr_bytes[4],
378 vmdq_ports_eth_addr[port].addr_bytes[5]);
384 * Set socket file path.
387 us_vhost_parse_socket_path(const char *q_arg)
391 /* parse number string */
392 if (strnlen(q_arg, PATH_MAX) == PATH_MAX)
396 socket_files = realloc(socket_files, PATH_MAX * (nb_sockets + 1));
397 if (socket_files == NULL) {
402 strlcpy(socket_files + nb_sockets * PATH_MAX, q_arg, PATH_MAX);
409 * Parse the portmask provided at run time.
412 parse_portmask(const char *portmask)
419 /* parse hexadecimal string */
420 pm = strtoul(portmask, &end, 16);
421 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
429 * Parse num options at run time.
432 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
439 /* parse unsigned int string */
440 num = strtoul(q_arg, &end, 10);
441 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
444 if (num > max_valid_value)
455 us_vhost_usage(const char *prgname)
457 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
459 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
460 " --socket-file <path>\n"
462 " -p PORTMASK: Set mask for ports to be used by application\n"
463 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
464 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
465 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
466 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
467 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
468 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
469 " --socket-file: The path of the socket file.\n"
470 " --tx-csum [0|1] disable/enable TX checksum offload.\n"
471 " --tso [0|1] disable/enable TCP segment offload.\n"
472 " --client register a vhost-user socket as client mode.\n"
473 " --dma-type register dma type for your vhost async driver. For example \"ioat\" for now.\n"
474 " --dmas register dma channel for specific vhost device.\n",
479 #define OPT_VM2VM "vm2vm"
481 #define OPT_RX_RETRY "rx-retry"
483 #define OPT_RX_RETRY_DELAY "rx-retry-delay"
484 OPT_RX_RETRY_DELAY_NUM,
485 #define OPT_RX_RETRY_NUMB "rx-retry-num"
486 OPT_RX_RETRY_NUMB_NUM,
487 #define OPT_MERGEABLE "mergeable"
489 #define OPT_STATS "stats"
491 #define OPT_SOCKET_FILE "socket-file"
493 #define OPT_TX_CSUM "tx-csum"
495 #define OPT_TSO "tso"
497 #define OPT_CLIENT "client"
499 #define OPT_BUILTIN_NET_DRIVER "builtin-net-driver"
500 OPT_BUILTIN_NET_DRIVER_NUM,
501 #define OPT_DMA_TYPE "dma-type"
503 #define OPT_DMAS "dmas"
508 * Parse the arguments given in the command line of the application.
511 us_vhost_parse_args(int argc, char **argv)
516 const char *prgname = argv[0];
517 static struct option long_option[] = {
518 {OPT_VM2VM, required_argument,
519 NULL, OPT_VM2VM_NUM},
520 {OPT_RX_RETRY, required_argument,
521 NULL, OPT_RX_RETRY_NUM},
522 {OPT_RX_RETRY_DELAY, required_argument,
523 NULL, OPT_RX_RETRY_DELAY_NUM},
524 {OPT_RX_RETRY_NUMB, required_argument,
525 NULL, OPT_RX_RETRY_NUMB_NUM},
526 {OPT_MERGEABLE, required_argument,
527 NULL, OPT_MERGEABLE_NUM},
528 {OPT_STATS, required_argument,
529 NULL, OPT_STATS_NUM},
530 {OPT_SOCKET_FILE, required_argument,
531 NULL, OPT_SOCKET_FILE_NUM},
532 {OPT_TX_CSUM, required_argument,
533 NULL, OPT_TX_CSUM_NUM},
534 {OPT_TSO, required_argument,
536 {OPT_CLIENT, no_argument,
537 NULL, OPT_CLIENT_NUM},
538 {OPT_BUILTIN_NET_DRIVER, no_argument,
539 NULL, OPT_BUILTIN_NET_DRIVER_NUM},
540 {OPT_DMA_TYPE, required_argument,
541 NULL, OPT_DMA_TYPE_NUM},
542 {OPT_DMAS, required_argument,
547 /* Parse command line */
548 while ((opt = getopt_long(argc, argv, "p:P",
549 long_option, &option_index)) != EOF) {
553 enabled_port_mask = parse_portmask(optarg);
554 if (enabled_port_mask == 0) {
555 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
556 us_vhost_usage(prgname);
563 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
564 ETH_VMDQ_ACCEPT_BROADCAST |
565 ETH_VMDQ_ACCEPT_MULTICAST;
569 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
571 RTE_LOG(INFO, VHOST_CONFIG,
572 "Invalid argument for "
574 us_vhost_usage(prgname);
577 vm2vm_mode = (vm2vm_type)ret;
580 case OPT_RX_RETRY_NUM:
581 ret = parse_num_opt(optarg, 1);
583 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
584 us_vhost_usage(prgname);
590 case OPT_TX_CSUM_NUM:
591 ret = parse_num_opt(optarg, 1);
593 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tx-csum [0|1]\n");
594 us_vhost_usage(prgname);
597 enable_tx_csum = ret;
601 ret = parse_num_opt(optarg, 1);
603 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for tso [0|1]\n");
604 us_vhost_usage(prgname);
610 case OPT_RX_RETRY_DELAY_NUM:
611 ret = parse_num_opt(optarg, INT32_MAX);
613 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
614 us_vhost_usage(prgname);
617 burst_rx_delay_time = ret;
620 case OPT_RX_RETRY_NUMB_NUM:
621 ret = parse_num_opt(optarg, INT32_MAX);
623 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
624 us_vhost_usage(prgname);
627 burst_rx_retry_num = ret;
630 case OPT_MERGEABLE_NUM:
631 ret = parse_num_opt(optarg, 1);
633 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
634 us_vhost_usage(prgname);
639 vmdq_conf_default.rxmode.offloads |=
640 DEV_RX_OFFLOAD_JUMBO_FRAME;
641 vmdq_conf_default.rxmode.max_rx_pkt_len
642 = JUMBO_FRAME_MAX_SIZE;
647 ret = parse_num_opt(optarg, INT32_MAX);
649 RTE_LOG(INFO, VHOST_CONFIG,
650 "Invalid argument for stats [0..N]\n");
651 us_vhost_usage(prgname);
657 /* Set socket file path. */
658 case OPT_SOCKET_FILE_NUM:
659 if (us_vhost_parse_socket_path(optarg) == -1) {
660 RTE_LOG(INFO, VHOST_CONFIG,
661 "Invalid argument for socket name (Max %d characters)\n",
663 us_vhost_usage(prgname);
668 case OPT_DMA_TYPE_NUM:
673 if (open_dma(optarg) == -1) {
674 RTE_LOG(INFO, VHOST_CONFIG,
676 us_vhost_usage(prgname);
679 async_vhost_driver = 1;
686 case OPT_BUILTIN_NET_DRIVER_NUM:
687 builtin_net_driver = 1;
690 /* Invalid option - print options. */
692 us_vhost_usage(prgname);
697 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
698 if (enabled_port_mask & (1 << i))
699 ports[num_ports++] = i;
702 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
703 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
704 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
712 * Update the global var NUM_PORTS and array PORTS according to system ports number
713 * and return valid ports number
715 static unsigned check_ports_num(unsigned nb_ports)
717 unsigned valid_num_ports = num_ports;
720 if (num_ports > nb_ports) {
721 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
722 num_ports, nb_ports);
723 num_ports = nb_ports;
726 for (portid = 0; portid < num_ports; portid ++) {
727 if (!rte_eth_dev_is_valid_port(ports[portid])) {
728 RTE_LOG(INFO, VHOST_PORT,
729 "\nSpecified port ID(%u) is not valid\n",
731 ports[portid] = INVALID_PORT_ID;
735 return valid_num_ports;
738 static __rte_always_inline struct vhost_dev *
739 find_vhost_dev(struct rte_ether_addr *mac)
741 struct vhost_dev *vdev;
743 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
744 if (vdev->ready == DEVICE_RX &&
745 rte_is_same_ether_addr(mac, &vdev->mac_address))
753 * This function learns the MAC address of the device and registers this along with a
754 * vlan tag to a VMDQ.
757 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
759 struct rte_ether_hdr *pkt_hdr;
762 /* Learn MAC address of guest device from packet */
763 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
765 if (find_vhost_dev(&pkt_hdr->s_addr)) {
766 RTE_LOG(ERR, VHOST_DATA,
767 "(%d) device is using a registered MAC!\n",
772 for (i = 0; i < RTE_ETHER_ADDR_LEN; i++)
773 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
775 /* vlan_tag currently uses the device_id. */
776 vdev->vlan_tag = vlan_tags[vdev->vid];
778 /* Print out VMDQ registration info. */
779 RTE_LOG(INFO, VHOST_DATA,
780 "(%d) mac %02x:%02x:%02x:%02x:%02x:%02x and vlan %d registered\n",
782 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
783 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
784 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
787 /* Register the MAC address. */
788 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address,
789 (uint32_t)vdev->vid + vmdq_pool_base);
791 RTE_LOG(ERR, VHOST_DATA,
792 "(%d) failed to add device MAC address to VMDQ\n",
795 rte_eth_dev_set_vlan_strip_on_queue(ports[0], vdev->vmdq_rx_q, 1);
797 /* Set device as ready for RX. */
798 vdev->ready = DEVICE_RX;
804 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
805 * queue before disabling RX on the device.
808 unlink_vmdq(struct vhost_dev *vdev)
812 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
814 if (vdev->ready == DEVICE_RX) {
815 /*clear MAC and VLAN settings*/
816 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
817 for (i = 0; i < 6; i++)
818 vdev->mac_address.addr_bytes[i] = 0;
822 /*Clear out the receive buffers*/
823 rx_count = rte_eth_rx_burst(ports[0],
824 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
827 for (i = 0; i < rx_count; i++)
828 rte_pktmbuf_free(pkts_burst[i]);
830 rx_count = rte_eth_rx_burst(ports[0],
831 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
834 vdev->ready = DEVICE_MAC_LEARNING;
839 free_pkts(struct rte_mbuf **pkts, uint16_t n)
842 rte_pktmbuf_free(pkts[n]);
845 static __rte_always_inline void
846 complete_async_pkts(struct vhost_dev *vdev)
848 struct rte_mbuf *p_cpl[MAX_PKT_BURST];
849 uint16_t complete_count;
851 complete_count = rte_vhost_poll_enqueue_completed(vdev->vid,
852 VIRTIO_RXQ, p_cpl, MAX_PKT_BURST);
854 free_pkts(p_cpl, complete_count);
857 static __rte_always_inline void
858 sync_virtio_xmit(struct vhost_dev *dst_vdev, struct vhost_dev *src_vdev,
863 if (builtin_net_driver) {
864 ret = vs_enqueue_pkts(dst_vdev, VIRTIO_RXQ, &m, 1);
866 ret = rte_vhost_enqueue_burst(dst_vdev->vid, VIRTIO_RXQ, &m, 1);
870 __atomic_add_fetch(&dst_vdev->stats.rx_total_atomic, 1,
872 __atomic_add_fetch(&dst_vdev->stats.rx_atomic, ret,
874 src_vdev->stats.tx_total++;
875 src_vdev->stats.tx += ret;
879 static __rte_always_inline void
880 drain_vhost(struct vhost_dev *vdev)
883 uint32_t buff_idx = rte_lcore_id() * MAX_VHOST_DEVICE + vdev->vid;
884 uint16_t nr_xmit = vhost_txbuff[buff_idx]->len;
885 struct rte_mbuf **m = vhost_txbuff[buff_idx]->m_table;
887 if (builtin_net_driver) {
888 ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit);
889 } else if (async_vhost_driver) {
890 uint32_t cpu_cpl_nr = 0;
891 uint16_t enqueue_fail = 0;
892 struct rte_mbuf *m_cpu_cpl[nr_xmit];
894 complete_async_pkts(vdev);
895 ret = rte_vhost_submit_enqueue_burst(vdev->vid, VIRTIO_RXQ,
896 m, nr_xmit, m_cpu_cpl, &cpu_cpl_nr);
899 free_pkts(m_cpu_cpl, cpu_cpl_nr);
901 enqueue_fail = nr_xmit - ret;
903 free_pkts(&m[ret], nr_xmit - ret);
905 ret = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
910 __atomic_add_fetch(&vdev->stats.rx_total_atomic, nr_xmit,
912 __atomic_add_fetch(&vdev->stats.rx_atomic, ret,
916 if (!async_vhost_driver)
917 free_pkts(m, nr_xmit);
920 static __rte_always_inline void
921 drain_vhost_table(void)
923 uint16_t lcore_id = rte_lcore_id();
924 struct vhost_bufftable *vhost_txq;
925 struct vhost_dev *vdev;
928 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
929 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE
932 cur_tsc = rte_rdtsc();
933 if (unlikely(cur_tsc - vhost_txq->pre_tsc
934 > MBUF_TABLE_DRAIN_TSC)) {
935 RTE_LOG_DP(DEBUG, VHOST_DATA,
936 "Vhost TX queue drained after timeout with burst size %u\n",
940 vhost_txq->pre_tsc = cur_tsc;
946 * Check if the packet destination MAC address is for a local device. If so then put
947 * the packet on that devices RX queue. If not then return.
949 static __rte_always_inline int
950 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
952 struct rte_ether_hdr *pkt_hdr;
953 struct vhost_dev *dst_vdev;
954 struct vhost_bufftable *vhost_txq;
955 uint16_t lcore_id = rte_lcore_id();
956 pkt_hdr = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
958 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
962 if (vdev->vid == dst_vdev->vid) {
963 RTE_LOG_DP(DEBUG, VHOST_DATA,
964 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
969 RTE_LOG_DP(DEBUG, VHOST_DATA,
970 "(%d) TX: MAC address is local\n", dst_vdev->vid);
972 if (unlikely(dst_vdev->remove)) {
973 RTE_LOG_DP(DEBUG, VHOST_DATA,
974 "(%d) device is marked for removal\n", dst_vdev->vid);
978 vhost_txq = vhost_txbuff[lcore_id * MAX_VHOST_DEVICE + dst_vdev->vid];
979 vhost_txq->m_table[vhost_txq->len++] = m;
982 vdev->stats.tx_total++;
986 if (unlikely(vhost_txq->len == MAX_PKT_BURST)) {
987 drain_vhost(dst_vdev);
989 vhost_txq->pre_tsc = rte_rdtsc();
995 * Check if the destination MAC of a packet is one local VM,
996 * and get its vlan tag, and offset if it is.
998 static __rte_always_inline int
999 find_local_dest(struct vhost_dev *vdev, struct rte_mbuf *m,
1000 uint32_t *offset, uint16_t *vlan_tag)
1002 struct vhost_dev *dst_vdev;
1003 struct rte_ether_hdr *pkt_hdr =
1004 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1006 dst_vdev = find_vhost_dev(&pkt_hdr->d_addr);
1010 if (vdev->vid == dst_vdev->vid) {
1011 RTE_LOG_DP(DEBUG, VHOST_DATA,
1012 "(%d) TX: src and dst MAC is same. Dropping packet.\n",
1018 * HW vlan strip will reduce the packet length
1019 * by minus length of vlan tag, so need restore
1020 * the packet length by plus it.
1022 *offset = VLAN_HLEN;
1023 *vlan_tag = vlan_tags[vdev->vid];
1025 RTE_LOG_DP(DEBUG, VHOST_DATA,
1026 "(%d) TX: pkt to local VM device id: (%d), vlan tag: %u.\n",
1027 vdev->vid, dst_vdev->vid, *vlan_tag);
1033 get_psd_sum(void *l3_hdr, uint64_t ol_flags)
1035 if (ol_flags & PKT_TX_IPV4)
1036 return rte_ipv4_phdr_cksum(l3_hdr, ol_flags);
1037 else /* assume ethertype == RTE_ETHER_TYPE_IPV6 */
1038 return rte_ipv6_phdr_cksum(l3_hdr, ol_flags);
1041 static void virtio_tx_offload(struct rte_mbuf *m)
1044 struct rte_ipv4_hdr *ipv4_hdr = NULL;
1045 struct rte_tcp_hdr *tcp_hdr = NULL;
1046 struct rte_ether_hdr *eth_hdr =
1047 rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1049 l3_hdr = (char *)eth_hdr + m->l2_len;
1051 if (m->ol_flags & PKT_TX_IPV4) {
1053 ipv4_hdr->hdr_checksum = 0;
1054 m->ol_flags |= PKT_TX_IP_CKSUM;
1057 tcp_hdr = (struct rte_tcp_hdr *)((char *)l3_hdr + m->l3_len);
1058 tcp_hdr->cksum = get_psd_sum(l3_hdr, m->ol_flags);
1061 static __rte_always_inline void
1062 do_drain_mbuf_table(struct mbuf_table *tx_q)
1066 count = rte_eth_tx_burst(ports[0], tx_q->txq_id,
1067 tx_q->m_table, tx_q->len);
1068 if (unlikely(count < tx_q->len))
1069 free_pkts(&tx_q->m_table[count], tx_q->len - count);
1075 * This function routes the TX packet to the correct interface. This
1076 * may be a local device or the physical port.
1078 static __rte_always_inline void
1079 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1081 struct mbuf_table *tx_q;
1082 unsigned offset = 0;
1083 const uint16_t lcore_id = rte_lcore_id();
1084 struct rte_ether_hdr *nh;
1087 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1088 if (unlikely(rte_is_broadcast_ether_addr(&nh->d_addr))) {
1089 struct vhost_dev *vdev2;
1091 TAILQ_FOREACH(vdev2, &vhost_dev_list, global_vdev_entry) {
1093 sync_virtio_xmit(vdev2, vdev, m);
1098 /*check if destination is local VM*/
1099 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1102 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1103 if (unlikely(find_local_dest(vdev, m, &offset,
1105 rte_pktmbuf_free(m);
1110 RTE_LOG_DP(DEBUG, VHOST_DATA,
1111 "(%d) TX: MAC address is external\n", vdev->vid);
1115 /*Add packet to the port tx queue*/
1116 tx_q = &lcore_tx_queue[lcore_id];
1118 nh = rte_pktmbuf_mtod(m, struct rte_ether_hdr *);
1119 if (unlikely(nh->ether_type == rte_cpu_to_be_16(RTE_ETHER_TYPE_VLAN))) {
1120 /* Guest has inserted the vlan tag. */
1121 struct rte_vlan_hdr *vh = (struct rte_vlan_hdr *) (nh + 1);
1122 uint16_t vlan_tag_be = rte_cpu_to_be_16(vlan_tag);
1123 if ((vm2vm_mode == VM2VM_HARDWARE) &&
1124 (vh->vlan_tci != vlan_tag_be))
1125 vh->vlan_tci = vlan_tag_be;
1127 m->ol_flags |= PKT_TX_VLAN_PKT;
1130 * Find the right seg to adjust the data len when offset is
1131 * bigger than tail room size.
1133 if (unlikely(vm2vm_mode == VM2VM_HARDWARE)) {
1134 if (likely(offset <= rte_pktmbuf_tailroom(m)))
1135 m->data_len += offset;
1137 struct rte_mbuf *seg = m;
1139 while ((seg->next != NULL) &&
1140 (offset > rte_pktmbuf_tailroom(seg)))
1143 seg->data_len += offset;
1145 m->pkt_len += offset;
1148 m->vlan_tci = vlan_tag;
1151 if (m->ol_flags & PKT_TX_TCP_SEG)
1152 virtio_tx_offload(m);
1154 tx_q->m_table[tx_q->len++] = m;
1156 vdev->stats.tx_total++;
1160 if (unlikely(tx_q->len == MAX_PKT_BURST))
1161 do_drain_mbuf_table(tx_q);
1165 static __rte_always_inline void
1166 drain_mbuf_table(struct mbuf_table *tx_q)
1168 static uint64_t prev_tsc;
1174 cur_tsc = rte_rdtsc();
1175 if (unlikely(cur_tsc - prev_tsc > MBUF_TABLE_DRAIN_TSC)) {
1178 RTE_LOG_DP(DEBUG, VHOST_DATA,
1179 "TX queue drained after timeout with burst size %u\n",
1181 do_drain_mbuf_table(tx_q);
1185 static __rte_always_inline void
1186 drain_eth_rx(struct vhost_dev *vdev)
1188 uint16_t rx_count, enqueue_count;
1189 struct rte_mbuf *pkts[MAX_PKT_BURST];
1191 rx_count = rte_eth_rx_burst(ports[0], vdev->vmdq_rx_q,
1192 pkts, MAX_PKT_BURST);
1198 * When "enable_retry" is set, here we wait and retry when there
1199 * is no enough free slots in the queue to hold @rx_count packets,
1200 * to diminish packet loss.
1203 unlikely(rx_count > rte_vhost_avail_entries(vdev->vid,
1207 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1208 rte_delay_us(burst_rx_delay_time);
1209 if (rx_count <= rte_vhost_avail_entries(vdev->vid,
1215 if (builtin_net_driver) {
1216 enqueue_count = vs_enqueue_pkts(vdev, VIRTIO_RXQ,
1218 } else if (async_vhost_driver) {
1219 uint32_t cpu_cpl_nr = 0;
1220 uint16_t enqueue_fail = 0;
1221 struct rte_mbuf *m_cpu_cpl[MAX_PKT_BURST];
1223 complete_async_pkts(vdev);
1224 enqueue_count = rte_vhost_submit_enqueue_burst(vdev->vid,
1225 VIRTIO_RXQ, pkts, rx_count,
1226 m_cpu_cpl, &cpu_cpl_nr);
1228 free_pkts(m_cpu_cpl, cpu_cpl_nr);
1230 enqueue_fail = rx_count - enqueue_count;
1232 free_pkts(&pkts[enqueue_count], enqueue_fail);
1235 enqueue_count = rte_vhost_enqueue_burst(vdev->vid, VIRTIO_RXQ,
1240 __atomic_add_fetch(&vdev->stats.rx_total_atomic, rx_count,
1242 __atomic_add_fetch(&vdev->stats.rx_atomic, enqueue_count,
1246 if (!async_vhost_driver)
1247 free_pkts(pkts, rx_count);
1250 static __rte_always_inline void
1251 drain_virtio_tx(struct vhost_dev *vdev)
1253 struct rte_mbuf *pkts[MAX_PKT_BURST];
1257 if (builtin_net_driver) {
1258 count = vs_dequeue_pkts(vdev, VIRTIO_TXQ, mbuf_pool,
1259 pkts, MAX_PKT_BURST);
1261 count = rte_vhost_dequeue_burst(vdev->vid, VIRTIO_TXQ,
1262 mbuf_pool, pkts, MAX_PKT_BURST);
1265 /* setup VMDq for the first packet */
1266 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && count) {
1267 if (vdev->remove || link_vmdq(vdev, pkts[0]) == -1)
1268 free_pkts(pkts, count);
1271 for (i = 0; i < count; ++i)
1272 virtio_tx_route(vdev, pkts[i], vlan_tags[vdev->vid]);
1276 * Main function of vhost-switch. It basically does:
1278 * for each vhost device {
1281 * Which drains the host eth Rx queue linked to the vhost device,
1282 * and deliver all of them to guest virito Rx ring associated with
1283 * this vhost device.
1285 * - drain_virtio_tx()
1287 * Which drains the guest virtio Tx queue and deliver all of them
1288 * to the target, which could be another vhost device, or the
1289 * physical eth dev. The route is done in function "virtio_tx_route".
1293 switch_worker(void *arg __rte_unused)
1296 unsigned lcore_id = rte_lcore_id();
1297 struct vhost_dev *vdev;
1298 struct mbuf_table *tx_q;
1300 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1302 tx_q = &lcore_tx_queue[lcore_id];
1303 for (i = 0; i < rte_lcore_count(); i++) {
1304 if (lcore_ids[i] == lcore_id) {
1311 drain_mbuf_table(tx_q);
1312 drain_vhost_table();
1314 * Inform the configuration core that we have exited the
1315 * linked list and that no devices are in use if requested.
1317 if (lcore_info[lcore_id].dev_removal_flag == REQUEST_DEV_REMOVAL)
1318 lcore_info[lcore_id].dev_removal_flag = ACK_DEV_REMOVAL;
1321 * Process vhost devices
1323 TAILQ_FOREACH(vdev, &lcore_info[lcore_id].vdev_list,
1325 if (unlikely(vdev->remove)) {
1327 vdev->ready = DEVICE_SAFE_REMOVE;
1331 if (likely(vdev->ready == DEVICE_RX))
1334 if (likely(!vdev->remove))
1335 drain_virtio_tx(vdev);
1343 * Remove a device from the specific data core linked list and from the
1344 * main linked list. Synchonization occurs through the use of the
1345 * lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
1346 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
1349 destroy_device(int vid)
1351 struct vhost_dev *vdev = NULL;
1355 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1356 if (vdev->vid == vid)
1361 /*set the remove flag. */
1363 while(vdev->ready != DEVICE_SAFE_REMOVE) {
1367 for (i = 0; i < RTE_MAX_LCORE; i++)
1368 rte_free(vhost_txbuff[i * MAX_VHOST_DEVICE + vid]);
1370 if (builtin_net_driver)
1371 vs_vhost_net_remove(vdev);
1373 TAILQ_REMOVE(&lcore_info[vdev->coreid].vdev_list, vdev,
1375 TAILQ_REMOVE(&vhost_dev_list, vdev, global_vdev_entry);
1378 /* Set the dev_removal_flag on each lcore. */
1379 RTE_LCORE_FOREACH_WORKER(lcore)
1380 lcore_info[lcore].dev_removal_flag = REQUEST_DEV_REMOVAL;
1383 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL
1384 * we can be sure that they can no longer access the device removed
1385 * from the linked lists and that the devices are no longer in use.
1387 RTE_LCORE_FOREACH_WORKER(lcore) {
1388 while (lcore_info[lcore].dev_removal_flag != ACK_DEV_REMOVAL)
1392 lcore_info[vdev->coreid].device_num--;
1394 RTE_LOG(INFO, VHOST_DATA,
1395 "(%d) device has been removed from data core\n",
1398 if (async_vhost_driver)
1399 rte_vhost_async_channel_unregister(vid, VIRTIO_RXQ);
1405 * A new device is added to a data core. First the device is added to the main linked list
1406 * and then allocated to a specific data core.
1411 int lcore, core_add = 0;
1413 uint32_t device_num_min = num_devices;
1414 struct vhost_dev *vdev;
1415 vdev = rte_zmalloc("vhost device", sizeof(*vdev), RTE_CACHE_LINE_SIZE);
1417 RTE_LOG(INFO, VHOST_DATA,
1418 "(%d) couldn't allocate memory for vhost dev\n",
1424 for (i = 0; i < RTE_MAX_LCORE; i++) {
1425 vhost_txbuff[i * MAX_VHOST_DEVICE + vid]
1426 = rte_zmalloc("vhost bufftable",
1427 sizeof(struct vhost_bufftable),
1428 RTE_CACHE_LINE_SIZE);
1430 if (vhost_txbuff[i * MAX_VHOST_DEVICE + vid] == NULL) {
1431 RTE_LOG(INFO, VHOST_DATA,
1432 "(%d) couldn't allocate memory for vhost TX\n", vid);
1437 if (builtin_net_driver)
1438 vs_vhost_net_setup(vdev);
1440 TAILQ_INSERT_TAIL(&vhost_dev_list, vdev, global_vdev_entry);
1441 vdev->vmdq_rx_q = vid * queues_per_pool + vmdq_queue_base;
1443 /*reset ready flag*/
1444 vdev->ready = DEVICE_MAC_LEARNING;
1447 /* Find a suitable lcore to add the device. */
1448 RTE_LCORE_FOREACH_WORKER(lcore) {
1449 if (lcore_info[lcore].device_num < device_num_min) {
1450 device_num_min = lcore_info[lcore].device_num;
1454 vdev->coreid = core_add;
1456 TAILQ_INSERT_TAIL(&lcore_info[vdev->coreid].vdev_list, vdev,
1458 lcore_info[vdev->coreid].device_num++;
1460 /* Disable notifications. */
1461 rte_vhost_enable_guest_notification(vid, VIRTIO_RXQ, 0);
1462 rte_vhost_enable_guest_notification(vid, VIRTIO_TXQ, 0);
1464 RTE_LOG(INFO, VHOST_DATA,
1465 "(%d) device has been added to data core %d\n",
1468 if (async_vhost_driver) {
1469 struct rte_vhost_async_features f;
1470 struct rte_vhost_async_channel_ops channel_ops;
1472 if (dma_type != NULL && strncmp(dma_type, "ioat", 4) == 0) {
1473 channel_ops.transfer_data = ioat_transfer_data_cb;
1474 channel_ops.check_completed_copies =
1475 ioat_check_completed_copies_cb;
1477 f.async_inorder = 1;
1478 f.async_threshold = 256;
1480 return rte_vhost_async_channel_register(vid, VIRTIO_RXQ,
1481 f.intval, &channel_ops);
1489 * These callback allow devices to be added to the data core when configuration
1490 * has been fully complete.
1492 static const struct vhost_device_ops virtio_net_device_ops =
1494 .new_device = new_device,
1495 .destroy_device = destroy_device,
1499 * This is a thread will wake up after a period to print stats if the user has
1503 print_stats(__rte_unused void *arg)
1505 struct vhost_dev *vdev;
1506 uint64_t tx_dropped, rx_dropped;
1507 uint64_t tx, tx_total, rx, rx_total;
1508 const char clr[] = { 27, '[', '2', 'J', '\0' };
1509 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
1512 sleep(enable_stats);
1514 /* Clear screen and move to top left */
1515 printf("%s%s\n", clr, top_left);
1516 printf("Device statistics =================================\n");
1518 TAILQ_FOREACH(vdev, &vhost_dev_list, global_vdev_entry) {
1519 tx_total = vdev->stats.tx_total;
1520 tx = vdev->stats.tx;
1521 tx_dropped = tx_total - tx;
1523 rx_total = __atomic_load_n(&vdev->stats.rx_total_atomic,
1525 rx = __atomic_load_n(&vdev->stats.rx_atomic,
1527 rx_dropped = rx_total - rx;
1529 printf("Statistics for device %d\n"
1530 "-----------------------\n"
1531 "TX total: %" PRIu64 "\n"
1532 "TX dropped: %" PRIu64 "\n"
1533 "TX successful: %" PRIu64 "\n"
1534 "RX total: %" PRIu64 "\n"
1535 "RX dropped: %" PRIu64 "\n"
1536 "RX successful: %" PRIu64 "\n",
1538 tx_total, tx_dropped, tx,
1539 rx_total, rx_dropped, rx);
1542 printf("===================================================\n");
1551 unregister_drivers(int socket_num)
1555 for (i = 0; i < socket_num; i++) {
1556 ret = rte_vhost_driver_unregister(socket_files + i * PATH_MAX);
1558 RTE_LOG(ERR, VHOST_CONFIG,
1559 "Fail to unregister vhost driver for %s.\n",
1560 socket_files + i * PATH_MAX);
1564 /* When we receive a INT signal, unregister vhost driver */
1566 sigint_handler(__rte_unused int signum)
1568 /* Unregister vhost driver. */
1569 unregister_drivers(nb_sockets);
1575 * While creating an mbuf pool, one key thing is to figure out how
1576 * many mbuf entries is enough for our use. FYI, here are some
1579 * - Each rx queue would reserve @nr_rx_desc mbufs at queue setup stage
1581 * - For each switch core (A CPU core does the packet switch), we need
1582 * also make some reservation for receiving the packets from virtio
1583 * Tx queue. How many is enough depends on the usage. It's normally
1584 * a simple calculation like following:
1586 * MAX_PKT_BURST * max packet size / mbuf size
1588 * So, we definitely need allocate more mbufs when TSO is enabled.
1590 * - Similarly, for each switching core, we should serve @nr_rx_desc
1591 * mbufs for receiving the packets from physical NIC device.
1593 * - We also need make sure, for each switch core, we have allocated
1594 * enough mbufs to fill up the mbuf cache.
1597 create_mbuf_pool(uint16_t nr_port, uint32_t nr_switch_core, uint32_t mbuf_size,
1598 uint32_t nr_queues, uint32_t nr_rx_desc, uint32_t nr_mbuf_cache)
1601 uint32_t nr_mbufs_per_core;
1602 uint32_t mtu = 1500;
1609 nr_mbufs_per_core = (mtu + mbuf_size) * MAX_PKT_BURST /
1610 (mbuf_size - RTE_PKTMBUF_HEADROOM);
1611 nr_mbufs_per_core += nr_rx_desc;
1612 nr_mbufs_per_core = RTE_MAX(nr_mbufs_per_core, nr_mbuf_cache);
1614 nr_mbufs = nr_queues * nr_rx_desc;
1615 nr_mbufs += nr_mbufs_per_core * nr_switch_core;
1616 nr_mbufs *= nr_port;
1618 mbuf_pool = rte_pktmbuf_pool_create("MBUF_POOL", nr_mbufs,
1619 nr_mbuf_cache, 0, mbuf_size,
1621 if (mbuf_pool == NULL)
1622 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
1626 * Main function, does initialisation and calls the per-lcore functions.
1629 main(int argc, char *argv[])
1631 unsigned lcore_id, core_id = 0;
1632 unsigned nb_ports, valid_num_ports;
1635 static pthread_t tid;
1638 signal(SIGINT, sigint_handler);
1641 ret = rte_eal_init(argc, argv);
1643 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
1647 /* parse app arguments */
1648 ret = us_vhost_parse_args(argc, argv);
1650 rte_exit(EXIT_FAILURE, "Invalid argument\n");
1652 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id++) {
1653 TAILQ_INIT(&lcore_info[lcore_id].vdev_list);
1655 if (rte_lcore_is_enabled(lcore_id))
1656 lcore_ids[core_id++] = lcore_id;
1659 if (rte_lcore_count() > RTE_MAX_LCORE)
1660 rte_exit(EXIT_FAILURE,"Not enough cores\n");
1662 /* Get the number of physical ports. */
1663 nb_ports = rte_eth_dev_count_avail();
1666 * Update the global var NUM_PORTS and global array PORTS
1667 * and get value of var VALID_NUM_PORTS according to system ports number
1669 valid_num_ports = check_ports_num(nb_ports);
1671 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
1672 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
1673 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
1678 * FIXME: here we are trying to allocate mbufs big enough for
1679 * @MAX_QUEUES, but the truth is we're never going to use that
1680 * many queues here. We probably should only do allocation for
1681 * those queues we are going to use.
1683 create_mbuf_pool(valid_num_ports, rte_lcore_count() - 1, MBUF_DATA_SIZE,
1684 MAX_QUEUES, RTE_TEST_RX_DESC_DEFAULT, MBUF_CACHE_SIZE);
1686 if (vm2vm_mode == VM2VM_HARDWARE) {
1687 /* Enable VT loop back to let L2 switch to do it. */
1688 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
1689 RTE_LOG(DEBUG, VHOST_CONFIG,
1690 "Enable loop back for L2 switch in vmdq.\n");
1693 /* initialize all ports */
1694 RTE_ETH_FOREACH_DEV(portid) {
1695 /* skip ports that are not enabled */
1696 if ((enabled_port_mask & (1 << portid)) == 0) {
1697 RTE_LOG(INFO, VHOST_PORT,
1698 "Skipping disabled port %d\n", portid);
1701 if (port_init(portid) != 0)
1702 rte_exit(EXIT_FAILURE,
1703 "Cannot initialize network ports\n");
1706 /* Enable stats if the user option is set. */
1708 ret = rte_ctrl_thread_create(&tid, "print-stats", NULL,
1711 rte_exit(EXIT_FAILURE,
1712 "Cannot create print-stats thread\n");
1715 /* Launch all data cores. */
1716 RTE_LCORE_FOREACH_WORKER(lcore_id)
1717 rte_eal_remote_launch(switch_worker, NULL, lcore_id);
1720 flags |= RTE_VHOST_USER_CLIENT;
1722 /* Register vhost user driver to handle vhost messages. */
1723 for (i = 0; i < nb_sockets; i++) {
1724 char *file = socket_files + i * PATH_MAX;
1726 if (async_vhost_driver)
1727 flags = flags | RTE_VHOST_USER_ASYNC_COPY;
1729 ret = rte_vhost_driver_register(file, flags);
1731 unregister_drivers(i);
1732 rte_exit(EXIT_FAILURE,
1733 "vhost driver register failure.\n");
1736 if (builtin_net_driver)
1737 rte_vhost_driver_set_features(file, VIRTIO_NET_FEATURES);
1739 if (mergeable == 0) {
1740 rte_vhost_driver_disable_features(file,
1741 1ULL << VIRTIO_NET_F_MRG_RXBUF);
1744 if (enable_tx_csum == 0) {
1745 rte_vhost_driver_disable_features(file,
1746 1ULL << VIRTIO_NET_F_CSUM);
1749 if (enable_tso == 0) {
1750 rte_vhost_driver_disable_features(file,
1751 1ULL << VIRTIO_NET_F_HOST_TSO4);
1752 rte_vhost_driver_disable_features(file,
1753 1ULL << VIRTIO_NET_F_HOST_TSO6);
1754 rte_vhost_driver_disable_features(file,
1755 1ULL << VIRTIO_NET_F_GUEST_TSO4);
1756 rte_vhost_driver_disable_features(file,
1757 1ULL << VIRTIO_NET_F_GUEST_TSO6);
1761 rte_vhost_driver_enable_features(file,
1762 1ULL << VIRTIO_NET_F_CTRL_RX);
1765 ret = rte_vhost_driver_callback_register(file,
1766 &virtio_net_device_ops);
1768 rte_exit(EXIT_FAILURE,
1769 "failed to register vhost driver callbacks.\n");
1772 if (rte_vhost_driver_start(file) < 0) {
1773 rte_exit(EXIT_FAILURE,
1774 "failed to start vhost driver.\n");
1778 RTE_LCORE_FOREACH_WORKER(lcore_id)
1779 rte_eal_wait_lcore(lcore_id);
1781 /* clean up the EAL */