4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
52 #include <rte_virtio_net.h>
56 #define MAX_QUEUES 128
58 /* the maximum number of external ports supported */
59 #define MAX_SUP_PORTS 1
62 * Calculate the number of buffers needed per port
64 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
65 (num_switching_cores*MAX_PKT_BURST) + \
66 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
67 (num_switching_cores*MBUF_CACHE_SIZE))
69 #define MBUF_CACHE_SIZE 128
70 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
73 * No frame data buffer allocated from host are required for zero copy
74 * implementation, guest will allocate the frame data buffer, and vhost
77 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
78 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
79 + RTE_PKTMBUF_HEADROOM)
80 #define MBUF_CACHE_SIZE_ZCP 0
83 * RX and TX Prefetch, Host, and Write-back threshold values should be
84 * carefully set for optimal performance. Consult the network
85 * controller's datasheet and supporting DPDK documentation for guidance
86 * on how these parameters should be set.
88 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
89 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
90 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
93 * These default values are optimized for use with the Intel(R) 82599 10 GbE
94 * Controller and the DPDK ixgbe PMD. Consider using other values for other
95 * network controllers and/or network drivers.
97 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
98 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
99 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
101 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
102 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
104 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
105 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
107 #define JUMBO_FRAME_MAX_SIZE 0x2600
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_SAFE_REMOVE 2
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
123 * Need refine these 2 macros for legacy and DPDK based front end:
124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125 * And then adjust power 2.
128 * For legacy front end, 128 descriptors,
129 * half for virtio header, another half for mbuf.
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 + sizeof(struct rte_mbuf)))
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 #define INVALID_PORT_ID 0xFF
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
146 /* Size of buffers used for snprintfs. */
147 #define MAX_PRINT_BUFF 6072
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
164 /* Promiscuous mode */
165 static uint32_t promiscuous;
167 /*Number of switching cores enabled*/
168 static uint32_t num_switching_cores = 0;
170 /* number of devices/queues to support*/
171 static uint32_t num_queues = 0;
172 static uint32_t num_devices;
175 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
176 * disabled on default.
178 static uint32_t zero_copy;
179 static int mergeable;
181 /* number of descriptors to apply*/
182 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
183 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
185 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
186 #define MAX_RING_DESC 4096
189 struct rte_mempool *pool;
190 struct rte_ring *ring;
192 } vpool_array[MAX_QUEUES+MAX_QUEUES];
194 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
201 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
203 /* The type of host physical address translated from guest physical address. */
205 PHYS_ADDR_CONTINUOUS = 0,
206 PHYS_ADDR_CROSS_SUBREG = 1,
207 PHYS_ADDR_INVALID = 2,
212 static uint32_t enable_stats = 0;
213 /* Enable retries on RX. */
214 static uint32_t enable_retry = 1;
215 /* Specify timeout (in useconds) between retries on RX. */
216 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
217 /* Specify the number of retries on RX. */
218 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
220 /* Character device basename. Can be set by user. */
221 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
224 /* Default configuration for rx and tx thresholds etc. */
225 static struct rte_eth_rxconf rx_conf_default = {
227 .pthresh = RX_PTHRESH,
228 .hthresh = RX_HTHRESH,
229 .wthresh = RX_WTHRESH,
235 * These default values are optimized for use with the Intel(R) 82599 10 GbE
236 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
237 * network controllers and/or network drivers.
239 static struct rte_eth_txconf tx_conf_default = {
241 .pthresh = TX_PTHRESH,
242 .hthresh = TX_HTHRESH,
243 .wthresh = TX_WTHRESH,
245 .tx_free_thresh = 0, /* Use PMD default values */
246 .tx_rs_thresh = 0, /* Use PMD default values */
249 /* empty vmdq configuration structure. Filled in programatically */
250 static struct rte_eth_conf vmdq_conf_default = {
252 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
254 .header_split = 0, /**< Header Split disabled */
255 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
256 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
258 * It is necessary for 1G NIC such as I350,
259 * this fixes bug of ipv4 forwarding in guest can't
260 * forward pakets from one virtio dev to another virtio dev.
262 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
263 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
264 .hw_strip_crc = 0, /**< CRC stripped by hardware */
268 .mq_mode = ETH_MQ_TX_NONE,
272 * should be overridden separately in code with
276 .nb_queue_pools = ETH_8_POOLS,
277 .enable_default_pool = 0,
280 .pool_map = {{0, 0},},
285 static unsigned lcore_ids[RTE_MAX_LCORE];
286 static uint8_t ports[RTE_MAX_ETHPORTS];
287 static unsigned num_ports = 0; /**< The number of ports specified in command line */
289 static const uint16_t external_pkt_default_vlan_tag = 2000;
290 const uint16_t vlan_tags[] = {
291 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
292 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
293 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
294 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
295 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
296 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
297 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
298 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
301 /* ethernet addresses of ports */
302 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
304 /* heads for the main used and free linked lists for the data path. */
305 static struct virtio_net_data_ll *ll_root_used = NULL;
306 static struct virtio_net_data_ll *ll_root_free = NULL;
308 /* Array of data core structures containing information on individual core linked lists. */
309 static struct lcore_info lcore_info[RTE_MAX_LCORE];
311 /* Used for queueing bursts of TX packets. */
315 struct rte_mbuf *m_table[MAX_PKT_BURST];
318 /* TX queue for each data core. */
319 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
321 /* TX queue fori each virtio device for zero copy. */
322 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
324 /* Vlan header struct used to insert vlan tags on TX. */
326 unsigned char h_dest[ETH_ALEN];
327 unsigned char h_source[ETH_ALEN];
330 __be16 h_vlan_encapsulated_proto;
335 uint8_t version_ihl; /**< version and header length */
336 uint8_t type_of_service; /**< type of service */
337 uint16_t total_length; /**< length of packet */
338 uint16_t packet_id; /**< packet ID */
339 uint16_t fragment_offset; /**< fragmentation offset */
340 uint8_t time_to_live; /**< time to live */
341 uint8_t next_proto_id; /**< protocol ID */
342 uint16_t hdr_checksum; /**< header checksum */
343 uint32_t src_addr; /**< source address */
344 uint32_t dst_addr; /**< destination address */
345 } __attribute__((__packed__));
347 /* Header lengths. */
349 #define VLAN_ETH_HLEN 18
351 /* Per-device statistics struct */
352 struct device_statistics {
354 rte_atomic64_t rx_total_atomic;
357 rte_atomic64_t rx_atomic;
359 } __rte_cache_aligned;
360 struct device_statistics dev_statistics[MAX_DEVICES];
363 * Builds up the correct configuration for VMDQ VLAN pool map
364 * according to the pool & queue limits.
367 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
369 struct rte_eth_vmdq_rx_conf conf;
370 struct rte_eth_vmdq_rx_conf *def_conf =
371 &vmdq_conf_default.rx_adv_conf.vmdq_rx_conf;
374 memset(&conf, 0, sizeof(conf));
375 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
376 conf.nb_pool_maps = num_devices;
377 conf.enable_loop_back = def_conf->enable_loop_back;
378 conf.rx_mode = def_conf->rx_mode;
380 for (i = 0; i < conf.nb_pool_maps; i++) {
381 conf.pool_map[i].vlan_id = vlan_tags[ i ];
382 conf.pool_map[i].pools = (1UL << i);
385 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
386 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
387 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
392 * Validate the device number according to the max pool number gotten form
393 * dev_info. If the device number is invalid, give the error message and
394 * return -1. Each device must have its own pool.
397 validate_num_devices(uint32_t max_nb_devices)
399 if (num_devices > max_nb_devices) {
400 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
407 * Initialises a given port using global settings and with the rx buffers
408 * coming from the mbuf_pool passed as parameter
411 port_init(uint8_t port)
413 struct rte_eth_dev_info dev_info;
414 struct rte_eth_conf port_conf;
415 uint16_t rx_rings, tx_rings;
416 uint16_t rx_ring_size, tx_ring_size;
420 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
421 rte_eth_dev_info_get (port, &dev_info);
423 /*configure the number of supported virtio devices based on VMDQ limits */
424 num_devices = dev_info.max_vmdq_pools;
425 num_queues = dev_info.max_rx_queues;
428 rx_ring_size = num_rx_descriptor;
429 tx_ring_size = num_tx_descriptor;
430 tx_rings = dev_info.max_tx_queues;
432 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
433 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
434 tx_rings = (uint16_t)rte_lcore_count();
437 retval = validate_num_devices(MAX_DEVICES);
441 /* Get port configuration. */
442 retval = get_eth_conf(&port_conf, num_devices);
446 if (port >= rte_eth_dev_count()) return -1;
448 rx_rings = (uint16_t)num_queues,
449 /* Configure ethernet device. */
450 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
454 /* Setup the queues. */
455 for (q = 0; q < rx_rings; q ++) {
456 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
457 rte_eth_dev_socket_id(port), &rx_conf_default,
458 vpool_array[q].pool);
462 for (q = 0; q < tx_rings; q ++) {
463 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
464 rte_eth_dev_socket_id(port), &tx_conf_default);
469 /* Start the device. */
470 retval = rte_eth_dev_start(port);
472 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
477 rte_eth_promiscuous_enable(port);
479 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
480 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
481 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
482 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
484 vmdq_ports_eth_addr[port].addr_bytes[0],
485 vmdq_ports_eth_addr[port].addr_bytes[1],
486 vmdq_ports_eth_addr[port].addr_bytes[2],
487 vmdq_ports_eth_addr[port].addr_bytes[3],
488 vmdq_ports_eth_addr[port].addr_bytes[4],
489 vmdq_ports_eth_addr[port].addr_bytes[5]);
495 * Set character device basename.
498 us_vhost_parse_basename(const char *q_arg)
500 /* parse number string */
502 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
505 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
511 * Parse the portmask provided at run time.
514 parse_portmask(const char *portmask)
521 /* parse hexadecimal string */
522 pm = strtoul(portmask, &end, 16);
523 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
534 * Parse num options at run time.
537 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
544 /* parse unsigned int string */
545 num = strtoul(q_arg, &end, 10);
546 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
549 if (num > max_valid_value)
560 us_vhost_usage(const char *prgname)
562 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
564 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
565 " --dev-basename <name>\n"
567 " -p PORTMASK: Set mask for ports to be used by application\n"
568 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
569 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
570 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
571 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
572 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
573 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
574 " --dev-basename: The basename to be used for the character device.\n"
575 " --zero-copy [0|1]: disable(default)/enable rx/tx "
577 " --rx-desc-num [0-N]: the number of descriptors on rx, "
578 "used only when zero copy is enabled.\n"
579 " --tx-desc-num [0-N]: the number of descriptors on tx, "
580 "used only when zero copy is enabled.\n",
585 * Parse the arguments given in the command line of the application.
588 us_vhost_parse_args(int argc, char **argv)
593 const char *prgname = argv[0];
594 static struct option long_option[] = {
595 {"vm2vm", required_argument, NULL, 0},
596 {"rx-retry", required_argument, NULL, 0},
597 {"rx-retry-delay", required_argument, NULL, 0},
598 {"rx-retry-num", required_argument, NULL, 0},
599 {"mergeable", required_argument, NULL, 0},
600 {"stats", required_argument, NULL, 0},
601 {"dev-basename", required_argument, NULL, 0},
602 {"zero-copy", required_argument, NULL, 0},
603 {"rx-desc-num", required_argument, NULL, 0},
604 {"tx-desc-num", required_argument, NULL, 0},
608 /* Parse command line */
609 while ((opt = getopt_long(argc, argv, "p:P",
610 long_option, &option_index)) != EOF) {
614 enabled_port_mask = parse_portmask(optarg);
615 if (enabled_port_mask == 0) {
616 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
617 us_vhost_usage(prgname);
624 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.rx_mode =
625 ETH_VMDQ_ACCEPT_BROADCAST |
626 ETH_VMDQ_ACCEPT_MULTICAST;
627 rte_vhost_feature_enable(1ULL << VIRTIO_NET_F_CTRL_RX);
632 /* Enable/disable vm2vm comms. */
633 if (!strncmp(long_option[option_index].name, "vm2vm",
635 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
637 RTE_LOG(INFO, VHOST_CONFIG,
638 "Invalid argument for "
640 us_vhost_usage(prgname);
643 vm2vm_mode = (vm2vm_type)ret;
647 /* Enable/disable retries on RX. */
648 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
649 ret = parse_num_opt(optarg, 1);
651 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
652 us_vhost_usage(prgname);
659 /* Specify the retries delay time (in useconds) on RX. */
660 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
661 ret = parse_num_opt(optarg, INT32_MAX);
663 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
664 us_vhost_usage(prgname);
667 burst_rx_delay_time = ret;
671 /* Specify the retries number on RX. */
672 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
673 ret = parse_num_opt(optarg, INT32_MAX);
675 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
676 us_vhost_usage(prgname);
679 burst_rx_retry_num = ret;
683 /* Enable/disable RX mergeable buffers. */
684 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
685 ret = parse_num_opt(optarg, 1);
687 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
688 us_vhost_usage(prgname);
693 vmdq_conf_default.rxmode.jumbo_frame = 1;
694 vmdq_conf_default.rxmode.max_rx_pkt_len
695 = JUMBO_FRAME_MAX_SIZE;
700 /* Enable/disable stats. */
701 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
702 ret = parse_num_opt(optarg, INT32_MAX);
704 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
705 us_vhost_usage(prgname);
712 /* Set character device basename. */
713 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
714 if (us_vhost_parse_basename(optarg) == -1) {
715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
716 us_vhost_usage(prgname);
721 /* Enable/disable rx/tx zero copy. */
722 if (!strncmp(long_option[option_index].name,
723 "zero-copy", MAX_LONG_OPT_SZ)) {
724 ret = parse_num_opt(optarg, 1);
726 RTE_LOG(INFO, VHOST_CONFIG,
728 " for zero-copy [0|1]\n");
729 us_vhost_usage(prgname);
735 #ifdef RTE_MBUF_REFCNT
736 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
737 "zero copy vhost APP, please "
738 "disable RTE_MBUF_REFCNT\n"
739 "in config file and then rebuild DPDK "
741 "Otherwise please disable zero copy "
742 "flag in command line!\n");
748 /* Specify the descriptor number on RX. */
749 if (!strncmp(long_option[option_index].name,
750 "rx-desc-num", MAX_LONG_OPT_SZ)) {
751 ret = parse_num_opt(optarg, MAX_RING_DESC);
752 if ((ret == -1) || (!POWEROF2(ret))) {
753 RTE_LOG(INFO, VHOST_CONFIG,
754 "Invalid argument for rx-desc-num[0-N],"
755 "power of 2 required.\n");
756 us_vhost_usage(prgname);
759 num_rx_descriptor = ret;
763 /* Specify the descriptor number on TX. */
764 if (!strncmp(long_option[option_index].name,
765 "tx-desc-num", MAX_LONG_OPT_SZ)) {
766 ret = parse_num_opt(optarg, MAX_RING_DESC);
767 if ((ret == -1) || (!POWEROF2(ret))) {
768 RTE_LOG(INFO, VHOST_CONFIG,
769 "Invalid argument for tx-desc-num [0-N],"
770 "power of 2 required.\n");
771 us_vhost_usage(prgname);
774 num_tx_descriptor = ret;
780 /* Invalid option - print options. */
782 us_vhost_usage(prgname);
787 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
788 if (enabled_port_mask & (1 << i))
789 ports[num_ports++] = (uint8_t)i;
792 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
793 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
794 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
798 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
799 RTE_LOG(INFO, VHOST_PORT,
800 "Vhost zero copy doesn't support software vm2vm,"
801 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
805 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
806 RTE_LOG(INFO, VHOST_PORT,
807 "Vhost zero copy doesn't support jumbo frame,"
808 "please specify '--mergeable 0' to disable the "
809 "mergeable feature.\n");
817 * Update the global var NUM_PORTS and array PORTS according to system ports number
818 * and return valid ports number
820 static unsigned check_ports_num(unsigned nb_ports)
822 unsigned valid_num_ports = num_ports;
825 if (num_ports > nb_ports) {
826 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
827 num_ports, nb_ports);
828 num_ports = nb_ports;
831 for (portid = 0; portid < num_ports; portid ++) {
832 if (ports[portid] >= nb_ports) {
833 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
834 ports[portid], (nb_ports - 1));
835 ports[portid] = INVALID_PORT_ID;
839 return valid_num_ports;
843 * Macro to print out packet contents. Wrapped in debug define so that the
844 * data path is not effected when debug is disabled.
847 #define PRINT_PACKET(device, addr, size, header) do { \
848 char *pkt_addr = (char*)(addr); \
849 unsigned int index; \
850 char packet[MAX_PRINT_BUFF]; \
853 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
855 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
856 for (index = 0; index < (size); index++) { \
857 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
858 "%02hhx ", pkt_addr[index]); \
860 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
862 LOG_DEBUG(VHOST_DATA, "%s", packet); \
865 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
869 * Function to convert guest physical addresses to vhost physical addresses.
870 * This is used to convert virtio buffer addresses.
872 static inline uint64_t __attribute__((always_inline))
873 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
874 uint32_t buf_len, hpa_type *addr_type)
876 struct virtio_memory_regions_hpa *region;
878 uint64_t vhost_pa = 0;
880 *addr_type = PHYS_ADDR_INVALID;
882 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
883 region = &vdev->regions_hpa[regionidx];
884 if ((guest_pa >= region->guest_phys_address) &&
885 (guest_pa <= region->guest_phys_address_end)) {
886 vhost_pa = region->host_phys_addr_offset + guest_pa;
887 if (likely((guest_pa + buf_len - 1)
888 <= region->guest_phys_address_end))
889 *addr_type = PHYS_ADDR_CONTINUOUS;
891 *addr_type = PHYS_ADDR_CROSS_SUBREG;
896 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
897 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
898 (void *)(uintptr_t)vhost_pa);
904 * Compares a packet destination MAC address to a device MAC address.
906 static inline int __attribute__((always_inline))
907 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
909 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
913 * This function learns the MAC address of the device and registers this along with a
914 * vlan tag to a VMDQ.
917 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
919 struct ether_hdr *pkt_hdr;
920 struct virtio_net_data_ll *dev_ll;
921 struct virtio_net *dev = vdev->dev;
924 /* Learn MAC address of guest device from packet */
925 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
927 dev_ll = ll_root_used;
929 while (dev_ll != NULL) {
930 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
931 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
934 dev_ll = dev_ll->next;
937 for (i = 0; i < ETHER_ADDR_LEN; i++)
938 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
940 /* vlan_tag currently uses the device_id. */
941 vdev->vlan_tag = vlan_tags[dev->device_fh];
943 /* Print out VMDQ registration info. */
944 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
946 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
947 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
948 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
951 /* Register the MAC address. */
952 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
954 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
957 /* Enable stripping of the vlan tag as we handle routing. */
958 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
960 /* Set device as ready for RX. */
961 vdev->ready = DEVICE_RX;
967 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
968 * queue before disabling RX on the device.
971 unlink_vmdq(struct vhost_dev *vdev)
975 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
977 if (vdev->ready == DEVICE_RX) {
978 /*clear MAC and VLAN settings*/
979 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
980 for (i = 0; i < 6; i++)
981 vdev->mac_address.addr_bytes[i] = 0;
985 /*Clear out the receive buffers*/
986 rx_count = rte_eth_rx_burst(ports[0],
987 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990 for (i = 0; i < rx_count; i++)
991 rte_pktmbuf_free(pkts_burst[i]);
993 rx_count = rte_eth_rx_burst(ports[0],
994 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
997 vdev->ready = DEVICE_MAC_LEARNING;
1002 * Check if the packet destination MAC address is for a local device. If so then put
1003 * the packet on that devices RX queue. If not then return.
1005 static inline int __attribute__((always_inline))
1006 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1008 struct virtio_net_data_ll *dev_ll;
1009 struct ether_hdr *pkt_hdr;
1011 struct virtio_net *dev = vdev->dev;
1012 struct virtio_net *tdev; /* destination virito device */
1014 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1016 /*get the used devices list*/
1017 dev_ll = ll_root_used;
1019 while (dev_ll != NULL) {
1020 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1021 &dev_ll->vdev->mac_address)) {
1023 /* Drop the packet if the TX packet is destined for the TX device. */
1024 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1025 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1029 tdev = dev_ll->vdev->dev;
1032 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1034 if (unlikely(dev_ll->vdev->remove)) {
1035 /*drop the packet if the device is marked for removal*/
1036 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1038 /*send the packet to the local virtio device*/
1039 ret = rte_vhost_enqueue_burst(tdev, VIRTIO_RXQ, &m, 1);
1042 &dev_statistics[tdev->device_fh].rx_total_atomic,
1045 &dev_statistics[tdev->device_fh].rx_atomic,
1047 dev_statistics[tdev->device_fh].tx_total++;
1048 dev_statistics[tdev->device_fh].tx += ret;
1054 dev_ll = dev_ll->next;
1061 * Check if the destination MAC of a packet is one local VM,
1062 * and get its vlan tag, and offset if it is.
1064 static inline int __attribute__((always_inline))
1065 find_local_dest(struct virtio_net *dev, struct rte_mbuf *m,
1066 uint32_t *offset, uint16_t *vlan_tag)
1068 struct virtio_net_data_ll *dev_ll = ll_root_used;
1069 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1071 while (dev_ll != NULL) {
1072 if ((dev_ll->vdev->ready == DEVICE_RX)
1073 && ether_addr_cmp(&(pkt_hdr->d_addr),
1074 &dev_ll->vdev->mac_address)) {
1076 * Drop the packet if the TX packet is
1077 * destined for the TX device.
1079 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1080 LOG_DEBUG(VHOST_DATA,
1081 "(%"PRIu64") TX: Source and destination"
1082 " MAC addresses are the same. Dropping "
1084 dev_ll->vdev->dev->device_fh);
1089 * HW vlan strip will reduce the packet length
1090 * by minus length of vlan tag, so need restore
1091 * the packet length by plus it.
1093 *offset = VLAN_HLEN;
1096 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1098 LOG_DEBUG(VHOST_DATA,
1099 "(%"PRIu64") TX: pkt to local VM device id:"
1100 "(%"PRIu64") vlan tag: %d.\n",
1101 dev->device_fh, dev_ll->vdev->dev->device_fh,
1106 dev_ll = dev_ll->next;
1112 * This function routes the TX packet to the correct interface. This may be a local device
1113 * or the physical port.
1115 static inline void __attribute__((always_inline))
1116 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, uint16_t vlan_tag)
1118 struct mbuf_table *tx_q;
1119 struct rte_mbuf **m_table;
1120 unsigned len, ret, offset = 0;
1121 const uint16_t lcore_id = rte_lcore_id();
1122 struct virtio_net *dev = vdev->dev;
1124 /*check if destination is local VM*/
1125 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0)) {
1126 rte_pktmbuf_free(m);
1130 if (vm2vm_mode == VM2VM_HARDWARE) {
1131 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0 ||
1132 offset > rte_pktmbuf_tailroom(m)) {
1133 rte_pktmbuf_free(m);
1138 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1140 /*Add packet to the port tx queue*/
1141 tx_q = &lcore_tx_queue[lcore_id];
1144 m->ol_flags = PKT_TX_VLAN_PKT;
1146 m->data_len += offset;
1147 m->pkt_len += offset;
1149 m->vlan_tci = vlan_tag;
1151 tx_q->m_table[len] = m;
1154 dev_statistics[dev->device_fh].tx_total++;
1155 dev_statistics[dev->device_fh].tx++;
1158 if (unlikely(len == MAX_PKT_BURST)) {
1159 m_table = (struct rte_mbuf **)tx_q->m_table;
1160 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1161 /* Free any buffers not handled by TX and update the port stats. */
1162 if (unlikely(ret < len)) {
1164 rte_pktmbuf_free(m_table[ret]);
1165 } while (++ret < len);
1175 * This function is called by each data core. It handles all RX/TX registered with the
1176 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1177 * with all devices in the main linked list.
1180 switch_worker(__attribute__((unused)) void *arg)
1182 struct rte_mempool *mbuf_pool = arg;
1183 struct virtio_net *dev = NULL;
1184 struct vhost_dev *vdev = NULL;
1185 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1186 struct virtio_net_data_ll *dev_ll;
1187 struct mbuf_table *tx_q;
1188 volatile struct lcore_ll_info *lcore_ll;
1189 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1190 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1192 const uint16_t lcore_id = rte_lcore_id();
1193 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1194 uint16_t rx_count = 0;
1198 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1199 lcore_ll = lcore_info[lcore_id].lcore_ll;
1202 tx_q = &lcore_tx_queue[lcore_id];
1203 for (i = 0; i < num_cores; i ++) {
1204 if (lcore_ids[i] == lcore_id) {
1211 cur_tsc = rte_rdtsc();
1213 * TX burst queue drain
1215 diff_tsc = cur_tsc - prev_tsc;
1216 if (unlikely(diff_tsc > drain_tsc)) {
1219 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1221 /*Tx any packets in the queue*/
1222 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1223 (struct rte_mbuf **)tx_q->m_table,
1224 (uint16_t)tx_q->len);
1225 if (unlikely(ret < tx_q->len)) {
1227 rte_pktmbuf_free(tx_q->m_table[ret]);
1228 } while (++ret < tx_q->len);
1238 rte_prefetch0(lcore_ll->ll_root_used);
1240 * Inform the configuration core that we have exited the linked list and that no devices are
1241 * in use if requested.
1243 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1244 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1249 dev_ll = lcore_ll->ll_root_used;
1251 while (dev_ll != NULL) {
1252 /*get virtio device ID*/
1253 vdev = dev_ll->vdev;
1256 if (unlikely(vdev->remove)) {
1257 dev_ll = dev_ll->next;
1259 vdev->ready = DEVICE_SAFE_REMOVE;
1262 if (likely(vdev->ready == DEVICE_RX)) {
1264 rx_count = rte_eth_rx_burst(ports[0],
1265 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1269 * Retry is enabled and the queue is full then we wait and retry to avoid packet loss
1270 * Here MAX_PKT_BURST must be less than virtio queue size
1272 if (enable_retry && unlikely(rx_count > rte_vring_available_entries(dev, VIRTIO_RXQ))) {
1273 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1274 rte_delay_us(burst_rx_delay_time);
1275 if (rx_count <= rte_vring_available_entries(dev, VIRTIO_RXQ))
1279 ret_count = rte_vhost_enqueue_burst(dev, VIRTIO_RXQ, pkts_burst, rx_count);
1282 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1285 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1287 while (likely(rx_count)) {
1289 rte_pktmbuf_free(pkts_burst[rx_count]);
1295 if (likely(!vdev->remove)) {
1296 /* Handle guest TX*/
1297 tx_count = rte_vhost_dequeue_burst(dev, VIRTIO_TXQ, mbuf_pool, pkts_burst, MAX_PKT_BURST);
1298 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1299 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING) && tx_count) {
1300 if (vdev->remove || (link_vmdq(vdev, pkts_burst[0]) == -1)) {
1302 rte_pktmbuf_free(pkts_burst[tx_count]);
1306 virtio_tx_route(vdev, pkts_burst[--tx_count], (uint16_t)dev->device_fh);
1309 /*move to the next device in the list*/
1310 dev_ll = dev_ll->next;
1318 * This function gets available ring number for zero copy rx.
1319 * Only one thread will call this funciton for a paticular virtio device,
1320 * so, it is designed as non-thread-safe function.
1322 static inline uint32_t __attribute__((always_inline))
1323 get_available_ring_num_zcp(struct virtio_net *dev)
1325 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1328 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1329 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1333 * This function gets available ring index for zero copy rx,
1334 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1335 * Only one thread will call this funciton for a paticular virtio device,
1336 * so, it is designed as non-thread-safe function.
1338 static inline uint32_t __attribute__((always_inline))
1339 get_available_ring_index_zcp(struct virtio_net *dev,
1340 uint16_t *res_base_idx, uint32_t count)
1342 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1345 uint16_t free_entries;
1347 *res_base_idx = vq->last_used_idx_res;
1348 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1349 free_entries = (avail_idx - *res_base_idx);
1351 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1353 "res base idx:%d, free entries:%d\n",
1354 dev->device_fh, avail_idx, *res_base_idx,
1358 * If retry is enabled and the queue is full then we wait
1359 * and retry to avoid packet loss.
1361 if (enable_retry && unlikely(count > free_entries)) {
1362 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1363 rte_delay_us(burst_rx_delay_time);
1364 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1365 free_entries = (avail_idx - *res_base_idx);
1366 if (count <= free_entries)
1371 /*check that we have enough buffers*/
1372 if (unlikely(count > free_entries))
1373 count = free_entries;
1375 if (unlikely(count == 0)) {
1376 LOG_DEBUG(VHOST_DATA,
1377 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1378 "avail idx: %d, res base idx:%d, free entries:%d\n",
1379 dev->device_fh, avail_idx,
1380 *res_base_idx, free_entries);
1384 vq->last_used_idx_res = *res_base_idx + count;
1390 * This function put descriptor back to used list.
1392 static inline void __attribute__((always_inline))
1393 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1395 uint16_t res_cur_idx = vq->last_used_idx;
1396 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1397 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1398 rte_compiler_barrier();
1399 *(volatile uint16_t *)&vq->used->idx += 1;
1400 vq->last_used_idx += 1;
1402 /* Kick the guest if necessary. */
1403 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1404 eventfd_write((int)vq->kickfd, 1);
1408 * This function get available descriptor from vitio vring and un-attached mbuf
1409 * from vpool->ring, and then attach them together. It needs adjust the offset
1410 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1411 * frame data may be put to wrong location in mbuf.
1413 static inline void __attribute__((always_inline))
1414 attach_rxmbuf_zcp(struct virtio_net *dev)
1416 uint16_t res_base_idx, desc_idx;
1417 uint64_t buff_addr, phys_addr;
1418 struct vhost_virtqueue *vq;
1419 struct vring_desc *desc;
1420 struct rte_mbuf *mbuf = NULL;
1421 struct vpool *vpool;
1423 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1425 vpool = &vpool_array[vdev->vmdq_rx_q];
1426 vq = dev->virtqueue[VIRTIO_RXQ];
1429 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1432 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1434 desc = &vq->desc[desc_idx];
1435 if (desc->flags & VRING_DESC_F_NEXT) {
1436 desc = &vq->desc[desc->next];
1437 buff_addr = gpa_to_vva(dev, desc->addr);
1438 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1441 buff_addr = gpa_to_vva(dev,
1442 desc->addr + vq->vhost_hlen);
1443 phys_addr = gpa_to_hpa(vdev,
1444 desc->addr + vq->vhost_hlen,
1445 desc->len, &addr_type);
1448 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1449 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1450 " address found when attaching RX frame buffer"
1451 " address!\n", dev->device_fh);
1452 put_desc_to_used_list_zcp(vq, desc_idx);
1457 * Check if the frame buffer address from guest crosses
1458 * sub-region or not.
1460 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1461 RTE_LOG(ERR, VHOST_DATA,
1462 "(%"PRIu64") Frame buffer address cross "
1463 "sub-regioin found when attaching RX frame "
1464 "buffer address!\n",
1466 put_desc_to_used_list_zcp(vq, desc_idx);
1469 } while (unlikely(phys_addr == 0));
1471 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1472 if (unlikely(mbuf == NULL)) {
1473 LOG_DEBUG(VHOST_DATA,
1474 "(%"PRIu64") in attach_rxmbuf_zcp: "
1475 "ring_sc_dequeue fail.\n",
1477 put_desc_to_used_list_zcp(vq, desc_idx);
1481 if (unlikely(vpool->buf_size > desc->len)) {
1482 LOG_DEBUG(VHOST_DATA,
1483 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1484 "length(%d) of descriptor idx: %d less than room "
1485 "size required: %d\n",
1486 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1487 put_desc_to_used_list_zcp(vq, desc_idx);
1488 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1492 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1493 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1494 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1495 mbuf->data_len = desc->len;
1496 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1498 LOG_DEBUG(VHOST_DATA,
1499 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1500 "descriptor idx:%d\n",
1501 dev->device_fh, res_base_idx, desc_idx);
1503 __rte_mbuf_raw_free(mbuf);
1509 * Detach an attched packet mbuf -
1510 * - restore original mbuf address and length values.
1511 * - reset pktmbuf data and data_len to their default values.
1512 * All other fields of the given packet mbuf will be left intact.
1515 * The attached packet mbuf.
1517 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1519 const struct rte_mempool *mp = m->pool;
1520 void *buf = RTE_MBUF_TO_BADDR(m);
1522 uint32_t buf_len = mp->elt_size - sizeof(*m);
1523 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1526 m->buf_len = (uint16_t)buf_len;
1528 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1529 RTE_PKTMBUF_HEADROOM : m->buf_len;
1530 m->data_off = buf_ofs;
1536 * This function is called after packets have been transimited. It fetchs mbuf
1537 * from vpool->pool, detached it and put into vpool->ring. It also update the
1538 * used index and kick the guest if necessary.
1540 static inline uint32_t __attribute__((always_inline))
1541 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1543 struct rte_mbuf *mbuf;
1544 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1545 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1547 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1549 LOG_DEBUG(VHOST_DATA,
1550 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1552 dev->device_fh, mbuf_count);
1553 LOG_DEBUG(VHOST_DATA,
1554 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1556 dev->device_fh, rte_ring_count(vpool->ring));
1558 for (index = 0; index < mbuf_count; index++) {
1559 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1560 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1561 pktmbuf_detach_zcp(mbuf);
1562 rte_ring_sp_enqueue(vpool->ring, mbuf);
1564 /* Update used index buffer information. */
1565 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1566 vq->used->ring[used_idx].len = 0;
1568 used_idx = (used_idx + 1) & (vq->size - 1);
1571 LOG_DEBUG(VHOST_DATA,
1572 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1574 dev->device_fh, rte_mempool_count(vpool->pool));
1575 LOG_DEBUG(VHOST_DATA,
1576 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1578 dev->device_fh, rte_ring_count(vpool->ring));
1579 LOG_DEBUG(VHOST_DATA,
1580 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1581 "vq->last_used_idx:%d\n",
1582 dev->device_fh, vq->last_used_idx);
1584 vq->last_used_idx += mbuf_count;
1586 LOG_DEBUG(VHOST_DATA,
1587 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1588 "vq->last_used_idx:%d\n",
1589 dev->device_fh, vq->last_used_idx);
1591 rte_compiler_barrier();
1593 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1595 /* Kick guest if required. */
1596 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1597 eventfd_write((int)vq->kickfd, 1);
1603 * This function is called when a virtio device is destroy.
1604 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1606 static void mbuf_destroy_zcp(struct vpool *vpool)
1608 struct rte_mbuf *mbuf = NULL;
1609 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1611 LOG_DEBUG(VHOST_CONFIG,
1612 "in mbuf_destroy_zcp: mbuf count in mempool before "
1613 "mbuf_destroy_zcp is: %d\n",
1615 LOG_DEBUG(VHOST_CONFIG,
1616 "in mbuf_destroy_zcp: mbuf count in ring before "
1617 "mbuf_destroy_zcp is : %d\n",
1618 rte_ring_count(vpool->ring));
1620 for (index = 0; index < mbuf_count; index++) {
1621 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1622 if (likely(mbuf != NULL)) {
1623 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1624 pktmbuf_detach_zcp(mbuf);
1625 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1629 LOG_DEBUG(VHOST_CONFIG,
1630 "in mbuf_destroy_zcp: mbuf count in mempool after "
1631 "mbuf_destroy_zcp is: %d\n",
1632 rte_mempool_count(vpool->pool));
1633 LOG_DEBUG(VHOST_CONFIG,
1634 "in mbuf_destroy_zcp: mbuf count in ring after "
1635 "mbuf_destroy_zcp is : %d\n",
1636 rte_ring_count(vpool->ring));
1640 * This function update the use flag and counter.
1642 static inline uint32_t __attribute__((always_inline))
1643 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1646 struct vhost_virtqueue *vq;
1647 struct vring_desc *desc;
1648 struct rte_mbuf *buff;
1649 /* The virtio_hdr is initialised to 0. */
1650 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1651 = {{0, 0, 0, 0, 0, 0}, 0};
1652 uint64_t buff_hdr_addr = 0;
1653 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1654 uint32_t head_idx, packet_success = 0;
1655 uint16_t res_cur_idx;
1657 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1662 vq = dev->virtqueue[VIRTIO_RXQ];
1663 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1665 res_cur_idx = vq->last_used_idx;
1666 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1667 dev->device_fh, res_cur_idx, res_cur_idx + count);
1669 /* Retrieve all of the head indexes first to avoid caching issues. */
1670 for (head_idx = 0; head_idx < count; head_idx++)
1671 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1673 /*Prefetch descriptor index. */
1674 rte_prefetch0(&vq->desc[head[packet_success]]);
1676 while (packet_success != count) {
1677 /* Get descriptor from available ring */
1678 desc = &vq->desc[head[packet_success]];
1680 buff = pkts[packet_success];
1681 LOG_DEBUG(VHOST_DATA,
1682 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1683 "pkt[%d] descriptor idx: %d\n",
1684 dev->device_fh, packet_success,
1685 MBUF_HEADROOM_UINT32(buff));
1688 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1689 + RTE_PKTMBUF_HEADROOM),
1690 rte_pktmbuf_data_len(buff), 0);
1692 /* Buffer address translation for virtio header. */
1693 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1694 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1697 * If the descriptors are chained the header and data are
1698 * placed in separate buffers.
1700 if (desc->flags & VRING_DESC_F_NEXT) {
1701 desc->len = vq->vhost_hlen;
1702 desc = &vq->desc[desc->next];
1703 desc->len = rte_pktmbuf_data_len(buff);
1705 desc->len = packet_len;
1708 /* Update used ring with desc information */
1709 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1710 = head[packet_success];
1711 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1716 /* A header is required per buffer. */
1717 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1718 (const void *)&virtio_hdr, vq->vhost_hlen);
1720 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1722 if (likely(packet_success < count)) {
1723 /* Prefetch descriptor index. */
1724 rte_prefetch0(&vq->desc[head[packet_success]]);
1728 rte_compiler_barrier();
1730 LOG_DEBUG(VHOST_DATA,
1731 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1732 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1733 dev->device_fh, vq->last_used_idx, vq->used->idx);
1735 *(volatile uint16_t *)&vq->used->idx += count;
1736 vq->last_used_idx += count;
1738 LOG_DEBUG(VHOST_DATA,
1739 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1740 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1741 dev->device_fh, vq->last_used_idx, vq->used->idx);
1743 /* Kick the guest if necessary. */
1744 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1745 eventfd_write((int)vq->kickfd, 1);
1751 * This function routes the TX packet to the correct interface.
1752 * This may be a local device or the physical port.
1754 static inline void __attribute__((always_inline))
1755 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1756 uint32_t desc_idx, uint8_t need_copy)
1758 struct mbuf_table *tx_q;
1759 struct rte_mbuf **m_table;
1760 struct rte_mbuf *mbuf = NULL;
1761 unsigned len, ret, offset = 0;
1762 struct vpool *vpool;
1763 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1764 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1766 /*Add packet to the port tx queue*/
1767 tx_q = &tx_queue_zcp[vmdq_rx_q];
1770 /* Allocate an mbuf and populate the structure. */
1771 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1772 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1773 if (unlikely(mbuf == NULL)) {
1774 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1775 RTE_LOG(ERR, VHOST_DATA,
1776 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1778 put_desc_to_used_list_zcp(vq, desc_idx);
1782 if (vm2vm_mode == VM2VM_HARDWARE) {
1783 /* Avoid using a vlan tag from any vm for external pkt, such as
1784 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1785 * selection, MAC address determines it as an external pkt
1786 * which should go to network, while vlan tag determine it as
1787 * a vm2vm pkt should forward to another vm. Hardware confuse
1788 * such a ambiguous situation, so pkt will lost.
1790 vlan_tag = external_pkt_default_vlan_tag;
1791 if (find_local_dest(dev, m, &offset, &vlan_tag) != 0) {
1792 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1793 __rte_mbuf_raw_free(mbuf);
1798 mbuf->nb_segs = m->nb_segs;
1799 mbuf->next = m->next;
1800 mbuf->data_len = m->data_len + offset;
1801 mbuf->pkt_len = mbuf->data_len;
1802 if (unlikely(need_copy)) {
1803 /* Copy the packet contents to the mbuf. */
1804 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1805 rte_pktmbuf_mtod(m, void *),
1808 mbuf->data_off = m->data_off;
1809 mbuf->buf_physaddr = m->buf_physaddr;
1810 mbuf->buf_addr = m->buf_addr;
1812 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1813 mbuf->vlan_tci = vlan_tag;
1814 mbuf->l2_len = sizeof(struct ether_hdr);
1815 mbuf->l3_len = sizeof(struct ipv4_hdr);
1816 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1818 tx_q->m_table[len] = mbuf;
1821 LOG_DEBUG(VHOST_DATA,
1822 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1825 (mbuf->next == NULL) ? "null" : "non-null");
1828 dev_statistics[dev->device_fh].tx_total++;
1829 dev_statistics[dev->device_fh].tx++;
1832 if (unlikely(len == MAX_PKT_BURST)) {
1833 m_table = (struct rte_mbuf **)tx_q->m_table;
1834 ret = rte_eth_tx_burst(ports[0],
1835 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1838 * Free any buffers not handled by TX and update
1841 if (unlikely(ret < len)) {
1843 rte_pktmbuf_free(m_table[ret]);
1844 } while (++ret < len);
1848 txmbuf_clean_zcp(dev, vpool);
1857 * This function TX all available packets in virtio TX queue for one
1858 * virtio-net device. If it is first packet, it learns MAC address and
1861 static inline void __attribute__((always_inline))
1862 virtio_dev_tx_zcp(struct virtio_net *dev)
1865 struct vhost_virtqueue *vq;
1866 struct vring_desc *desc;
1867 uint64_t buff_addr = 0, phys_addr;
1868 uint32_t head[MAX_PKT_BURST];
1870 uint16_t free_entries, packet_success = 0;
1872 uint8_t need_copy = 0;
1874 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1876 vq = dev->virtqueue[VIRTIO_TXQ];
1877 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1879 /* If there are no available buffers then return. */
1880 if (vq->last_used_idx_res == avail_idx)
1883 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1885 /* Prefetch available ring to retrieve head indexes. */
1886 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1888 /* Get the number of free entries in the ring */
1889 free_entries = (avail_idx - vq->last_used_idx_res);
1891 /* Limit to MAX_PKT_BURST. */
1893 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1895 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1896 dev->device_fh, free_entries);
1898 /* Retrieve all of the head indexes first to avoid caching issues. */
1899 for (i = 0; i < free_entries; i++)
1901 = vq->avail->ring[(vq->last_used_idx_res + i)
1904 vq->last_used_idx_res += free_entries;
1906 /* Prefetch descriptor index. */
1907 rte_prefetch0(&vq->desc[head[packet_success]]);
1908 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1910 while (packet_success < free_entries) {
1911 desc = &vq->desc[head[packet_success]];
1913 /* Discard first buffer as it is the virtio header */
1914 desc = &vq->desc[desc->next];
1916 /* Buffer address translation. */
1917 buff_addr = gpa_to_vva(dev, desc->addr);
1918 /* Need check extra VLAN_HLEN size for inserting VLAN tag */
1919 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len + VLAN_HLEN,
1922 if (likely(packet_success < (free_entries - 1)))
1923 /* Prefetch descriptor index. */
1924 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1926 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1927 RTE_LOG(ERR, VHOST_DATA,
1928 "(%"PRIu64") Invalid frame buffer address found"
1929 "when TX packets!\n",
1935 /* Prefetch buffer address. */
1936 rte_prefetch0((void *)(uintptr_t)buff_addr);
1939 * Setup dummy mbuf. This is copied to a real mbuf if
1940 * transmitted out the physical port.
1942 m.data_len = desc->len;
1946 m.buf_addr = (void *)(uintptr_t)buff_addr;
1947 m.buf_physaddr = phys_addr;
1950 * Check if the frame buffer address from guest crosses
1951 * sub-region or not.
1953 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1954 RTE_LOG(ERR, VHOST_DATA,
1955 "(%"PRIu64") Frame buffer address cross "
1956 "sub-regioin found when attaching TX frame "
1957 "buffer address!\n",
1963 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1966 * If this is the first received packet we need to learn
1967 * the MAC and setup VMDQ
1969 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
1970 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
1972 * Discard frame if device is scheduled for
1973 * removal or a duplicate MAC address is found.
1975 packet_success += free_entries;
1976 vq->last_used_idx += packet_success;
1981 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
1987 * This function is called by each data core. It handles all RX/TX registered
1988 * with the core. For TX the specific lcore linked list is used. For RX, MAC
1989 * addresses are compared with all devices in the main linked list.
1992 switch_worker_zcp(__attribute__((unused)) void *arg)
1994 struct virtio_net *dev = NULL;
1995 struct vhost_dev *vdev = NULL;
1996 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1997 struct virtio_net_data_ll *dev_ll;
1998 struct mbuf_table *tx_q;
1999 volatile struct lcore_ll_info *lcore_ll;
2000 const uint64_t drain_tsc
2001 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2002 * BURST_TX_DRAIN_US;
2003 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2005 const uint16_t lcore_id = rte_lcore_id();
2006 uint16_t count_in_ring, rx_count = 0;
2008 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2010 lcore_ll = lcore_info[lcore_id].lcore_ll;
2014 cur_tsc = rte_rdtsc();
2016 /* TX burst queue drain */
2017 diff_tsc = cur_tsc - prev_tsc;
2018 if (unlikely(diff_tsc > drain_tsc)) {
2020 * Get mbuf from vpool.pool and detach mbuf and
2021 * put back into vpool.ring.
2023 dev_ll = lcore_ll->ll_root_used;
2024 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2025 /* Get virtio device ID */
2026 vdev = dev_ll->vdev;
2029 if (likely(!vdev->remove)) {
2030 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2032 LOG_DEBUG(VHOST_DATA,
2033 "TX queue drained after timeout"
2034 " with burst size %u\n",
2038 * Tx any packets in the queue
2040 ret = rte_eth_tx_burst(
2042 (uint16_t)tx_q->txq_id,
2043 (struct rte_mbuf **)
2045 (uint16_t)tx_q->len);
2046 if (unlikely(ret < tx_q->len)) {
2049 tx_q->m_table[ret]);
2050 } while (++ret < tx_q->len);
2054 txmbuf_clean_zcp(dev,
2055 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2058 dev_ll = dev_ll->next;
2063 rte_prefetch0(lcore_ll->ll_root_used);
2066 * Inform the configuration core that we have exited the linked
2067 * list and that no devices are in use if requested.
2069 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2070 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2072 /* Process devices */
2073 dev_ll = lcore_ll->ll_root_used;
2075 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2076 vdev = dev_ll->vdev;
2078 if (unlikely(vdev->remove)) {
2079 dev_ll = dev_ll->next;
2081 vdev->ready = DEVICE_SAFE_REMOVE;
2085 if (likely(vdev->ready == DEVICE_RX)) {
2086 uint32_t index = vdev->vmdq_rx_q;
2089 = rte_ring_count(vpool_array[index].ring);
2090 uint16_t free_entries
2091 = (uint16_t)get_available_ring_num_zcp(dev);
2094 * Attach all mbufs in vpool.ring and put back
2098 i < RTE_MIN(free_entries,
2099 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2101 attach_rxmbuf_zcp(dev);
2103 /* Handle guest RX */
2104 rx_count = rte_eth_rx_burst(ports[0],
2105 vdev->vmdq_rx_q, pkts_burst,
2109 ret_count = virtio_dev_rx_zcp(dev,
2110 pkts_burst, rx_count);
2112 dev_statistics[dev->device_fh].rx_total
2114 dev_statistics[dev->device_fh].rx
2117 while (likely(rx_count)) {
2120 pkts_burst[rx_count]);
2121 rte_ring_sp_enqueue(
2122 vpool_array[index].ring,
2123 (void *)pkts_burst[rx_count]);
2128 if (likely(!vdev->remove))
2129 /* Handle guest TX */
2130 virtio_dev_tx_zcp(dev);
2132 /* Move to the next device in the list */
2133 dev_ll = dev_ll->next;
2142 * Add an entry to a used linked list. A free entry must first be found
2143 * in the free linked list using get_data_ll_free_entry();
2146 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2147 struct virtio_net_data_ll *ll_dev)
2149 struct virtio_net_data_ll *ll = *ll_root_addr;
2151 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2152 ll_dev->next = NULL;
2153 rte_compiler_barrier();
2155 /* If ll == NULL then this is the first device. */
2157 /* Increment to the tail of the linked list. */
2158 while ((ll->next != NULL) )
2163 *ll_root_addr = ll_dev;
2168 * Remove an entry from a used linked list. The entry must then be added to
2169 * the free linked list using put_data_ll_free_entry().
2172 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2173 struct virtio_net_data_ll *ll_dev,
2174 struct virtio_net_data_ll *ll_dev_last)
2176 struct virtio_net_data_ll *ll = *ll_root_addr;
2178 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2182 *ll_root_addr = ll_dev->next;
2184 if (likely(ll_dev_last != NULL))
2185 ll_dev_last->next = ll_dev->next;
2187 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2191 * Find and return an entry from the free linked list.
2193 static struct virtio_net_data_ll *
2194 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2196 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2197 struct virtio_net_data_ll *ll_dev;
2199 if (ll_free == NULL)
2203 *ll_root_addr = ll_free->next;
2209 * Place an entry back on to the free linked list.
2212 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2213 struct virtio_net_data_ll *ll_dev)
2215 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2220 ll_dev->next = ll_free;
2221 *ll_root_addr = ll_dev;
2225 * Creates a linked list of a given size.
2227 static struct virtio_net_data_ll *
2228 alloc_data_ll(uint32_t size)
2230 struct virtio_net_data_ll *ll_new;
2233 /* Malloc and then chain the linked list. */
2234 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2235 if (ll_new == NULL) {
2236 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2240 for (i = 0; i < size - 1; i++) {
2241 ll_new[i].vdev = NULL;
2242 ll_new[i].next = &ll_new[i+1];
2244 ll_new[i].next = NULL;
2250 * Create the main linked list along with each individual cores linked list. A used and a free list
2251 * are created to manage entries.
2258 RTE_LCORE_FOREACH_SLAVE(lcore) {
2259 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2260 if (lcore_info[lcore].lcore_ll == NULL) {
2261 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2265 lcore_info[lcore].lcore_ll->device_num = 0;
2266 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2267 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2268 if (num_devices % num_switching_cores)
2269 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2271 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2274 /* Allocate devices up to a maximum of MAX_DEVICES. */
2275 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2281 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2282 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2283 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2286 destroy_device (volatile struct virtio_net *dev)
2288 struct virtio_net_data_ll *ll_lcore_dev_cur;
2289 struct virtio_net_data_ll *ll_main_dev_cur;
2290 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2291 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2292 struct vhost_dev *vdev;
2295 dev->flags &= ~VIRTIO_DEV_RUNNING;
2297 vdev = (struct vhost_dev *)dev->priv;
2298 /*set the remove flag. */
2300 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2304 /* Search for entry to be removed from lcore ll */
2305 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2306 while (ll_lcore_dev_cur != NULL) {
2307 if (ll_lcore_dev_cur->vdev == vdev) {
2310 ll_lcore_dev_last = ll_lcore_dev_cur;
2311 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2315 if (ll_lcore_dev_cur == NULL) {
2316 RTE_LOG(ERR, VHOST_CONFIG,
2317 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2322 /* Search for entry to be removed from main ll */
2323 ll_main_dev_cur = ll_root_used;
2324 ll_main_dev_last = NULL;
2325 while (ll_main_dev_cur != NULL) {
2326 if (ll_main_dev_cur->vdev == vdev) {
2329 ll_main_dev_last = ll_main_dev_cur;
2330 ll_main_dev_cur = ll_main_dev_cur->next;
2334 /* Remove entries from the lcore and main ll. */
2335 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2336 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2338 /* Set the dev_removal_flag on each lcore. */
2339 RTE_LCORE_FOREACH_SLAVE(lcore) {
2340 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2344 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2345 * they can no longer access the device removed from the linked lists and that the devices
2346 * are no longer in use.
2348 RTE_LCORE_FOREACH_SLAVE(lcore) {
2349 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2354 /* Add the entries back to the lcore and main free ll.*/
2355 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2356 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2358 /* Decrement number of device on the lcore. */
2359 lcore_info[vdev->coreid].lcore_ll->device_num--;
2361 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2364 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2366 /* Stop the RX queue. */
2367 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2368 LOG_DEBUG(VHOST_CONFIG,
2369 "(%"PRIu64") In destroy_device: Failed to stop "
2375 LOG_DEBUG(VHOST_CONFIG,
2376 "(%"PRIu64") in destroy_device: Start put mbuf in "
2377 "mempool back to ring for RX queue: %d\n",
2378 dev->device_fh, vdev->vmdq_rx_q);
2380 mbuf_destroy_zcp(vpool);
2382 /* Stop the TX queue. */
2383 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2384 LOG_DEBUG(VHOST_CONFIG,
2385 "(%"PRIu64") In destroy_device: Failed to "
2386 "stop tx queue:%d\n",
2387 dev->device_fh, vdev->vmdq_rx_q);
2390 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2392 LOG_DEBUG(VHOST_CONFIG,
2393 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2394 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2395 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2398 mbuf_destroy_zcp(vpool);
2399 rte_free(vdev->regions_hpa);
2406 * Calculate the region count of physical continous regions for one particular
2407 * region of whose vhost virtual address is continous. The particular region
2408 * start from vva_start, with size of 'size' in argument.
2411 check_hpa_regions(uint64_t vva_start, uint64_t size)
2413 uint32_t i, nregions = 0, page_size = getpagesize();
2414 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2415 if (vva_start % page_size) {
2416 LOG_DEBUG(VHOST_CONFIG,
2417 "in check_countinous: vva start(%p) mod page_size(%d) "
2419 (void *)(uintptr_t)vva_start, page_size);
2422 if (size % page_size) {
2423 LOG_DEBUG(VHOST_CONFIG,
2424 "in check_countinous: "
2425 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2429 for (i = 0; i < size - page_size; i = i + page_size) {
2431 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2432 next_phys_addr = rte_mem_virt2phy(
2433 (void *)(uintptr_t)(vva_start + i + page_size));
2434 if ((cur_phys_addr + page_size) != next_phys_addr) {
2436 LOG_DEBUG(VHOST_CONFIG,
2437 "in check_continuous: hva addr:(%p) is not "
2438 "continuous with hva addr:(%p), diff:%d\n",
2439 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2440 (void *)(uintptr_t)(vva_start + (uint64_t)i
2441 + page_size), page_size);
2442 LOG_DEBUG(VHOST_CONFIG,
2443 "in check_continuous: hpa addr:(%p) is not "
2444 "continuous with hpa addr:(%p), "
2445 "diff:(%"PRIu64")\n",
2446 (void *)(uintptr_t)cur_phys_addr,
2447 (void *)(uintptr_t)next_phys_addr,
2448 (next_phys_addr-cur_phys_addr));
2455 * Divide each region whose vhost virtual address is continous into a few
2456 * sub-regions, make sure the physical address within each sub-region are
2457 * continous. And fill offset(to GPA) and size etc. information of each
2458 * sub-region into regions_hpa.
2461 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2463 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2464 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2466 if (mem_region_hpa == NULL)
2469 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2470 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2471 virtio_memory->regions[regionidx].address_offset;
2472 mem_region_hpa[regionidx_hpa].guest_phys_address
2473 = virtio_memory->regions[regionidx].guest_phys_address;
2474 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2475 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2476 mem_region_hpa[regionidx_hpa].guest_phys_address;
2477 LOG_DEBUG(VHOST_CONFIG,
2478 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2481 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2482 LOG_DEBUG(VHOST_CONFIG,
2483 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2486 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2488 i < virtio_memory->regions[regionidx].memory_size -
2491 cur_phys_addr = rte_mem_virt2phy(
2492 (void *)(uintptr_t)(vva_start + i));
2493 next_phys_addr = rte_mem_virt2phy(
2494 (void *)(uintptr_t)(vva_start +
2496 if ((cur_phys_addr + page_size) != next_phys_addr) {
2497 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2498 mem_region_hpa[regionidx_hpa].guest_phys_address +
2500 mem_region_hpa[regionidx_hpa].memory_size
2502 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2503 "phys addr end [%d]:(%p)\n",
2506 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2507 LOG_DEBUG(VHOST_CONFIG,
2508 "in fill_hpa_regions: guest phys addr "
2512 (mem_region_hpa[regionidx_hpa].memory_size));
2513 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2514 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2516 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2518 mem_region_hpa[regionidx_hpa].guest_phys_address;
2519 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2520 " phys addr start[%d]:(%p)\n",
2523 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2524 LOG_DEBUG(VHOST_CONFIG,
2525 "in fill_hpa_regions: host phys addr "
2529 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2535 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2536 = mem_region_hpa[regionidx_hpa].guest_phys_address
2538 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2539 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2540 "[%d]:(%p)\n", regionidx_hpa,
2542 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2543 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2544 "[%d]:(%p)\n", regionidx_hpa,
2546 (mem_region_hpa[regionidx_hpa].memory_size));
2549 return regionidx_hpa;
2553 * A new device is added to a data core. First the device is added to the main linked list
2554 * and the allocated to a specific data core.
2557 new_device (struct virtio_net *dev)
2559 struct virtio_net_data_ll *ll_dev;
2560 int lcore, core_add = 0;
2561 uint32_t device_num_min = num_devices;
2562 struct vhost_dev *vdev;
2565 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2567 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2575 vdev->nregions_hpa = dev->mem->nregions;
2576 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2578 += check_hpa_regions(
2579 dev->mem->regions[regionidx].guest_phys_address
2580 + dev->mem->regions[regionidx].address_offset,
2581 dev->mem->regions[regionidx].memory_size);
2585 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2586 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2588 if (vdev->regions_hpa == NULL) {
2589 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2595 if (fill_hpa_memory_regions(
2596 vdev->regions_hpa, dev->mem
2597 ) != vdev->nregions_hpa) {
2599 RTE_LOG(ERR, VHOST_CONFIG,
2600 "hpa memory regions number mismatch: "
2601 "[%d]\n", vdev->nregions_hpa);
2602 rte_free(vdev->regions_hpa);
2609 /* Add device to main ll */
2610 ll_dev = get_data_ll_free_entry(&ll_root_free);
2611 if (ll_dev == NULL) {
2612 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2613 "of %d devices per core has been reached\n",
2614 dev->device_fh, num_devices);
2615 if (vdev->regions_hpa)
2616 rte_free(vdev->regions_hpa);
2620 ll_dev->vdev = vdev;
2621 add_data_ll_entry(&ll_root_used, ll_dev);
2623 = dev->device_fh * (num_queues / num_devices);
2626 uint32_t index = vdev->vmdq_rx_q;
2627 uint32_t count_in_ring, i;
2628 struct mbuf_table *tx_q;
2630 count_in_ring = rte_ring_count(vpool_array[index].ring);
2632 LOG_DEBUG(VHOST_CONFIG,
2633 "(%"PRIu64") in new_device: mbuf count in mempool "
2634 "before attach is: %d\n",
2636 rte_mempool_count(vpool_array[index].pool));
2637 LOG_DEBUG(VHOST_CONFIG,
2638 "(%"PRIu64") in new_device: mbuf count in ring "
2639 "before attach is : %d\n",
2640 dev->device_fh, count_in_ring);
2643 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2645 for (i = 0; i < count_in_ring; i++)
2646 attach_rxmbuf_zcp(dev);
2648 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2649 "mempool after attach is: %d\n",
2651 rte_mempool_count(vpool_array[index].pool));
2652 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2653 "ring after attach is : %d\n",
2655 rte_ring_count(vpool_array[index].ring));
2657 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2658 tx_q->txq_id = vdev->vmdq_rx_q;
2660 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2661 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2663 LOG_DEBUG(VHOST_CONFIG,
2664 "(%"PRIu64") In new_device: Failed to start "
2666 dev->device_fh, vdev->vmdq_rx_q);
2668 mbuf_destroy_zcp(vpool);
2669 rte_free(vdev->regions_hpa);
2674 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2675 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2677 LOG_DEBUG(VHOST_CONFIG,
2678 "(%"PRIu64") In new_device: Failed to start "
2680 dev->device_fh, vdev->vmdq_rx_q);
2682 /* Stop the TX queue. */
2683 if (rte_eth_dev_tx_queue_stop(ports[0],
2684 vdev->vmdq_rx_q) != 0) {
2685 LOG_DEBUG(VHOST_CONFIG,
2686 "(%"PRIu64") In new_device: Failed to "
2687 "stop tx queue:%d\n",
2688 dev->device_fh, vdev->vmdq_rx_q);
2691 mbuf_destroy_zcp(vpool);
2692 rte_free(vdev->regions_hpa);
2699 /*reset ready flag*/
2700 vdev->ready = DEVICE_MAC_LEARNING;
2703 /* Find a suitable lcore to add the device. */
2704 RTE_LCORE_FOREACH_SLAVE(lcore) {
2705 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2706 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2710 /* Add device to lcore ll */
2711 ll_dev = get_data_ll_free_entry(&lcore_info[core_add].lcore_ll->ll_root_free);
2712 if (ll_dev == NULL) {
2713 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2714 vdev->ready = DEVICE_SAFE_REMOVE;
2715 destroy_device(dev);
2716 if (vdev->regions_hpa)
2717 rte_free(vdev->regions_hpa);
2721 ll_dev->vdev = vdev;
2722 vdev->coreid = core_add;
2724 add_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_dev);
2726 /* Initialize device stats */
2727 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2729 /* Disable notifications. */
2730 rte_vhost_enable_guest_notification(dev, VIRTIO_RXQ, 0);
2731 rte_vhost_enable_guest_notification(dev, VIRTIO_TXQ, 0);
2732 lcore_info[vdev->coreid].lcore_ll->device_num++;
2733 dev->flags |= VIRTIO_DEV_RUNNING;
2735 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2741 * These callback allow devices to be added to the data core when configuration
2742 * has been fully complete.
2744 static const struct virtio_net_device_ops virtio_net_device_ops =
2746 .new_device = new_device,
2747 .destroy_device = destroy_device,
2751 * This is a thread will wake up after a period to print stats if the user has
2757 struct virtio_net_data_ll *dev_ll;
2758 uint64_t tx_dropped, rx_dropped;
2759 uint64_t tx, tx_total, rx, rx_total;
2761 const char clr[] = { 27, '[', '2', 'J', '\0' };
2762 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2765 sleep(enable_stats);
2767 /* Clear screen and move to top left */
2768 printf("%s%s", clr, top_left);
2770 printf("\nDevice statistics ====================================");
2772 dev_ll = ll_root_used;
2773 while (dev_ll != NULL) {
2774 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2775 tx_total = dev_statistics[device_fh].tx_total;
2776 tx = dev_statistics[device_fh].tx;
2777 tx_dropped = tx_total - tx;
2778 if (zero_copy == 0) {
2779 rx_total = rte_atomic64_read(
2780 &dev_statistics[device_fh].rx_total_atomic);
2781 rx = rte_atomic64_read(
2782 &dev_statistics[device_fh].rx_atomic);
2784 rx_total = dev_statistics[device_fh].rx_total;
2785 rx = dev_statistics[device_fh].rx;
2787 rx_dropped = rx_total - rx;
2789 printf("\nStatistics for device %"PRIu32" ------------------------------"
2790 "\nTX total: %"PRIu64""
2791 "\nTX dropped: %"PRIu64""
2792 "\nTX successful: %"PRIu64""
2793 "\nRX total: %"PRIu64""
2794 "\nRX dropped: %"PRIu64""
2795 "\nRX successful: %"PRIu64"",
2804 dev_ll = dev_ll->next;
2806 printf("\n======================================================\n");
2811 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2812 char *ring_name, uint32_t nb_mbuf)
2814 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2815 vpool_array[index].pool
2816 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2817 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2818 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2819 rte_pktmbuf_init, NULL, socket, 0);
2820 if (vpool_array[index].pool != NULL) {
2821 vpool_array[index].ring
2822 = rte_ring_create(ring_name,
2823 rte_align32pow2(nb_mbuf + 1),
2824 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2825 if (likely(vpool_array[index].ring != NULL)) {
2826 LOG_DEBUG(VHOST_CONFIG,
2827 "in setup_mempool_tbl: mbuf count in "
2829 rte_mempool_count(vpool_array[index].pool));
2830 LOG_DEBUG(VHOST_CONFIG,
2831 "in setup_mempool_tbl: mbuf count in "
2833 rte_ring_count(vpool_array[index].ring));
2835 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2839 /* Need consider head room. */
2840 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2842 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2848 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2849 * device is also registered here to handle the IOCTLs.
2852 MAIN(int argc, char *argv[])
2854 struct rte_mempool *mbuf_pool = NULL;
2855 unsigned lcore_id, core_id = 0;
2856 unsigned nb_ports, valid_num_ports;
2858 uint8_t portid, queue_id = 0;
2859 static pthread_t tid;
2862 ret = rte_eal_init(argc, argv);
2864 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2868 /* parse app arguments */
2869 ret = us_vhost_parse_args(argc, argv);
2871 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2873 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2874 if (rte_lcore_is_enabled(lcore_id))
2875 lcore_ids[core_id ++] = lcore_id;
2877 if (rte_lcore_count() > RTE_MAX_LCORE)
2878 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2880 /*set the number of swithcing cores available*/
2881 num_switching_cores = rte_lcore_count()-1;
2883 /* Get the number of physical ports. */
2884 nb_ports = rte_eth_dev_count();
2885 if (nb_ports > RTE_MAX_ETHPORTS)
2886 nb_ports = RTE_MAX_ETHPORTS;
2889 * Update the global var NUM_PORTS and global array PORTS
2890 * and get value of var VALID_NUM_PORTS according to system ports number
2892 valid_num_ports = check_ports_num(nb_ports);
2894 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2895 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2896 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2900 if (zero_copy == 0) {
2901 /* Create the mbuf pool. */
2902 mbuf_pool = rte_mempool_create(
2906 MBUF_SIZE, MBUF_CACHE_SIZE,
2907 sizeof(struct rte_pktmbuf_pool_private),
2908 rte_pktmbuf_pool_init, NULL,
2909 rte_pktmbuf_init, NULL,
2910 rte_socket_id(), 0);
2911 if (mbuf_pool == NULL)
2912 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2914 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2915 vpool_array[queue_id].pool = mbuf_pool;
2917 if (vm2vm_mode == VM2VM_HARDWARE) {
2918 /* Enable VT loop back to let L2 switch to do it. */
2919 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2920 LOG_DEBUG(VHOST_CONFIG,
2921 "Enable loop back for L2 switch in vmdq.\n");
2925 char pool_name[RTE_MEMPOOL_NAMESIZE];
2926 char ring_name[RTE_MEMPOOL_NAMESIZE];
2929 * Zero copy defers queue RX/TX start to the time when guest
2930 * finishes its startup and packet buffers from that guest are
2933 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2934 rx_conf_default.rx_drop_en = 0;
2935 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2936 nb_mbuf = num_rx_descriptor
2937 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2938 + num_switching_cores * MAX_PKT_BURST;
2940 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2941 snprintf(pool_name, sizeof(pool_name),
2942 "rxmbuf_pool_%u", queue_id);
2943 snprintf(ring_name, sizeof(ring_name),
2944 "rxmbuf_ring_%u", queue_id);
2945 setup_mempool_tbl(rte_socket_id(), queue_id,
2946 pool_name, ring_name, nb_mbuf);
2949 nb_mbuf = num_tx_descriptor
2950 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2951 + num_switching_cores * MAX_PKT_BURST;
2953 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2954 snprintf(pool_name, sizeof(pool_name),
2955 "txmbuf_pool_%u", queue_id);
2956 snprintf(ring_name, sizeof(ring_name),
2957 "txmbuf_ring_%u", queue_id);
2958 setup_mempool_tbl(rte_socket_id(),
2959 (queue_id + MAX_QUEUES),
2960 pool_name, ring_name, nb_mbuf);
2963 if (vm2vm_mode == VM2VM_HARDWARE) {
2964 /* Enable VT loop back to let L2 switch to do it. */
2965 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2966 LOG_DEBUG(VHOST_CONFIG,
2967 "Enable loop back for L2 switch in vmdq.\n");
2970 /* Set log level. */
2971 rte_set_log_level(LOG_LEVEL);
2973 /* initialize all ports */
2974 for (portid = 0; portid < nb_ports; portid++) {
2975 /* skip ports that are not enabled */
2976 if ((enabled_port_mask & (1 << portid)) == 0) {
2977 RTE_LOG(INFO, VHOST_PORT,
2978 "Skipping disabled port %d\n", portid);
2981 if (port_init(portid) != 0)
2982 rte_exit(EXIT_FAILURE,
2983 "Cannot initialize network ports\n");
2986 /* Initialise all linked lists. */
2987 if (init_data_ll() == -1)
2988 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2990 /* Initialize device stats */
2991 memset(&dev_statistics, 0, sizeof(dev_statistics));
2993 /* Enable stats if the user option is set. */
2995 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2997 /* Launch all data cores. */
2998 if (zero_copy == 0) {
2999 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3000 rte_eal_remote_launch(switch_worker,
3001 mbuf_pool, lcore_id);
3004 uint32_t count_in_mempool, index, i;
3005 for (index = 0; index < 2*MAX_QUEUES; index++) {
3006 /* For all RX and TX queues. */
3008 = rte_mempool_count(vpool_array[index].pool);
3011 * Transfer all un-attached mbufs from vpool.pool
3014 for (i = 0; i < count_in_mempool; i++) {
3015 struct rte_mbuf *mbuf
3016 = __rte_mbuf_raw_alloc(
3017 vpool_array[index].pool);
3018 rte_ring_sp_enqueue(vpool_array[index].ring,
3022 LOG_DEBUG(VHOST_CONFIG,
3023 "in MAIN: mbuf count in mempool at initial "
3024 "is: %d\n", count_in_mempool);
3025 LOG_DEBUG(VHOST_CONFIG,
3026 "in MAIN: mbuf count in ring at initial is :"
3028 rte_ring_count(vpool_array[index].ring));
3031 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3032 rte_eal_remote_launch(switch_worker_zcp, NULL,
3037 rte_vhost_feature_disable(1ULL << VIRTIO_NET_F_MRG_RXBUF);
3039 /* Register CUSE device to handle IOCTLs. */
3040 ret = rte_vhost_driver_register((char *)&dev_basename);
3042 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3044 rte_vhost_driver_callback_register(&virtio_net_device_ops);
3046 /* Start CUSE session. */
3047 rte_vhost_driver_session_start();