4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_QUEUES 128
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
63 * Calculate the number of buffers needed per port
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
66 (num_switching_cores*MAX_PKT_BURST) + \
67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 (num_switching_cores*MBUF_CACHE_SIZE))
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
74 * No frame data buffer allocated from host are required for zero copy
75 * implementation, guest will allocate the frame data buffer, and vhost
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
84 * RX and TX Prefetch, Host, and Write-back threshold values should be
85 * carefully set for optimal performance. Consult the network
86 * controller's datasheet and supporting DPDK documentation for guidance
87 * on how these parameters should be set.
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
94 * These default values are optimized for use with the Intel(R) 82599 10 GbE
95 * Controller and the DPDK ixgbe PMD. Consider using other values for other
96 * network controllers and/or network drivers.
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
109 #define JUMBO_FRAME_MAX_SIZE 0x2600
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
114 #define DEVICE_SAFE_REMOVE 2
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
125 * Need refine these 2 macros for legacy and DPDK based front end:
126 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127 * And then adjust power 2.
130 * For legacy front end, 128 descriptors,
131 * half for virtio header, another half for mbuf.
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 + sizeof(struct rte_mbuf)))
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
143 #define INVALID_PORT_ID 0xFF
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
174 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175 * disabled on default.
177 static uint32_t zero_copy;
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
187 struct rte_mempool *pool;
188 struct rte_ring *ring;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
201 /* The type of host physical address translated from guest physical address. */
203 PHYS_ADDR_CONTINUOUS = 0,
204 PHYS_ADDR_CROSS_SUBREG = 1,
205 PHYS_ADDR_INVALID = 2,
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
230 .pthresh = RX_PTHRESH,
231 .hthresh = RX_HTHRESH,
232 .wthresh = RX_WTHRESH,
238 * These default values are optimized for use with the Intel(R) 82599 10 GbE
239 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240 * network controllers and/or network drivers.
242 static struct rte_eth_txconf tx_conf_default = {
244 .pthresh = TX_PTHRESH,
245 .hthresh = TX_HTHRESH,
246 .wthresh = TX_WTHRESH,
248 .tx_free_thresh = 0, /* Use PMD default values */
249 .tx_rs_thresh = 0, /* Use PMD default values */
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
255 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
257 .header_split = 0, /**< Header Split disabled */
258 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
261 * It is necessary for 1G NIC such as I350,
262 * this fixes bug of ipv4 forwarding in guest can't
263 * forward pakets from one virtio dev to another virtio dev.
265 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
266 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
267 .hw_strip_crc = 0, /**< CRC stripped by hardware */
271 .mq_mode = ETH_MQ_TX_NONE,
275 * should be overridden separately in code with
279 .nb_queue_pools = ETH_8_POOLS,
280 .enable_default_pool = 0,
283 .pool_map = {{0, 0},},
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
314 /* Used for queueing bursts of TX packets. */
318 struct rte_mbuf *m_table[MAX_PKT_BURST];
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
327 /* Vlan header struct used to insert vlan tags on TX. */
329 unsigned char h_dest[ETH_ALEN];
330 unsigned char h_source[ETH_ALEN];
333 __be16 h_vlan_encapsulated_proto;
338 uint8_t version_ihl; /**< version and header length */
339 uint8_t type_of_service; /**< type of service */
340 uint16_t total_length; /**< length of packet */
341 uint16_t packet_id; /**< packet ID */
342 uint16_t fragment_offset; /**< fragmentation offset */
343 uint8_t time_to_live; /**< time to live */
344 uint8_t next_proto_id; /**< protocol ID */
345 uint16_t hdr_checksum; /**< header checksum */
346 uint32_t src_addr; /**< source address */
347 uint32_t dst_addr; /**< destination address */
348 } __attribute__((__packed__));
350 /* Header lengths. */
352 #define VLAN_ETH_HLEN 18
354 /* Per-device statistics struct */
355 struct device_statistics {
357 rte_atomic64_t rx_total_atomic;
360 rte_atomic64_t rx_atomic;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
366 * Builds up the correct configuration for VMDQ VLAN pool map
367 * according to the pool & queue limits.
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
372 struct rte_eth_vmdq_rx_conf conf;
375 memset(&conf, 0, sizeof(conf));
376 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 conf.nb_pool_maps = num_devices;
378 conf.enable_loop_back =
379 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
381 for (i = 0; i < conf.nb_pool_maps; i++) {
382 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 conf.pool_map[i].pools = (1UL << i);
386 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
393 * Validate the device number according to the max pool number gotten form
394 * dev_info. If the device number is invalid, give the error message and
395 * return -1. Each device must have its own pool.
398 validate_num_devices(uint32_t max_nb_devices)
400 if (num_devices > max_nb_devices) {
401 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
408 * Initialises a given port using global settings and with the rx buffers
409 * coming from the mbuf_pool passed as parameter
412 port_init(uint8_t port)
414 struct rte_eth_dev_info dev_info;
415 struct rte_eth_conf port_conf;
416 uint16_t rx_rings, tx_rings;
417 uint16_t rx_ring_size, tx_ring_size;
421 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 rte_eth_dev_info_get (port, &dev_info);
424 /*configure the number of supported virtio devices based on VMDQ limits */
425 num_devices = dev_info.max_vmdq_pools;
426 num_queues = dev_info.max_rx_queues;
429 rx_ring_size = num_rx_descriptor;
430 tx_ring_size = num_tx_descriptor;
431 tx_rings = dev_info.max_tx_queues;
433 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 tx_rings = (uint16_t)rte_lcore_count();
438 retval = validate_num_devices(MAX_DEVICES);
442 /* Get port configuration. */
443 retval = get_eth_conf(&port_conf, num_devices);
447 if (port >= rte_eth_dev_count()) return -1;
449 rx_rings = (uint16_t)num_queues,
450 /* Configure ethernet device. */
451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
455 /* Setup the queues. */
456 for (q = 0; q < rx_rings; q ++) {
457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 rte_eth_dev_socket_id(port), &rx_conf_default,
459 vpool_array[q].pool);
463 for (q = 0; q < tx_rings; q ++) {
464 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 rte_eth_dev_socket_id(port), &tx_conf_default);
470 /* Start the device. */
471 retval = rte_eth_dev_start(port);
473 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 vmdq_ports_eth_addr[port].addr_bytes[0],
483 vmdq_ports_eth_addr[port].addr_bytes[1],
484 vmdq_ports_eth_addr[port].addr_bytes[2],
485 vmdq_ports_eth_addr[port].addr_bytes[3],
486 vmdq_ports_eth_addr[port].addr_bytes[4],
487 vmdq_ports_eth_addr[port].addr_bytes[5]);
493 * Set character device basename.
496 us_vhost_parse_basename(const char *q_arg)
498 /* parse number string */
500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 * Parse the portmask provided at run time.
512 parse_portmask(const char *portmask)
519 /* parse hexadecimal string */
520 pm = strtoul(portmask, &end, 16);
521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532 * Parse num options at run time.
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
542 /* parse unsigned int string */
543 num = strtoul(q_arg, &end, 10);
544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 if (num > max_valid_value)
558 us_vhost_usage(const char *prgname)
560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 " --dev-basename <name> --dev-index [0-N]\n"
565 " -p PORTMASK: Set mask for ports to be used by application\n"
566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 " --dev-basename: The basename to be used for the character device.\n"
573 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 " --zero-copy [0|1]: disable(default)/enable rx/tx "
576 " --rx-desc-num [0-N]: the number of descriptors on rx, "
577 "used only when zero copy is enabled.\n"
578 " --tx-desc-num [0-N]: the number of descriptors on tx, "
579 "used only when zero copy is enabled.\n",
584 * Parse the arguments given in the command line of the application.
587 us_vhost_parse_args(int argc, char **argv)
592 const char *prgname = argv[0];
593 static struct option long_option[] = {
594 {"vm2vm", required_argument, NULL, 0},
595 {"rx-retry", required_argument, NULL, 0},
596 {"rx-retry-delay", required_argument, NULL, 0},
597 {"rx-retry-num", required_argument, NULL, 0},
598 {"mergeable", required_argument, NULL, 0},
599 {"stats", required_argument, NULL, 0},
600 {"dev-basename", required_argument, NULL, 0},
601 {"dev-index", required_argument, NULL, 0},
602 {"zero-copy", required_argument, NULL, 0},
603 {"rx-desc-num", required_argument, NULL, 0},
604 {"tx-desc-num", required_argument, NULL, 0},
608 /* Parse command line */
609 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
613 enabled_port_mask = parse_portmask(optarg);
614 if (enabled_port_mask == 0) {
615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 us_vhost_usage(prgname);
622 /* Enable/disable vm2vm comms. */
623 if (!strncmp(long_option[option_index].name, "vm2vm",
625 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627 RTE_LOG(INFO, VHOST_CONFIG,
628 "Invalid argument for "
630 us_vhost_usage(prgname);
633 vm2vm_mode = (vm2vm_type)ret;
637 /* Enable/disable retries on RX. */
638 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 ret = parse_num_opt(optarg, 1);
641 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 us_vhost_usage(prgname);
649 /* Specify the retries delay time (in useconds) on RX. */
650 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 ret = parse_num_opt(optarg, INT32_MAX);
653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 us_vhost_usage(prgname);
657 burst_rx_delay_time = ret;
661 /* Specify the retries number on RX. */
662 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 ret = parse_num_opt(optarg, INT32_MAX);
665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 us_vhost_usage(prgname);
669 burst_rx_retry_num = ret;
673 /* Enable/disable RX mergeable buffers. */
674 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 ret = parse_num_opt(optarg, 1);
677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 us_vhost_usage(prgname);
682 vmdq_conf_default.rxmode.jumbo_frame = 1;
683 vmdq_conf_default.rxmode.max_rx_pkt_len
684 = JUMBO_FRAME_MAX_SIZE;
685 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
690 /* Enable/disable stats. */
691 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 ret = parse_num_opt(optarg, INT32_MAX);
694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 us_vhost_usage(prgname);
702 /* Set character device basename. */
703 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 if (us_vhost_parse_basename(optarg) == -1) {
705 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 us_vhost_usage(prgname);
711 /* Set character device index. */
712 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 ret = parse_num_opt(optarg, INT32_MAX);
715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 us_vhost_usage(prgname);
722 /* Enable/disable rx/tx zero copy. */
723 if (!strncmp(long_option[option_index].name,
724 "zero-copy", MAX_LONG_OPT_SZ)) {
725 ret = parse_num_opt(optarg, 1);
727 RTE_LOG(INFO, VHOST_CONFIG,
729 " for zero-copy [0|1]\n");
730 us_vhost_usage(prgname);
736 #ifdef RTE_MBUF_REFCNT
737 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 "zero copy vhost APP, please "
739 "disable RTE_MBUF_REFCNT\n"
740 "in config file and then rebuild DPDK "
742 "Otherwise please disable zero copy "
743 "flag in command line!\n");
749 /* Specify the descriptor number on RX. */
750 if (!strncmp(long_option[option_index].name,
751 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752 ret = parse_num_opt(optarg, MAX_RING_DESC);
753 if ((ret == -1) || (!POWEROF2(ret))) {
754 RTE_LOG(INFO, VHOST_CONFIG,
755 "Invalid argument for rx-desc-num[0-N],"
756 "power of 2 required.\n");
757 us_vhost_usage(prgname);
760 num_rx_descriptor = ret;
764 /* Specify the descriptor number on TX. */
765 if (!strncmp(long_option[option_index].name,
766 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767 ret = parse_num_opt(optarg, MAX_RING_DESC);
768 if ((ret == -1) || (!POWEROF2(ret))) {
769 RTE_LOG(INFO, VHOST_CONFIG,
770 "Invalid argument for tx-desc-num [0-N],"
771 "power of 2 required.\n");
772 us_vhost_usage(prgname);
775 num_tx_descriptor = ret;
781 /* Invalid option - print options. */
783 us_vhost_usage(prgname);
788 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 if (enabled_port_mask & (1 << i))
790 ports[num_ports++] = (uint8_t)i;
793 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
794 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
799 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 RTE_LOG(INFO, VHOST_PORT,
801 "Vhost zero copy doesn't support software vm2vm,"
802 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
806 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 RTE_LOG(INFO, VHOST_PORT,
808 "Vhost zero copy doesn't support jumbo frame,"
809 "please specify '--mergeable 0' to disable the "
810 "mergeable feature.\n");
818 * Update the global var NUM_PORTS and array PORTS according to system ports number
819 * and return valid ports number
821 static unsigned check_ports_num(unsigned nb_ports)
823 unsigned valid_num_ports = num_ports;
826 if (num_ports > nb_ports) {
827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 num_ports, nb_ports);
829 num_ports = nb_ports;
832 for (portid = 0; portid < num_ports; portid ++) {
833 if (ports[portid] >= nb_ports) {
834 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 ports[portid], (nb_ports - 1));
836 ports[portid] = INVALID_PORT_ID;
840 return valid_num_ports;
844 * Macro to print out packet contents. Wrapped in debug define so that the
845 * data path is not effected when debug is disabled.
848 #define PRINT_PACKET(device, addr, size, header) do { \
849 char *pkt_addr = (char*)(addr); \
850 unsigned int index; \
851 char packet[MAX_PRINT_BUFF]; \
854 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
857 for (index = 0; index < (size); index++) { \
858 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
859 "%02hhx ", pkt_addr[index]); \
861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
863 LOG_DEBUG(VHOST_DATA, "%s", packet); \
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
870 * Function to convert guest physical addresses to vhost physical addresses.
871 * This is used to convert virtio buffer addresses.
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct vhost_dev *vdev, uint64_t guest_pa,
875 uint32_t buf_len, hpa_type *addr_type)
877 struct virtio_memory_regions_hpa *region;
879 uint64_t vhost_pa = 0;
881 *addr_type = PHYS_ADDR_INVALID;
883 for (regionidx = 0; regionidx < vdev->nregions_hpa; regionidx++) {
884 region = &vdev->regions_hpa[regionidx];
885 if ((guest_pa >= region->guest_phys_address) &&
886 (guest_pa <= region->guest_phys_address_end)) {
887 vhost_pa = region->host_phys_addr_offset + guest_pa;
888 if (likely((guest_pa + buf_len - 1)
889 <= region->guest_phys_address_end))
890 *addr_type = PHYS_ADDR_CONTINUOUS;
892 *addr_type = PHYS_ADDR_CROSS_SUBREG;
897 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898 vdev->dev->device_fh, (void *)(uintptr_t)guest_pa,
899 (void *)(uintptr_t)vhost_pa);
905 * Compares a packet destination MAC address to a device MAC address.
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
910 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
914 * This function learns the MAC address of the device and registers this along with a
915 * vlan tag to a VMDQ.
918 link_vmdq(struct vhost_dev *vdev, struct rte_mbuf *m)
920 struct ether_hdr *pkt_hdr;
921 struct virtio_net_data_ll *dev_ll;
922 struct virtio_net *dev = vdev->dev;
925 /* Learn MAC address of guest device from packet */
926 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
928 dev_ll = ll_root_used;
930 while (dev_ll != NULL) {
931 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->vdev->mac_address)) {
932 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
935 dev_ll = dev_ll->next;
938 for (i = 0; i < ETHER_ADDR_LEN; i++)
939 vdev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
941 /* vlan_tag currently uses the device_id. */
942 vdev->vlan_tag = vlan_tags[dev->device_fh];
944 /* Print out VMDQ registration info. */
945 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
947 vdev->mac_address.addr_bytes[0], vdev->mac_address.addr_bytes[1],
948 vdev->mac_address.addr_bytes[2], vdev->mac_address.addr_bytes[3],
949 vdev->mac_address.addr_bytes[4], vdev->mac_address.addr_bytes[5],
952 /* Register the MAC address. */
953 ret = rte_eth_dev_mac_addr_add(ports[0], &vdev->mac_address, (uint32_t)dev->device_fh);
955 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
958 /* Enable stripping of the vlan tag as we handle routing. */
959 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)vdev->vmdq_rx_q, 1);
961 /* Set device as ready for RX. */
962 vdev->ready = DEVICE_RX;
968 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
969 * queue before disabling RX on the device.
972 unlink_vmdq(struct vhost_dev *vdev)
976 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
978 if (vdev->ready == DEVICE_RX) {
979 /*clear MAC and VLAN settings*/
980 rte_eth_dev_mac_addr_remove(ports[0], &vdev->mac_address);
981 for (i = 0; i < 6; i++)
982 vdev->mac_address.addr_bytes[i] = 0;
986 /*Clear out the receive buffers*/
987 rx_count = rte_eth_rx_burst(ports[0],
988 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
991 for (i = 0; i < rx_count; i++)
992 rte_pktmbuf_free(pkts_burst[i]);
994 rx_count = rte_eth_rx_burst(ports[0],
995 (uint16_t)vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
998 vdev->ready = DEVICE_MAC_LEARNING;
1003 * Check if the packet destination MAC address is for a local device. If so then put
1004 * the packet on that devices RX queue. If not then return.
1006 static inline unsigned __attribute__((always_inline))
1007 virtio_tx_local(struct vhost_dev *vdev, struct rte_mbuf *m)
1009 struct virtio_net_data_ll *dev_ll;
1010 struct ether_hdr *pkt_hdr;
1012 struct virtio_net *dev = vdev->dev;
1013 struct virtio_net *tdev; /* destination virito device */
1015 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1017 /*get the used devices list*/
1018 dev_ll = ll_root_used;
1020 while (dev_ll != NULL) {
1021 if ((dev_ll->vdev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1022 &dev_ll->vdev->mac_address)) {
1024 /* Drop the packet if the TX packet is destined for the TX device. */
1025 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1026 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1030 tdev = dev_ll->vdev->dev;
1033 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", tdev->device_fh);
1035 if (dev_ll->vdev->remove) {
1036 /*drop the packet if the device is marked for removal*/
1037 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", tdev->device_fh);
1039 uint32_t mergeable =
1040 dev_ll->dev->features &
1041 (1 << VIRTIO_NET_F_MRG_RXBUF);
1043 /*send the packet to the local virtio device*/
1044 if (likely(mergeable == 0))
1045 ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1047 ret = virtio_dev_merge_rx(dev_ll->dev,
1052 &dev_statistics[tdev->device_fh].rx_total_atomic,
1055 &dev_statistics[tdev->device_fh].rx_atomic,
1057 dev_statistics[tdev->device_fh].tx_total++;
1058 dev_statistics[tdev->device_fh].tx += ret;
1064 dev_ll = dev_ll->next;
1071 * This function routes the TX packet to the correct interface. This may be a local device
1072 * or the physical port.
1074 static inline void __attribute__((always_inline))
1075 virtio_tx_route(struct vhost_dev *vdev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1077 struct mbuf_table *tx_q;
1078 struct vlan_ethhdr *vlan_hdr;
1079 struct rte_mbuf **m_table;
1080 struct rte_mbuf *mbuf, *prev;
1081 unsigned len, ret, offset = 0;
1082 const uint16_t lcore_id = rte_lcore_id();
1083 struct virtio_net_data_ll *dev_ll = ll_root_used;
1084 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1085 struct virtio_net *dev = vdev->dev;
1087 /*check if destination is local VM*/
1088 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(vdev, m) == 0))
1091 if (vm2vm_mode == VM2VM_HARDWARE) {
1092 while (dev_ll != NULL) {
1093 if ((dev_ll->vdev->ready == DEVICE_RX)
1094 && ether_addr_cmp(&(pkt_hdr->d_addr),
1095 &dev_ll->vdev->mac_address)) {
1097 * Drop the packet if the TX packet is
1098 * destined for the TX device.
1100 if (dev_ll->vdev->dev->device_fh == dev->device_fh) {
1101 LOG_DEBUG(VHOST_DATA,
1102 "(%"PRIu64") TX: Source and destination"
1103 " MAC addresses are the same. Dropping "
1105 dev_ll->vdev->device_fh);
1111 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1113 LOG_DEBUG(VHOST_DATA,
1114 "(%"PRIu64") TX: pkt to local VM device id:"
1115 "(%"PRIu64") vlan tag: %d.\n",
1116 dev->device_fh, dev_ll->vdev->dev->device_fh,
1121 dev_ll = dev_ll->next;
1125 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1127 /*Add packet to the port tx queue*/
1128 tx_q = &lcore_tx_queue[lcore_id];
1131 /* Allocate an mbuf and populate the structure. */
1132 mbuf = rte_pktmbuf_alloc(mbuf_pool);
1133 if (unlikely(mbuf == NULL)) {
1134 RTE_LOG(ERR, VHOST_DATA,
1135 "Failed to allocate memory for mbuf.\n");
1139 mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1140 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1141 mbuf->nb_segs = m->nb_segs;
1143 /* Copy ethernet header to mbuf. */
1144 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1145 rte_pktmbuf_mtod(m, const void *),
1149 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1150 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1151 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1152 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1153 vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1155 /* Copy the remaining packet contents to the mbuf. */
1156 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1157 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1158 (m->data_len - ETH_HLEN));
1160 /* Copy the remaining segments for the whole packet. */
1163 /* Allocate an mbuf and populate the structure. */
1164 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1165 if (unlikely(next_mbuf == NULL)) {
1166 rte_pktmbuf_free(mbuf);
1167 RTE_LOG(ERR, VHOST_DATA,
1168 "Failed to allocate memory for mbuf.\n");
1173 prev->next = next_mbuf;
1175 next_mbuf->data_len = m->data_len;
1177 /* Copy data to next mbuf. */
1178 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1179 rte_pktmbuf_mtod(m, const void *), m->data_len);
1182 tx_q->m_table[len] = mbuf;
1185 dev_statistics[dev->device_fh].tx_total++;
1186 dev_statistics[dev->device_fh].tx++;
1189 if (unlikely(len == MAX_PKT_BURST)) {
1190 m_table = (struct rte_mbuf **)tx_q->m_table;
1191 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1192 /* Free any buffers not handled by TX and update the port stats. */
1193 if (unlikely(ret < len)) {
1195 rte_pktmbuf_free(m_table[ret]);
1196 } while (++ret < len);
1206 * This function is called by each data core. It handles all RX/TX registered with the
1207 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1208 * with all devices in the main linked list.
1211 switch_worker(__attribute__((unused)) void *arg)
1213 struct rte_mempool *mbuf_pool = arg;
1214 struct virtio_net *dev = NULL;
1215 struct vhost_dev *vdev = NULL;
1216 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1217 struct virtio_net_data_ll *dev_ll;
1218 struct mbuf_table *tx_q;
1219 volatile struct lcore_ll_info *lcore_ll;
1220 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1221 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1223 const uint16_t lcore_id = rte_lcore_id();
1224 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1225 uint16_t rx_count = 0;
1226 uint32_t mergeable = 0;
1228 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1229 lcore_ll = lcore_info[lcore_id].lcore_ll;
1232 tx_q = &lcore_tx_queue[lcore_id];
1233 for (i = 0; i < num_cores; i ++) {
1234 if (lcore_ids[i] == lcore_id) {
1241 cur_tsc = rte_rdtsc();
1243 * TX burst queue drain
1245 diff_tsc = cur_tsc - prev_tsc;
1246 if (unlikely(diff_tsc > drain_tsc)) {
1249 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1251 /*Tx any packets in the queue*/
1252 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1253 (struct rte_mbuf **)tx_q->m_table,
1254 (uint16_t)tx_q->len);
1255 if (unlikely(ret < tx_q->len)) {
1257 rte_pktmbuf_free(tx_q->m_table[ret]);
1258 } while (++ret < tx_q->len);
1268 rte_prefetch0(lcore_ll->ll_root_used);
1270 * Inform the configuration core that we have exited the linked list and that no devices are
1271 * in use if requested.
1273 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1274 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1279 dev_ll = lcore_ll->ll_root_used;
1281 while (dev_ll != NULL) {
1282 /*get virtio device ID*/
1283 vdev = dev_ll->vdev;
1286 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1289 dev_ll = dev_ll->next;
1291 vdev->ready = DEVICE_SAFE_REMOVE;
1294 if (likely(vdev->ready == DEVICE_RX)) {
1296 rx_count = rte_eth_rx_burst(ports[0],
1297 vdev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1300 if (likely(mergeable == 0))
1303 pkts_burst, rx_count);
1306 virtio_dev_merge_rx(dev,
1307 pkts_burst, rx_count);
1311 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_total_atomic,
1314 &dev_statistics[dev_ll->vdev->dev->device_fh].rx_atomic, ret_count);
1316 while (likely(rx_count)) {
1318 rte_pktmbuf_free(pkts_burst[rx_count]);
1324 if (!vdev->remove) {
1326 if (likely(mergeable == 0))
1327 virtio_dev_tx(dev, mbuf_pool);
1329 virtio_dev_merge_tx(dev, mbuf_pool);
1332 /*move to the next device in the list*/
1333 dev_ll = dev_ll->next;
1341 * This function gets available ring number for zero copy rx.
1342 * Only one thread will call this funciton for a paticular virtio device,
1343 * so, it is designed as non-thread-safe function.
1345 static inline uint32_t __attribute__((always_inline))
1346 get_available_ring_num_zcp(struct virtio_net *dev)
1348 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1351 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1352 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1356 * This function gets available ring index for zero copy rx,
1357 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1358 * Only one thread will call this funciton for a paticular virtio device,
1359 * so, it is designed as non-thread-safe function.
1361 static inline uint32_t __attribute__((always_inline))
1362 get_available_ring_index_zcp(struct virtio_net *dev,
1363 uint16_t *res_base_idx, uint32_t count)
1365 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1368 uint16_t free_entries;
1370 *res_base_idx = vq->last_used_idx_res;
1371 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1372 free_entries = (avail_idx - *res_base_idx);
1374 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1376 "res base idx:%d, free entries:%d\n",
1377 dev->device_fh, avail_idx, *res_base_idx,
1381 * If retry is enabled and the queue is full then we wait
1382 * and retry to avoid packet loss.
1384 if (enable_retry && unlikely(count > free_entries)) {
1385 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1386 rte_delay_us(burst_rx_delay_time);
1387 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1388 free_entries = (avail_idx - *res_base_idx);
1389 if (count <= free_entries)
1394 /*check that we have enough buffers*/
1395 if (unlikely(count > free_entries))
1396 count = free_entries;
1398 if (unlikely(count == 0)) {
1399 LOG_DEBUG(VHOST_DATA,
1400 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1401 "avail idx: %d, res base idx:%d, free entries:%d\n",
1402 dev->device_fh, avail_idx,
1403 *res_base_idx, free_entries);
1407 vq->last_used_idx_res = *res_base_idx + count;
1413 * This function put descriptor back to used list.
1415 static inline void __attribute__((always_inline))
1416 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1418 uint16_t res_cur_idx = vq->last_used_idx;
1419 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1420 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1421 rte_compiler_barrier();
1422 *(volatile uint16_t *)&vq->used->idx += 1;
1423 vq->last_used_idx += 1;
1425 /* Kick the guest if necessary. */
1426 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1427 eventfd_write((int)vq->kickfd, 1);
1431 * This function get available descriptor from vitio vring and un-attached mbuf
1432 * from vpool->ring, and then attach them together. It needs adjust the offset
1433 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1434 * frame data may be put to wrong location in mbuf.
1436 static inline void __attribute__((always_inline))
1437 attach_rxmbuf_zcp(struct virtio_net *dev)
1439 uint16_t res_base_idx, desc_idx;
1440 uint64_t buff_addr, phys_addr;
1441 struct vhost_virtqueue *vq;
1442 struct vring_desc *desc;
1443 struct rte_mbuf *mbuf = NULL;
1444 struct vpool *vpool;
1446 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1448 vpool = &vpool_array[vdev->vmdq_rx_q];
1449 vq = dev->virtqueue[VIRTIO_RXQ];
1452 if (unlikely(get_available_ring_index_zcp(vdev->dev, &res_base_idx,
1455 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1457 desc = &vq->desc[desc_idx];
1458 if (desc->flags & VRING_DESC_F_NEXT) {
1459 desc = &vq->desc[desc->next];
1460 buff_addr = gpa_to_vva(dev, desc->addr);
1461 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len,
1464 buff_addr = gpa_to_vva(dev,
1465 desc->addr + vq->vhost_hlen);
1466 phys_addr = gpa_to_hpa(vdev,
1467 desc->addr + vq->vhost_hlen,
1468 desc->len, &addr_type);
1471 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1472 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1473 " address found when attaching RX frame buffer"
1474 " address!\n", dev->device_fh);
1475 put_desc_to_used_list_zcp(vq, desc_idx);
1480 * Check if the frame buffer address from guest crosses
1481 * sub-region or not.
1483 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1484 RTE_LOG(ERR, VHOST_DATA,
1485 "(%"PRIu64") Frame buffer address cross "
1486 "sub-regioin found when attaching RX frame "
1487 "buffer address!\n",
1489 put_desc_to_used_list_zcp(vq, desc_idx);
1492 } while (unlikely(phys_addr == 0));
1494 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1495 if (unlikely(mbuf == NULL)) {
1496 LOG_DEBUG(VHOST_DATA,
1497 "(%"PRIu64") in attach_rxmbuf_zcp: "
1498 "ring_sc_dequeue fail.\n",
1500 put_desc_to_used_list_zcp(vq, desc_idx);
1504 if (unlikely(vpool->buf_size > desc->len)) {
1505 LOG_DEBUG(VHOST_DATA,
1506 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1507 "length(%d) of descriptor idx: %d less than room "
1508 "size required: %d\n",
1509 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1510 put_desc_to_used_list_zcp(vq, desc_idx);
1511 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1515 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1516 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1517 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1518 mbuf->data_len = desc->len;
1519 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1521 LOG_DEBUG(VHOST_DATA,
1522 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1523 "descriptor idx:%d\n",
1524 dev->device_fh, res_base_idx, desc_idx);
1526 __rte_mbuf_raw_free(mbuf);
1532 * Detach an attched packet mbuf -
1533 * - restore original mbuf address and length values.
1534 * - reset pktmbuf data and data_len to their default values.
1535 * All other fields of the given packet mbuf will be left intact.
1538 * The attached packet mbuf.
1540 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1542 const struct rte_mempool *mp = m->pool;
1543 void *buf = RTE_MBUF_TO_BADDR(m);
1545 uint32_t buf_len = mp->elt_size - sizeof(*m);
1546 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1549 m->buf_len = (uint16_t)buf_len;
1551 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1552 RTE_PKTMBUF_HEADROOM : m->buf_len;
1553 m->data_off = buf_ofs;
1559 * This function is called after packets have been transimited. It fetchs mbuf
1560 * from vpool->pool, detached it and put into vpool->ring. It also update the
1561 * used index and kick the guest if necessary.
1563 static inline uint32_t __attribute__((always_inline))
1564 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1566 struct rte_mbuf *mbuf;
1567 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1568 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1570 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1572 LOG_DEBUG(VHOST_DATA,
1573 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1575 dev->device_fh, mbuf_count);
1576 LOG_DEBUG(VHOST_DATA,
1577 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1579 dev->device_fh, rte_ring_count(vpool->ring));
1581 for (index = 0; index < mbuf_count; index++) {
1582 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1583 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1584 pktmbuf_detach_zcp(mbuf);
1585 rte_ring_sp_enqueue(vpool->ring, mbuf);
1587 /* Update used index buffer information. */
1588 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1589 vq->used->ring[used_idx].len = 0;
1591 used_idx = (used_idx + 1) & (vq->size - 1);
1594 LOG_DEBUG(VHOST_DATA,
1595 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1597 dev->device_fh, rte_mempool_count(vpool->pool));
1598 LOG_DEBUG(VHOST_DATA,
1599 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1601 dev->device_fh, rte_ring_count(vpool->ring));
1602 LOG_DEBUG(VHOST_DATA,
1603 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1604 "vq->last_used_idx:%d\n",
1605 dev->device_fh, vq->last_used_idx);
1607 vq->last_used_idx += mbuf_count;
1609 LOG_DEBUG(VHOST_DATA,
1610 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1611 "vq->last_used_idx:%d\n",
1612 dev->device_fh, vq->last_used_idx);
1614 rte_compiler_barrier();
1616 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1618 /* Kick guest if required. */
1619 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1620 eventfd_write((int)vq->kickfd, 1);
1626 * This function is called when a virtio device is destroy.
1627 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1629 static void mbuf_destroy_zcp(struct vpool *vpool)
1631 struct rte_mbuf *mbuf = NULL;
1632 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1634 LOG_DEBUG(VHOST_CONFIG,
1635 "in mbuf_destroy_zcp: mbuf count in mempool before "
1636 "mbuf_destroy_zcp is: %d\n",
1638 LOG_DEBUG(VHOST_CONFIG,
1639 "in mbuf_destroy_zcp: mbuf count in ring before "
1640 "mbuf_destroy_zcp is : %d\n",
1641 rte_ring_count(vpool->ring));
1643 for (index = 0; index < mbuf_count; index++) {
1644 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1645 if (likely(mbuf != NULL)) {
1646 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1647 pktmbuf_detach_zcp(mbuf);
1648 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1652 LOG_DEBUG(VHOST_CONFIG,
1653 "in mbuf_destroy_zcp: mbuf count in mempool after "
1654 "mbuf_destroy_zcp is: %d\n",
1655 rte_mempool_count(vpool->pool));
1656 LOG_DEBUG(VHOST_CONFIG,
1657 "in mbuf_destroy_zcp: mbuf count in ring after "
1658 "mbuf_destroy_zcp is : %d\n",
1659 rte_ring_count(vpool->ring));
1663 * This function update the use flag and counter.
1665 static inline uint32_t __attribute__((always_inline))
1666 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1669 struct vhost_virtqueue *vq;
1670 struct vring_desc *desc;
1671 struct rte_mbuf *buff;
1672 /* The virtio_hdr is initialised to 0. */
1673 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1674 = {{0, 0, 0, 0, 0, 0}, 0};
1675 uint64_t buff_hdr_addr = 0;
1676 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1677 uint32_t head_idx, packet_success = 0;
1678 uint16_t res_cur_idx;
1680 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1685 vq = dev->virtqueue[VIRTIO_RXQ];
1686 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1688 res_cur_idx = vq->last_used_idx;
1689 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1690 dev->device_fh, res_cur_idx, res_cur_idx + count);
1692 /* Retrieve all of the head indexes first to avoid caching issues. */
1693 for (head_idx = 0; head_idx < count; head_idx++)
1694 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1696 /*Prefetch descriptor index. */
1697 rte_prefetch0(&vq->desc[head[packet_success]]);
1699 while (packet_success != count) {
1700 /* Get descriptor from available ring */
1701 desc = &vq->desc[head[packet_success]];
1703 buff = pkts[packet_success];
1704 LOG_DEBUG(VHOST_DATA,
1705 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1706 "pkt[%d] descriptor idx: %d\n",
1707 dev->device_fh, packet_success,
1708 MBUF_HEADROOM_UINT32(buff));
1711 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1712 + RTE_PKTMBUF_HEADROOM),
1713 rte_pktmbuf_data_len(buff), 0);
1715 /* Buffer address translation for virtio header. */
1716 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1717 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1720 * If the descriptors are chained the header and data are
1721 * placed in separate buffers.
1723 if (desc->flags & VRING_DESC_F_NEXT) {
1724 desc->len = vq->vhost_hlen;
1725 desc = &vq->desc[desc->next];
1726 desc->len = rte_pktmbuf_data_len(buff);
1728 desc->len = packet_len;
1731 /* Update used ring with desc information */
1732 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1733 = head[packet_success];
1734 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1739 /* A header is required per buffer. */
1740 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1741 (const void *)&virtio_hdr, vq->vhost_hlen);
1743 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1745 if (likely(packet_success < count)) {
1746 /* Prefetch descriptor index. */
1747 rte_prefetch0(&vq->desc[head[packet_success]]);
1751 rte_compiler_barrier();
1753 LOG_DEBUG(VHOST_DATA,
1754 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1755 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1756 dev->device_fh, vq->last_used_idx, vq->used->idx);
1758 *(volatile uint16_t *)&vq->used->idx += count;
1759 vq->last_used_idx += count;
1761 LOG_DEBUG(VHOST_DATA,
1762 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1763 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1764 dev->device_fh, vq->last_used_idx, vq->used->idx);
1766 /* Kick the guest if necessary. */
1767 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1768 eventfd_write((int)vq->kickfd, 1);
1774 * This function routes the TX packet to the correct interface.
1775 * This may be a local device or the physical port.
1777 static inline void __attribute__((always_inline))
1778 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1779 uint32_t desc_idx, uint8_t need_copy)
1781 struct mbuf_table *tx_q;
1782 struct rte_mbuf **m_table;
1783 struct rte_mbuf *mbuf = NULL;
1784 unsigned len, ret, offset = 0;
1785 struct vpool *vpool;
1786 struct virtio_net_data_ll *dev_ll = ll_root_used;
1787 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1788 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1789 uint16_t vmdq_rx_q = ((struct vhost_dev *)dev->priv)->vmdq_rx_q;
1791 /*Add packet to the port tx queue*/
1792 tx_q = &tx_queue_zcp[vmdq_rx_q];
1795 /* Allocate an mbuf and populate the structure. */
1796 vpool = &vpool_array[MAX_QUEUES + vmdq_rx_q];
1797 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1798 if (unlikely(mbuf == NULL)) {
1799 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1800 RTE_LOG(ERR, VHOST_DATA,
1801 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1803 put_desc_to_used_list_zcp(vq, desc_idx);
1807 if (vm2vm_mode == VM2VM_HARDWARE) {
1808 /* Avoid using a vlan tag from any vm for external pkt, such as
1809 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1810 * selection, MAC address determines it as an external pkt
1811 * which should go to network, while vlan tag determine it as
1812 * a vm2vm pkt should forward to another vm. Hardware confuse
1813 * such a ambiguous situation, so pkt will lost.
1815 vlan_tag = external_pkt_default_vlan_tag;
1816 while (dev_ll != NULL) {
1817 if (likely(dev_ll->vdev->ready == DEVICE_RX) &&
1818 ether_addr_cmp(&(pkt_hdr->d_addr),
1819 &dev_ll->vdev->mac_address)) {
1822 * Drop the packet if the TX packet is destined
1823 * for the TX device.
1825 if (unlikely(dev_ll->vdev->dev->device_fh
1826 == dev->device_fh)) {
1827 LOG_DEBUG(VHOST_DATA,
1828 "(%"PRIu64") TX: Source and destination"
1829 "MAC addresses are the same. Dropping "
1831 dev_ll->vdev->dev->device_fh);
1832 MBUF_HEADROOM_UINT32(mbuf)
1833 = (uint32_t)desc_idx;
1834 __rte_mbuf_raw_free(mbuf);
1839 * Packet length offset 4 bytes for HW vlan
1840 * strip when L2 switch back.
1845 vlan_tags[(uint16_t)dev_ll->vdev->dev->device_fh];
1847 LOG_DEBUG(VHOST_DATA,
1848 "(%"PRIu64") TX: pkt to local VM device id:"
1849 "(%"PRIu64") vlan tag: %d.\n",
1850 dev->device_fh, dev_ll->vdev->dev->device_fh,
1855 dev_ll = dev_ll->next;
1859 mbuf->nb_segs = m->nb_segs;
1860 mbuf->next = m->next;
1861 mbuf->data_len = m->data_len + offset;
1862 mbuf->pkt_len = mbuf->data_len;
1863 if (unlikely(need_copy)) {
1864 /* Copy the packet contents to the mbuf. */
1865 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1866 rte_pktmbuf_mtod(m, void *),
1869 mbuf->data_off = m->data_off;
1870 mbuf->buf_physaddr = m->buf_physaddr;
1871 mbuf->buf_addr = m->buf_addr;
1873 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1874 mbuf->vlan_tci = vlan_tag;
1875 mbuf->l2_len = sizeof(struct ether_hdr);
1876 mbuf->l3_len = sizeof(struct ipv4_hdr);
1877 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1879 tx_q->m_table[len] = mbuf;
1882 LOG_DEBUG(VHOST_DATA,
1883 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1886 (mbuf->next == NULL) ? "null" : "non-null");
1889 dev_statistics[dev->device_fh].tx_total++;
1890 dev_statistics[dev->device_fh].tx++;
1893 if (unlikely(len == MAX_PKT_BURST)) {
1894 m_table = (struct rte_mbuf **)tx_q->m_table;
1895 ret = rte_eth_tx_burst(ports[0],
1896 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1899 * Free any buffers not handled by TX and update
1902 if (unlikely(ret < len)) {
1904 rte_pktmbuf_free(m_table[ret]);
1905 } while (++ret < len);
1909 txmbuf_clean_zcp(dev, vpool);
1918 * This function TX all available packets in virtio TX queue for one
1919 * virtio-net device. If it is first packet, it learns MAC address and
1922 static inline void __attribute__((always_inline))
1923 virtio_dev_tx_zcp(struct virtio_net *dev)
1926 struct vhost_virtqueue *vq;
1927 struct vring_desc *desc;
1928 uint64_t buff_addr = 0, phys_addr;
1929 uint32_t head[MAX_PKT_BURST];
1931 uint16_t free_entries, packet_success = 0;
1933 uint8_t need_copy = 0;
1935 struct vhost_dev *vdev = (struct vhost_dev *)dev->priv;
1937 vq = dev->virtqueue[VIRTIO_TXQ];
1938 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1940 /* If there are no available buffers then return. */
1941 if (vq->last_used_idx_res == avail_idx)
1944 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1946 /* Prefetch available ring to retrieve head indexes. */
1947 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1949 /* Get the number of free entries in the ring */
1950 free_entries = (avail_idx - vq->last_used_idx_res);
1952 /* Limit to MAX_PKT_BURST. */
1954 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1956 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1957 dev->device_fh, free_entries);
1959 /* Retrieve all of the head indexes first to avoid caching issues. */
1960 for (i = 0; i < free_entries; i++)
1962 = vq->avail->ring[(vq->last_used_idx_res + i)
1965 vq->last_used_idx_res += free_entries;
1967 /* Prefetch descriptor index. */
1968 rte_prefetch0(&vq->desc[head[packet_success]]);
1969 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1971 while (packet_success < free_entries) {
1972 desc = &vq->desc[head[packet_success]];
1974 /* Discard first buffer as it is the virtio header */
1975 desc = &vq->desc[desc->next];
1977 /* Buffer address translation. */
1978 buff_addr = gpa_to_vva(dev, desc->addr);
1979 phys_addr = gpa_to_hpa(vdev, desc->addr, desc->len, &addr_type);
1981 if (likely(packet_success < (free_entries - 1)))
1982 /* Prefetch descriptor index. */
1983 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1985 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1986 RTE_LOG(ERR, VHOST_DATA,
1987 "(%"PRIu64") Invalid frame buffer address found"
1988 "when TX packets!\n",
1994 /* Prefetch buffer address. */
1995 rte_prefetch0((void *)(uintptr_t)buff_addr);
1998 * Setup dummy mbuf. This is copied to a real mbuf if
1999 * transmitted out the physical port.
2001 m.data_len = desc->len;
2005 m.buf_addr = (void *)(uintptr_t)buff_addr;
2006 m.buf_physaddr = phys_addr;
2009 * Check if the frame buffer address from guest crosses
2010 * sub-region or not.
2012 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2013 RTE_LOG(ERR, VHOST_DATA,
2014 "(%"PRIu64") Frame buffer address cross "
2015 "sub-regioin found when attaching TX frame "
2016 "buffer address!\n",
2022 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2025 * If this is the first received packet we need to learn
2026 * the MAC and setup VMDQ
2028 if (unlikely(vdev->ready == DEVICE_MAC_LEARNING)) {
2029 if (vdev->remove || (link_vmdq(vdev, &m) == -1)) {
2031 * Discard frame if device is scheduled for
2032 * removal or a duplicate MAC address is found.
2034 packet_success += free_entries;
2035 vq->last_used_idx += packet_success;
2040 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2046 * This function is called by each data core. It handles all RX/TX registered
2047 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2048 * addresses are compared with all devices in the main linked list.
2051 switch_worker_zcp(__attribute__((unused)) void *arg)
2053 struct virtio_net *dev = NULL;
2054 struct vhost_dev *vdev = NULL;
2055 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2056 struct virtio_net_data_ll *dev_ll;
2057 struct mbuf_table *tx_q;
2058 volatile struct lcore_ll_info *lcore_ll;
2059 const uint64_t drain_tsc
2060 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2061 * BURST_TX_DRAIN_US;
2062 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2064 const uint16_t lcore_id = rte_lcore_id();
2065 uint16_t count_in_ring, rx_count = 0;
2067 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2069 lcore_ll = lcore_info[lcore_id].lcore_ll;
2073 cur_tsc = rte_rdtsc();
2075 /* TX burst queue drain */
2076 diff_tsc = cur_tsc - prev_tsc;
2077 if (unlikely(diff_tsc > drain_tsc)) {
2079 * Get mbuf from vpool.pool and detach mbuf and
2080 * put back into vpool.ring.
2082 dev_ll = lcore_ll->ll_root_used;
2083 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2084 /* Get virtio device ID */
2085 vdev = dev_ll->vdev;
2088 if (likely(!vdev->remove)) {
2089 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2091 LOG_DEBUG(VHOST_DATA,
2092 "TX queue drained after timeout"
2093 " with burst size %u\n",
2097 * Tx any packets in the queue
2099 ret = rte_eth_tx_burst(
2101 (uint16_t)tx_q->txq_id,
2102 (struct rte_mbuf **)
2104 (uint16_t)tx_q->len);
2105 if (unlikely(ret < tx_q->len)) {
2108 tx_q->m_table[ret]);
2109 } while (++ret < tx_q->len);
2113 txmbuf_clean_zcp(dev,
2114 &vpool_array[MAX_QUEUES+vdev->vmdq_rx_q]);
2117 dev_ll = dev_ll->next;
2122 rte_prefetch0(lcore_ll->ll_root_used);
2125 * Inform the configuration core that we have exited the linked
2126 * list and that no devices are in use if requested.
2128 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2129 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2131 /* Process devices */
2132 dev_ll = lcore_ll->ll_root_used;
2134 while ((dev_ll != NULL) && (dev_ll->vdev != NULL)) {
2135 vdev = dev_ll->vdev;
2137 if (unlikely(vdev->remove)) {
2138 dev_ll = dev_ll->next;
2140 vdev->ready = DEVICE_SAFE_REMOVE;
2144 if (likely(vdev->ready == DEVICE_RX)) {
2145 uint32_t index = vdev->vmdq_rx_q;
2148 = rte_ring_count(vpool_array[index].ring);
2149 uint16_t free_entries
2150 = (uint16_t)get_available_ring_num_zcp(dev);
2153 * Attach all mbufs in vpool.ring and put back
2157 i < RTE_MIN(free_entries,
2158 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2160 attach_rxmbuf_zcp(dev);
2162 /* Handle guest RX */
2163 rx_count = rte_eth_rx_burst(ports[0],
2164 vdev->vmdq_rx_q, pkts_burst,
2168 ret_count = virtio_dev_rx_zcp(dev,
2169 pkts_burst, rx_count);
2171 dev_statistics[dev->device_fh].rx_total
2173 dev_statistics[dev->device_fh].rx
2176 while (likely(rx_count)) {
2179 pkts_burst[rx_count]);
2180 rte_ring_sp_enqueue(
2181 vpool_array[index].ring,
2182 (void *)pkts_burst[rx_count]);
2187 if (likely(!vdev->remove))
2188 /* Handle guest TX */
2189 virtio_dev_tx_zcp(dev);
2191 /* Move to the next device in the list */
2192 dev_ll = dev_ll->next;
2201 * Add an entry to a used linked list. A free entry must first be found
2202 * in the free linked list using get_data_ll_free_entry();
2205 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2206 struct virtio_net_data_ll *ll_dev)
2208 struct virtio_net_data_ll *ll = *ll_root_addr;
2210 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2211 ll_dev->next = NULL;
2212 rte_compiler_barrier();
2214 /* If ll == NULL then this is the first device. */
2216 /* Increment to the tail of the linked list. */
2217 while ((ll->next != NULL) )
2222 *ll_root_addr = ll_dev;
2227 * Remove an entry from a used linked list. The entry must then be added to
2228 * the free linked list using put_data_ll_free_entry().
2231 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2232 struct virtio_net_data_ll *ll_dev,
2233 struct virtio_net_data_ll *ll_dev_last)
2235 struct virtio_net_data_ll *ll = *ll_root_addr;
2237 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2241 *ll_root_addr = ll_dev->next;
2243 if (likely(ll_dev_last != NULL))
2244 ll_dev_last->next = ll_dev->next;
2246 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2250 * Find and return an entry from the free linked list.
2252 static struct virtio_net_data_ll *
2253 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2255 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2256 struct virtio_net_data_ll *ll_dev;
2258 if (ll_free == NULL)
2262 *ll_root_addr = ll_free->next;
2268 * Place an entry back on to the free linked list.
2271 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2272 struct virtio_net_data_ll *ll_dev)
2274 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2279 ll_dev->next = ll_free;
2280 *ll_root_addr = ll_dev;
2284 * Creates a linked list of a given size.
2286 static struct virtio_net_data_ll *
2287 alloc_data_ll(uint32_t size)
2289 struct virtio_net_data_ll *ll_new;
2292 /* Malloc and then chain the linked list. */
2293 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2294 if (ll_new == NULL) {
2295 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2299 for (i = 0; i < size - 1; i++) {
2300 ll_new[i].vdev = NULL;
2301 ll_new[i].next = &ll_new[i+1];
2303 ll_new[i].next = NULL;
2309 * Create the main linked list along with each individual cores linked list. A used and a free list
2310 * are created to manage entries.
2317 RTE_LCORE_FOREACH_SLAVE(lcore) {
2318 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2319 if (lcore_info[lcore].lcore_ll == NULL) {
2320 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2324 lcore_info[lcore].lcore_ll->device_num = 0;
2325 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2326 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2327 if (num_devices % num_switching_cores)
2328 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2330 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2333 /* Allocate devices up to a maximum of MAX_DEVICES. */
2334 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2340 * Set virtqueue flags so that we do not receive interrupts.
2343 set_irq_status (struct virtio_net *dev)
2345 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2346 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2350 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2351 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2352 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2355 destroy_device (volatile struct virtio_net *dev)
2357 struct virtio_net_data_ll *ll_lcore_dev_cur;
2358 struct virtio_net_data_ll *ll_main_dev_cur;
2359 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2360 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2361 struct vhost_dev *vdev;
2364 dev->flags &= ~VIRTIO_DEV_RUNNING;
2366 vdev = (struct vhost_dev *)dev->priv;
2367 /*set the remove flag. */
2369 while(vdev->ready != DEVICE_SAFE_REMOVE) {
2373 /* Search for entry to be removed from lcore ll */
2374 ll_lcore_dev_cur = lcore_info[vdev->coreid].lcore_ll->ll_root_used;
2375 while (ll_lcore_dev_cur != NULL) {
2376 if (ll_lcore_dev_cur->vdev == vdev) {
2379 ll_lcore_dev_last = ll_lcore_dev_cur;
2380 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2384 if (ll_lcore_dev_cur == NULL) {
2385 RTE_LOG(ERR, VHOST_CONFIG,
2386 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2391 /* Search for entry to be removed from main ll */
2392 ll_main_dev_cur = ll_root_used;
2393 ll_main_dev_last = NULL;
2394 while (ll_main_dev_cur != NULL) {
2395 if (ll_main_dev_cur->vdev == vdev) {
2398 ll_main_dev_last = ll_main_dev_cur;
2399 ll_main_dev_cur = ll_main_dev_cur->next;
2403 /* Remove entries from the lcore and main ll. */
2404 rm_data_ll_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2405 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2407 /* Set the dev_removal_flag on each lcore. */
2408 RTE_LCORE_FOREACH_SLAVE(lcore) {
2409 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2413 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2414 * they can no longer access the device removed from the linked lists and that the devices
2415 * are no longer in use.
2417 RTE_LCORE_FOREACH_SLAVE(lcore) {
2418 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2423 /* Add the entries back to the lcore and main free ll.*/
2424 put_data_ll_free_entry(&lcore_info[vdev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2425 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2427 /* Decrement number of device on the lcore. */
2428 lcore_info[vdev->coreid].lcore_ll->device_num--;
2430 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2433 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2435 /* Stop the RX queue. */
2436 if (rte_eth_dev_rx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2437 LOG_DEBUG(VHOST_CONFIG,
2438 "(%"PRIu64") In destroy_device: Failed to stop "
2444 LOG_DEBUG(VHOST_CONFIG,
2445 "(%"PRIu64") in destroy_device: Start put mbuf in "
2446 "mempool back to ring for RX queue: %d\n",
2447 dev->device_fh, vdev->vmdq_rx_q);
2449 mbuf_destroy_zcp(vpool);
2451 /* Stop the TX queue. */
2452 if (rte_eth_dev_tx_queue_stop(ports[0], vdev->vmdq_rx_q) != 0) {
2453 LOG_DEBUG(VHOST_CONFIG,
2454 "(%"PRIu64") In destroy_device: Failed to "
2455 "stop tx queue:%d\n",
2456 dev->device_fh, vdev->vmdq_rx_q);
2459 vpool = &vpool_array[vdev->vmdq_rx_q + MAX_QUEUES];
2461 LOG_DEBUG(VHOST_CONFIG,
2462 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2463 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2464 dev->device_fh, (vdev->vmdq_rx_q + MAX_QUEUES),
2467 mbuf_destroy_zcp(vpool);
2468 rte_free(vdev->regions_hpa);
2475 * Calculate the region count of physical continous regions for one particular
2476 * region of whose vhost virtual address is continous. The particular region
2477 * start from vva_start, with size of 'size' in argument.
2480 check_hpa_regions(uint64_t vva_start, uint64_t size)
2482 uint32_t i, nregions = 0, page_size = getpagesize();
2483 uint64_t cur_phys_addr = 0, next_phys_addr = 0;
2484 if (vva_start % page_size) {
2485 LOG_DEBUG(VHOST_CONFIG,
2486 "in check_countinous: vva start(%p) mod page_size(%d) "
2488 (void *)(uintptr_t)vva_start, page_size);
2491 if (size % page_size) {
2492 LOG_DEBUG(VHOST_CONFIG,
2493 "in check_countinous: "
2494 "size((%"PRIu64")) mod page_size(%d) has remainder\n",
2498 for (i = 0; i < size - page_size; i = i + page_size) {
2500 = rte_mem_virt2phy((void *)(uintptr_t)(vva_start + i));
2501 next_phys_addr = rte_mem_virt2phy(
2502 (void *)(uintptr_t)(vva_start + i + page_size));
2503 if ((cur_phys_addr + page_size) != next_phys_addr) {
2505 LOG_DEBUG(VHOST_CONFIG,
2506 "in check_continuous: hva addr:(%p) is not "
2507 "continuous with hva addr:(%p), diff:%d\n",
2508 (void *)(uintptr_t)(vva_start + (uint64_t)i),
2509 (void *)(uintptr_t)(vva_start + (uint64_t)i
2510 + page_size), page_size);
2511 LOG_DEBUG(VHOST_CONFIG,
2512 "in check_continuous: hpa addr:(%p) is not "
2513 "continuous with hpa addr:(%p), "
2514 "diff:(%"PRIu64")\n",
2515 (void *)(uintptr_t)cur_phys_addr,
2516 (void *)(uintptr_t)next_phys_addr,
2517 (next_phys_addr-cur_phys_addr));
2524 * Divide each region whose vhost virtual address is continous into a few
2525 * sub-regions, make sure the physical address within each sub-region are
2526 * continous. And fill offset(to GPA) and size etc. information of each
2527 * sub-region into regions_hpa.
2530 fill_hpa_memory_regions(struct virtio_memory_regions_hpa *mem_region_hpa, struct virtio_memory *virtio_memory)
2532 uint32_t regionidx, regionidx_hpa = 0, i, k, page_size = getpagesize();
2533 uint64_t cur_phys_addr = 0, next_phys_addr = 0, vva_start;
2535 if (mem_region_hpa == NULL)
2538 for (regionidx = 0; regionidx < virtio_memory->nregions; regionidx++) {
2539 vva_start = virtio_memory->regions[regionidx].guest_phys_address +
2540 virtio_memory->regions[regionidx].address_offset;
2541 mem_region_hpa[regionidx_hpa].guest_phys_address
2542 = virtio_memory->regions[regionidx].guest_phys_address;
2543 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2544 rte_mem_virt2phy((void *)(uintptr_t)(vva_start)) -
2545 mem_region_hpa[regionidx_hpa].guest_phys_address;
2546 LOG_DEBUG(VHOST_CONFIG,
2547 "in fill_hpa_regions: guest phys addr start[%d]:(%p)\n",
2550 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2551 LOG_DEBUG(VHOST_CONFIG,
2552 "in fill_hpa_regions: host phys addr start[%d]:(%p)\n",
2555 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2557 i < virtio_memory->regions[regionidx].memory_size -
2560 cur_phys_addr = rte_mem_virt2phy(
2561 (void *)(uintptr_t)(vva_start + i));
2562 next_phys_addr = rte_mem_virt2phy(
2563 (void *)(uintptr_t)(vva_start +
2565 if ((cur_phys_addr + page_size) != next_phys_addr) {
2566 mem_region_hpa[regionidx_hpa].guest_phys_address_end =
2567 mem_region_hpa[regionidx_hpa].guest_phys_address +
2569 mem_region_hpa[regionidx_hpa].memory_size
2571 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest "
2572 "phys addr end [%d]:(%p)\n",
2575 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2576 LOG_DEBUG(VHOST_CONFIG,
2577 "in fill_hpa_regions: guest phys addr "
2581 (mem_region_hpa[regionidx_hpa].memory_size));
2582 mem_region_hpa[regionidx_hpa + 1].guest_phys_address
2583 = mem_region_hpa[regionidx_hpa].guest_phys_address_end;
2585 mem_region_hpa[regionidx_hpa].host_phys_addr_offset =
2587 mem_region_hpa[regionidx_hpa].guest_phys_address;
2588 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest"
2589 " phys addr start[%d]:(%p)\n",
2592 (mem_region_hpa[regionidx_hpa].guest_phys_address));
2593 LOG_DEBUG(VHOST_CONFIG,
2594 "in fill_hpa_regions: host phys addr "
2598 (mem_region_hpa[regionidx_hpa].host_phys_addr_offset));
2604 mem_region_hpa[regionidx_hpa].guest_phys_address_end
2605 = mem_region_hpa[regionidx_hpa].guest_phys_address
2607 mem_region_hpa[regionidx_hpa].memory_size = k + page_size;
2608 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr end "
2609 "[%d]:(%p)\n", regionidx_hpa,
2611 (mem_region_hpa[regionidx_hpa].guest_phys_address_end));
2612 LOG_DEBUG(VHOST_CONFIG, "in fill_hpa_regions: guest phys addr size "
2613 "[%d]:(%p)\n", regionidx_hpa,
2615 (mem_region_hpa[regionidx_hpa].memory_size));
2618 return regionidx_hpa;
2622 * A new device is added to a data core. First the device is added to the main linked list
2623 * and the allocated to a specific data core.
2626 new_device (struct virtio_net *dev)
2628 struct virtio_net_data_ll *ll_dev;
2629 int lcore, core_add = 0;
2630 uint32_t device_num_min = num_devices;
2631 struct vhost_dev *vdev;
2634 vdev = rte_zmalloc("vhost device", sizeof(*vdev), CACHE_LINE_SIZE);
2636 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Couldn't allocate memory for vhost dev\n",
2644 vdev->nregions_hpa = dev->mem->nregions;
2645 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
2647 += check_hpa_regions(
2648 dev->mem->regions[regionidx].guest_phys_address
2649 + dev->mem->regions[regionidx].address_offset,
2650 dev->mem->regions[regionidx].memory_size);
2654 vdev->regions_hpa = (struct virtio_memory_regions_hpa *) rte_zmalloc("vhost hpa region",
2655 sizeof(struct virtio_memory_regions_hpa) * vdev->nregions_hpa,
2657 if (vdev->regions_hpa == NULL) {
2658 RTE_LOG(ERR, VHOST_CONFIG, "Cannot allocate memory for hpa region\n");
2664 if (fill_hpa_memory_regions(
2665 vdev->regions_hpa, dev->mem
2666 ) != vdev->nregions_hpa) {
2668 RTE_LOG(ERR, VHOST_CONFIG,
2669 "hpa memory regions number mismatch: "
2670 "[%d]\n", vdev->nregions_hpa);
2671 rte_free(vdev->regions_hpa);
2678 /* Add device to main ll */
2679 ll_dev = get_data_ll_free_entry(&ll_root_free);
2680 if (ll_dev == NULL) {
2681 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2682 "of %d devices per core has been reached\n",
2683 dev->device_fh, num_devices);
2684 if (vdev->regions_hpa)
2685 rte_free(vdev->regions_hpa);
2689 ll_dev->vdev = vdev;
2690 add_data_ll_entry(&ll_root_used, ll_dev);
2692 = dev->device_fh * (num_queues / num_devices);
2695 uint32_t index = vdev->vmdq_rx_q;
2696 uint32_t count_in_ring, i;
2697 struct mbuf_table *tx_q;
2699 count_in_ring = rte_ring_count(vpool_array[index].ring);
2701 LOG_DEBUG(VHOST_CONFIG,
2702 "(%"PRIu64") in new_device: mbuf count in mempool "
2703 "before attach is: %d\n",
2705 rte_mempool_count(vpool_array[index].pool));
2706 LOG_DEBUG(VHOST_CONFIG,
2707 "(%"PRIu64") in new_device: mbuf count in ring "
2708 "before attach is : %d\n",
2709 dev->device_fh, count_in_ring);
2712 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2714 for (i = 0; i < count_in_ring; i++)
2715 attach_rxmbuf_zcp(dev);
2717 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2718 "mempool after attach is: %d\n",
2720 rte_mempool_count(vpool_array[index].pool));
2721 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2722 "ring after attach is : %d\n",
2724 rte_ring_count(vpool_array[index].ring));
2726 tx_q = &tx_queue_zcp[(uint16_t)vdev->vmdq_rx_q];
2727 tx_q->txq_id = vdev->vmdq_rx_q;
2729 if (rte_eth_dev_tx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2730 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2732 LOG_DEBUG(VHOST_CONFIG,
2733 "(%"PRIu64") In new_device: Failed to start "
2735 dev->device_fh, vdev->vmdq_rx_q);
2737 mbuf_destroy_zcp(vpool);
2738 rte_free(vdev->regions_hpa);
2743 if (rte_eth_dev_rx_queue_start(ports[0], vdev->vmdq_rx_q) != 0) {
2744 struct vpool *vpool = &vpool_array[vdev->vmdq_rx_q];
2746 LOG_DEBUG(VHOST_CONFIG,
2747 "(%"PRIu64") In new_device: Failed to start "
2749 dev->device_fh, vdev->vmdq_rx_q);
2751 /* Stop the TX queue. */
2752 if (rte_eth_dev_tx_queue_stop(ports[0],
2753 vdev->vmdq_rx_q) != 0) {
2754 LOG_DEBUG(VHOST_CONFIG,
2755 "(%"PRIu64") In new_device: Failed to "
2756 "stop tx queue:%d\n",
2757 dev->device_fh, vdev->vmdq_rx_q);
2760 mbuf_destroy_zcp(vpool);
2761 rte_free(vdev->regions_hpa);
2768 /*reset ready flag*/
2769 vdev->ready = DEVICE_MAC_LEARNING;
2772 /* Find a suitable lcore to add the device. */
2773 RTE_LCORE_FOREACH_SLAVE(lcore) {
2774 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2775 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2779 /* Add device to lcore ll */
2780 ll_dev->dev->coreid = core_add;
2781 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2782 if (ll_dev == NULL) {
2783 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2784 vdev->ready = DEVICE_SAFE_REMOVE;
2785 destroy_device(dev);
2786 if (vdev->regions_hpa)
2787 rte_free(vdev->regions_hpa);
2791 ll_dev->vdev = vdev;
2792 vdev->coreid = core_add;
2794 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2796 /* Initialize device stats */
2797 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2799 /* Disable notifications. */
2800 set_irq_status(dev);
2801 lcore_info[vdev->coreid].lcore_ll->device_num++;
2802 dev->flags |= VIRTIO_DEV_RUNNING;
2804 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, vdev->coreid);
2810 * These callback allow devices to be added to the data core when configuration
2811 * has been fully complete.
2813 static const struct virtio_net_device_ops virtio_net_device_ops =
2815 .new_device = new_device,
2816 .destroy_device = destroy_device,
2820 * This is a thread will wake up after a period to print stats if the user has
2826 struct virtio_net_data_ll *dev_ll;
2827 uint64_t tx_dropped, rx_dropped;
2828 uint64_t tx, tx_total, rx, rx_total;
2830 const char clr[] = { 27, '[', '2', 'J', '\0' };
2831 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2834 sleep(enable_stats);
2836 /* Clear screen and move to top left */
2837 printf("%s%s", clr, top_left);
2839 printf("\nDevice statistics ====================================");
2841 dev_ll = ll_root_used;
2842 while (dev_ll != NULL) {
2843 device_fh = (uint32_t)dev_ll->vdev->dev->device_fh;
2844 tx_total = dev_statistics[device_fh].tx_total;
2845 tx = dev_statistics[device_fh].tx;
2846 tx_dropped = tx_total - tx;
2847 if (zero_copy == 0) {
2848 rx_total = rte_atomic64_read(
2849 &dev_statistics[device_fh].rx_total_atomic);
2850 rx = rte_atomic64_read(
2851 &dev_statistics[device_fh].rx_atomic);
2853 rx_total = dev_statistics[device_fh].rx_total;
2854 rx = dev_statistics[device_fh].rx;
2856 rx_dropped = rx_total - rx;
2858 printf("\nStatistics for device %"PRIu32" ------------------------------"
2859 "\nTX total: %"PRIu64""
2860 "\nTX dropped: %"PRIu64""
2861 "\nTX successful: %"PRIu64""
2862 "\nRX total: %"PRIu64""
2863 "\nRX dropped: %"PRIu64""
2864 "\nRX successful: %"PRIu64"",
2873 dev_ll = dev_ll->next;
2875 printf("\n======================================================\n");
2880 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2881 char *ring_name, uint32_t nb_mbuf)
2883 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2884 vpool_array[index].pool
2885 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2886 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2887 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2888 rte_pktmbuf_init, NULL, socket, 0);
2889 if (vpool_array[index].pool != NULL) {
2890 vpool_array[index].ring
2891 = rte_ring_create(ring_name,
2892 rte_align32pow2(nb_mbuf + 1),
2893 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2894 if (likely(vpool_array[index].ring != NULL)) {
2895 LOG_DEBUG(VHOST_CONFIG,
2896 "in setup_mempool_tbl: mbuf count in "
2898 rte_mempool_count(vpool_array[index].pool));
2899 LOG_DEBUG(VHOST_CONFIG,
2900 "in setup_mempool_tbl: mbuf count in "
2902 rte_ring_count(vpool_array[index].ring));
2904 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2908 /* Need consider head room. */
2909 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2911 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2917 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2918 * device is also registered here to handle the IOCTLs.
2921 MAIN(int argc, char *argv[])
2923 struct rte_mempool *mbuf_pool = NULL;
2924 unsigned lcore_id, core_id = 0;
2925 unsigned nb_ports, valid_num_ports;
2927 uint8_t portid, queue_id = 0;
2928 static pthread_t tid;
2931 ret = rte_eal_init(argc, argv);
2933 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2937 /* parse app arguments */
2938 ret = us_vhost_parse_args(argc, argv);
2940 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2942 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2943 if (rte_lcore_is_enabled(lcore_id))
2944 lcore_ids[core_id ++] = lcore_id;
2946 if (rte_lcore_count() > RTE_MAX_LCORE)
2947 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2949 /*set the number of swithcing cores available*/
2950 num_switching_cores = rte_lcore_count()-1;
2952 /* Get the number of physical ports. */
2953 nb_ports = rte_eth_dev_count();
2954 if (nb_ports > RTE_MAX_ETHPORTS)
2955 nb_ports = RTE_MAX_ETHPORTS;
2958 * Update the global var NUM_PORTS and global array PORTS
2959 * and get value of var VALID_NUM_PORTS according to system ports number
2961 valid_num_ports = check_ports_num(nb_ports);
2963 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2964 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2965 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2969 if (zero_copy == 0) {
2970 /* Create the mbuf pool. */
2971 mbuf_pool = rte_mempool_create(
2975 MBUF_SIZE, MBUF_CACHE_SIZE,
2976 sizeof(struct rte_pktmbuf_pool_private),
2977 rte_pktmbuf_pool_init, NULL,
2978 rte_pktmbuf_init, NULL,
2979 rte_socket_id(), 0);
2980 if (mbuf_pool == NULL)
2981 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2983 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2984 vpool_array[queue_id].pool = mbuf_pool;
2986 if (vm2vm_mode == VM2VM_HARDWARE) {
2987 /* Enable VT loop back to let L2 switch to do it. */
2988 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2989 LOG_DEBUG(VHOST_CONFIG,
2990 "Enable loop back for L2 switch in vmdq.\n");
2994 char pool_name[RTE_MEMPOOL_NAMESIZE];
2995 char ring_name[RTE_MEMPOOL_NAMESIZE];
2998 * Zero copy defers queue RX/TX start to the time when guest
2999 * finishes its startup and packet buffers from that guest are
3002 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
3003 rx_conf_default.rx_drop_en = 0;
3004 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
3005 nb_mbuf = num_rx_descriptor
3006 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3007 + num_switching_cores * MAX_PKT_BURST;
3009 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3010 snprintf(pool_name, sizeof(pool_name),
3011 "rxmbuf_pool_%u", queue_id);
3012 snprintf(ring_name, sizeof(ring_name),
3013 "rxmbuf_ring_%u", queue_id);
3014 setup_mempool_tbl(rte_socket_id(), queue_id,
3015 pool_name, ring_name, nb_mbuf);
3018 nb_mbuf = num_tx_descriptor
3019 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3020 + num_switching_cores * MAX_PKT_BURST;
3022 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3023 snprintf(pool_name, sizeof(pool_name),
3024 "txmbuf_pool_%u", queue_id);
3025 snprintf(ring_name, sizeof(ring_name),
3026 "txmbuf_ring_%u", queue_id);
3027 setup_mempool_tbl(rte_socket_id(),
3028 (queue_id + MAX_QUEUES),
3029 pool_name, ring_name, nb_mbuf);
3032 if (vm2vm_mode == VM2VM_HARDWARE) {
3033 /* Enable VT loop back to let L2 switch to do it. */
3034 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3035 LOG_DEBUG(VHOST_CONFIG,
3036 "Enable loop back for L2 switch in vmdq.\n");
3039 /* Set log level. */
3040 rte_set_log_level(LOG_LEVEL);
3042 /* initialize all ports */
3043 for (portid = 0; portid < nb_ports; portid++) {
3044 /* skip ports that are not enabled */
3045 if ((enabled_port_mask & (1 << portid)) == 0) {
3046 RTE_LOG(INFO, VHOST_PORT,
3047 "Skipping disabled port %d\n", portid);
3050 if (port_init(portid) != 0)
3051 rte_exit(EXIT_FAILURE,
3052 "Cannot initialize network ports\n");
3055 /* Initialise all linked lists. */
3056 if (init_data_ll() == -1)
3057 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3059 /* Initialize device stats */
3060 memset(&dev_statistics, 0, sizeof(dev_statistics));
3062 /* Enable stats if the user option is set. */
3064 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3066 /* Launch all data cores. */
3067 if (zero_copy == 0) {
3068 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3069 rte_eal_remote_launch(switch_worker,
3070 mbuf_pool, lcore_id);
3073 uint32_t count_in_mempool, index, i;
3074 for (index = 0; index < 2*MAX_QUEUES; index++) {
3075 /* For all RX and TX queues. */
3077 = rte_mempool_count(vpool_array[index].pool);
3080 * Transfer all un-attached mbufs from vpool.pool
3083 for (i = 0; i < count_in_mempool; i++) {
3084 struct rte_mbuf *mbuf
3085 = __rte_mbuf_raw_alloc(
3086 vpool_array[index].pool);
3087 rte_ring_sp_enqueue(vpool_array[index].ring,
3091 LOG_DEBUG(VHOST_CONFIG,
3092 "in MAIN: mbuf count in mempool at initial "
3093 "is: %d\n", count_in_mempool);
3094 LOG_DEBUG(VHOST_CONFIG,
3095 "in MAIN: mbuf count in ring at initial is :"
3097 rte_ring_count(vpool_array[index].ring));
3100 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3101 rte_eal_remote_launch(switch_worker_zcp, NULL,
3105 /* Register CUSE device to handle IOCTLs. */
3106 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3108 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3110 init_virtio_net(&virtio_net_device_ops);
3112 /* Start CUSE session. */
3113 start_cuse_session_loop();