4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_QUEUES 128
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
63 * Calculate the number of buffers needed per port
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
66 (num_switching_cores*MAX_PKT_BURST) + \
67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 (num_switching_cores*MBUF_CACHE_SIZE))
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
74 * No frame data buffer allocated from host are required for zero copy
75 * implementation, guest will allocate the frame data buffer, and vhost
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
84 * RX and TX Prefetch, Host, and Write-back threshold values should be
85 * carefully set for optimal performance. Consult the network
86 * controller's datasheet and supporting DPDK documentation for guidance
87 * on how these parameters should be set.
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
94 * These default values are optimized for use with the Intel(R) 82599 10 GbE
95 * Controller and the DPDK ixgbe PMD. Consider using other values for other
96 * network controllers and/or network drivers.
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
109 /* State of virtio device. */
110 #define DEVICE_MAC_LEARNING 0
112 #define DEVICE_SAFE_REMOVE 2
114 /* Config_core_flag status definitions. */
115 #define REQUEST_DEV_REMOVAL 1
116 #define ACK_DEV_REMOVAL 0
118 /* Configurable number of RX/TX ring descriptors */
119 #define RTE_TEST_RX_DESC_DEFAULT 1024
120 #define RTE_TEST_TX_DESC_DEFAULT 512
123 * Need refine these 2 macros for legacy and DPDK based front end:
124 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
125 * And then adjust power 2.
128 * For legacy front end, 128 descriptors,
129 * half for virtio header, another half for mbuf.
131 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
132 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
134 /* Get first 4 bytes in mbuf headroom. */
135 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
136 + sizeof(struct rte_mbuf)))
138 /* true if x is a power of 2 */
139 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
141 #define INVALID_PORT_ID 0xFF
143 /* Max number of devices. Limited by vmdq. */
144 #define MAX_DEVICES 64
146 /* Size of buffers used for rte_snprintfs. */
147 #define MAX_PRINT_BUFF 6072
149 /* Maximum character device basename size. */
150 #define MAX_BASENAME_SZ 10
152 /* Maximum long option length for option parsing. */
153 #define MAX_LONG_OPT_SZ 64
155 /* Used to compare MAC addresses. */
156 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
158 /* Number of descriptors per cacheline. */
159 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
161 /* mask of enabled ports */
162 static uint32_t enabled_port_mask = 0;
164 /*Number of switching cores enabled*/
165 static uint32_t num_switching_cores = 0;
167 /* number of devices/queues to support*/
168 static uint32_t num_queues = 0;
169 uint32_t num_devices = 0;
172 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
173 * disabled on default.
175 static uint32_t zero_copy;
177 /* number of descriptors to apply*/
178 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
179 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
181 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
182 #define MAX_RING_DESC 4096
185 struct rte_mempool *pool;
186 struct rte_ring *ring;
188 } vpool_array[MAX_QUEUES+MAX_QUEUES];
190 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
197 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
199 /* The type of host physical address translated from guest physical address. */
201 PHYS_ADDR_CONTINUOUS = 0,
202 PHYS_ADDR_CROSS_SUBREG = 1,
203 PHYS_ADDR_INVALID = 2,
208 static uint32_t enable_stats = 0;
209 /* Enable retries on RX. */
210 static uint32_t enable_retry = 1;
211 /* Specify timeout (in useconds) between retries on RX. */
212 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
213 /* Specify the number of retries on RX. */
214 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
216 /* Character device basename. Can be set by user. */
217 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
219 /* Charater device index. Can be set by user. */
220 static uint32_t dev_index = 0;
222 /* This can be set by the user so it is made available here. */
223 extern uint64_t VHOST_FEATURES;
225 /* Default configuration for rx and tx thresholds etc. */
226 static struct rte_eth_rxconf rx_conf_default = {
228 .pthresh = RX_PTHRESH,
229 .hthresh = RX_HTHRESH,
230 .wthresh = RX_WTHRESH,
236 * These default values are optimized for use with the Intel(R) 82599 10 GbE
237 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
238 * network controllers and/or network drivers.
240 static struct rte_eth_txconf tx_conf_default = {
242 .pthresh = TX_PTHRESH,
243 .hthresh = TX_HTHRESH,
244 .wthresh = TX_WTHRESH,
246 .tx_free_thresh = 0, /* Use PMD default values */
247 .tx_rs_thresh = 0, /* Use PMD default values */
250 /* empty vmdq configuration structure. Filled in programatically */
251 static struct rte_eth_conf vmdq_conf_default = {
253 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
255 .header_split = 0, /**< Header Split disabled */
256 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
257 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
259 * It is necessary for 1G NIC such as I350,
260 * this fixes bug of ipv4 forwarding in guest can't
261 * forward pakets from one virtio dev to another virtio dev.
263 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
264 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
265 .hw_strip_crc = 0, /**< CRC stripped by hardware */
269 .mq_mode = ETH_MQ_TX_NONE,
273 * should be overridden separately in code with
277 .nb_queue_pools = ETH_8_POOLS,
278 .enable_default_pool = 0,
281 .pool_map = {{0, 0},},
286 static unsigned lcore_ids[RTE_MAX_LCORE];
287 static uint8_t ports[RTE_MAX_ETHPORTS];
288 static unsigned num_ports = 0; /**< The number of ports specified in command line */
290 static const uint16_t external_pkt_default_vlan_tag = 2000;
291 const uint16_t vlan_tags[] = {
292 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
293 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
294 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
295 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
296 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
297 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
298 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
299 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
302 /* ethernet addresses of ports */
303 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
305 /* heads for the main used and free linked lists for the data path. */
306 static struct virtio_net_data_ll *ll_root_used = NULL;
307 static struct virtio_net_data_ll *ll_root_free = NULL;
309 /* Array of data core structures containing information on individual core linked lists. */
310 static struct lcore_info lcore_info[RTE_MAX_LCORE];
312 /* Used for queueing bursts of TX packets. */
316 struct rte_mbuf *m_table[MAX_PKT_BURST];
319 /* TX queue for each data core. */
320 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
322 /* TX queue fori each virtio device for zero copy. */
323 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
325 /* Vlan header struct used to insert vlan tags on TX. */
327 unsigned char h_dest[ETH_ALEN];
328 unsigned char h_source[ETH_ALEN];
331 __be16 h_vlan_encapsulated_proto;
336 uint8_t version_ihl; /**< version and header length */
337 uint8_t type_of_service; /**< type of service */
338 uint16_t total_length; /**< length of packet */
339 uint16_t packet_id; /**< packet ID */
340 uint16_t fragment_offset; /**< fragmentation offset */
341 uint8_t time_to_live; /**< time to live */
342 uint8_t next_proto_id; /**< protocol ID */
343 uint16_t hdr_checksum; /**< header checksum */
344 uint32_t src_addr; /**< source address */
345 uint32_t dst_addr; /**< destination address */
346 } __attribute__((__packed__));
348 /* Header lengths. */
350 #define VLAN_ETH_HLEN 18
352 /* Per-device statistics struct */
353 struct device_statistics {
355 rte_atomic64_t rx_total_atomic;
358 rte_atomic64_t rx_atomic;
360 } __rte_cache_aligned;
361 struct device_statistics dev_statistics[MAX_DEVICES];
364 * Builds up the correct configuration for VMDQ VLAN pool map
365 * according to the pool & queue limits.
368 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
370 struct rte_eth_vmdq_rx_conf conf;
373 memset(&conf, 0, sizeof(conf));
374 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
375 conf.nb_pool_maps = num_devices;
376 conf.enable_loop_back =
377 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
379 for (i = 0; i < conf.nb_pool_maps; i++) {
380 conf.pool_map[i].vlan_id = vlan_tags[ i ];
381 conf.pool_map[i].pools = (1UL << i);
384 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
385 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
386 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
391 * Validate the device number according to the max pool number gotten form
392 * dev_info. If the device number is invalid, give the error message and
393 * return -1. Each device must have its own pool.
396 validate_num_devices(uint32_t max_nb_devices)
398 if (num_devices > max_nb_devices) {
399 RTE_LOG(ERR, PORT, "invalid number of devices\n");
406 * Initialises a given port using global settings and with the rx buffers
407 * coming from the mbuf_pool passed as parameter
410 port_init(uint8_t port)
412 struct rte_eth_dev_info dev_info;
413 struct rte_eth_conf port_conf;
414 uint16_t rx_rings, tx_rings;
415 uint16_t rx_ring_size, tx_ring_size;
419 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
420 rte_eth_dev_info_get (port, &dev_info);
422 /*configure the number of supported virtio devices based on VMDQ limits */
423 num_devices = dev_info.max_vmdq_pools;
424 num_queues = dev_info.max_rx_queues;
427 rx_ring_size = num_rx_descriptor;
428 tx_ring_size = num_tx_descriptor;
429 tx_rings = dev_info.max_tx_queues;
431 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
432 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
433 tx_rings = (uint16_t)rte_lcore_count();
436 retval = validate_num_devices(MAX_DEVICES);
440 /* Get port configuration. */
441 retval = get_eth_conf(&port_conf, num_devices);
445 if (port >= rte_eth_dev_count()) return -1;
447 rx_rings = (uint16_t)num_queues,
448 /* Configure ethernet device. */
449 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
453 /* Setup the queues. */
454 for (q = 0; q < rx_rings; q ++) {
455 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
456 rte_eth_dev_socket_id(port), &rx_conf_default,
457 vpool_array[q].pool);
461 for (q = 0; q < tx_rings; q ++) {
462 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
463 rte_eth_dev_socket_id(port), &tx_conf_default);
468 /* Start the device. */
469 retval = rte_eth_dev_start(port);
471 RTE_LOG(ERR, DATA, "Failed to start the device.\n");
475 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
476 RTE_LOG(INFO, PORT, "Max virtio devices supported: %u\n", num_devices);
477 RTE_LOG(INFO, PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
478 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
480 vmdq_ports_eth_addr[port].addr_bytes[0],
481 vmdq_ports_eth_addr[port].addr_bytes[1],
482 vmdq_ports_eth_addr[port].addr_bytes[2],
483 vmdq_ports_eth_addr[port].addr_bytes[3],
484 vmdq_ports_eth_addr[port].addr_bytes[4],
485 vmdq_ports_eth_addr[port].addr_bytes[5]);
491 * Set character device basename.
494 us_vhost_parse_basename(const char *q_arg)
496 /* parse number string */
498 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
501 rte_snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
507 * Parse the portmask provided at run time.
510 parse_portmask(const char *portmask)
517 /* parse hexadecimal string */
518 pm = strtoul(portmask, &end, 16);
519 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
530 * Parse num options at run time.
533 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
540 /* parse unsigned int string */
541 num = strtoul(q_arg, &end, 10);
542 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
545 if (num > max_valid_value)
556 us_vhost_usage(const char *prgname)
558 RTE_LOG(INFO, CONFIG, "%s [EAL options] -- -p PORTMASK\n"
560 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
561 " --dev-basename <name> --dev-index [0-N]\n"
563 " -p PORTMASK: Set mask for ports to be used by application\n"
564 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
565 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
566 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
567 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
568 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
569 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
570 " --dev-basename: The basename to be used for the character device.\n"
571 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
572 " --zero-copy [0|1]: disable(default)/enable rx/tx "
574 " --rx-desc-num [0-N]: the number of descriptors on rx, "
575 "used only when zero copy is enabled.\n"
576 " --tx-desc-num [0-N]: the number of descriptors on tx, "
577 "used only when zero copy is enabled.\n",
582 * Parse the arguments given in the command line of the application.
585 us_vhost_parse_args(int argc, char **argv)
590 const char *prgname = argv[0];
591 static struct option long_option[] = {
592 {"vm2vm", required_argument, NULL, 0},
593 {"rx-retry", required_argument, NULL, 0},
594 {"rx-retry-delay", required_argument, NULL, 0},
595 {"rx-retry-num", required_argument, NULL, 0},
596 {"mergeable", required_argument, NULL, 0},
597 {"stats", required_argument, NULL, 0},
598 {"dev-basename", required_argument, NULL, 0},
599 {"dev-index", required_argument, NULL, 0},
600 {"zero-copy", required_argument, NULL, 0},
601 {"rx-desc-num", required_argument, NULL, 0},
602 {"tx-desc-num", required_argument, NULL, 0},
606 /* Parse command line */
607 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
611 enabled_port_mask = parse_portmask(optarg);
612 if (enabled_port_mask == 0) {
613 RTE_LOG(INFO, CONFIG, "Invalid portmask\n");
614 us_vhost_usage(prgname);
620 /* Enable/disable vm2vm comms. */
621 if (!strncmp(long_option[option_index].name, "vm2vm",
623 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
625 RTE_LOG(INFO, CONFIG,
626 "Invalid argument for "
628 us_vhost_usage(prgname);
631 vm2vm_mode = (vm2vm_type)ret;
635 /* Enable/disable retries on RX. */
636 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
637 ret = parse_num_opt(optarg, 1);
639 RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry [0|1]\n");
640 us_vhost_usage(prgname);
647 /* Specify the retries delay time (in useconds) on RX. */
648 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
649 ret = parse_num_opt(optarg, INT32_MAX);
651 RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
652 us_vhost_usage(prgname);
655 burst_rx_delay_time = ret;
659 /* Specify the retries number on RX. */
660 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
661 ret = parse_num_opt(optarg, INT32_MAX);
663 RTE_LOG(INFO, CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
664 us_vhost_usage(prgname);
667 burst_rx_retry_num = ret;
671 /* Enable/disable RX mergeable buffers. */
672 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
673 ret = parse_num_opt(optarg, 1);
675 RTE_LOG(INFO, CONFIG, "Invalid argument for mergeable [0|1]\n");
676 us_vhost_usage(prgname);
680 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
684 /* Enable/disable stats. */
685 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
686 ret = parse_num_opt(optarg, INT32_MAX);
688 RTE_LOG(INFO, CONFIG, "Invalid argument for stats [0..N]\n");
689 us_vhost_usage(prgname);
696 /* Set character device basename. */
697 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
698 if (us_vhost_parse_basename(optarg) == -1) {
699 RTE_LOG(INFO, CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
700 us_vhost_usage(prgname);
705 /* Set character device index. */
706 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
707 ret = parse_num_opt(optarg, INT32_MAX);
709 RTE_LOG(INFO, CONFIG, "Invalid argument for character device index [0..N]\n");
710 us_vhost_usage(prgname);
716 /* Enable/disable rx/tx zero copy. */
717 if (!strncmp(long_option[option_index].name,
718 "zero-copy", MAX_LONG_OPT_SZ)) {
719 ret = parse_num_opt(optarg, 1);
721 RTE_LOG(INFO, CONFIG,
723 " for zero-copy [0|1]\n");
724 us_vhost_usage(prgname);
730 #ifdef RTE_MBUF_SCATTER_GATHER
731 RTE_LOG(ERR, CONFIG, "Before running "
732 "zero copy vhost APP, please "
733 "disable RTE_MBUF_SCATTER_GATHER\n"
734 "in config file and then rebuild DPDK "
736 "Otherwise please disable zero copy "
737 "flag in command line!\n");
743 /* Specify the descriptor number on RX. */
744 if (!strncmp(long_option[option_index].name,
745 "rx-desc-num", MAX_LONG_OPT_SZ)) {
746 ret = parse_num_opt(optarg, MAX_RING_DESC);
747 if ((ret == -1) || (!POWEROF2(ret))) {
748 RTE_LOG(INFO, CONFIG,
749 "Invalid argument for rx-desc-num[0-N],"
750 "power of 2 required.\n");
751 us_vhost_usage(prgname);
754 num_rx_descriptor = ret;
758 /* Specify the descriptor number on TX. */
759 if (!strncmp(long_option[option_index].name,
760 "tx-desc-num", MAX_LONG_OPT_SZ)) {
761 ret = parse_num_opt(optarg, MAX_RING_DESC);
762 if ((ret == -1) || (!POWEROF2(ret))) {
763 RTE_LOG(INFO, CONFIG,
764 "Invalid argument for tx-desc-num [0-N],"
765 "power of 2 required.\n");
766 us_vhost_usage(prgname);
769 num_tx_descriptor = ret;
775 /* Invalid option - print options. */
777 us_vhost_usage(prgname);
782 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
783 if (enabled_port_mask & (1 << i))
784 ports[num_ports++] = (uint8_t)i;
787 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
788 RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
789 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
793 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
795 "Vhost zero copy doesn't support software vm2vm,"
796 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
804 * Update the global var NUM_PORTS and array PORTS according to system ports number
805 * and return valid ports number
807 static unsigned check_ports_num(unsigned nb_ports)
809 unsigned valid_num_ports = num_ports;
812 if (num_ports > nb_ports) {
813 RTE_LOG(INFO, PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
814 num_ports, nb_ports);
815 num_ports = nb_ports;
818 for (portid = 0; portid < num_ports; portid ++) {
819 if (ports[portid] >= nb_ports) {
820 RTE_LOG(INFO, PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
821 ports[portid], (nb_ports - 1));
822 ports[portid] = INVALID_PORT_ID;
826 return valid_num_ports;
830 * Macro to print out packet contents. Wrapped in debug define so that the
831 * data path is not effected when debug is disabled.
834 #define PRINT_PACKET(device, addr, size, header) do { \
835 char *pkt_addr = (char*)(addr); \
836 unsigned int index; \
837 char packet[MAX_PRINT_BUFF]; \
840 rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
842 rte_snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
843 for (index = 0; index < (size); index++) { \
844 rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
845 "%02hhx ", pkt_addr[index]); \
847 rte_snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
849 LOG_DEBUG(DATA, "%s", packet); \
852 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
856 * Function to convert guest physical addresses to vhost virtual addresses. This
857 * is used to convert virtio buffer addresses.
859 static inline uint64_t __attribute__((always_inline))
860 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
862 struct virtio_memory_regions *region;
864 uint64_t vhost_va = 0;
866 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
867 region = &dev->mem->regions[regionidx];
868 if ((guest_pa >= region->guest_phys_address) &&
869 (guest_pa <= region->guest_phys_address_end)) {
870 vhost_va = region->address_offset + guest_pa;
874 LOG_DEBUG(DATA, "(%"PRIu64") GPA %p| VVA %p\n",
875 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
881 * Function to convert guest physical addresses to vhost physical addresses.
882 * This is used to convert virtio buffer addresses.
884 static inline uint64_t __attribute__((always_inline))
885 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
886 uint32_t buf_len, hpa_type *addr_type)
888 struct virtio_memory_regions_hpa *region;
890 uint64_t vhost_pa = 0;
892 *addr_type = PHYS_ADDR_INVALID;
894 for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
895 region = &dev->mem->regions_hpa[regionidx];
896 if ((guest_pa >= region->guest_phys_address) &&
897 (guest_pa <= region->guest_phys_address_end)) {
898 vhost_pa = region->host_phys_addr_offset + guest_pa;
899 if (likely((guest_pa + buf_len - 1)
900 <= region->guest_phys_address_end))
901 *addr_type = PHYS_ADDR_CONTINUOUS;
903 *addr_type = PHYS_ADDR_CROSS_SUBREG;
908 LOG_DEBUG(DATA, "(%"PRIu64") GPA %p| HPA %p\n",
909 dev->device_fh, (void *)(uintptr_t)guest_pa,
910 (void *)(uintptr_t)vhost_pa);
916 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
917 * be received from the physical port or from another virtio device. A packet
918 * count is returned to indicate the number of packets that were succesfully
919 * added to the RX queue.
921 static inline uint32_t __attribute__((always_inline))
922 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
924 struct vhost_virtqueue *vq;
925 struct vring_desc *desc;
926 struct rte_mbuf *buff;
927 /* The virtio_hdr is initialised to 0. */
928 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
929 uint64_t buff_addr = 0;
930 uint64_t buff_hdr_addr = 0;
931 uint32_t head[MAX_PKT_BURST], packet_len = 0;
932 uint32_t head_idx, packet_success = 0;
933 uint32_t mergeable, mrg_count = 0;
935 uint16_t avail_idx, res_cur_idx;
936 uint16_t res_base_idx, res_end_idx;
937 uint16_t free_entries;
940 LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
941 vq = dev->virtqueue[VIRTIO_RXQ];
942 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
943 /* As many data cores may want access to available buffers, they need to be reserved. */
945 res_base_idx = vq->last_used_idx_res;
946 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
948 free_entries = (avail_idx - res_base_idx);
949 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
950 if (enable_retry && unlikely(count > free_entries)) {
951 for (retry = 0; retry < burst_rx_retry_num; retry++) {
952 rte_delay_us(burst_rx_delay_time);
954 *((volatile uint16_t *)&vq->avail->idx);
955 free_entries = (avail_idx - res_base_idx);
956 if (count <= free_entries)
961 /*check that we have enough buffers*/
962 if (unlikely(count > free_entries))
963 count = free_entries;
968 res_end_idx = res_base_idx + count;
969 /* vq->last_used_idx_res is atomically updated. */
970 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
972 } while (unlikely(success == 0));
973 res_cur_idx = res_base_idx;
974 LOG_DEBUG(DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
976 /* Prefetch available ring to retrieve indexes. */
977 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
979 /* Check if the VIRTIO_NET_F_MRG_RXBUF feature is enabled. */
980 mergeable = dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
982 /* Retrieve all of the head indexes first to avoid caching issues. */
983 for (head_idx = 0; head_idx < count; head_idx++)
984 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
986 /*Prefetch descriptor index. */
987 rte_prefetch0(&vq->desc[head[packet_success]]);
989 while (res_cur_idx != res_end_idx) {
990 /* Get descriptor from available ring */
991 desc = &vq->desc[head[packet_success]];
993 buff = pkts[packet_success];
995 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
996 buff_addr = gpa_to_vva(dev, desc->addr);
997 /* Prefetch buffer address. */
998 rte_prefetch0((void*)(uintptr_t)buff_addr);
1000 if (mergeable && (mrg_count != 0)) {
1001 desc->len = packet_len = rte_pktmbuf_data_len(buff);
1003 /* Copy virtio_hdr to packet and increment buffer address */
1004 buff_hdr_addr = buff_addr;
1005 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1008 * If the descriptors are chained the header and data are placed in
1011 if (desc->flags & VRING_DESC_F_NEXT) {
1012 desc->len = vq->vhost_hlen;
1013 desc = &vq->desc[desc->next];
1014 /* Buffer address translation. */
1015 buff_addr = gpa_to_vva(dev, desc->addr);
1016 desc->len = rte_pktmbuf_data_len(buff);
1018 buff_addr += vq->vhost_hlen;
1019 desc->len = packet_len;
1023 PRINT_PACKET(dev, (uintptr_t)buff_addr, rte_pktmbuf_data_len(buff), 0);
1025 /* Update used ring with desc information */
1026 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1027 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1029 /* Copy mbuf data to buffer */
1030 rte_memcpy((void *)(uintptr_t)buff_addr, (const void*)buff->pkt.data, rte_pktmbuf_data_len(buff));
1035 /* If mergeable is disabled then a header is required per buffer. */
1037 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1038 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1041 /* Merge buffer can only handle so many buffers at a time. Tell the guest if this limit is reached. */
1042 if ((mrg_count == MAX_MRG_PKT_BURST) || (res_cur_idx == res_end_idx)) {
1043 virtio_hdr.num_buffers = mrg_count;
1044 LOG_DEBUG(DATA, "(%"PRIu64") RX: Num merge buffers %d\n", dev->device_fh, virtio_hdr.num_buffers);
1045 rte_memcpy((void *)(uintptr_t)buff_hdr_addr, (const void*)&virtio_hdr, vq->vhost_hlen);
1046 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1050 if (res_cur_idx < res_end_idx) {
1051 /* Prefetch descriptor index. */
1052 rte_prefetch0(&vq->desc[head[packet_success]]);
1056 rte_compiler_barrier();
1058 /* Wait until it's our turn to add our buffer to the used ring. */
1059 while (unlikely(vq->last_used_idx != res_base_idx))
1062 *(volatile uint16_t *)&vq->used->idx += count;
1063 vq->last_used_idx = res_end_idx;
1065 /* Kick the guest if necessary. */
1066 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1067 eventfd_write((int)vq->kickfd, 1);
1072 * Compares a packet destination MAC address to a device MAC address.
1074 static inline int __attribute__((always_inline))
1075 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1077 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1081 * This function learns the MAC address of the device and registers this along with a
1082 * vlan tag to a VMDQ.
1085 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1087 struct ether_hdr *pkt_hdr;
1088 struct virtio_net_data_ll *dev_ll;
1091 /* Learn MAC address of guest device from packet */
1092 pkt_hdr = (struct ether_hdr *)m->pkt.data;
1094 dev_ll = ll_root_used;
1096 while (dev_ll != NULL) {
1097 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1098 RTE_LOG(INFO, DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1101 dev_ll = dev_ll->next;
1104 for (i = 0; i < ETHER_ADDR_LEN; i++)
1105 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1107 /* vlan_tag currently uses the device_id. */
1108 dev->vlan_tag = vlan_tags[dev->device_fh];
1110 /* Print out VMDQ registration info. */
1111 RTE_LOG(INFO, DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1113 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1114 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1115 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1118 /* Register the MAC address. */
1119 ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1121 RTE_LOG(ERR, DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1124 /* Enable stripping of the vlan tag as we handle routing. */
1125 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1127 /* Set device as ready for RX. */
1128 dev->ready = DEVICE_RX;
1134 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1135 * queue before disabling RX on the device.
1138 unlink_vmdq(struct virtio_net *dev)
1142 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1144 if (dev->ready == DEVICE_RX) {
1145 /*clear MAC and VLAN settings*/
1146 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1147 for (i = 0; i < 6; i++)
1148 dev->mac_address.addr_bytes[i] = 0;
1152 /*Clear out the receive buffers*/
1153 rx_count = rte_eth_rx_burst(ports[0],
1154 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1157 for (i = 0; i < rx_count; i++)
1158 rte_pktmbuf_free(pkts_burst[i]);
1160 rx_count = rte_eth_rx_burst(ports[0],
1161 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1164 dev->ready = DEVICE_MAC_LEARNING;
1169 * Check if the packet destination MAC address is for a local device. If so then put
1170 * the packet on that devices RX queue. If not then return.
1172 static inline unsigned __attribute__((always_inline))
1173 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1175 struct virtio_net_data_ll *dev_ll;
1176 struct ether_hdr *pkt_hdr;
1179 pkt_hdr = (struct ether_hdr *)m->pkt.data;
1181 /*get the used devices list*/
1182 dev_ll = ll_root_used;
1184 while (dev_ll != NULL) {
1185 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1186 &dev_ll->dev->mac_address)) {
1188 /* Drop the packet if the TX packet is destined for the TX device. */
1189 if (dev_ll->dev->device_fh == dev->device_fh) {
1190 LOG_DEBUG(DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1191 dev_ll->dev->device_fh);
1196 LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1198 if (dev_ll->dev->remove) {
1199 /*drop the packet if the device is marked for removal*/
1200 LOG_DEBUG(DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1202 /*send the packet to the local virtio device*/
1203 ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1206 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1209 &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1211 dev_statistics[dev->device_fh].tx_total++;
1212 dev_statistics[dev->device_fh].tx += ret;
1218 dev_ll = dev_ll->next;
1225 * This function routes the TX packet to the correct interface. This may be a local device
1226 * or the physical port.
1228 static inline void __attribute__((always_inline))
1229 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1231 struct mbuf_table *tx_q;
1232 struct vlan_ethhdr *vlan_hdr;
1233 struct rte_mbuf **m_table;
1234 struct rte_mbuf *mbuf;
1235 unsigned len, ret, offset = 0;
1236 const uint16_t lcore_id = rte_lcore_id();
1237 struct virtio_net_data_ll *dev_ll = ll_root_used;
1238 struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1240 /*check if destination is local VM*/
1241 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1244 if (vm2vm_mode == VM2VM_HARDWARE) {
1245 while (dev_ll != NULL) {
1246 if ((dev_ll->dev->ready == DEVICE_RX)
1247 && ether_addr_cmp(&(pkt_hdr->d_addr),
1248 &dev_ll->dev->mac_address)) {
1250 * Drop the packet if the TX packet is
1251 * destined for the TX device.
1253 if (dev_ll->dev->device_fh == dev->device_fh) {
1255 "(%"PRIu64") TX: Source and destination"
1256 " MAC addresses are the same. Dropping "
1258 dev_ll->dev->device_fh);
1264 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1267 "(%"PRIu64") TX: pkt to local VM device id:"
1268 "(%"PRIu64") vlan tag: %d.\n",
1269 dev->device_fh, dev_ll->dev->device_fh,
1274 dev_ll = dev_ll->next;
1278 LOG_DEBUG(DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1280 /*Add packet to the port tx queue*/
1281 tx_q = &lcore_tx_queue[lcore_id];
1284 /* Allocate an mbuf and populate the structure. */
1285 mbuf = rte_pktmbuf_alloc(mbuf_pool);
1286 if (unlikely(mbuf == NULL)) {
1287 RTE_LOG(ERR, DATA, "Failed to allocate memory for mbuf.\n");
1291 mbuf->pkt.data_len = m->pkt.data_len + VLAN_HLEN + offset;
1292 mbuf->pkt.pkt_len = mbuf->pkt.data_len;
1294 /* Copy ethernet header to mbuf. */
1295 rte_memcpy((void*)mbuf->pkt.data, (const void*)m->pkt.data, ETH_HLEN);
1298 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1299 vlan_hdr = (struct vlan_ethhdr *) mbuf->pkt.data;
1300 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1301 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1302 vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1304 /* Copy the remaining packet contents to the mbuf. */
1305 rte_memcpy((void*) ((uint8_t*)mbuf->pkt.data + VLAN_ETH_HLEN),
1306 (const void*) ((uint8_t*)m->pkt.data + ETH_HLEN), (m->pkt.data_len - ETH_HLEN));
1307 tx_q->m_table[len] = mbuf;
1310 dev_statistics[dev->device_fh].tx_total++;
1311 dev_statistics[dev->device_fh].tx++;
1314 if (unlikely(len == MAX_PKT_BURST)) {
1315 m_table = (struct rte_mbuf **)tx_q->m_table;
1316 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1317 /* Free any buffers not handled by TX and update the port stats. */
1318 if (unlikely(ret < len)) {
1320 rte_pktmbuf_free(m_table[ret]);
1321 } while (++ret < len);
1331 static inline void __attribute__((always_inline))
1332 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1335 struct vhost_virtqueue *vq;
1336 struct vring_desc *desc;
1337 uint64_t buff_addr = 0;
1338 uint32_t head[MAX_PKT_BURST];
1341 uint16_t free_entries, packet_success = 0;
1344 vq = dev->virtqueue[VIRTIO_TXQ];
1345 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1347 /* If there are no available buffers then return. */
1348 if (vq->last_used_idx == avail_idx)
1351 LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1353 /* Prefetch available ring to retrieve head indexes. */
1354 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1356 /*get the number of free entries in the ring*/
1357 free_entries = (avail_idx - vq->last_used_idx);
1359 /* Limit to MAX_PKT_BURST. */
1360 if (free_entries > MAX_PKT_BURST)
1361 free_entries = MAX_PKT_BURST;
1363 LOG_DEBUG(DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1364 /* Retrieve all of the head indexes first to avoid caching issues. */
1365 for (i = 0; i < free_entries; i++)
1366 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1368 /* Prefetch descriptor index. */
1369 rte_prefetch0(&vq->desc[head[packet_success]]);
1370 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1372 while (packet_success < free_entries) {
1373 desc = &vq->desc[head[packet_success]];
1375 /* Discard first buffer as it is the virtio header */
1376 desc = &vq->desc[desc->next];
1378 /* Buffer address translation. */
1379 buff_addr = gpa_to_vva(dev, desc->addr);
1380 /* Prefetch buffer address. */
1381 rte_prefetch0((void*)(uintptr_t)buff_addr);
1383 used_idx = vq->last_used_idx & (vq->size - 1);
1385 if (packet_success < (free_entries - 1)) {
1386 /* Prefetch descriptor index. */
1387 rte_prefetch0(&vq->desc[head[packet_success+1]]);
1388 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1391 /* Update used index buffer information. */
1392 vq->used->ring[used_idx].id = head[packet_success];
1393 vq->used->ring[used_idx].len = 0;
1395 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1396 m.pkt.data_len = desc->len;
1397 m.pkt.data = (void*)(uintptr_t)buff_addr;
1399 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1401 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1402 if (dev->ready == DEVICE_MAC_LEARNING) {
1403 if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1404 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1405 packet_success += free_entries;
1406 vq->last_used_idx += packet_success;
1410 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1412 vq->last_used_idx++;
1416 rte_compiler_barrier();
1417 vq->used->idx += packet_success;
1418 /* Kick guest if required. */
1419 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1420 eventfd_write((int)vq->kickfd, 1);
1424 * This function is called by each data core. It handles all RX/TX registered with the
1425 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1426 * with all devices in the main linked list.
1429 switch_worker(__attribute__((unused)) void *arg)
1431 struct rte_mempool *mbuf_pool = arg;
1432 struct virtio_net *dev = NULL;
1433 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1434 struct virtio_net_data_ll *dev_ll;
1435 struct mbuf_table *tx_q;
1436 volatile struct lcore_ll_info *lcore_ll;
1437 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1438 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1440 const uint16_t lcore_id = rte_lcore_id();
1441 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1442 uint16_t rx_count = 0;
1444 RTE_LOG(INFO, DATA, "Procesing on Core %u started \n", lcore_id);
1445 lcore_ll = lcore_info[lcore_id].lcore_ll;
1448 tx_q = &lcore_tx_queue[lcore_id];
1449 for (i = 0; i < num_cores; i ++) {
1450 if (lcore_ids[i] == lcore_id) {
1457 cur_tsc = rte_rdtsc();
1459 * TX burst queue drain
1461 diff_tsc = cur_tsc - prev_tsc;
1462 if (unlikely(diff_tsc > drain_tsc)) {
1465 LOG_DEBUG(DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1467 /*Tx any packets in the queue*/
1468 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1469 (struct rte_mbuf **)tx_q->m_table,
1470 (uint16_t)tx_q->len);
1471 if (unlikely(ret < tx_q->len)) {
1473 rte_pktmbuf_free(tx_q->m_table[ret]);
1474 } while (++ret < tx_q->len);
1484 rte_prefetch0(lcore_ll->ll_root_used);
1486 * Inform the configuration core that we have exited the linked list and that no devices are
1487 * in use if requested.
1489 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1490 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1495 dev_ll = lcore_ll->ll_root_used;
1497 while (dev_ll != NULL) {
1498 /*get virtio device ID*/
1502 dev_ll = dev_ll->next;
1504 dev->ready = DEVICE_SAFE_REMOVE;
1507 if (likely(dev->ready == DEVICE_RX)) {
1509 rx_count = rte_eth_rx_burst(ports[0],
1510 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1513 ret_count = virtio_dev_rx(dev, pkts_burst, rx_count);
1516 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1519 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
1521 while (likely(rx_count)) {
1523 rte_pktmbuf_free_seg(pkts_burst[rx_count]);
1531 virtio_dev_tx(dev, mbuf_pool);
1533 /*move to the next device in the list*/
1534 dev_ll = dev_ll->next;
1542 * This function gets available ring number for zero copy rx.
1543 * Only one thread will call this funciton for a paticular virtio device,
1544 * so, it is designed as non-thread-safe function.
1546 static inline uint32_t __attribute__((always_inline))
1547 get_available_ring_num_zcp(struct virtio_net *dev)
1549 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1552 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1553 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1557 * This function gets available ring index for zero copy rx,
1558 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1559 * Only one thread will call this funciton for a paticular virtio device,
1560 * so, it is designed as non-thread-safe function.
1562 static inline uint32_t __attribute__((always_inline))
1563 get_available_ring_index_zcp(struct virtio_net *dev,
1564 uint16_t *res_base_idx, uint32_t count)
1566 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1569 uint16_t free_entries;
1571 *res_base_idx = vq->last_used_idx_res;
1572 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1573 free_entries = (avail_idx - *res_base_idx);
1575 LOG_DEBUG(DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1577 "res base idx:%d, free entries:%d\n",
1578 dev->device_fh, avail_idx, *res_base_idx,
1582 * If retry is enabled and the queue is full then we wait
1583 * and retry to avoid packet loss.
1585 if (enable_retry && unlikely(count > free_entries)) {
1586 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1587 rte_delay_us(burst_rx_delay_time);
1588 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1589 free_entries = (avail_idx - *res_base_idx);
1590 if (count <= free_entries)
1595 /*check that we have enough buffers*/
1596 if (unlikely(count > free_entries))
1597 count = free_entries;
1599 if (unlikely(count == 0)) {
1601 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1602 "avail idx: %d, res base idx:%d, free entries:%d\n",
1603 dev->device_fh, avail_idx,
1604 *res_base_idx, free_entries);
1608 vq->last_used_idx_res = *res_base_idx + count;
1614 * This function put descriptor back to used list.
1616 static inline void __attribute__((always_inline))
1617 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1619 uint16_t res_cur_idx = vq->last_used_idx;
1620 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1621 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1622 rte_compiler_barrier();
1623 *(volatile uint16_t *)&vq->used->idx += 1;
1624 vq->last_used_idx += 1;
1626 /* Kick the guest if necessary. */
1627 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1628 eventfd_write((int)vq->kickfd, 1);
1632 * This function get available descriptor from vitio vring and un-attached mbuf
1633 * from vpool->ring, and then attach them together. It needs adjust the offset
1634 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1635 * frame data may be put to wrong location in mbuf.
1637 static inline void __attribute__((always_inline))
1638 attach_rxmbuf_zcp(struct virtio_net *dev)
1640 uint16_t res_base_idx, desc_idx;
1641 uint64_t buff_addr, phys_addr;
1642 struct vhost_virtqueue *vq;
1643 struct vring_desc *desc;
1644 struct rte_mbuf *mbuf = NULL;
1645 struct vpool *vpool;
1648 vpool = &vpool_array[dev->vmdq_rx_q];
1649 vq = dev->virtqueue[VIRTIO_RXQ];
1652 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
1655 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1657 desc = &vq->desc[desc_idx];
1658 if (desc->flags & VRING_DESC_F_NEXT) {
1659 desc = &vq->desc[desc->next];
1660 buff_addr = gpa_to_vva(dev, desc->addr);
1661 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
1664 buff_addr = gpa_to_vva(dev,
1665 desc->addr + vq->vhost_hlen);
1666 phys_addr = gpa_to_hpa(dev,
1667 desc->addr + vq->vhost_hlen,
1668 desc->len, &addr_type);
1671 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1672 RTE_LOG(ERR, DATA, "(%"PRIu64") Invalid frame buffer"
1673 " address found when attaching RX frame buffer"
1674 " address!\n", dev->device_fh);
1675 put_desc_to_used_list_zcp(vq, desc_idx);
1680 * Check if the frame buffer address from guest crosses
1681 * sub-region or not.
1683 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1685 "(%"PRIu64") Frame buffer address cross "
1686 "sub-regioin found when attaching RX frame "
1687 "buffer address!\n",
1689 put_desc_to_used_list_zcp(vq, desc_idx);
1692 } while (unlikely(phys_addr == 0));
1694 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1695 if (unlikely(mbuf == NULL)) {
1697 "(%"PRIu64") in attach_rxmbuf_zcp: "
1698 "ring_sc_dequeue fail.\n",
1700 put_desc_to_used_list_zcp(vq, desc_idx);
1704 if (unlikely(vpool->buf_size > desc->len)) {
1706 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1707 "length(%d) of descriptor idx: %d less than room "
1708 "size required: %d\n",
1709 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1710 put_desc_to_used_list_zcp(vq, desc_idx);
1711 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1715 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1716 mbuf->pkt.data = (void *)(uintptr_t)(buff_addr);
1717 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1718 mbuf->pkt.data_len = desc->len;
1719 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1722 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1723 "descriptor idx:%d\n",
1724 dev->device_fh, res_base_idx, desc_idx);
1726 __rte_mbuf_raw_free(mbuf);
1732 * Detach an attched packet mbuf -
1733 * - restore original mbuf address and length values.
1734 * - reset pktmbuf data and data_len to their default values.
1735 * All other fields of the given packet mbuf will be left intact.
1738 * The attached packet mbuf.
1740 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1742 const struct rte_mempool *mp = m->pool;
1743 void *buf = RTE_MBUF_TO_BADDR(m);
1745 uint32_t buf_len = mp->elt_size - sizeof(*m);
1746 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1749 m->buf_len = (uint16_t)buf_len;
1751 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1752 RTE_PKTMBUF_HEADROOM : m->buf_len;
1753 m->pkt.data = (char *) m->buf_addr + buf_ofs;
1755 m->pkt.data_len = 0;
1759 * This function is called after packets have been transimited. It fetchs mbuf
1760 * from vpool->pool, detached it and put into vpool->ring. It also update the
1761 * used index and kick the guest if necessary.
1763 static inline uint32_t __attribute__((always_inline))
1764 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1766 struct rte_mbuf *mbuf;
1767 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1768 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1770 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1773 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1775 dev->device_fh, mbuf_count);
1777 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1779 dev->device_fh, rte_ring_count(vpool->ring));
1781 for (index = 0; index < mbuf_count; index++) {
1782 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1783 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1784 pktmbuf_detach_zcp(mbuf);
1785 rte_ring_sp_enqueue(vpool->ring, mbuf);
1787 /* Update used index buffer information. */
1788 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1789 vq->used->ring[used_idx].len = 0;
1791 used_idx = (used_idx + 1) & (vq->size - 1);
1795 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1797 dev->device_fh, rte_mempool_count(vpool->pool));
1799 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1801 dev->device_fh, rte_ring_count(vpool->ring));
1803 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1804 "vq->last_used_idx:%d\n",
1805 dev->device_fh, vq->last_used_idx);
1807 vq->last_used_idx += mbuf_count;
1810 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1811 "vq->last_used_idx:%d\n",
1812 dev->device_fh, vq->last_used_idx);
1814 rte_compiler_barrier();
1816 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1818 /* Kick guest if required. */
1819 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1820 eventfd_write((int)vq->kickfd, 1);
1826 * This function is called when a virtio device is destroy.
1827 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1829 static void mbuf_destroy_zcp(struct vpool *vpool)
1831 struct rte_mbuf *mbuf = NULL;
1832 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1835 "in mbuf_destroy_zcp: mbuf count in mempool before "
1836 "mbuf_destroy_zcp is: %d\n",
1839 "in mbuf_destroy_zcp: mbuf count in ring before "
1840 "mbuf_destroy_zcp is : %d\n",
1841 rte_ring_count(vpool->ring));
1843 for (index = 0; index < mbuf_count; index++) {
1844 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1845 if (likely(mbuf != NULL)) {
1846 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1847 pktmbuf_detach_zcp(mbuf);
1848 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1853 "in mbuf_destroy_zcp: mbuf count in mempool after "
1854 "mbuf_destroy_zcp is: %d\n",
1855 rte_mempool_count(vpool->pool));
1857 "in mbuf_destroy_zcp: mbuf count in ring after "
1858 "mbuf_destroy_zcp is : %d\n",
1859 rte_ring_count(vpool->ring));
1863 * This function update the use flag and counter.
1865 static inline uint32_t __attribute__((always_inline))
1866 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1869 struct vhost_virtqueue *vq;
1870 struct vring_desc *desc;
1871 struct rte_mbuf *buff;
1872 /* The virtio_hdr is initialised to 0. */
1873 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1874 = {{0, 0, 0, 0, 0, 0}, 0};
1875 uint64_t buff_hdr_addr = 0;
1876 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1877 uint32_t head_idx, packet_success = 0;
1878 uint16_t res_cur_idx;
1880 LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1885 vq = dev->virtqueue[VIRTIO_RXQ];
1886 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1888 res_cur_idx = vq->last_used_idx;
1889 LOG_DEBUG(DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1890 dev->device_fh, res_cur_idx, res_cur_idx + count);
1892 /* Retrieve all of the head indexes first to avoid caching issues. */
1893 for (head_idx = 0; head_idx < count; head_idx++)
1894 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1896 /*Prefetch descriptor index. */
1897 rte_prefetch0(&vq->desc[head[packet_success]]);
1899 while (packet_success != count) {
1900 /* Get descriptor from available ring */
1901 desc = &vq->desc[head[packet_success]];
1903 buff = pkts[packet_success];
1905 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1906 "pkt[%d] descriptor idx: %d\n",
1907 dev->device_fh, packet_success,
1908 MBUF_HEADROOM_UINT32(buff));
1911 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1912 + RTE_PKTMBUF_HEADROOM),
1913 rte_pktmbuf_data_len(buff), 0);
1915 /* Buffer address translation for virtio header. */
1916 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1917 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1920 * If the descriptors are chained the header and data are
1921 * placed in separate buffers.
1923 if (desc->flags & VRING_DESC_F_NEXT) {
1924 desc->len = vq->vhost_hlen;
1925 desc = &vq->desc[desc->next];
1926 desc->len = rte_pktmbuf_data_len(buff);
1928 desc->len = packet_len;
1931 /* Update used ring with desc information */
1932 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1933 = head[packet_success];
1934 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1939 /* A header is required per buffer. */
1940 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1941 (const void *)&virtio_hdr, vq->vhost_hlen);
1943 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1945 if (likely(packet_success < count)) {
1946 /* Prefetch descriptor index. */
1947 rte_prefetch0(&vq->desc[head[packet_success]]);
1951 rte_compiler_barrier();
1954 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1955 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1956 dev->device_fh, vq->last_used_idx, vq->used->idx);
1958 *(volatile uint16_t *)&vq->used->idx += count;
1959 vq->last_used_idx += count;
1962 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1963 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1964 dev->device_fh, vq->last_used_idx, vq->used->idx);
1966 /* Kick the guest if necessary. */
1967 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1968 eventfd_write((int)vq->kickfd, 1);
1974 * This function routes the TX packet to the correct interface.
1975 * This may be a local device or the physical port.
1977 static inline void __attribute__((always_inline))
1978 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1979 uint32_t desc_idx, uint8_t need_copy)
1981 struct mbuf_table *tx_q;
1982 struct rte_mbuf **m_table;
1983 struct rte_mbuf *mbuf = NULL;
1984 unsigned len, ret, offset = 0;
1985 struct vpool *vpool;
1986 struct virtio_net_data_ll *dev_ll = ll_root_used;
1987 struct ether_hdr *pkt_hdr = (struct ether_hdr *)m->pkt.data;
1988 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1990 /*Add packet to the port tx queue*/
1991 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
1994 /* Allocate an mbuf and populate the structure. */
1995 vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
1996 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1997 if (unlikely(mbuf == NULL)) {
1998 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2000 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
2002 put_desc_to_used_list_zcp(vq, desc_idx);
2006 if (vm2vm_mode == VM2VM_HARDWARE) {
2007 /* Avoid using a vlan tag from any vm for external pkt, such as
2008 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2009 * selection, MAC address determines it as an external pkt
2010 * which should go to network, while vlan tag determine it as
2011 * a vm2vm pkt should forward to another vm. Hardware confuse
2012 * such a ambiguous situation, so pkt will lost.
2014 vlan_tag = external_pkt_default_vlan_tag;
2015 while (dev_ll != NULL) {
2016 if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2017 ether_addr_cmp(&(pkt_hdr->d_addr),
2018 &dev_ll->dev->mac_address)) {
2021 * Drop the packet if the TX packet is destined
2022 * for the TX device.
2024 if (unlikely(dev_ll->dev->device_fh
2025 == dev->device_fh)) {
2027 "(%"PRIu64") TX: Source and destination"
2028 "MAC addresses are the same. Dropping "
2030 dev_ll->dev->device_fh);
2031 MBUF_HEADROOM_UINT32(mbuf)
2032 = (uint32_t)desc_idx;
2033 __rte_mbuf_raw_free(mbuf);
2038 * Packet length offset 4 bytes for HW vlan
2039 * strip when L2 switch back.
2044 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2047 "(%"PRIu64") TX: pkt to local VM device id:"
2048 "(%"PRIu64") vlan tag: %d.\n",
2049 dev->device_fh, dev_ll->dev->device_fh,
2054 dev_ll = dev_ll->next;
2058 mbuf->pkt.nb_segs = m->pkt.nb_segs;
2059 mbuf->pkt.next = m->pkt.next;
2060 mbuf->pkt.data_len = m->pkt.data_len + offset;
2061 mbuf->pkt.pkt_len = mbuf->pkt.data_len;
2062 if (unlikely(need_copy)) {
2063 /* Copy the packet contents to the mbuf. */
2064 rte_memcpy((void *)((uint8_t *)mbuf->pkt.data),
2065 (const void *) ((uint8_t *)m->pkt.data),
2068 mbuf->pkt.data = m->pkt.data;
2069 mbuf->buf_physaddr = m->buf_physaddr;
2070 mbuf->buf_addr = m->buf_addr;
2072 mbuf->ol_flags = PKT_TX_VLAN_PKT;
2073 mbuf->pkt.vlan_macip.f.vlan_tci = vlan_tag;
2074 mbuf->pkt.vlan_macip.f.l2_len = sizeof(struct ether_hdr);
2075 mbuf->pkt.vlan_macip.f.l3_len = sizeof(struct ipv4_hdr);
2076 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2078 tx_q->m_table[len] = mbuf;
2082 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2085 (mbuf->pkt.next == NULL) ? "null" : "non-null");
2088 dev_statistics[dev->device_fh].tx_total++;
2089 dev_statistics[dev->device_fh].tx++;
2092 if (unlikely(len == MAX_PKT_BURST)) {
2093 m_table = (struct rte_mbuf **)tx_q->m_table;
2094 ret = rte_eth_tx_burst(ports[0],
2095 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2098 * Free any buffers not handled by TX and update
2101 if (unlikely(ret < len)) {
2103 rte_pktmbuf_free(m_table[ret]);
2104 } while (++ret < len);
2108 txmbuf_clean_zcp(dev, vpool);
2117 * This function TX all available packets in virtio TX queue for one
2118 * virtio-net device. If it is first packet, it learns MAC address and
2121 static inline void __attribute__((always_inline))
2122 virtio_dev_tx_zcp(struct virtio_net *dev)
2125 struct vhost_virtqueue *vq;
2126 struct vring_desc *desc;
2127 uint64_t buff_addr = 0, phys_addr;
2128 uint32_t head[MAX_PKT_BURST];
2130 uint16_t free_entries, packet_success = 0;
2132 uint8_t need_copy = 0;
2135 vq = dev->virtqueue[VIRTIO_TXQ];
2136 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2138 /* If there are no available buffers then return. */
2139 if (vq->last_used_idx_res == avail_idx)
2142 LOG_DEBUG(DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2144 /* Prefetch available ring to retrieve head indexes. */
2145 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2147 /* Get the number of free entries in the ring */
2148 free_entries = (avail_idx - vq->last_used_idx_res);
2150 /* Limit to MAX_PKT_BURST. */
2152 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2154 LOG_DEBUG(DATA, "(%"PRIu64") Buffers available %d\n",
2155 dev->device_fh, free_entries);
2157 /* Retrieve all of the head indexes first to avoid caching issues. */
2158 for (i = 0; i < free_entries; i++)
2160 = vq->avail->ring[(vq->last_used_idx_res + i)
2163 vq->last_used_idx_res += free_entries;
2165 /* Prefetch descriptor index. */
2166 rte_prefetch0(&vq->desc[head[packet_success]]);
2167 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2169 while (packet_success < free_entries) {
2170 desc = &vq->desc[head[packet_success]];
2172 /* Discard first buffer as it is the virtio header */
2173 desc = &vq->desc[desc->next];
2175 /* Buffer address translation. */
2176 buff_addr = gpa_to_vva(dev, desc->addr);
2177 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2179 if (likely(packet_success < (free_entries - 1)))
2180 /* Prefetch descriptor index. */
2181 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2183 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2185 "(%"PRIu64") Invalid frame buffer address found"
2186 "when TX packets!\n",
2192 /* Prefetch buffer address. */
2193 rte_prefetch0((void *)(uintptr_t)buff_addr);
2196 * Setup dummy mbuf. This is copied to a real mbuf if
2197 * transmitted out the physical port.
2199 m.pkt.data_len = desc->len;
2202 m.pkt.data = (void *)(uintptr_t)buff_addr;
2203 m.buf_addr = m.pkt.data;
2204 m.buf_physaddr = phys_addr;
2207 * Check if the frame buffer address from guest crosses
2208 * sub-region or not.
2210 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2212 "(%"PRIu64") Frame buffer address cross "
2213 "sub-regioin found when attaching TX frame "
2214 "buffer address!\n",
2220 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2223 * If this is the first received packet we need to learn
2224 * the MAC and setup VMDQ
2226 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2227 if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2229 * Discard frame if device is scheduled for
2230 * removal or a duplicate MAC address is found.
2232 packet_success += free_entries;
2233 vq->last_used_idx += packet_success;
2238 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2244 * This function is called by each data core. It handles all RX/TX registered
2245 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2246 * addresses are compared with all devices in the main linked list.
2249 switch_worker_zcp(__attribute__((unused)) void *arg)
2251 struct virtio_net *dev = NULL;
2252 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2253 struct virtio_net_data_ll *dev_ll;
2254 struct mbuf_table *tx_q;
2255 volatile struct lcore_ll_info *lcore_ll;
2256 const uint64_t drain_tsc
2257 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2258 * BURST_TX_DRAIN_US;
2259 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2261 const uint16_t lcore_id = rte_lcore_id();
2262 uint16_t count_in_ring, rx_count = 0;
2264 RTE_LOG(INFO, DATA, "Procesing on Core %u started\n", lcore_id);
2266 lcore_ll = lcore_info[lcore_id].lcore_ll;
2270 cur_tsc = rte_rdtsc();
2272 /* TX burst queue drain */
2273 diff_tsc = cur_tsc - prev_tsc;
2274 if (unlikely(diff_tsc > drain_tsc)) {
2276 * Get mbuf from vpool.pool and detach mbuf and
2277 * put back into vpool.ring.
2279 dev_ll = lcore_ll->ll_root_used;
2280 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2281 /* Get virtio device ID */
2284 if (likely(!dev->remove)) {
2285 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2288 "TX queue drained after timeout"
2289 " with burst size %u\n",
2293 * Tx any packets in the queue
2295 ret = rte_eth_tx_burst(
2297 (uint16_t)tx_q->txq_id,
2298 (struct rte_mbuf **)
2300 (uint16_t)tx_q->len);
2301 if (unlikely(ret < tx_q->len)) {
2304 tx_q->m_table[ret]);
2305 } while (++ret < tx_q->len);
2309 txmbuf_clean_zcp(dev,
2310 &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2313 dev_ll = dev_ll->next;
2318 rte_prefetch0(lcore_ll->ll_root_used);
2321 * Inform the configuration core that we have exited the linked
2322 * list and that no devices are in use if requested.
2324 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2325 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2327 /* Process devices */
2328 dev_ll = lcore_ll->ll_root_used;
2330 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2332 if (unlikely(dev->remove)) {
2333 dev_ll = dev_ll->next;
2335 dev->ready = DEVICE_SAFE_REMOVE;
2339 if (likely(dev->ready == DEVICE_RX)) {
2340 uint32_t index = dev->vmdq_rx_q;
2343 = rte_ring_count(vpool_array[index].ring);
2344 uint16_t free_entries
2345 = (uint16_t)get_available_ring_num_zcp(dev);
2348 * Attach all mbufs in vpool.ring and put back
2352 i < RTE_MIN(free_entries,
2353 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2355 attach_rxmbuf_zcp(dev);
2357 /* Handle guest RX */
2358 rx_count = rte_eth_rx_burst(ports[0],
2359 (uint16_t)dev->vmdq_rx_q, pkts_burst,
2363 ret_count = virtio_dev_rx_zcp(dev,
2364 pkts_burst, rx_count);
2366 dev_statistics[dev->device_fh].rx_total
2368 dev_statistics[dev->device_fh].rx
2371 while (likely(rx_count)) {
2374 pkts_burst[rx_count]);
2375 rte_ring_sp_enqueue(
2376 vpool_array[index].ring,
2377 (void *)pkts_burst[rx_count]);
2382 if (likely(!dev->remove))
2383 /* Handle guest TX */
2384 virtio_dev_tx_zcp(dev);
2386 /* Move to the next device in the list */
2387 dev_ll = dev_ll->next;
2396 * Add an entry to a used linked list. A free entry must first be found
2397 * in the free linked list using get_data_ll_free_entry();
2400 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2401 struct virtio_net_data_ll *ll_dev)
2403 struct virtio_net_data_ll *ll = *ll_root_addr;
2405 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2406 ll_dev->next = NULL;
2407 rte_compiler_barrier();
2409 /* If ll == NULL then this is the first device. */
2411 /* Increment to the tail of the linked list. */
2412 while ((ll->next != NULL) )
2417 *ll_root_addr = ll_dev;
2422 * Remove an entry from a used linked list. The entry must then be added to
2423 * the free linked list using put_data_ll_free_entry().
2426 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2427 struct virtio_net_data_ll *ll_dev,
2428 struct virtio_net_data_ll *ll_dev_last)
2430 struct virtio_net_data_ll *ll = *ll_root_addr;
2432 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2436 *ll_root_addr = ll_dev->next;
2438 if (likely(ll_dev_last != NULL))
2439 ll_dev_last->next = ll_dev->next;
2441 RTE_LOG(ERR, CONFIG, "Remove entry form ll failed.\n");
2445 * Find and return an entry from the free linked list.
2447 static struct virtio_net_data_ll *
2448 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2450 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2451 struct virtio_net_data_ll *ll_dev;
2453 if (ll_free == NULL)
2457 *ll_root_addr = ll_free->next;
2463 * Place an entry back on to the free linked list.
2466 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2467 struct virtio_net_data_ll *ll_dev)
2469 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2474 ll_dev->next = ll_free;
2475 *ll_root_addr = ll_dev;
2479 * Creates a linked list of a given size.
2481 static struct virtio_net_data_ll *
2482 alloc_data_ll(uint32_t size)
2484 struct virtio_net_data_ll *ll_new;
2487 /* Malloc and then chain the linked list. */
2488 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2489 if (ll_new == NULL) {
2490 RTE_LOG(ERR, CONFIG, "Failed to allocate memory for ll_new.\n");
2494 for (i = 0; i < size - 1; i++) {
2495 ll_new[i].dev = NULL;
2496 ll_new[i].next = &ll_new[i+1];
2498 ll_new[i].next = NULL;
2504 * Create the main linked list along with each individual cores linked list. A used and a free list
2505 * are created to manage entries.
2512 RTE_LCORE_FOREACH_SLAVE(lcore) {
2513 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2514 if (lcore_info[lcore].lcore_ll == NULL) {
2515 RTE_LOG(ERR, CONFIG, "Failed to allocate memory for lcore_ll.\n");
2519 lcore_info[lcore].lcore_ll->device_num = 0;
2520 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2521 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2522 if (num_devices % num_switching_cores)
2523 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2525 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2528 /* Allocate devices up to a maximum of MAX_DEVICES. */
2529 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2535 * Set virtqueue flags so that we do not receive interrupts.
2538 set_irq_status (struct virtio_net *dev)
2540 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2541 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2545 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2546 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2547 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2550 destroy_device (volatile struct virtio_net *dev)
2552 struct virtio_net_data_ll *ll_lcore_dev_cur;
2553 struct virtio_net_data_ll *ll_main_dev_cur;
2554 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2555 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2558 dev->flags &= ~VIRTIO_DEV_RUNNING;
2560 /*set the remove flag. */
2563 while(dev->ready != DEVICE_SAFE_REMOVE) {
2567 /* Search for entry to be removed from lcore ll */
2568 ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
2569 while (ll_lcore_dev_cur != NULL) {
2570 if (ll_lcore_dev_cur->dev == dev) {
2573 ll_lcore_dev_last = ll_lcore_dev_cur;
2574 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2578 if (ll_lcore_dev_cur == NULL) {
2579 RTE_LOG(ERR, CONFIG,
2580 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2585 /* Search for entry to be removed from main ll */
2586 ll_main_dev_cur = ll_root_used;
2587 ll_main_dev_last = NULL;
2588 while (ll_main_dev_cur != NULL) {
2589 if (ll_main_dev_cur->dev == dev) {
2592 ll_main_dev_last = ll_main_dev_cur;
2593 ll_main_dev_cur = ll_main_dev_cur->next;
2597 /* Remove entries from the lcore and main ll. */
2598 rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2599 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2601 /* Set the dev_removal_flag on each lcore. */
2602 RTE_LCORE_FOREACH_SLAVE(lcore) {
2603 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2607 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2608 * they can no longer access the device removed from the linked lists and that the devices
2609 * are no longer in use.
2611 RTE_LCORE_FOREACH_SLAVE(lcore) {
2612 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2617 /* Add the entries back to the lcore and main free ll.*/
2618 put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2619 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2621 /* Decrement number of device on the lcore. */
2622 lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
2624 RTE_LOG(INFO, DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2627 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2629 /* Stop the RX queue. */
2630 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2632 "(%"PRIu64") In destroy_device: Failed to stop "
2639 "(%"PRIu64") in destroy_device: Start put mbuf in "
2640 "mempool back to ring for RX queue: %d\n",
2641 dev->device_fh, dev->vmdq_rx_q);
2643 mbuf_destroy_zcp(vpool);
2645 /* Stop the TX queue. */
2646 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2648 "(%"PRIu64") In destroy_device: Failed to "
2649 "stop tx queue:%d\n",
2650 dev->device_fh, dev->vmdq_rx_q);
2653 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
2656 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2657 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2658 dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
2661 mbuf_destroy_zcp(vpool);
2667 * A new device is added to a data core. First the device is added to the main linked list
2668 * and the allocated to a specific data core.
2671 new_device (struct virtio_net *dev)
2673 struct virtio_net_data_ll *ll_dev;
2674 int lcore, core_add = 0;
2675 uint32_t device_num_min = num_devices;
2677 /* Add device to main ll */
2678 ll_dev = get_data_ll_free_entry(&ll_root_free);
2679 if (ll_dev == NULL) {
2680 RTE_LOG(INFO, DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2681 "of %d devices per core has been reached\n",
2682 dev->device_fh, num_devices);
2686 add_data_ll_entry(&ll_root_used, ll_dev);
2687 ll_dev->dev->vmdq_rx_q
2688 = ll_dev->dev->device_fh * (num_queues / num_devices);
2691 uint32_t index = ll_dev->dev->vmdq_rx_q;
2692 uint32_t count_in_ring, i;
2693 struct mbuf_table *tx_q;
2695 count_in_ring = rte_ring_count(vpool_array[index].ring);
2698 "(%"PRIu64") in new_device: mbuf count in mempool "
2699 "before attach is: %d\n",
2701 rte_mempool_count(vpool_array[index].pool));
2703 "(%"PRIu64") in new_device: mbuf count in ring "
2704 "before attach is : %d\n",
2705 dev->device_fh, count_in_ring);
2708 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2710 for (i = 0; i < count_in_ring; i++)
2711 attach_rxmbuf_zcp(dev);
2713 LOG_DEBUG(CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2714 "mempool after attach is: %d\n",
2716 rte_mempool_count(vpool_array[index].pool));
2717 LOG_DEBUG(CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2718 "ring after attach is : %d\n",
2720 rte_ring_count(vpool_array[index].ring));
2722 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2723 tx_q->txq_id = dev->vmdq_rx_q;
2725 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2726 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2729 "(%"PRIu64") In new_device: Failed to start "
2731 dev->device_fh, dev->vmdq_rx_q);
2733 mbuf_destroy_zcp(vpool);
2737 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2738 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2741 "(%"PRIu64") In new_device: Failed to start "
2743 dev->device_fh, dev->vmdq_rx_q);
2745 /* Stop the TX queue. */
2746 if (rte_eth_dev_tx_queue_stop(ports[0],
2747 dev->vmdq_rx_q) != 0) {
2749 "(%"PRIu64") In new_device: Failed to "
2750 "stop tx queue:%d\n",
2751 dev->device_fh, dev->vmdq_rx_q);
2754 mbuf_destroy_zcp(vpool);
2760 /*reset ready flag*/
2761 dev->ready = DEVICE_MAC_LEARNING;
2764 /* Find a suitable lcore to add the device. */
2765 RTE_LCORE_FOREACH_SLAVE(lcore) {
2766 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2767 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2771 /* Add device to lcore ll */
2772 ll_dev->dev->coreid = core_add;
2773 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2774 if (ll_dev == NULL) {
2775 RTE_LOG(INFO, DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2776 dev->ready = DEVICE_SAFE_REMOVE;
2777 destroy_device(dev);
2781 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2783 /* Initialize device stats */
2784 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2786 /* Disable notifications. */
2787 set_irq_status(dev);
2788 lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
2789 dev->flags |= VIRTIO_DEV_RUNNING;
2791 RTE_LOG(INFO, DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
2797 * These callback allow devices to be added to the data core when configuration
2798 * has been fully complete.
2800 static const struct virtio_net_device_ops virtio_net_device_ops =
2802 .new_device = new_device,
2803 .destroy_device = destroy_device,
2807 * This is a thread will wake up after a period to print stats if the user has
2813 struct virtio_net_data_ll *dev_ll;
2814 uint64_t tx_dropped, rx_dropped;
2815 uint64_t tx, tx_total, rx, rx_total;
2817 const char clr[] = { 27, '[', '2', 'J', '\0' };
2818 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2821 sleep(enable_stats);
2823 /* Clear screen and move to top left */
2824 printf("%s%s", clr, top_left);
2826 printf("\nDevice statistics ====================================");
2828 dev_ll = ll_root_used;
2829 while (dev_ll != NULL) {
2830 device_fh = (uint32_t)dev_ll->dev->device_fh;
2831 tx_total = dev_statistics[device_fh].tx_total;
2832 tx = dev_statistics[device_fh].tx;
2833 tx_dropped = tx_total - tx;
2834 if (zero_copy == 0) {
2835 rx_total = rte_atomic64_read(
2836 &dev_statistics[device_fh].rx_total_atomic);
2837 rx = rte_atomic64_read(
2838 &dev_statistics[device_fh].rx_atomic);
2840 rx_total = dev_statistics[device_fh].rx_total;
2841 rx = dev_statistics[device_fh].rx;
2843 rx_dropped = rx_total - rx;
2845 printf("\nStatistics for device %"PRIu32" ------------------------------"
2846 "\nTX total: %"PRIu64""
2847 "\nTX dropped: %"PRIu64""
2848 "\nTX successful: %"PRIu64""
2849 "\nRX total: %"PRIu64""
2850 "\nRX dropped: %"PRIu64""
2851 "\nRX successful: %"PRIu64"",
2860 dev_ll = dev_ll->next;
2862 printf("\n======================================================\n");
2867 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2868 char *ring_name, uint32_t nb_mbuf)
2870 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2871 vpool_array[index].pool
2872 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2873 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2874 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2875 rte_pktmbuf_init, NULL, socket, 0);
2876 if (vpool_array[index].pool != NULL) {
2877 vpool_array[index].ring
2878 = rte_ring_create(ring_name,
2879 rte_align32pow2(nb_mbuf + 1),
2880 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2881 if (likely(vpool_array[index].ring != NULL)) {
2883 "in setup_mempool_tbl: mbuf count in "
2885 rte_mempool_count(vpool_array[index].pool));
2887 "in setup_mempool_tbl: mbuf count in "
2889 rte_ring_count(vpool_array[index].ring));
2891 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2895 /* Need consider head room. */
2896 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2898 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2904 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2905 * device is also registered here to handle the IOCTLs.
2908 MAIN(int argc, char *argv[])
2910 struct rte_mempool *mbuf_pool = NULL;
2911 unsigned lcore_id, core_id = 0;
2912 unsigned nb_ports, valid_num_ports;
2914 uint8_t portid, queue_id = 0;
2915 static pthread_t tid;
2918 ret = rte_eal_init(argc, argv);
2920 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2924 /* parse app arguments */
2925 ret = us_vhost_parse_args(argc, argv);
2927 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2929 if (rte_eal_pci_probe() != 0)
2930 rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
2932 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2933 if (rte_lcore_is_enabled(lcore_id))
2934 lcore_ids[core_id ++] = lcore_id;
2936 if (rte_lcore_count() > RTE_MAX_LCORE)
2937 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2939 /*set the number of swithcing cores available*/
2940 num_switching_cores = rte_lcore_count()-1;
2942 /* Get the number of physical ports. */
2943 nb_ports = rte_eth_dev_count();
2944 if (nb_ports > RTE_MAX_ETHPORTS)
2945 nb_ports = RTE_MAX_ETHPORTS;
2948 * Update the global var NUM_PORTS and global array PORTS
2949 * and get value of var VALID_NUM_PORTS according to system ports number
2951 valid_num_ports = check_ports_num(nb_ports);
2953 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2954 RTE_LOG(INFO, PORT, "Current enabled port number is %u,"
2955 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2959 if (zero_copy == 0) {
2960 /* Create the mbuf pool. */
2961 mbuf_pool = rte_mempool_create(
2965 MBUF_SIZE, MBUF_CACHE_SIZE,
2966 sizeof(struct rte_pktmbuf_pool_private),
2967 rte_pktmbuf_pool_init, NULL,
2968 rte_pktmbuf_init, NULL,
2969 rte_socket_id(), 0);
2970 if (mbuf_pool == NULL)
2971 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2973 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2974 vpool_array[queue_id].pool = mbuf_pool;
2976 if (vm2vm_mode == VM2VM_HARDWARE) {
2977 /* Enable VT loop back to let L2 switch to do it. */
2978 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2980 "Enable loop back for L2 switch in vmdq.\n");
2984 char pool_name[RTE_MEMPOOL_NAMESIZE];
2985 char ring_name[RTE_MEMPOOL_NAMESIZE];
2987 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
2988 rx_conf_default.rx_drop_en = 0;
2989 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
2990 nb_mbuf = num_rx_descriptor
2991 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2992 + num_switching_cores * MAX_PKT_BURST;
2994 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2995 rte_snprintf(pool_name, sizeof(pool_name),
2996 "rxmbuf_pool_%u", queue_id);
2997 rte_snprintf(ring_name, sizeof(ring_name),
2998 "rxmbuf_ring_%u", queue_id);
2999 setup_mempool_tbl(rte_socket_id(), queue_id,
3000 pool_name, ring_name, nb_mbuf);
3003 nb_mbuf = num_tx_descriptor
3004 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3005 + num_switching_cores * MAX_PKT_BURST;
3007 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3008 rte_snprintf(pool_name, sizeof(pool_name),
3009 "txmbuf_pool_%u", queue_id);
3010 rte_snprintf(ring_name, sizeof(ring_name),
3011 "txmbuf_ring_%u", queue_id);
3012 setup_mempool_tbl(rte_socket_id(),
3013 (queue_id + MAX_QUEUES),
3014 pool_name, ring_name, nb_mbuf);
3017 if (vm2vm_mode == VM2VM_HARDWARE) {
3018 /* Enable VT loop back to let L2 switch to do it. */
3019 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3021 "Enable loop back for L2 switch in vmdq.\n");
3024 /* Set log level. */
3025 rte_set_log_level(LOG_LEVEL);
3027 /* initialize all ports */
3028 for (portid = 0; portid < nb_ports; portid++) {
3029 /* skip ports that are not enabled */
3030 if ((enabled_port_mask & (1 << portid)) == 0) {
3032 "Skipping disabled port %d\n", portid);
3035 if (port_init(portid) != 0)
3036 rte_exit(EXIT_FAILURE,
3037 "Cannot initialize network ports\n");
3040 /* Initialise all linked lists. */
3041 if (init_data_ll() == -1)
3042 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3044 /* Initialize device stats */
3045 memset(&dev_statistics, 0, sizeof(dev_statistics));
3047 /* Enable stats if the user option is set. */
3049 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3051 /* Launch all data cores. */
3052 if (zero_copy == 0) {
3053 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3054 rte_eal_remote_launch(switch_worker,
3055 mbuf_pool, lcore_id);
3058 uint32_t count_in_mempool, index, i;
3059 for (index = 0; index < 2*MAX_QUEUES; index++) {
3060 /* For all RX and TX queues. */
3062 = rte_mempool_count(vpool_array[index].pool);
3065 * Transfer all un-attached mbufs from vpool.pool
3068 for (i = 0; i < count_in_mempool; i++) {
3069 struct rte_mbuf *mbuf
3070 = __rte_mbuf_raw_alloc(
3071 vpool_array[index].pool);
3072 rte_ring_sp_enqueue(vpool_array[index].ring,
3077 "in MAIN: mbuf count in mempool at initial "
3078 "is: %d\n", count_in_mempool);
3080 "in MAIN: mbuf count in ring at initial is :"
3082 rte_ring_count(vpool_array[index].ring));
3085 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3086 rte_eal_remote_launch(switch_worker_zcp, NULL,
3090 /* Register CUSE device to handle IOCTLs. */
3091 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3093 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3095 init_virtio_net(&virtio_net_device_ops);
3097 /* Start CUSE session. */
3098 start_cuse_session_loop();