4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_QUEUES 128
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
63 * Calculate the number of buffers needed per port
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
66 (num_switching_cores*MAX_PKT_BURST) + \
67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 (num_switching_cores*MBUF_CACHE_SIZE))
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
74 * No frame data buffer allocated from host are required for zero copy
75 * implementation, guest will allocate the frame data buffer, and vhost
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
84 * RX and TX Prefetch, Host, and Write-back threshold values should be
85 * carefully set for optimal performance. Consult the network
86 * controller's datasheet and supporting DPDK documentation for guidance
87 * on how these parameters should be set.
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
94 * These default values are optimized for use with the Intel(R) 82599 10 GbE
95 * Controller and the DPDK ixgbe PMD. Consider using other values for other
96 * network controllers and/or network drivers.
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
109 #define JUMBO_FRAME_MAX_SIZE 0x2600
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
114 #define DEVICE_SAFE_REMOVE 2
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
125 * Need refine these 2 macros for legacy and DPDK based front end:
126 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127 * And then adjust power 2.
130 * For legacy front end, 128 descriptors,
131 * half for virtio header, another half for mbuf.
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 + sizeof(struct rte_mbuf)))
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
143 #define INVALID_PORT_ID 0xFF
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
174 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175 * disabled on default.
177 static uint32_t zero_copy;
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
187 struct rte_mempool *pool;
188 struct rte_ring *ring;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
201 /* The type of host physical address translated from guest physical address. */
203 PHYS_ADDR_CONTINUOUS = 0,
204 PHYS_ADDR_CROSS_SUBREG = 1,
205 PHYS_ADDR_INVALID = 2,
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
230 .pthresh = RX_PTHRESH,
231 .hthresh = RX_HTHRESH,
232 .wthresh = RX_WTHRESH,
238 * These default values are optimized for use with the Intel(R) 82599 10 GbE
239 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240 * network controllers and/or network drivers.
242 static struct rte_eth_txconf tx_conf_default = {
244 .pthresh = TX_PTHRESH,
245 .hthresh = TX_HTHRESH,
246 .wthresh = TX_WTHRESH,
248 .tx_free_thresh = 0, /* Use PMD default values */
249 .tx_rs_thresh = 0, /* Use PMD default values */
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
255 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
257 .header_split = 0, /**< Header Split disabled */
258 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
261 * It is necessary for 1G NIC such as I350,
262 * this fixes bug of ipv4 forwarding in guest can't
263 * forward pakets from one virtio dev to another virtio dev.
265 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
266 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
267 .hw_strip_crc = 0, /**< CRC stripped by hardware */
271 .mq_mode = ETH_MQ_TX_NONE,
275 * should be overridden separately in code with
279 .nb_queue_pools = ETH_8_POOLS,
280 .enable_default_pool = 0,
283 .pool_map = {{0, 0},},
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
314 /* Used for queueing bursts of TX packets. */
318 struct rte_mbuf *m_table[MAX_PKT_BURST];
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
327 /* Vlan header struct used to insert vlan tags on TX. */
329 unsigned char h_dest[ETH_ALEN];
330 unsigned char h_source[ETH_ALEN];
333 __be16 h_vlan_encapsulated_proto;
338 uint8_t version_ihl; /**< version and header length */
339 uint8_t type_of_service; /**< type of service */
340 uint16_t total_length; /**< length of packet */
341 uint16_t packet_id; /**< packet ID */
342 uint16_t fragment_offset; /**< fragmentation offset */
343 uint8_t time_to_live; /**< time to live */
344 uint8_t next_proto_id; /**< protocol ID */
345 uint16_t hdr_checksum; /**< header checksum */
346 uint32_t src_addr; /**< source address */
347 uint32_t dst_addr; /**< destination address */
348 } __attribute__((__packed__));
350 /* Header lengths. */
352 #define VLAN_ETH_HLEN 18
354 /* Per-device statistics struct */
355 struct device_statistics {
357 rte_atomic64_t rx_total_atomic;
360 rte_atomic64_t rx_atomic;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
366 * Builds up the correct configuration for VMDQ VLAN pool map
367 * according to the pool & queue limits.
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
372 struct rte_eth_vmdq_rx_conf conf;
375 memset(&conf, 0, sizeof(conf));
376 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 conf.nb_pool_maps = num_devices;
378 conf.enable_loop_back =
379 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
381 for (i = 0; i < conf.nb_pool_maps; i++) {
382 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 conf.pool_map[i].pools = (1UL << i);
386 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
393 * Validate the device number according to the max pool number gotten form
394 * dev_info. If the device number is invalid, give the error message and
395 * return -1. Each device must have its own pool.
398 validate_num_devices(uint32_t max_nb_devices)
400 if (num_devices > max_nb_devices) {
401 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
408 * Initialises a given port using global settings and with the rx buffers
409 * coming from the mbuf_pool passed as parameter
412 port_init(uint8_t port)
414 struct rte_eth_dev_info dev_info;
415 struct rte_eth_conf port_conf;
416 uint16_t rx_rings, tx_rings;
417 uint16_t rx_ring_size, tx_ring_size;
421 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 rte_eth_dev_info_get (port, &dev_info);
424 /*configure the number of supported virtio devices based on VMDQ limits */
425 num_devices = dev_info.max_vmdq_pools;
426 num_queues = dev_info.max_rx_queues;
429 rx_ring_size = num_rx_descriptor;
430 tx_ring_size = num_tx_descriptor;
431 tx_rings = dev_info.max_tx_queues;
433 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 tx_rings = (uint16_t)rte_lcore_count();
438 retval = validate_num_devices(MAX_DEVICES);
442 /* Get port configuration. */
443 retval = get_eth_conf(&port_conf, num_devices);
447 if (port >= rte_eth_dev_count()) return -1;
449 rx_rings = (uint16_t)num_queues,
450 /* Configure ethernet device. */
451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
455 /* Setup the queues. */
456 for (q = 0; q < rx_rings; q ++) {
457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 rte_eth_dev_socket_id(port), &rx_conf_default,
459 vpool_array[q].pool);
463 for (q = 0; q < tx_rings; q ++) {
464 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 rte_eth_dev_socket_id(port), &tx_conf_default);
470 /* Start the device. */
471 retval = rte_eth_dev_start(port);
473 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 vmdq_ports_eth_addr[port].addr_bytes[0],
483 vmdq_ports_eth_addr[port].addr_bytes[1],
484 vmdq_ports_eth_addr[port].addr_bytes[2],
485 vmdq_ports_eth_addr[port].addr_bytes[3],
486 vmdq_ports_eth_addr[port].addr_bytes[4],
487 vmdq_ports_eth_addr[port].addr_bytes[5]);
493 * Set character device basename.
496 us_vhost_parse_basename(const char *q_arg)
498 /* parse number string */
500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 * Parse the portmask provided at run time.
512 parse_portmask(const char *portmask)
519 /* parse hexadecimal string */
520 pm = strtoul(portmask, &end, 16);
521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532 * Parse num options at run time.
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
542 /* parse unsigned int string */
543 num = strtoul(q_arg, &end, 10);
544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 if (num > max_valid_value)
558 us_vhost_usage(const char *prgname)
560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 " --dev-basename <name> --dev-index [0-N]\n"
565 " -p PORTMASK: Set mask for ports to be used by application\n"
566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 " --dev-basename: The basename to be used for the character device.\n"
573 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 " --zero-copy [0|1]: disable(default)/enable rx/tx "
576 " --rx-desc-num [0-N]: the number of descriptors on rx, "
577 "used only when zero copy is enabled.\n"
578 " --tx-desc-num [0-N]: the number of descriptors on tx, "
579 "used only when zero copy is enabled.\n",
584 * Parse the arguments given in the command line of the application.
587 us_vhost_parse_args(int argc, char **argv)
592 const char *prgname = argv[0];
593 static struct option long_option[] = {
594 {"vm2vm", required_argument, NULL, 0},
595 {"rx-retry", required_argument, NULL, 0},
596 {"rx-retry-delay", required_argument, NULL, 0},
597 {"rx-retry-num", required_argument, NULL, 0},
598 {"mergeable", required_argument, NULL, 0},
599 {"stats", required_argument, NULL, 0},
600 {"dev-basename", required_argument, NULL, 0},
601 {"dev-index", required_argument, NULL, 0},
602 {"zero-copy", required_argument, NULL, 0},
603 {"rx-desc-num", required_argument, NULL, 0},
604 {"tx-desc-num", required_argument, NULL, 0},
608 /* Parse command line */
609 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
613 enabled_port_mask = parse_portmask(optarg);
614 if (enabled_port_mask == 0) {
615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 us_vhost_usage(prgname);
622 /* Enable/disable vm2vm comms. */
623 if (!strncmp(long_option[option_index].name, "vm2vm",
625 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627 RTE_LOG(INFO, VHOST_CONFIG,
628 "Invalid argument for "
630 us_vhost_usage(prgname);
633 vm2vm_mode = (vm2vm_type)ret;
637 /* Enable/disable retries on RX. */
638 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 ret = parse_num_opt(optarg, 1);
641 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 us_vhost_usage(prgname);
649 /* Specify the retries delay time (in useconds) on RX. */
650 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 ret = parse_num_opt(optarg, INT32_MAX);
653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 us_vhost_usage(prgname);
657 burst_rx_delay_time = ret;
661 /* Specify the retries number on RX. */
662 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 ret = parse_num_opt(optarg, INT32_MAX);
665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 us_vhost_usage(prgname);
669 burst_rx_retry_num = ret;
673 /* Enable/disable RX mergeable buffers. */
674 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 ret = parse_num_opt(optarg, 1);
677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 us_vhost_usage(prgname);
682 vmdq_conf_default.rxmode.jumbo_frame = 1;
683 vmdq_conf_default.rxmode.max_rx_pkt_len
684 = JUMBO_FRAME_MAX_SIZE;
685 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
690 /* Enable/disable stats. */
691 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 ret = parse_num_opt(optarg, INT32_MAX);
694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 us_vhost_usage(prgname);
702 /* Set character device basename. */
703 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 if (us_vhost_parse_basename(optarg) == -1) {
705 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 us_vhost_usage(prgname);
711 /* Set character device index. */
712 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 ret = parse_num_opt(optarg, INT32_MAX);
715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 us_vhost_usage(prgname);
722 /* Enable/disable rx/tx zero copy. */
723 if (!strncmp(long_option[option_index].name,
724 "zero-copy", MAX_LONG_OPT_SZ)) {
725 ret = parse_num_opt(optarg, 1);
727 RTE_LOG(INFO, VHOST_CONFIG,
729 " for zero-copy [0|1]\n");
730 us_vhost_usage(prgname);
736 #ifdef RTE_MBUF_REFCNT
737 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 "zero copy vhost APP, please "
739 "disable RTE_MBUF_REFCNT\n"
740 "in config file and then rebuild DPDK "
742 "Otherwise please disable zero copy "
743 "flag in command line!\n");
749 /* Specify the descriptor number on RX. */
750 if (!strncmp(long_option[option_index].name,
751 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752 ret = parse_num_opt(optarg, MAX_RING_DESC);
753 if ((ret == -1) || (!POWEROF2(ret))) {
754 RTE_LOG(INFO, VHOST_CONFIG,
755 "Invalid argument for rx-desc-num[0-N],"
756 "power of 2 required.\n");
757 us_vhost_usage(prgname);
760 num_rx_descriptor = ret;
764 /* Specify the descriptor number on TX. */
765 if (!strncmp(long_option[option_index].name,
766 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767 ret = parse_num_opt(optarg, MAX_RING_DESC);
768 if ((ret == -1) || (!POWEROF2(ret))) {
769 RTE_LOG(INFO, VHOST_CONFIG,
770 "Invalid argument for tx-desc-num [0-N],"
771 "power of 2 required.\n");
772 us_vhost_usage(prgname);
775 num_tx_descriptor = ret;
781 /* Invalid option - print options. */
783 us_vhost_usage(prgname);
788 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 if (enabled_port_mask & (1 << i))
790 ports[num_ports++] = (uint8_t)i;
793 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
794 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
799 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 RTE_LOG(INFO, VHOST_PORT,
801 "Vhost zero copy doesn't support software vm2vm,"
802 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
806 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 RTE_LOG(INFO, VHOST_PORT,
808 "Vhost zero copy doesn't support jumbo frame,"
809 "please specify '--mergeable 0' to disable the "
810 "mergeable feature.\n");
818 * Update the global var NUM_PORTS and array PORTS according to system ports number
819 * and return valid ports number
821 static unsigned check_ports_num(unsigned nb_ports)
823 unsigned valid_num_ports = num_ports;
826 if (num_ports > nb_ports) {
827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 num_ports, nb_ports);
829 num_ports = nb_ports;
832 for (portid = 0; portid < num_ports; portid ++) {
833 if (ports[portid] >= nb_ports) {
834 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 ports[portid], (nb_ports - 1));
836 ports[portid] = INVALID_PORT_ID;
840 return valid_num_ports;
844 * Macro to print out packet contents. Wrapped in debug define so that the
845 * data path is not effected when debug is disabled.
848 #define PRINT_PACKET(device, addr, size, header) do { \
849 char *pkt_addr = (char*)(addr); \
850 unsigned int index; \
851 char packet[MAX_PRINT_BUFF]; \
854 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
857 for (index = 0; index < (size); index++) { \
858 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
859 "%02hhx ", pkt_addr[index]); \
861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
863 LOG_DEBUG(VHOST_DATA, "%s", packet); \
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
870 * Function to convert guest physical addresses to vhost physical addresses.
871 * This is used to convert virtio buffer addresses.
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
875 uint32_t buf_len, hpa_type *addr_type)
877 struct virtio_memory_regions_hpa *region;
879 uint64_t vhost_pa = 0;
881 *addr_type = PHYS_ADDR_INVALID;
883 for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
884 region = &dev->mem->regions_hpa[regionidx];
885 if ((guest_pa >= region->guest_phys_address) &&
886 (guest_pa <= region->guest_phys_address_end)) {
887 vhost_pa = region->host_phys_addr_offset + guest_pa;
888 if (likely((guest_pa + buf_len - 1)
889 <= region->guest_phys_address_end))
890 *addr_type = PHYS_ADDR_CONTINUOUS;
892 *addr_type = PHYS_ADDR_CROSS_SUBREG;
897 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
898 dev->device_fh, (void *)(uintptr_t)guest_pa,
899 (void *)(uintptr_t)vhost_pa);
905 * Compares a packet destination MAC address to a device MAC address.
907 static inline int __attribute__((always_inline))
908 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
910 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
914 * This function learns the MAC address of the device and registers this along with a
915 * vlan tag to a VMDQ.
918 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
920 struct ether_hdr *pkt_hdr;
921 struct virtio_net_data_ll *dev_ll;
924 /* Learn MAC address of guest device from packet */
925 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
927 dev_ll = ll_root_used;
929 while (dev_ll != NULL) {
930 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
931 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
934 dev_ll = dev_ll->next;
937 for (i = 0; i < ETHER_ADDR_LEN; i++)
938 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
940 /* vlan_tag currently uses the device_id. */
941 dev->vlan_tag = vlan_tags[dev->device_fh];
943 /* Print out VMDQ registration info. */
944 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
946 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
947 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
948 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
951 /* Register the MAC address. */
952 ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
954 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
957 /* Enable stripping of the vlan tag as we handle routing. */
958 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
960 /* Set device as ready for RX. */
961 dev->ready = DEVICE_RX;
967 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
968 * queue before disabling RX on the device.
971 unlink_vmdq(struct virtio_net *dev)
975 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
977 if (dev->ready == DEVICE_RX) {
978 /*clear MAC and VLAN settings*/
979 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
980 for (i = 0; i < 6; i++)
981 dev->mac_address.addr_bytes[i] = 0;
985 /*Clear out the receive buffers*/
986 rx_count = rte_eth_rx_burst(ports[0],
987 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
990 for (i = 0; i < rx_count; i++)
991 rte_pktmbuf_free(pkts_burst[i]);
993 rx_count = rte_eth_rx_burst(ports[0],
994 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
997 dev->ready = DEVICE_MAC_LEARNING;
1002 * Check if the packet destination MAC address is for a local device. If so then put
1003 * the packet on that devices RX queue. If not then return.
1005 static inline unsigned __attribute__((always_inline))
1006 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1008 struct virtio_net_data_ll *dev_ll;
1009 struct ether_hdr *pkt_hdr;
1012 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1014 /*get the used devices list*/
1015 dev_ll = ll_root_used;
1017 while (dev_ll != NULL) {
1018 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1019 &dev_ll->dev->mac_address)) {
1021 /* Drop the packet if the TX packet is destined for the TX device. */
1022 if (dev_ll->dev->device_fh == dev->device_fh) {
1023 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1024 dev_ll->dev->device_fh);
1029 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1031 if (dev_ll->dev->remove) {
1032 /*drop the packet if the device is marked for removal*/
1033 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1035 uint32_t mergeable =
1036 dev_ll->dev->features &
1037 (1 << VIRTIO_NET_F_MRG_RXBUF);
1039 /*send the packet to the local virtio device*/
1040 if (likely(mergeable == 0))
1041 ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1043 ret = virtio_dev_merge_rx(dev_ll->dev,
1048 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1051 &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1053 dev_statistics[dev->device_fh].tx_total++;
1054 dev_statistics[dev->device_fh].tx += ret;
1060 dev_ll = dev_ll->next;
1067 * This function routes the TX packet to the correct interface. This may be a local device
1068 * or the physical port.
1070 static inline void __attribute__((always_inline))
1071 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1073 struct mbuf_table *tx_q;
1074 struct vlan_ethhdr *vlan_hdr;
1075 struct rte_mbuf **m_table;
1076 struct rte_mbuf *mbuf, *prev;
1077 unsigned len, ret, offset = 0;
1078 const uint16_t lcore_id = rte_lcore_id();
1079 struct virtio_net_data_ll *dev_ll = ll_root_used;
1080 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1082 /*check if destination is local VM*/
1083 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1086 if (vm2vm_mode == VM2VM_HARDWARE) {
1087 while (dev_ll != NULL) {
1088 if ((dev_ll->dev->ready == DEVICE_RX)
1089 && ether_addr_cmp(&(pkt_hdr->d_addr),
1090 &dev_ll->dev->mac_address)) {
1092 * Drop the packet if the TX packet is
1093 * destined for the TX device.
1095 if (dev_ll->dev->device_fh == dev->device_fh) {
1096 LOG_DEBUG(VHOST_DATA,
1097 "(%"PRIu64") TX: Source and destination"
1098 " MAC addresses are the same. Dropping "
1100 dev_ll->dev->device_fh);
1106 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1108 LOG_DEBUG(VHOST_DATA,
1109 "(%"PRIu64") TX: pkt to local VM device id:"
1110 "(%"PRIu64") vlan tag: %d.\n",
1111 dev->device_fh, dev_ll->dev->device_fh,
1116 dev_ll = dev_ll->next;
1120 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1122 /*Add packet to the port tx queue*/
1123 tx_q = &lcore_tx_queue[lcore_id];
1126 /* Allocate an mbuf and populate the structure. */
1127 mbuf = rte_pktmbuf_alloc(mbuf_pool);
1128 if (unlikely(mbuf == NULL)) {
1129 RTE_LOG(ERR, VHOST_DATA,
1130 "Failed to allocate memory for mbuf.\n");
1134 mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1135 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1136 mbuf->nb_segs = m->nb_segs;
1138 /* Copy ethernet header to mbuf. */
1139 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1140 rte_pktmbuf_mtod(m, const void *),
1144 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1145 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1146 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1147 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1148 vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1150 /* Copy the remaining packet contents to the mbuf. */
1151 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1152 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1153 (m->data_len - ETH_HLEN));
1155 /* Copy the remaining segments for the whole packet. */
1158 /* Allocate an mbuf and populate the structure. */
1159 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1160 if (unlikely(next_mbuf == NULL)) {
1161 rte_pktmbuf_free(mbuf);
1162 RTE_LOG(ERR, VHOST_DATA,
1163 "Failed to allocate memory for mbuf.\n");
1168 prev->next = next_mbuf;
1170 next_mbuf->data_len = m->data_len;
1172 /* Copy data to next mbuf. */
1173 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1174 rte_pktmbuf_mtod(m, const void *), m->data_len);
1177 tx_q->m_table[len] = mbuf;
1180 dev_statistics[dev->device_fh].tx_total++;
1181 dev_statistics[dev->device_fh].tx++;
1184 if (unlikely(len == MAX_PKT_BURST)) {
1185 m_table = (struct rte_mbuf **)tx_q->m_table;
1186 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1187 /* Free any buffers not handled by TX and update the port stats. */
1188 if (unlikely(ret < len)) {
1190 rte_pktmbuf_free(m_table[ret]);
1191 } while (++ret < len);
1201 * This function is called by each data core. It handles all RX/TX registered with the
1202 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
1203 * with all devices in the main linked list.
1206 switch_worker(__attribute__((unused)) void *arg)
1208 struct rte_mempool *mbuf_pool = arg;
1209 struct virtio_net *dev = NULL;
1210 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1211 struct virtio_net_data_ll *dev_ll;
1212 struct mbuf_table *tx_q;
1213 volatile struct lcore_ll_info *lcore_ll;
1214 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
1215 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
1217 const uint16_t lcore_id = rte_lcore_id();
1218 const uint16_t num_cores = (uint16_t)rte_lcore_count();
1219 uint16_t rx_count = 0;
1220 uint32_t mergeable = 0;
1222 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
1223 lcore_ll = lcore_info[lcore_id].lcore_ll;
1226 tx_q = &lcore_tx_queue[lcore_id];
1227 for (i = 0; i < num_cores; i ++) {
1228 if (lcore_ids[i] == lcore_id) {
1235 cur_tsc = rte_rdtsc();
1237 * TX burst queue drain
1239 diff_tsc = cur_tsc - prev_tsc;
1240 if (unlikely(diff_tsc > drain_tsc)) {
1243 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
1245 /*Tx any packets in the queue*/
1246 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
1247 (struct rte_mbuf **)tx_q->m_table,
1248 (uint16_t)tx_q->len);
1249 if (unlikely(ret < tx_q->len)) {
1251 rte_pktmbuf_free(tx_q->m_table[ret]);
1252 } while (++ret < tx_q->len);
1262 rte_prefetch0(lcore_ll->ll_root_used);
1264 * Inform the configuration core that we have exited the linked list and that no devices are
1265 * in use if requested.
1267 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
1268 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
1273 dev_ll = lcore_ll->ll_root_used;
1275 while (dev_ll != NULL) {
1276 /*get virtio device ID*/
1279 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
1282 dev_ll = dev_ll->next;
1284 dev->ready = DEVICE_SAFE_REMOVE;
1287 if (likely(dev->ready == DEVICE_RX)) {
1289 rx_count = rte_eth_rx_burst(ports[0],
1290 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1293 if (likely(mergeable == 0))
1296 pkts_burst, rx_count);
1299 virtio_dev_merge_rx(dev,
1300 pkts_burst, rx_count);
1304 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1307 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
1309 while (likely(rx_count)) {
1311 rte_pktmbuf_free(pkts_burst[rx_count]);
1319 if (likely(mergeable == 0))
1320 virtio_dev_tx(dev, mbuf_pool);
1322 virtio_dev_merge_tx(dev, mbuf_pool);
1325 /*move to the next device in the list*/
1326 dev_ll = dev_ll->next;
1334 * This function gets available ring number for zero copy rx.
1335 * Only one thread will call this funciton for a paticular virtio device,
1336 * so, it is designed as non-thread-safe function.
1338 static inline uint32_t __attribute__((always_inline))
1339 get_available_ring_num_zcp(struct virtio_net *dev)
1341 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1344 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1345 return (uint32_t)(avail_idx - vq->last_used_idx_res);
1349 * This function gets available ring index for zero copy rx,
1350 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
1351 * Only one thread will call this funciton for a paticular virtio device,
1352 * so, it is designed as non-thread-safe function.
1354 static inline uint32_t __attribute__((always_inline))
1355 get_available_ring_index_zcp(struct virtio_net *dev,
1356 uint16_t *res_base_idx, uint32_t count)
1358 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
1361 uint16_t free_entries;
1363 *res_base_idx = vq->last_used_idx_res;
1364 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1365 free_entries = (avail_idx - *res_base_idx);
1367 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
1369 "res base idx:%d, free entries:%d\n",
1370 dev->device_fh, avail_idx, *res_base_idx,
1374 * If retry is enabled and the queue is full then we wait
1375 * and retry to avoid packet loss.
1377 if (enable_retry && unlikely(count > free_entries)) {
1378 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1379 rte_delay_us(burst_rx_delay_time);
1380 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1381 free_entries = (avail_idx - *res_base_idx);
1382 if (count <= free_entries)
1387 /*check that we have enough buffers*/
1388 if (unlikely(count > free_entries))
1389 count = free_entries;
1391 if (unlikely(count == 0)) {
1392 LOG_DEBUG(VHOST_DATA,
1393 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
1394 "avail idx: %d, res base idx:%d, free entries:%d\n",
1395 dev->device_fh, avail_idx,
1396 *res_base_idx, free_entries);
1400 vq->last_used_idx_res = *res_base_idx + count;
1406 * This function put descriptor back to used list.
1408 static inline void __attribute__((always_inline))
1409 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
1411 uint16_t res_cur_idx = vq->last_used_idx;
1412 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
1413 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
1414 rte_compiler_barrier();
1415 *(volatile uint16_t *)&vq->used->idx += 1;
1416 vq->last_used_idx += 1;
1418 /* Kick the guest if necessary. */
1419 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1420 eventfd_write((int)vq->kickfd, 1);
1424 * This function get available descriptor from vitio vring and un-attached mbuf
1425 * from vpool->ring, and then attach them together. It needs adjust the offset
1426 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
1427 * frame data may be put to wrong location in mbuf.
1429 static inline void __attribute__((always_inline))
1430 attach_rxmbuf_zcp(struct virtio_net *dev)
1432 uint16_t res_base_idx, desc_idx;
1433 uint64_t buff_addr, phys_addr;
1434 struct vhost_virtqueue *vq;
1435 struct vring_desc *desc;
1436 struct rte_mbuf *mbuf = NULL;
1437 struct vpool *vpool;
1440 vpool = &vpool_array[dev->vmdq_rx_q];
1441 vq = dev->virtqueue[VIRTIO_RXQ];
1444 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
1447 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
1449 desc = &vq->desc[desc_idx];
1450 if (desc->flags & VRING_DESC_F_NEXT) {
1451 desc = &vq->desc[desc->next];
1452 buff_addr = gpa_to_vva(dev, desc->addr);
1453 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
1456 buff_addr = gpa_to_vva(dev,
1457 desc->addr + vq->vhost_hlen);
1458 phys_addr = gpa_to_hpa(dev,
1459 desc->addr + vq->vhost_hlen,
1460 desc->len, &addr_type);
1463 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1464 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
1465 " address found when attaching RX frame buffer"
1466 " address!\n", dev->device_fh);
1467 put_desc_to_used_list_zcp(vq, desc_idx);
1472 * Check if the frame buffer address from guest crosses
1473 * sub-region or not.
1475 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
1476 RTE_LOG(ERR, VHOST_DATA,
1477 "(%"PRIu64") Frame buffer address cross "
1478 "sub-regioin found when attaching RX frame "
1479 "buffer address!\n",
1481 put_desc_to_used_list_zcp(vq, desc_idx);
1484 } while (unlikely(phys_addr == 0));
1486 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1487 if (unlikely(mbuf == NULL)) {
1488 LOG_DEBUG(VHOST_DATA,
1489 "(%"PRIu64") in attach_rxmbuf_zcp: "
1490 "ring_sc_dequeue fail.\n",
1492 put_desc_to_used_list_zcp(vq, desc_idx);
1496 if (unlikely(vpool->buf_size > desc->len)) {
1497 LOG_DEBUG(VHOST_DATA,
1498 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
1499 "length(%d) of descriptor idx: %d less than room "
1500 "size required: %d\n",
1501 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
1502 put_desc_to_used_list_zcp(vq, desc_idx);
1503 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1507 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
1508 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
1509 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
1510 mbuf->data_len = desc->len;
1511 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1513 LOG_DEBUG(VHOST_DATA,
1514 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
1515 "descriptor idx:%d\n",
1516 dev->device_fh, res_base_idx, desc_idx);
1518 __rte_mbuf_raw_free(mbuf);
1524 * Detach an attched packet mbuf -
1525 * - restore original mbuf address and length values.
1526 * - reset pktmbuf data and data_len to their default values.
1527 * All other fields of the given packet mbuf will be left intact.
1530 * The attached packet mbuf.
1532 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
1534 const struct rte_mempool *mp = m->pool;
1535 void *buf = RTE_MBUF_TO_BADDR(m);
1537 uint32_t buf_len = mp->elt_size - sizeof(*m);
1538 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
1541 m->buf_len = (uint16_t)buf_len;
1543 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
1544 RTE_PKTMBUF_HEADROOM : m->buf_len;
1545 m->data_off = buf_ofs;
1551 * This function is called after packets have been transimited. It fetchs mbuf
1552 * from vpool->pool, detached it and put into vpool->ring. It also update the
1553 * used index and kick the guest if necessary.
1555 static inline uint32_t __attribute__((always_inline))
1556 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
1558 struct rte_mbuf *mbuf;
1559 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1560 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
1562 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
1564 LOG_DEBUG(VHOST_DATA,
1565 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
1567 dev->device_fh, mbuf_count);
1568 LOG_DEBUG(VHOST_DATA,
1569 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
1571 dev->device_fh, rte_ring_count(vpool->ring));
1573 for (index = 0; index < mbuf_count; index++) {
1574 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1575 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1576 pktmbuf_detach_zcp(mbuf);
1577 rte_ring_sp_enqueue(vpool->ring, mbuf);
1579 /* Update used index buffer information. */
1580 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
1581 vq->used->ring[used_idx].len = 0;
1583 used_idx = (used_idx + 1) & (vq->size - 1);
1586 LOG_DEBUG(VHOST_DATA,
1587 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
1589 dev->device_fh, rte_mempool_count(vpool->pool));
1590 LOG_DEBUG(VHOST_DATA,
1591 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
1593 dev->device_fh, rte_ring_count(vpool->ring));
1594 LOG_DEBUG(VHOST_DATA,
1595 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
1596 "vq->last_used_idx:%d\n",
1597 dev->device_fh, vq->last_used_idx);
1599 vq->last_used_idx += mbuf_count;
1601 LOG_DEBUG(VHOST_DATA,
1602 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
1603 "vq->last_used_idx:%d\n",
1604 dev->device_fh, vq->last_used_idx);
1606 rte_compiler_barrier();
1608 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
1610 /* Kick guest if required. */
1611 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1612 eventfd_write((int)vq->kickfd, 1);
1618 * This function is called when a virtio device is destroy.
1619 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
1621 static void mbuf_destroy_zcp(struct vpool *vpool)
1623 struct rte_mbuf *mbuf = NULL;
1624 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
1626 LOG_DEBUG(VHOST_CONFIG,
1627 "in mbuf_destroy_zcp: mbuf count in mempool before "
1628 "mbuf_destroy_zcp is: %d\n",
1630 LOG_DEBUG(VHOST_CONFIG,
1631 "in mbuf_destroy_zcp: mbuf count in ring before "
1632 "mbuf_destroy_zcp is : %d\n",
1633 rte_ring_count(vpool->ring));
1635 for (index = 0; index < mbuf_count; index++) {
1636 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
1637 if (likely(mbuf != NULL)) {
1638 if (likely(RTE_MBUF_INDIRECT(mbuf)))
1639 pktmbuf_detach_zcp(mbuf);
1640 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
1644 LOG_DEBUG(VHOST_CONFIG,
1645 "in mbuf_destroy_zcp: mbuf count in mempool after "
1646 "mbuf_destroy_zcp is: %d\n",
1647 rte_mempool_count(vpool->pool));
1648 LOG_DEBUG(VHOST_CONFIG,
1649 "in mbuf_destroy_zcp: mbuf count in ring after "
1650 "mbuf_destroy_zcp is : %d\n",
1651 rte_ring_count(vpool->ring));
1655 * This function update the use flag and counter.
1657 static inline uint32_t __attribute__((always_inline))
1658 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
1661 struct vhost_virtqueue *vq;
1662 struct vring_desc *desc;
1663 struct rte_mbuf *buff;
1664 /* The virtio_hdr is initialised to 0. */
1665 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
1666 = {{0, 0, 0, 0, 0, 0}, 0};
1667 uint64_t buff_hdr_addr = 0;
1668 uint32_t head[MAX_PKT_BURST], packet_len = 0;
1669 uint32_t head_idx, packet_success = 0;
1670 uint16_t res_cur_idx;
1672 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
1677 vq = dev->virtqueue[VIRTIO_RXQ];
1678 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
1680 res_cur_idx = vq->last_used_idx;
1681 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
1682 dev->device_fh, res_cur_idx, res_cur_idx + count);
1684 /* Retrieve all of the head indexes first to avoid caching issues. */
1685 for (head_idx = 0; head_idx < count; head_idx++)
1686 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
1688 /*Prefetch descriptor index. */
1689 rte_prefetch0(&vq->desc[head[packet_success]]);
1691 while (packet_success != count) {
1692 /* Get descriptor from available ring */
1693 desc = &vq->desc[head[packet_success]];
1695 buff = pkts[packet_success];
1696 LOG_DEBUG(VHOST_DATA,
1697 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
1698 "pkt[%d] descriptor idx: %d\n",
1699 dev->device_fh, packet_success,
1700 MBUF_HEADROOM_UINT32(buff));
1703 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
1704 + RTE_PKTMBUF_HEADROOM),
1705 rte_pktmbuf_data_len(buff), 0);
1707 /* Buffer address translation for virtio header. */
1708 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
1709 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1712 * If the descriptors are chained the header and data are
1713 * placed in separate buffers.
1715 if (desc->flags & VRING_DESC_F_NEXT) {
1716 desc->len = vq->vhost_hlen;
1717 desc = &vq->desc[desc->next];
1718 desc->len = rte_pktmbuf_data_len(buff);
1720 desc->len = packet_len;
1723 /* Update used ring with desc information */
1724 vq->used->ring[res_cur_idx & (vq->size - 1)].id
1725 = head[packet_success];
1726 vq->used->ring[res_cur_idx & (vq->size - 1)].len
1731 /* A header is required per buffer. */
1732 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1733 (const void *)&virtio_hdr, vq->vhost_hlen);
1735 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1737 if (likely(packet_success < count)) {
1738 /* Prefetch descriptor index. */
1739 rte_prefetch0(&vq->desc[head[packet_success]]);
1743 rte_compiler_barrier();
1745 LOG_DEBUG(VHOST_DATA,
1746 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
1747 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1748 dev->device_fh, vq->last_used_idx, vq->used->idx);
1750 *(volatile uint16_t *)&vq->used->idx += count;
1751 vq->last_used_idx += count;
1753 LOG_DEBUG(VHOST_DATA,
1754 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
1755 "vq.last_used_idx: %d, vq->used->idx: %d\n",
1756 dev->device_fh, vq->last_used_idx, vq->used->idx);
1758 /* Kick the guest if necessary. */
1759 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1760 eventfd_write((int)vq->kickfd, 1);
1766 * This function routes the TX packet to the correct interface.
1767 * This may be a local device or the physical port.
1769 static inline void __attribute__((always_inline))
1770 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
1771 uint32_t desc_idx, uint8_t need_copy)
1773 struct mbuf_table *tx_q;
1774 struct rte_mbuf **m_table;
1775 struct rte_mbuf *mbuf = NULL;
1776 unsigned len, ret, offset = 0;
1777 struct vpool *vpool;
1778 struct virtio_net_data_ll *dev_ll = ll_root_used;
1779 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1780 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
1782 /*Add packet to the port tx queue*/
1783 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
1786 /* Allocate an mbuf and populate the structure. */
1787 vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
1788 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
1789 if (unlikely(mbuf == NULL)) {
1790 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
1791 RTE_LOG(ERR, VHOST_DATA,
1792 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
1794 put_desc_to_used_list_zcp(vq, desc_idx);
1798 if (vm2vm_mode == VM2VM_HARDWARE) {
1799 /* Avoid using a vlan tag from any vm for external pkt, such as
1800 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
1801 * selection, MAC address determines it as an external pkt
1802 * which should go to network, while vlan tag determine it as
1803 * a vm2vm pkt should forward to another vm. Hardware confuse
1804 * such a ambiguous situation, so pkt will lost.
1806 vlan_tag = external_pkt_default_vlan_tag;
1807 while (dev_ll != NULL) {
1808 if (likely(dev_ll->dev->ready == DEVICE_RX) &&
1809 ether_addr_cmp(&(pkt_hdr->d_addr),
1810 &dev_ll->dev->mac_address)) {
1813 * Drop the packet if the TX packet is destined
1814 * for the TX device.
1816 if (unlikely(dev_ll->dev->device_fh
1817 == dev->device_fh)) {
1818 LOG_DEBUG(VHOST_DATA,
1819 "(%"PRIu64") TX: Source and destination"
1820 "MAC addresses are the same. Dropping "
1822 dev_ll->dev->device_fh);
1823 MBUF_HEADROOM_UINT32(mbuf)
1824 = (uint32_t)desc_idx;
1825 __rte_mbuf_raw_free(mbuf);
1830 * Packet length offset 4 bytes for HW vlan
1831 * strip when L2 switch back.
1836 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1838 LOG_DEBUG(VHOST_DATA,
1839 "(%"PRIu64") TX: pkt to local VM device id:"
1840 "(%"PRIu64") vlan tag: %d.\n",
1841 dev->device_fh, dev_ll->dev->device_fh,
1846 dev_ll = dev_ll->next;
1850 mbuf->nb_segs = m->nb_segs;
1851 mbuf->next = m->next;
1852 mbuf->data_len = m->data_len + offset;
1853 mbuf->pkt_len = mbuf->data_len;
1854 if (unlikely(need_copy)) {
1855 /* Copy the packet contents to the mbuf. */
1856 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1857 rte_pktmbuf_mtod(m, void *),
1860 mbuf->data_off = m->data_off;
1861 mbuf->buf_physaddr = m->buf_physaddr;
1862 mbuf->buf_addr = m->buf_addr;
1864 mbuf->ol_flags = PKT_TX_VLAN_PKT;
1865 mbuf->vlan_tci = vlan_tag;
1866 mbuf->l2_len = sizeof(struct ether_hdr);
1867 mbuf->l3_len = sizeof(struct ipv4_hdr);
1868 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
1870 tx_q->m_table[len] = mbuf;
1873 LOG_DEBUG(VHOST_DATA,
1874 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
1877 (mbuf->next == NULL) ? "null" : "non-null");
1880 dev_statistics[dev->device_fh].tx_total++;
1881 dev_statistics[dev->device_fh].tx++;
1884 if (unlikely(len == MAX_PKT_BURST)) {
1885 m_table = (struct rte_mbuf **)tx_q->m_table;
1886 ret = rte_eth_tx_burst(ports[0],
1887 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1890 * Free any buffers not handled by TX and update
1893 if (unlikely(ret < len)) {
1895 rte_pktmbuf_free(m_table[ret]);
1896 } while (++ret < len);
1900 txmbuf_clean_zcp(dev, vpool);
1909 * This function TX all available packets in virtio TX queue for one
1910 * virtio-net device. If it is first packet, it learns MAC address and
1913 static inline void __attribute__((always_inline))
1914 virtio_dev_tx_zcp(struct virtio_net *dev)
1917 struct vhost_virtqueue *vq;
1918 struct vring_desc *desc;
1919 uint64_t buff_addr = 0, phys_addr;
1920 uint32_t head[MAX_PKT_BURST];
1922 uint16_t free_entries, packet_success = 0;
1924 uint8_t need_copy = 0;
1927 vq = dev->virtqueue[VIRTIO_TXQ];
1928 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1930 /* If there are no available buffers then return. */
1931 if (vq->last_used_idx_res == avail_idx)
1934 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1936 /* Prefetch available ring to retrieve head indexes. */
1937 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
1939 /* Get the number of free entries in the ring */
1940 free_entries = (avail_idx - vq->last_used_idx_res);
1942 /* Limit to MAX_PKT_BURST. */
1944 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
1946 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1947 dev->device_fh, free_entries);
1949 /* Retrieve all of the head indexes first to avoid caching issues. */
1950 for (i = 0; i < free_entries; i++)
1952 = vq->avail->ring[(vq->last_used_idx_res + i)
1955 vq->last_used_idx_res += free_entries;
1957 /* Prefetch descriptor index. */
1958 rte_prefetch0(&vq->desc[head[packet_success]]);
1959 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1961 while (packet_success < free_entries) {
1962 desc = &vq->desc[head[packet_success]];
1964 /* Discard first buffer as it is the virtio header */
1965 desc = &vq->desc[desc->next];
1967 /* Buffer address translation. */
1968 buff_addr = gpa_to_vva(dev, desc->addr);
1969 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
1971 if (likely(packet_success < (free_entries - 1)))
1972 /* Prefetch descriptor index. */
1973 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
1975 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
1976 RTE_LOG(ERR, VHOST_DATA,
1977 "(%"PRIu64") Invalid frame buffer address found"
1978 "when TX packets!\n",
1984 /* Prefetch buffer address. */
1985 rte_prefetch0((void *)(uintptr_t)buff_addr);
1988 * Setup dummy mbuf. This is copied to a real mbuf if
1989 * transmitted out the physical port.
1991 m.data_len = desc->len;
1995 m.buf_addr = (void *)(uintptr_t)buff_addr;
1996 m.buf_physaddr = phys_addr;
1999 * Check if the frame buffer address from guest crosses
2000 * sub-region or not.
2002 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2003 RTE_LOG(ERR, VHOST_DATA,
2004 "(%"PRIu64") Frame buffer address cross "
2005 "sub-regioin found when attaching TX frame "
2006 "buffer address!\n",
2012 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2015 * If this is the first received packet we need to learn
2016 * the MAC and setup VMDQ
2018 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2019 if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2021 * Discard frame if device is scheduled for
2022 * removal or a duplicate MAC address is found.
2024 packet_success += free_entries;
2025 vq->last_used_idx += packet_success;
2030 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2036 * This function is called by each data core. It handles all RX/TX registered
2037 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2038 * addresses are compared with all devices in the main linked list.
2041 switch_worker_zcp(__attribute__((unused)) void *arg)
2043 struct virtio_net *dev = NULL;
2044 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2045 struct virtio_net_data_ll *dev_ll;
2046 struct mbuf_table *tx_q;
2047 volatile struct lcore_ll_info *lcore_ll;
2048 const uint64_t drain_tsc
2049 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2050 * BURST_TX_DRAIN_US;
2051 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2053 const uint16_t lcore_id = rte_lcore_id();
2054 uint16_t count_in_ring, rx_count = 0;
2056 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2058 lcore_ll = lcore_info[lcore_id].lcore_ll;
2062 cur_tsc = rte_rdtsc();
2064 /* TX burst queue drain */
2065 diff_tsc = cur_tsc - prev_tsc;
2066 if (unlikely(diff_tsc > drain_tsc)) {
2068 * Get mbuf from vpool.pool and detach mbuf and
2069 * put back into vpool.ring.
2071 dev_ll = lcore_ll->ll_root_used;
2072 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2073 /* Get virtio device ID */
2076 if (likely(!dev->remove)) {
2077 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2079 LOG_DEBUG(VHOST_DATA,
2080 "TX queue drained after timeout"
2081 " with burst size %u\n",
2085 * Tx any packets in the queue
2087 ret = rte_eth_tx_burst(
2089 (uint16_t)tx_q->txq_id,
2090 (struct rte_mbuf **)
2092 (uint16_t)tx_q->len);
2093 if (unlikely(ret < tx_q->len)) {
2096 tx_q->m_table[ret]);
2097 } while (++ret < tx_q->len);
2101 txmbuf_clean_zcp(dev,
2102 &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2105 dev_ll = dev_ll->next;
2110 rte_prefetch0(lcore_ll->ll_root_used);
2113 * Inform the configuration core that we have exited the linked
2114 * list and that no devices are in use if requested.
2116 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2117 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2119 /* Process devices */
2120 dev_ll = lcore_ll->ll_root_used;
2122 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2124 if (unlikely(dev->remove)) {
2125 dev_ll = dev_ll->next;
2127 dev->ready = DEVICE_SAFE_REMOVE;
2131 if (likely(dev->ready == DEVICE_RX)) {
2132 uint32_t index = dev->vmdq_rx_q;
2135 = rte_ring_count(vpool_array[index].ring);
2136 uint16_t free_entries
2137 = (uint16_t)get_available_ring_num_zcp(dev);
2140 * Attach all mbufs in vpool.ring and put back
2144 i < RTE_MIN(free_entries,
2145 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2147 attach_rxmbuf_zcp(dev);
2149 /* Handle guest RX */
2150 rx_count = rte_eth_rx_burst(ports[0],
2151 (uint16_t)dev->vmdq_rx_q, pkts_burst,
2155 ret_count = virtio_dev_rx_zcp(dev,
2156 pkts_burst, rx_count);
2158 dev_statistics[dev->device_fh].rx_total
2160 dev_statistics[dev->device_fh].rx
2163 while (likely(rx_count)) {
2166 pkts_burst[rx_count]);
2167 rte_ring_sp_enqueue(
2168 vpool_array[index].ring,
2169 (void *)pkts_burst[rx_count]);
2174 if (likely(!dev->remove))
2175 /* Handle guest TX */
2176 virtio_dev_tx_zcp(dev);
2178 /* Move to the next device in the list */
2179 dev_ll = dev_ll->next;
2188 * Add an entry to a used linked list. A free entry must first be found
2189 * in the free linked list using get_data_ll_free_entry();
2192 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2193 struct virtio_net_data_ll *ll_dev)
2195 struct virtio_net_data_ll *ll = *ll_root_addr;
2197 /* Set next as NULL and use a compiler barrier to avoid reordering. */
2198 ll_dev->next = NULL;
2199 rte_compiler_barrier();
2201 /* If ll == NULL then this is the first device. */
2203 /* Increment to the tail of the linked list. */
2204 while ((ll->next != NULL) )
2209 *ll_root_addr = ll_dev;
2214 * Remove an entry from a used linked list. The entry must then be added to
2215 * the free linked list using put_data_ll_free_entry().
2218 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
2219 struct virtio_net_data_ll *ll_dev,
2220 struct virtio_net_data_ll *ll_dev_last)
2222 struct virtio_net_data_ll *ll = *ll_root_addr;
2224 if (unlikely((ll == NULL) || (ll_dev == NULL)))
2228 *ll_root_addr = ll_dev->next;
2230 if (likely(ll_dev_last != NULL))
2231 ll_dev_last->next = ll_dev->next;
2233 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
2237 * Find and return an entry from the free linked list.
2239 static struct virtio_net_data_ll *
2240 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
2242 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2243 struct virtio_net_data_ll *ll_dev;
2245 if (ll_free == NULL)
2249 *ll_root_addr = ll_free->next;
2255 * Place an entry back on to the free linked list.
2258 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
2259 struct virtio_net_data_ll *ll_dev)
2261 struct virtio_net_data_ll *ll_free = *ll_root_addr;
2266 ll_dev->next = ll_free;
2267 *ll_root_addr = ll_dev;
2271 * Creates a linked list of a given size.
2273 static struct virtio_net_data_ll *
2274 alloc_data_ll(uint32_t size)
2276 struct virtio_net_data_ll *ll_new;
2279 /* Malloc and then chain the linked list. */
2280 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
2281 if (ll_new == NULL) {
2282 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
2286 for (i = 0; i < size - 1; i++) {
2287 ll_new[i].dev = NULL;
2288 ll_new[i].next = &ll_new[i+1];
2290 ll_new[i].next = NULL;
2296 * Create the main linked list along with each individual cores linked list. A used and a free list
2297 * are created to manage entries.
2304 RTE_LCORE_FOREACH_SLAVE(lcore) {
2305 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
2306 if (lcore_info[lcore].lcore_ll == NULL) {
2307 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
2311 lcore_info[lcore].lcore_ll->device_num = 0;
2312 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2313 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
2314 if (num_devices % num_switching_cores)
2315 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
2317 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
2320 /* Allocate devices up to a maximum of MAX_DEVICES. */
2321 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
2327 * Set virtqueue flags so that we do not receive interrupts.
2330 set_irq_status (struct virtio_net *dev)
2332 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2333 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
2337 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
2338 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
2339 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
2342 destroy_device (volatile struct virtio_net *dev)
2344 struct virtio_net_data_ll *ll_lcore_dev_cur;
2345 struct virtio_net_data_ll *ll_main_dev_cur;
2346 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
2347 struct virtio_net_data_ll *ll_main_dev_last = NULL;
2350 dev->flags &= ~VIRTIO_DEV_RUNNING;
2352 /*set the remove flag. */
2355 while(dev->ready != DEVICE_SAFE_REMOVE) {
2359 /* Search for entry to be removed from lcore ll */
2360 ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
2361 while (ll_lcore_dev_cur != NULL) {
2362 if (ll_lcore_dev_cur->dev == dev) {
2365 ll_lcore_dev_last = ll_lcore_dev_cur;
2366 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
2370 if (ll_lcore_dev_cur == NULL) {
2371 RTE_LOG(ERR, VHOST_CONFIG,
2372 "(%"PRIu64") Failed to find the dev to be destroy.\n",
2377 /* Search for entry to be removed from main ll */
2378 ll_main_dev_cur = ll_root_used;
2379 ll_main_dev_last = NULL;
2380 while (ll_main_dev_cur != NULL) {
2381 if (ll_main_dev_cur->dev == dev) {
2384 ll_main_dev_last = ll_main_dev_cur;
2385 ll_main_dev_cur = ll_main_dev_cur->next;
2389 /* Remove entries from the lcore and main ll. */
2390 rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
2391 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
2393 /* Set the dev_removal_flag on each lcore. */
2394 RTE_LCORE_FOREACH_SLAVE(lcore) {
2395 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
2399 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
2400 * they can no longer access the device removed from the linked lists and that the devices
2401 * are no longer in use.
2403 RTE_LCORE_FOREACH_SLAVE(lcore) {
2404 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
2409 /* Add the entries back to the lcore and main free ll.*/
2410 put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
2411 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
2413 /* Decrement number of device on the lcore. */
2414 lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
2416 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
2419 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2421 /* Stop the RX queue. */
2422 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2423 LOG_DEBUG(VHOST_CONFIG,
2424 "(%"PRIu64") In destroy_device: Failed to stop "
2430 LOG_DEBUG(VHOST_CONFIG,
2431 "(%"PRIu64") in destroy_device: Start put mbuf in "
2432 "mempool back to ring for RX queue: %d\n",
2433 dev->device_fh, dev->vmdq_rx_q);
2435 mbuf_destroy_zcp(vpool);
2437 /* Stop the TX queue. */
2438 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
2439 LOG_DEBUG(VHOST_CONFIG,
2440 "(%"PRIu64") In destroy_device: Failed to "
2441 "stop tx queue:%d\n",
2442 dev->device_fh, dev->vmdq_rx_q);
2445 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
2447 LOG_DEBUG(VHOST_CONFIG,
2448 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
2449 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
2450 dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
2453 mbuf_destroy_zcp(vpool);
2459 * A new device is added to a data core. First the device is added to the main linked list
2460 * and the allocated to a specific data core.
2463 new_device (struct virtio_net *dev)
2465 struct virtio_net_data_ll *ll_dev;
2466 int lcore, core_add = 0;
2467 uint32_t device_num_min = num_devices;
2469 /* Add device to main ll */
2470 ll_dev = get_data_ll_free_entry(&ll_root_free);
2471 if (ll_dev == NULL) {
2472 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
2473 "of %d devices per core has been reached\n",
2474 dev->device_fh, num_devices);
2478 add_data_ll_entry(&ll_root_used, ll_dev);
2479 ll_dev->dev->vmdq_rx_q
2480 = ll_dev->dev->device_fh * (num_queues / num_devices);
2483 uint32_t index = ll_dev->dev->vmdq_rx_q;
2484 uint32_t count_in_ring, i;
2485 struct mbuf_table *tx_q;
2487 count_in_ring = rte_ring_count(vpool_array[index].ring);
2489 LOG_DEBUG(VHOST_CONFIG,
2490 "(%"PRIu64") in new_device: mbuf count in mempool "
2491 "before attach is: %d\n",
2493 rte_mempool_count(vpool_array[index].pool));
2494 LOG_DEBUG(VHOST_CONFIG,
2495 "(%"PRIu64") in new_device: mbuf count in ring "
2496 "before attach is : %d\n",
2497 dev->device_fh, count_in_ring);
2500 * Attach all mbufs in vpool.ring and put back intovpool.pool.
2502 for (i = 0; i < count_in_ring; i++)
2503 attach_rxmbuf_zcp(dev);
2505 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2506 "mempool after attach is: %d\n",
2508 rte_mempool_count(vpool_array[index].pool));
2509 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
2510 "ring after attach is : %d\n",
2512 rte_ring_count(vpool_array[index].ring));
2514 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2515 tx_q->txq_id = dev->vmdq_rx_q;
2517 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2518 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2520 LOG_DEBUG(VHOST_CONFIG,
2521 "(%"PRIu64") In new_device: Failed to start "
2523 dev->device_fh, dev->vmdq_rx_q);
2525 mbuf_destroy_zcp(vpool);
2529 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
2530 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
2532 LOG_DEBUG(VHOST_CONFIG,
2533 "(%"PRIu64") In new_device: Failed to start "
2535 dev->device_fh, dev->vmdq_rx_q);
2537 /* Stop the TX queue. */
2538 if (rte_eth_dev_tx_queue_stop(ports[0],
2539 dev->vmdq_rx_q) != 0) {
2540 LOG_DEBUG(VHOST_CONFIG,
2541 "(%"PRIu64") In new_device: Failed to "
2542 "stop tx queue:%d\n",
2543 dev->device_fh, dev->vmdq_rx_q);
2546 mbuf_destroy_zcp(vpool);
2552 /*reset ready flag*/
2553 dev->ready = DEVICE_MAC_LEARNING;
2556 /* Find a suitable lcore to add the device. */
2557 RTE_LCORE_FOREACH_SLAVE(lcore) {
2558 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
2559 device_num_min = lcore_info[lcore].lcore_ll->device_num;
2563 /* Add device to lcore ll */
2564 ll_dev->dev->coreid = core_add;
2565 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
2566 if (ll_dev == NULL) {
2567 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
2568 dev->ready = DEVICE_SAFE_REMOVE;
2569 destroy_device(dev);
2573 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
2575 /* Initialize device stats */
2576 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
2578 /* Disable notifications. */
2579 set_irq_status(dev);
2580 lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
2581 dev->flags |= VIRTIO_DEV_RUNNING;
2583 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
2589 * These callback allow devices to be added to the data core when configuration
2590 * has been fully complete.
2592 static const struct virtio_net_device_ops virtio_net_device_ops =
2594 .new_device = new_device,
2595 .destroy_device = destroy_device,
2599 * This is a thread will wake up after a period to print stats if the user has
2605 struct virtio_net_data_ll *dev_ll;
2606 uint64_t tx_dropped, rx_dropped;
2607 uint64_t tx, tx_total, rx, rx_total;
2609 const char clr[] = { 27, '[', '2', 'J', '\0' };
2610 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
2613 sleep(enable_stats);
2615 /* Clear screen and move to top left */
2616 printf("%s%s", clr, top_left);
2618 printf("\nDevice statistics ====================================");
2620 dev_ll = ll_root_used;
2621 while (dev_ll != NULL) {
2622 device_fh = (uint32_t)dev_ll->dev->device_fh;
2623 tx_total = dev_statistics[device_fh].tx_total;
2624 tx = dev_statistics[device_fh].tx;
2625 tx_dropped = tx_total - tx;
2626 if (zero_copy == 0) {
2627 rx_total = rte_atomic64_read(
2628 &dev_statistics[device_fh].rx_total_atomic);
2629 rx = rte_atomic64_read(
2630 &dev_statistics[device_fh].rx_atomic);
2632 rx_total = dev_statistics[device_fh].rx_total;
2633 rx = dev_statistics[device_fh].rx;
2635 rx_dropped = rx_total - rx;
2637 printf("\nStatistics for device %"PRIu32" ------------------------------"
2638 "\nTX total: %"PRIu64""
2639 "\nTX dropped: %"PRIu64""
2640 "\nTX successful: %"PRIu64""
2641 "\nRX total: %"PRIu64""
2642 "\nRX dropped: %"PRIu64""
2643 "\nRX successful: %"PRIu64"",
2652 dev_ll = dev_ll->next;
2654 printf("\n======================================================\n");
2659 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
2660 char *ring_name, uint32_t nb_mbuf)
2662 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
2663 vpool_array[index].pool
2664 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
2665 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
2666 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
2667 rte_pktmbuf_init, NULL, socket, 0);
2668 if (vpool_array[index].pool != NULL) {
2669 vpool_array[index].ring
2670 = rte_ring_create(ring_name,
2671 rte_align32pow2(nb_mbuf + 1),
2672 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
2673 if (likely(vpool_array[index].ring != NULL)) {
2674 LOG_DEBUG(VHOST_CONFIG,
2675 "in setup_mempool_tbl: mbuf count in "
2677 rte_mempool_count(vpool_array[index].pool));
2678 LOG_DEBUG(VHOST_CONFIG,
2679 "in setup_mempool_tbl: mbuf count in "
2681 rte_ring_count(vpool_array[index].ring));
2683 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
2687 /* Need consider head room. */
2688 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
2690 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
2696 * Main function, does initialisation and calls the per-lcore functions. The CUSE
2697 * device is also registered here to handle the IOCTLs.
2700 MAIN(int argc, char *argv[])
2702 struct rte_mempool *mbuf_pool = NULL;
2703 unsigned lcore_id, core_id = 0;
2704 unsigned nb_ports, valid_num_ports;
2706 uint8_t portid, queue_id = 0;
2707 static pthread_t tid;
2710 ret = rte_eal_init(argc, argv);
2712 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
2716 /* parse app arguments */
2717 ret = us_vhost_parse_args(argc, argv);
2719 rte_exit(EXIT_FAILURE, "Invalid argument\n");
2721 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
2722 if (rte_lcore_is_enabled(lcore_id))
2723 lcore_ids[core_id ++] = lcore_id;
2725 if (rte_lcore_count() > RTE_MAX_LCORE)
2726 rte_exit(EXIT_FAILURE,"Not enough cores\n");
2728 /*set the number of swithcing cores available*/
2729 num_switching_cores = rte_lcore_count()-1;
2731 /* Get the number of physical ports. */
2732 nb_ports = rte_eth_dev_count();
2733 if (nb_ports > RTE_MAX_ETHPORTS)
2734 nb_ports = RTE_MAX_ETHPORTS;
2737 * Update the global var NUM_PORTS and global array PORTS
2738 * and get value of var VALID_NUM_PORTS according to system ports number
2740 valid_num_ports = check_ports_num(nb_ports);
2742 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
2743 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
2744 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
2748 if (zero_copy == 0) {
2749 /* Create the mbuf pool. */
2750 mbuf_pool = rte_mempool_create(
2754 MBUF_SIZE, MBUF_CACHE_SIZE,
2755 sizeof(struct rte_pktmbuf_pool_private),
2756 rte_pktmbuf_pool_init, NULL,
2757 rte_pktmbuf_init, NULL,
2758 rte_socket_id(), 0);
2759 if (mbuf_pool == NULL)
2760 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
2762 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
2763 vpool_array[queue_id].pool = mbuf_pool;
2765 if (vm2vm_mode == VM2VM_HARDWARE) {
2766 /* Enable VT loop back to let L2 switch to do it. */
2767 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2768 LOG_DEBUG(VHOST_CONFIG,
2769 "Enable loop back for L2 switch in vmdq.\n");
2773 char pool_name[RTE_MEMPOOL_NAMESIZE];
2774 char ring_name[RTE_MEMPOOL_NAMESIZE];
2777 * Zero copy defers queue RX/TX start to the time when guest
2778 * finishes its startup and packet buffers from that guest are
2781 rx_conf_default.rx_deferred_start = (uint8_t)zero_copy;
2782 rx_conf_default.rx_drop_en = 0;
2783 tx_conf_default.tx_deferred_start = (uint8_t)zero_copy;
2784 nb_mbuf = num_rx_descriptor
2785 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2786 + num_switching_cores * MAX_PKT_BURST;
2788 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2789 snprintf(pool_name, sizeof(pool_name),
2790 "rxmbuf_pool_%u", queue_id);
2791 snprintf(ring_name, sizeof(ring_name),
2792 "rxmbuf_ring_%u", queue_id);
2793 setup_mempool_tbl(rte_socket_id(), queue_id,
2794 pool_name, ring_name, nb_mbuf);
2797 nb_mbuf = num_tx_descriptor
2798 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
2799 + num_switching_cores * MAX_PKT_BURST;
2801 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
2802 snprintf(pool_name, sizeof(pool_name),
2803 "txmbuf_pool_%u", queue_id);
2804 snprintf(ring_name, sizeof(ring_name),
2805 "txmbuf_ring_%u", queue_id);
2806 setup_mempool_tbl(rte_socket_id(),
2807 (queue_id + MAX_QUEUES),
2808 pool_name, ring_name, nb_mbuf);
2811 if (vm2vm_mode == VM2VM_HARDWARE) {
2812 /* Enable VT loop back to let L2 switch to do it. */
2813 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
2814 LOG_DEBUG(VHOST_CONFIG,
2815 "Enable loop back for L2 switch in vmdq.\n");
2818 /* Set log level. */
2819 rte_set_log_level(LOG_LEVEL);
2821 /* initialize all ports */
2822 for (portid = 0; portid < nb_ports; portid++) {
2823 /* skip ports that are not enabled */
2824 if ((enabled_port_mask & (1 << portid)) == 0) {
2825 RTE_LOG(INFO, VHOST_PORT,
2826 "Skipping disabled port %d\n", portid);
2829 if (port_init(portid) != 0)
2830 rte_exit(EXIT_FAILURE,
2831 "Cannot initialize network ports\n");
2834 /* Initialise all linked lists. */
2835 if (init_data_ll() == -1)
2836 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
2838 /* Initialize device stats */
2839 memset(&dev_statistics, 0, sizeof(dev_statistics));
2841 /* Enable stats if the user option is set. */
2843 pthread_create(&tid, NULL, (void*)print_stats, NULL );
2845 /* Launch all data cores. */
2846 if (zero_copy == 0) {
2847 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
2848 rte_eal_remote_launch(switch_worker,
2849 mbuf_pool, lcore_id);
2852 uint32_t count_in_mempool, index, i;
2853 for (index = 0; index < 2*MAX_QUEUES; index++) {
2854 /* For all RX and TX queues. */
2856 = rte_mempool_count(vpool_array[index].pool);
2859 * Transfer all un-attached mbufs from vpool.pool
2862 for (i = 0; i < count_in_mempool; i++) {
2863 struct rte_mbuf *mbuf
2864 = __rte_mbuf_raw_alloc(
2865 vpool_array[index].pool);
2866 rte_ring_sp_enqueue(vpool_array[index].ring,
2870 LOG_DEBUG(VHOST_CONFIG,
2871 "in MAIN: mbuf count in mempool at initial "
2872 "is: %d\n", count_in_mempool);
2873 LOG_DEBUG(VHOST_CONFIG,
2874 "in MAIN: mbuf count in ring at initial is :"
2876 rte_ring_count(vpool_array[index].ring));
2879 RTE_LCORE_FOREACH_SLAVE(lcore_id)
2880 rte_eal_remote_launch(switch_worker_zcp, NULL,
2884 /* Register CUSE device to handle IOCTLs. */
2885 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
2887 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
2889 init_virtio_net(&virtio_net_device_ops);
2891 /* Start CUSE session. */
2892 start_cuse_session_loop();