4 * Copyright(c) 2010-2014 Intel Corporation. All rights reserved.
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
11 * * Redistributions of source code must retain the above copyright
12 * notice, this list of conditions and the following disclaimer.
13 * * Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
17 * * Neither the name of Intel Corporation nor the names of its
18 * contributors may be used to endorse or promote products derived
19 * from this software without specific prior written permission.
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
25 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
34 #include <arpa/inet.h>
36 #include <linux/if_ether.h>
37 #include <linux/if_vlan.h>
38 #include <linux/virtio_net.h>
39 #include <linux/virtio_ring.h>
42 #include <sys/eventfd.h>
43 #include <sys/param.h>
46 #include <rte_atomic.h>
47 #include <rte_cycles.h>
48 #include <rte_ethdev.h>
50 #include <rte_string_fns.h>
51 #include <rte_malloc.h>
54 #include "virtio-net.h"
55 #include "vhost-net-cdev.h"
57 #define MAX_QUEUES 128
59 /* the maximum number of external ports supported */
60 #define MAX_SUP_PORTS 1
63 * Calculate the number of buffers needed per port
65 #define NUM_MBUFS_PER_PORT ((MAX_QUEUES*RTE_TEST_RX_DESC_DEFAULT) + \
66 (num_switching_cores*MAX_PKT_BURST) + \
67 (num_switching_cores*RTE_TEST_TX_DESC_DEFAULT) +\
68 (num_switching_cores*MBUF_CACHE_SIZE))
70 #define MBUF_CACHE_SIZE 128
71 #define MBUF_SIZE (2048 + sizeof(struct rte_mbuf) + RTE_PKTMBUF_HEADROOM)
74 * No frame data buffer allocated from host are required for zero copy
75 * implementation, guest will allocate the frame data buffer, and vhost
78 #define VIRTIO_DESCRIPTOR_LEN_ZCP 1518
79 #define MBUF_SIZE_ZCP (VIRTIO_DESCRIPTOR_LEN_ZCP + sizeof(struct rte_mbuf) \
80 + RTE_PKTMBUF_HEADROOM)
81 #define MBUF_CACHE_SIZE_ZCP 0
84 * RX and TX Prefetch, Host, and Write-back threshold values should be
85 * carefully set for optimal performance. Consult the network
86 * controller's datasheet and supporting DPDK documentation for guidance
87 * on how these parameters should be set.
89 #define RX_PTHRESH 8 /* Default values of RX prefetch threshold reg. */
90 #define RX_HTHRESH 8 /* Default values of RX host threshold reg. */
91 #define RX_WTHRESH 4 /* Default values of RX write-back threshold reg. */
94 * These default values are optimized for use with the Intel(R) 82599 10 GbE
95 * Controller and the DPDK ixgbe PMD. Consider using other values for other
96 * network controllers and/or network drivers.
98 #define TX_PTHRESH 36 /* Default values of TX prefetch threshold reg. */
99 #define TX_HTHRESH 0 /* Default values of TX host threshold reg. */
100 #define TX_WTHRESH 0 /* Default values of TX write-back threshold reg. */
102 #define MAX_PKT_BURST 32 /* Max burst size for RX/TX */
103 #define MAX_MRG_PKT_BURST 16 /* Max burst for merge buffers. Set to 1 due to performance issue. */
104 #define BURST_TX_DRAIN_US 100 /* TX drain every ~100us */
106 #define BURST_RX_WAIT_US 15 /* Defines how long we wait between retries on RX */
107 #define BURST_RX_RETRIES 4 /* Number of retries on RX. */
109 #define JUMBO_FRAME_MAX_SIZE 0x2600
111 /* State of virtio device. */
112 #define DEVICE_MAC_LEARNING 0
114 #define DEVICE_SAFE_REMOVE 2
116 /* Config_core_flag status definitions. */
117 #define REQUEST_DEV_REMOVAL 1
118 #define ACK_DEV_REMOVAL 0
120 /* Configurable number of RX/TX ring descriptors */
121 #define RTE_TEST_RX_DESC_DEFAULT 1024
122 #define RTE_TEST_TX_DESC_DEFAULT 512
125 * Need refine these 2 macros for legacy and DPDK based front end:
126 * Max vring avail descriptor/entries from guest - MAX_PKT_BURST
127 * And then adjust power 2.
130 * For legacy front end, 128 descriptors,
131 * half for virtio header, another half for mbuf.
133 #define RTE_TEST_RX_DESC_DEFAULT_ZCP 32 /* legacy: 32, DPDK virt FE: 128. */
134 #define RTE_TEST_TX_DESC_DEFAULT_ZCP 64 /* legacy: 64, DPDK virt FE: 64. */
136 /* Get first 4 bytes in mbuf headroom. */
137 #define MBUF_HEADROOM_UINT32(mbuf) (*(uint32_t *)((uint8_t *)(mbuf) \
138 + sizeof(struct rte_mbuf)))
140 /* true if x is a power of 2 */
141 #define POWEROF2(x) ((((x)-1) & (x)) == 0)
143 #define INVALID_PORT_ID 0xFF
145 /* Max number of devices. Limited by vmdq. */
146 #define MAX_DEVICES 64
148 /* Size of buffers used for snprintfs. */
149 #define MAX_PRINT_BUFF 6072
151 /* Maximum character device basename size. */
152 #define MAX_BASENAME_SZ 10
154 /* Maximum long option length for option parsing. */
155 #define MAX_LONG_OPT_SZ 64
157 /* Used to compare MAC addresses. */
158 #define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL
160 /* Number of descriptors per cacheline. */
161 #define DESC_PER_CACHELINE (CACHE_LINE_SIZE / sizeof(struct vring_desc))
163 /* mask of enabled ports */
164 static uint32_t enabled_port_mask = 0;
166 /*Number of switching cores enabled*/
167 static uint32_t num_switching_cores = 0;
169 /* number of devices/queues to support*/
170 static uint32_t num_queues = 0;
171 uint32_t num_devices = 0;
174 * Enable zero copy, pkts buffer will directly dma to hw descriptor,
175 * disabled on default.
177 static uint32_t zero_copy;
179 /* number of descriptors to apply*/
180 static uint32_t num_rx_descriptor = RTE_TEST_RX_DESC_DEFAULT_ZCP;
181 static uint32_t num_tx_descriptor = RTE_TEST_TX_DESC_DEFAULT_ZCP;
183 /* max ring descriptor, ixgbe, i40e, e1000 all are 4096. */
184 #define MAX_RING_DESC 4096
187 struct rte_mempool *pool;
188 struct rte_ring *ring;
190 } vpool_array[MAX_QUEUES+MAX_QUEUES];
192 /* Enable VM2VM communications. If this is disabled then the MAC address compare is skipped. */
199 static vm2vm_type vm2vm_mode = VM2VM_SOFTWARE;
201 /* The type of host physical address translated from guest physical address. */
203 PHYS_ADDR_CONTINUOUS = 0,
204 PHYS_ADDR_CROSS_SUBREG = 1,
205 PHYS_ADDR_INVALID = 2,
210 static uint32_t enable_stats = 0;
211 /* Enable retries on RX. */
212 static uint32_t enable_retry = 1;
213 /* Specify timeout (in useconds) between retries on RX. */
214 static uint32_t burst_rx_delay_time = BURST_RX_WAIT_US;
215 /* Specify the number of retries on RX. */
216 static uint32_t burst_rx_retry_num = BURST_RX_RETRIES;
218 /* Character device basename. Can be set by user. */
219 static char dev_basename[MAX_BASENAME_SZ] = "vhost-net";
221 /* Charater device index. Can be set by user. */
222 static uint32_t dev_index = 0;
224 /* This can be set by the user so it is made available here. */
225 extern uint64_t VHOST_FEATURES;
227 /* Default configuration for rx and tx thresholds etc. */
228 static struct rte_eth_rxconf rx_conf_default = {
230 .pthresh = RX_PTHRESH,
231 .hthresh = RX_HTHRESH,
232 .wthresh = RX_WTHRESH,
238 * These default values are optimized for use with the Intel(R) 82599 10 GbE
239 * Controller and the DPDK ixgbe/igb PMD. Consider using other values for other
240 * network controllers and/or network drivers.
242 static struct rte_eth_txconf tx_conf_default = {
244 .pthresh = TX_PTHRESH,
245 .hthresh = TX_HTHRESH,
246 .wthresh = TX_WTHRESH,
248 .tx_free_thresh = 0, /* Use PMD default values */
249 .tx_rs_thresh = 0, /* Use PMD default values */
252 /* empty vmdq configuration structure. Filled in programatically */
253 static struct rte_eth_conf vmdq_conf_default = {
255 .mq_mode = ETH_MQ_RX_VMDQ_ONLY,
257 .header_split = 0, /**< Header Split disabled */
258 .hw_ip_checksum = 0, /**< IP checksum offload disabled */
259 .hw_vlan_filter = 0, /**< VLAN filtering disabled */
261 * It is necessary for 1G NIC such as I350,
262 * this fixes bug of ipv4 forwarding in guest can't
263 * forward pakets from one virtio dev to another virtio dev.
265 .hw_vlan_strip = 1, /**< VLAN strip enabled. */
266 .jumbo_frame = 0, /**< Jumbo Frame Support disabled */
267 .hw_strip_crc = 0, /**< CRC stripped by hardware */
271 .mq_mode = ETH_MQ_TX_NONE,
275 * should be overridden separately in code with
279 .nb_queue_pools = ETH_8_POOLS,
280 .enable_default_pool = 0,
283 .pool_map = {{0, 0},},
288 static unsigned lcore_ids[RTE_MAX_LCORE];
289 static uint8_t ports[RTE_MAX_ETHPORTS];
290 static unsigned num_ports = 0; /**< The number of ports specified in command line */
292 static const uint16_t external_pkt_default_vlan_tag = 2000;
293 const uint16_t vlan_tags[] = {
294 1000, 1001, 1002, 1003, 1004, 1005, 1006, 1007,
295 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015,
296 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
297 1024, 1025, 1026, 1027, 1028, 1029, 1030, 1031,
298 1032, 1033, 1034, 1035, 1036, 1037, 1038, 1039,
299 1040, 1041, 1042, 1043, 1044, 1045, 1046, 1047,
300 1048, 1049, 1050, 1051, 1052, 1053, 1054, 1055,
301 1056, 1057, 1058, 1059, 1060, 1061, 1062, 1063,
304 /* ethernet addresses of ports */
305 static struct ether_addr vmdq_ports_eth_addr[RTE_MAX_ETHPORTS];
307 /* heads for the main used and free linked lists for the data path. */
308 static struct virtio_net_data_ll *ll_root_used = NULL;
309 static struct virtio_net_data_ll *ll_root_free = NULL;
311 /* Array of data core structures containing information on individual core linked lists. */
312 static struct lcore_info lcore_info[RTE_MAX_LCORE];
314 /* Used for queueing bursts of TX packets. */
318 struct rte_mbuf *m_table[MAX_PKT_BURST];
321 /* TX queue for each data core. */
322 struct mbuf_table lcore_tx_queue[RTE_MAX_LCORE];
324 /* TX queue fori each virtio device for zero copy. */
325 struct mbuf_table tx_queue_zcp[MAX_QUEUES];
327 /* Vlan header struct used to insert vlan tags on TX. */
329 unsigned char h_dest[ETH_ALEN];
330 unsigned char h_source[ETH_ALEN];
333 __be16 h_vlan_encapsulated_proto;
338 uint8_t version_ihl; /**< version and header length */
339 uint8_t type_of_service; /**< type of service */
340 uint16_t total_length; /**< length of packet */
341 uint16_t packet_id; /**< packet ID */
342 uint16_t fragment_offset; /**< fragmentation offset */
343 uint8_t time_to_live; /**< time to live */
344 uint8_t next_proto_id; /**< protocol ID */
345 uint16_t hdr_checksum; /**< header checksum */
346 uint32_t src_addr; /**< source address */
347 uint32_t dst_addr; /**< destination address */
348 } __attribute__((__packed__));
350 /* Header lengths. */
352 #define VLAN_ETH_HLEN 18
354 /* Per-device statistics struct */
355 struct device_statistics {
357 rte_atomic64_t rx_total_atomic;
360 rte_atomic64_t rx_atomic;
362 } __rte_cache_aligned;
363 struct device_statistics dev_statistics[MAX_DEVICES];
366 * Builds up the correct configuration for VMDQ VLAN pool map
367 * according to the pool & queue limits.
370 get_eth_conf(struct rte_eth_conf *eth_conf, uint32_t num_devices)
372 struct rte_eth_vmdq_rx_conf conf;
375 memset(&conf, 0, sizeof(conf));
376 conf.nb_queue_pools = (enum rte_eth_nb_pools)num_devices;
377 conf.nb_pool_maps = num_devices;
378 conf.enable_loop_back =
379 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back;
381 for (i = 0; i < conf.nb_pool_maps; i++) {
382 conf.pool_map[i].vlan_id = vlan_tags[ i ];
383 conf.pool_map[i].pools = (1UL << i);
386 (void)(rte_memcpy(eth_conf, &vmdq_conf_default, sizeof(*eth_conf)));
387 (void)(rte_memcpy(ð_conf->rx_adv_conf.vmdq_rx_conf, &conf,
388 sizeof(eth_conf->rx_adv_conf.vmdq_rx_conf)));
393 * Validate the device number according to the max pool number gotten form
394 * dev_info. If the device number is invalid, give the error message and
395 * return -1. Each device must have its own pool.
398 validate_num_devices(uint32_t max_nb_devices)
400 if (num_devices > max_nb_devices) {
401 RTE_LOG(ERR, VHOST_PORT, "invalid number of devices\n");
408 * Initialises a given port using global settings and with the rx buffers
409 * coming from the mbuf_pool passed as parameter
412 port_init(uint8_t port)
414 struct rte_eth_dev_info dev_info;
415 struct rte_eth_conf port_conf;
416 uint16_t rx_rings, tx_rings;
417 uint16_t rx_ring_size, tx_ring_size;
421 /* The max pool number from dev_info will be used to validate the pool number specified in cmd line */
422 rte_eth_dev_info_get (port, &dev_info);
424 /*configure the number of supported virtio devices based on VMDQ limits */
425 num_devices = dev_info.max_vmdq_pools;
426 num_queues = dev_info.max_rx_queues;
429 rx_ring_size = num_rx_descriptor;
430 tx_ring_size = num_tx_descriptor;
431 tx_rings = dev_info.max_tx_queues;
433 rx_ring_size = RTE_TEST_RX_DESC_DEFAULT;
434 tx_ring_size = RTE_TEST_TX_DESC_DEFAULT;
435 tx_rings = (uint16_t)rte_lcore_count();
438 retval = validate_num_devices(MAX_DEVICES);
442 /* Get port configuration. */
443 retval = get_eth_conf(&port_conf, num_devices);
447 if (port >= rte_eth_dev_count()) return -1;
449 rx_rings = (uint16_t)num_queues,
450 /* Configure ethernet device. */
451 retval = rte_eth_dev_configure(port, rx_rings, tx_rings, &port_conf);
455 /* Setup the queues. */
456 for (q = 0; q < rx_rings; q ++) {
457 retval = rte_eth_rx_queue_setup(port, q, rx_ring_size,
458 rte_eth_dev_socket_id(port), &rx_conf_default,
459 vpool_array[q].pool);
463 for (q = 0; q < tx_rings; q ++) {
464 retval = rte_eth_tx_queue_setup(port, q, tx_ring_size,
465 rte_eth_dev_socket_id(port), &tx_conf_default);
470 /* Start the device. */
471 retval = rte_eth_dev_start(port);
473 RTE_LOG(ERR, VHOST_DATA, "Failed to start the device.\n");
477 rte_eth_macaddr_get(port, &vmdq_ports_eth_addr[port]);
478 RTE_LOG(INFO, VHOST_PORT, "Max virtio devices supported: %u\n", num_devices);
479 RTE_LOG(INFO, VHOST_PORT, "Port %u MAC: %02"PRIx8" %02"PRIx8" %02"PRIx8
480 " %02"PRIx8" %02"PRIx8" %02"PRIx8"\n",
482 vmdq_ports_eth_addr[port].addr_bytes[0],
483 vmdq_ports_eth_addr[port].addr_bytes[1],
484 vmdq_ports_eth_addr[port].addr_bytes[2],
485 vmdq_ports_eth_addr[port].addr_bytes[3],
486 vmdq_ports_eth_addr[port].addr_bytes[4],
487 vmdq_ports_eth_addr[port].addr_bytes[5]);
493 * Set character device basename.
496 us_vhost_parse_basename(const char *q_arg)
498 /* parse number string */
500 if (strnlen(q_arg, MAX_BASENAME_SZ) > MAX_BASENAME_SZ)
503 snprintf((char*)&dev_basename, MAX_BASENAME_SZ, "%s", q_arg);
509 * Parse the portmask provided at run time.
512 parse_portmask(const char *portmask)
519 /* parse hexadecimal string */
520 pm = strtoul(portmask, &end, 16);
521 if ((portmask[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
532 * Parse num options at run time.
535 parse_num_opt(const char *q_arg, uint32_t max_valid_value)
542 /* parse unsigned int string */
543 num = strtoul(q_arg, &end, 10);
544 if ((q_arg[0] == '\0') || (end == NULL) || (*end != '\0') || (errno != 0))
547 if (num > max_valid_value)
558 us_vhost_usage(const char *prgname)
560 RTE_LOG(INFO, VHOST_CONFIG, "%s [EAL options] -- -p PORTMASK\n"
562 " --rx_retry [0|1] --mergeable [0|1] --stats [0-N]\n"
563 " --dev-basename <name> --dev-index [0-N]\n"
565 " -p PORTMASK: Set mask for ports to be used by application\n"
566 " --vm2vm [0|1|2]: disable/software(default)/hardware vm2vm comms\n"
567 " --rx-retry [0|1]: disable/enable(default) retries on rx. Enable retry if destintation queue is full\n"
568 " --rx-retry-delay [0-N]: timeout(in usecond) between retries on RX. This makes effect only if retries on rx enabled\n"
569 " --rx-retry-num [0-N]: the number of retries on rx. This makes effect only if retries on rx enabled\n"
570 " --mergeable [0|1]: disable(default)/enable RX mergeable buffers\n"
571 " --stats [0-N]: 0: Disable stats, N: Time in seconds to print stats\n"
572 " --dev-basename: The basename to be used for the character device.\n"
573 " --dev-index [0-N]: Defaults to zero if not used. Index is appended to basename.\n"
574 " --zero-copy [0|1]: disable(default)/enable rx/tx "
576 " --rx-desc-num [0-N]: the number of descriptors on rx, "
577 "used only when zero copy is enabled.\n"
578 " --tx-desc-num [0-N]: the number of descriptors on tx, "
579 "used only when zero copy is enabled.\n",
584 * Parse the arguments given in the command line of the application.
587 us_vhost_parse_args(int argc, char **argv)
592 const char *prgname = argv[0];
593 static struct option long_option[] = {
594 {"vm2vm", required_argument, NULL, 0},
595 {"rx-retry", required_argument, NULL, 0},
596 {"rx-retry-delay", required_argument, NULL, 0},
597 {"rx-retry-num", required_argument, NULL, 0},
598 {"mergeable", required_argument, NULL, 0},
599 {"stats", required_argument, NULL, 0},
600 {"dev-basename", required_argument, NULL, 0},
601 {"dev-index", required_argument, NULL, 0},
602 {"zero-copy", required_argument, NULL, 0},
603 {"rx-desc-num", required_argument, NULL, 0},
604 {"tx-desc-num", required_argument, NULL, 0},
608 /* Parse command line */
609 while ((opt = getopt_long(argc, argv, "p:",long_option, &option_index)) != EOF) {
613 enabled_port_mask = parse_portmask(optarg);
614 if (enabled_port_mask == 0) {
615 RTE_LOG(INFO, VHOST_CONFIG, "Invalid portmask\n");
616 us_vhost_usage(prgname);
622 /* Enable/disable vm2vm comms. */
623 if (!strncmp(long_option[option_index].name, "vm2vm",
625 ret = parse_num_opt(optarg, (VM2VM_LAST - 1));
627 RTE_LOG(INFO, VHOST_CONFIG,
628 "Invalid argument for "
630 us_vhost_usage(prgname);
633 vm2vm_mode = (vm2vm_type)ret;
637 /* Enable/disable retries on RX. */
638 if (!strncmp(long_option[option_index].name, "rx-retry", MAX_LONG_OPT_SZ)) {
639 ret = parse_num_opt(optarg, 1);
641 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry [0|1]\n");
642 us_vhost_usage(prgname);
649 /* Specify the retries delay time (in useconds) on RX. */
650 if (!strncmp(long_option[option_index].name, "rx-retry-delay", MAX_LONG_OPT_SZ)) {
651 ret = parse_num_opt(optarg, INT32_MAX);
653 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-delay [0-N]\n");
654 us_vhost_usage(prgname);
657 burst_rx_delay_time = ret;
661 /* Specify the retries number on RX. */
662 if (!strncmp(long_option[option_index].name, "rx-retry-num", MAX_LONG_OPT_SZ)) {
663 ret = parse_num_opt(optarg, INT32_MAX);
665 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for rx-retry-num [0-N]\n");
666 us_vhost_usage(prgname);
669 burst_rx_retry_num = ret;
673 /* Enable/disable RX mergeable buffers. */
674 if (!strncmp(long_option[option_index].name, "mergeable", MAX_LONG_OPT_SZ)) {
675 ret = parse_num_opt(optarg, 1);
677 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for mergeable [0|1]\n");
678 us_vhost_usage(prgname);
682 vmdq_conf_default.rxmode.jumbo_frame = 1;
683 vmdq_conf_default.rxmode.max_rx_pkt_len
684 = JUMBO_FRAME_MAX_SIZE;
685 VHOST_FEATURES = (1ULL << VIRTIO_NET_F_MRG_RXBUF);
690 /* Enable/disable stats. */
691 if (!strncmp(long_option[option_index].name, "stats", MAX_LONG_OPT_SZ)) {
692 ret = parse_num_opt(optarg, INT32_MAX);
694 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for stats [0..N]\n");
695 us_vhost_usage(prgname);
702 /* Set character device basename. */
703 if (!strncmp(long_option[option_index].name, "dev-basename", MAX_LONG_OPT_SZ)) {
704 if (us_vhost_parse_basename(optarg) == -1) {
705 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device basename (Max %d characters)\n", MAX_BASENAME_SZ);
706 us_vhost_usage(prgname);
711 /* Set character device index. */
712 if (!strncmp(long_option[option_index].name, "dev-index", MAX_LONG_OPT_SZ)) {
713 ret = parse_num_opt(optarg, INT32_MAX);
715 RTE_LOG(INFO, VHOST_CONFIG, "Invalid argument for character device index [0..N]\n");
716 us_vhost_usage(prgname);
722 /* Enable/disable rx/tx zero copy. */
723 if (!strncmp(long_option[option_index].name,
724 "zero-copy", MAX_LONG_OPT_SZ)) {
725 ret = parse_num_opt(optarg, 1);
727 RTE_LOG(INFO, VHOST_CONFIG,
729 " for zero-copy [0|1]\n");
730 us_vhost_usage(prgname);
736 #ifdef RTE_MBUF_REFCNT
737 RTE_LOG(ERR, VHOST_CONFIG, "Before running "
738 "zero copy vhost APP, please "
739 "disable RTE_MBUF_REFCNT\n"
740 "in config file and then rebuild DPDK "
742 "Otherwise please disable zero copy "
743 "flag in command line!\n");
749 /* Specify the descriptor number on RX. */
750 if (!strncmp(long_option[option_index].name,
751 "rx-desc-num", MAX_LONG_OPT_SZ)) {
752 ret = parse_num_opt(optarg, MAX_RING_DESC);
753 if ((ret == -1) || (!POWEROF2(ret))) {
754 RTE_LOG(INFO, VHOST_CONFIG,
755 "Invalid argument for rx-desc-num[0-N],"
756 "power of 2 required.\n");
757 us_vhost_usage(prgname);
760 num_rx_descriptor = ret;
764 /* Specify the descriptor number on TX. */
765 if (!strncmp(long_option[option_index].name,
766 "tx-desc-num", MAX_LONG_OPT_SZ)) {
767 ret = parse_num_opt(optarg, MAX_RING_DESC);
768 if ((ret == -1) || (!POWEROF2(ret))) {
769 RTE_LOG(INFO, VHOST_CONFIG,
770 "Invalid argument for tx-desc-num [0-N],"
771 "power of 2 required.\n");
772 us_vhost_usage(prgname);
775 num_tx_descriptor = ret;
781 /* Invalid option - print options. */
783 us_vhost_usage(prgname);
788 for (i = 0; i < RTE_MAX_ETHPORTS; i++) {
789 if (enabled_port_mask & (1 << i))
790 ports[num_ports++] = (uint8_t)i;
793 if ((num_ports == 0) || (num_ports > MAX_SUP_PORTS)) {
794 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
795 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
799 if ((zero_copy == 1) && (vm2vm_mode == VM2VM_SOFTWARE)) {
800 RTE_LOG(INFO, VHOST_PORT,
801 "Vhost zero copy doesn't support software vm2vm,"
802 "please specify 'vm2vm 2' to use hardware vm2vm.\n");
806 if ((zero_copy == 1) && (vmdq_conf_default.rxmode.jumbo_frame == 1)) {
807 RTE_LOG(INFO, VHOST_PORT,
808 "Vhost zero copy doesn't support jumbo frame,"
809 "please specify '--mergeable 0' to disable the "
810 "mergeable feature.\n");
818 * Update the global var NUM_PORTS and array PORTS according to system ports number
819 * and return valid ports number
821 static unsigned check_ports_num(unsigned nb_ports)
823 unsigned valid_num_ports = num_ports;
826 if (num_ports > nb_ports) {
827 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port number(%u) exceeds total system port number(%u)\n",
828 num_ports, nb_ports);
829 num_ports = nb_ports;
832 for (portid = 0; portid < num_ports; portid ++) {
833 if (ports[portid] >= nb_ports) {
834 RTE_LOG(INFO, VHOST_PORT, "\nSpecified port ID(%u) exceeds max system port ID(%u)\n",
835 ports[portid], (nb_ports - 1));
836 ports[portid] = INVALID_PORT_ID;
840 return valid_num_ports;
844 * Macro to print out packet contents. Wrapped in debug define so that the
845 * data path is not effected when debug is disabled.
848 #define PRINT_PACKET(device, addr, size, header) do { \
849 char *pkt_addr = (char*)(addr); \
850 unsigned int index; \
851 char packet[MAX_PRINT_BUFF]; \
854 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Header size %d: ", (device->device_fh), (size)); \
856 snprintf(packet, MAX_PRINT_BUFF, "(%"PRIu64") Packet size %d: ", (device->device_fh), (size)); \
857 for (index = 0; index < (size); index++) { \
858 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), \
859 "%02hhx ", pkt_addr[index]); \
861 snprintf(packet + strnlen(packet, MAX_PRINT_BUFF), MAX_PRINT_BUFF - strnlen(packet, MAX_PRINT_BUFF), "\n"); \
863 LOG_DEBUG(VHOST_DATA, "%s", packet); \
866 #define PRINT_PACKET(device, addr, size, header) do{} while(0)
870 * Function to convert guest physical addresses to vhost virtual addresses. This
871 * is used to convert virtio buffer addresses.
873 static inline uint64_t __attribute__((always_inline))
874 gpa_to_vva(struct virtio_net *dev, uint64_t guest_pa)
876 struct virtio_memory_regions *region;
878 uint64_t vhost_va = 0;
880 for (regionidx = 0; regionidx < dev->mem->nregions; regionidx++) {
881 region = &dev->mem->regions[regionidx];
882 if ((guest_pa >= region->guest_phys_address) &&
883 (guest_pa <= region->guest_phys_address_end)) {
884 vhost_va = region->address_offset + guest_pa;
888 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| VVA %p\n",
889 dev->device_fh, (void*)(uintptr_t)guest_pa, (void*)(uintptr_t)vhost_va);
895 * Function to convert guest physical addresses to vhost physical addresses.
896 * This is used to convert virtio buffer addresses.
898 static inline uint64_t __attribute__((always_inline))
899 gpa_to_hpa(struct virtio_net *dev, uint64_t guest_pa,
900 uint32_t buf_len, hpa_type *addr_type)
902 struct virtio_memory_regions_hpa *region;
904 uint64_t vhost_pa = 0;
906 *addr_type = PHYS_ADDR_INVALID;
908 for (regionidx = 0; regionidx < dev->mem->nregions_hpa; regionidx++) {
909 region = &dev->mem->regions_hpa[regionidx];
910 if ((guest_pa >= region->guest_phys_address) &&
911 (guest_pa <= region->guest_phys_address_end)) {
912 vhost_pa = region->host_phys_addr_offset + guest_pa;
913 if (likely((guest_pa + buf_len - 1)
914 <= region->guest_phys_address_end))
915 *addr_type = PHYS_ADDR_CONTINUOUS;
917 *addr_type = PHYS_ADDR_CROSS_SUBREG;
922 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") GPA %p| HPA %p\n",
923 dev->device_fh, (void *)(uintptr_t)guest_pa,
924 (void *)(uintptr_t)vhost_pa);
930 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
931 * be received from the physical port or from another virtio device. A packet
932 * count is returned to indicate the number of packets that were succesfully
933 * added to the RX queue. This function works when mergeable is disabled.
935 static inline uint32_t __attribute__((always_inline))
936 virtio_dev_rx(struct virtio_net *dev, struct rte_mbuf **pkts, uint32_t count)
938 struct vhost_virtqueue *vq;
939 struct vring_desc *desc;
940 struct rte_mbuf *buff;
941 /* The virtio_hdr is initialised to 0. */
942 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {{0,0,0,0,0,0},0};
943 uint64_t buff_addr = 0;
944 uint64_t buff_hdr_addr = 0;
945 uint32_t head[MAX_PKT_BURST], packet_len = 0;
946 uint32_t head_idx, packet_success = 0;
948 uint16_t avail_idx, res_cur_idx;
949 uint16_t res_base_idx, res_end_idx;
950 uint16_t free_entries;
953 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
954 vq = dev->virtqueue[VIRTIO_RXQ];
955 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
957 /* As many data cores may want access to available buffers, they need to be reserved. */
959 res_base_idx = vq->last_used_idx_res;
960 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
962 free_entries = (avail_idx - res_base_idx);
963 /* If retry is enabled and the queue is full then we wait and retry to avoid packet loss. */
964 if (enable_retry && unlikely(count > free_entries)) {
965 for (retry = 0; retry < burst_rx_retry_num; retry++) {
966 rte_delay_us(burst_rx_delay_time);
968 *((volatile uint16_t *)&vq->avail->idx);
969 free_entries = (avail_idx - res_base_idx);
970 if (count <= free_entries)
975 /*check that we have enough buffers*/
976 if (unlikely(count > free_entries))
977 count = free_entries;
982 res_end_idx = res_base_idx + count;
983 /* vq->last_used_idx_res is atomically updated. */
984 success = rte_atomic16_cmpset(&vq->last_used_idx_res, res_base_idx,
986 } while (unlikely(success == 0));
987 res_cur_idx = res_base_idx;
988 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n", dev->device_fh, res_cur_idx, res_end_idx);
990 /* Prefetch available ring to retrieve indexes. */
991 rte_prefetch0(&vq->avail->ring[res_cur_idx & (vq->size - 1)]);
993 /* Retrieve all of the head indexes first to avoid caching issues. */
994 for (head_idx = 0; head_idx < count; head_idx++)
995 head[head_idx] = vq->avail->ring[(res_cur_idx + head_idx) & (vq->size - 1)];
997 /*Prefetch descriptor index. */
998 rte_prefetch0(&vq->desc[head[packet_success]]);
1000 while (res_cur_idx != res_end_idx) {
1001 /* Get descriptor from available ring */
1002 desc = &vq->desc[head[packet_success]];
1004 buff = pkts[packet_success];
1006 /* Convert from gpa to vva (guest physical addr -> vhost virtual addr) */
1007 buff_addr = gpa_to_vva(dev, desc->addr);
1008 /* Prefetch buffer address. */
1009 rte_prefetch0((void*)(uintptr_t)buff_addr);
1011 /* Copy virtio_hdr to packet and increment buffer address */
1012 buff_hdr_addr = buff_addr;
1013 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
1016 * If the descriptors are chained the header and data are
1017 * placed in separate buffers.
1019 if (desc->flags & VRING_DESC_F_NEXT) {
1020 desc->len = vq->vhost_hlen;
1021 desc = &vq->desc[desc->next];
1022 /* Buffer address translation. */
1023 buff_addr = gpa_to_vva(dev, desc->addr);
1024 desc->len = rte_pktmbuf_data_len(buff);
1026 buff_addr += vq->vhost_hlen;
1027 desc->len = packet_len;
1030 /* Update used ring with desc information */
1031 vq->used->ring[res_cur_idx & (vq->size - 1)].id = head[packet_success];
1032 vq->used->ring[res_cur_idx & (vq->size - 1)].len = packet_len;
1034 /* Copy mbuf data to buffer */
1035 rte_memcpy((void *)(uintptr_t)buff_addr,
1036 rte_pktmbuf_mtod(buff, const void *),
1037 rte_pktmbuf_data_len(buff));
1038 PRINT_PACKET(dev, (uintptr_t)buff_addr,
1039 rte_pktmbuf_data_len(buff), 0);
1044 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
1045 (const void *)&virtio_hdr, vq->vhost_hlen);
1047 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
1049 if (res_cur_idx < res_end_idx) {
1050 /* Prefetch descriptor index. */
1051 rte_prefetch0(&vq->desc[head[packet_success]]);
1055 rte_compiler_barrier();
1057 /* Wait until it's our turn to add our buffer to the used ring. */
1058 while (unlikely(vq->last_used_idx != res_base_idx))
1061 *(volatile uint16_t *)&vq->used->idx += count;
1062 vq->last_used_idx = res_end_idx;
1064 /* Kick the guest if necessary. */
1065 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1066 eventfd_write((int)vq->kickfd, 1);
1070 static inline uint32_t __attribute__((always_inline))
1071 copy_from_mbuf_to_vring(struct virtio_net *dev,
1072 uint16_t res_base_idx, uint16_t res_end_idx,
1073 struct rte_mbuf *pkt)
1075 uint32_t vec_idx = 0;
1076 uint32_t entry_success = 0;
1077 struct vhost_virtqueue *vq;
1078 /* The virtio_hdr is initialised to 0. */
1079 struct virtio_net_hdr_mrg_rxbuf virtio_hdr = {
1080 {0, 0, 0, 0, 0, 0}, 0};
1081 uint16_t cur_idx = res_base_idx;
1082 uint64_t vb_addr = 0;
1083 uint64_t vb_hdr_addr = 0;
1084 uint32_t seg_offset = 0;
1085 uint32_t vb_offset = 0;
1088 uint32_t cpy_len, entry_len;
1093 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| "
1095 dev->device_fh, cur_idx, res_end_idx);
1098 * Convert from gpa to vva
1099 * (guest physical addr -> vhost virtual addr)
1101 vq = dev->virtqueue[VIRTIO_RXQ];
1103 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1104 vb_hdr_addr = vb_addr;
1106 /* Prefetch buffer address. */
1107 rte_prefetch0((void *)(uintptr_t)vb_addr);
1109 virtio_hdr.num_buffers = res_end_idx - res_base_idx;
1111 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") RX: Num merge buffers %d\n",
1112 dev->device_fh, virtio_hdr.num_buffers);
1114 rte_memcpy((void *)(uintptr_t)vb_hdr_addr,
1115 (const void *)&virtio_hdr, vq->vhost_hlen);
1117 PRINT_PACKET(dev, (uintptr_t)vb_hdr_addr, vq->vhost_hlen, 1);
1119 seg_avail = rte_pktmbuf_data_len(pkt);
1120 vb_offset = vq->vhost_hlen;
1122 vq->buf_vec[vec_idx].buf_len - vq->vhost_hlen;
1124 entry_len = vq->vhost_hlen;
1126 if (vb_avail == 0) {
1128 vq->buf_vec[vec_idx].desc_idx;
1129 vq->desc[desc_idx].len = vq->vhost_hlen;
1131 if ((vq->desc[desc_idx].flags
1132 & VRING_DESC_F_NEXT) == 0) {
1133 /* Update used ring with desc information */
1134 vq->used->ring[cur_idx & (vq->size - 1)].id
1135 = vq->buf_vec[vec_idx].desc_idx;
1136 vq->used->ring[cur_idx & (vq->size - 1)].len
1146 gpa_to_vva(dev, vq->buf_vec[vec_idx].buf_addr);
1148 /* Prefetch buffer address. */
1149 rte_prefetch0((void *)(uintptr_t)vb_addr);
1151 vb_avail = vq->buf_vec[vec_idx].buf_len;
1154 cpy_len = RTE_MIN(vb_avail, seg_avail);
1156 while (cpy_len > 0) {
1157 /* Copy mbuf data to vring buffer */
1158 rte_memcpy((void *)(uintptr_t)(vb_addr + vb_offset),
1159 (const void *)(rte_pktmbuf_mtod(pkt, char*) + seg_offset),
1163 (uintptr_t)(vb_addr + vb_offset),
1166 seg_offset += cpy_len;
1167 vb_offset += cpy_len;
1168 seg_avail -= cpy_len;
1169 vb_avail -= cpy_len;
1170 entry_len += cpy_len;
1172 if (seg_avail != 0) {
1174 * The virtio buffer in this vring
1175 * entry reach to its end.
1176 * But the segment doesn't complete.
1178 if ((vq->desc[vq->buf_vec[vec_idx].desc_idx].flags &
1179 VRING_DESC_F_NEXT) == 0) {
1180 /* Update used ring with desc information */
1181 vq->used->ring[cur_idx & (vq->size - 1)].id
1182 = vq->buf_vec[vec_idx].desc_idx;
1183 vq->used->ring[cur_idx & (vq->size - 1)].len
1191 vb_addr = gpa_to_vva(dev,
1192 vq->buf_vec[vec_idx].buf_addr);
1194 vb_avail = vq->buf_vec[vec_idx].buf_len;
1195 cpy_len = RTE_MIN(vb_avail, seg_avail);
1198 * This current segment complete, need continue to
1199 * check if the whole packet complete or not.
1204 * There are more segments.
1206 if (vb_avail == 0) {
1208 * This current buffer from vring is
1209 * used up, need fetch next buffer
1213 vq->buf_vec[vec_idx].desc_idx;
1214 vq->desc[desc_idx].len = vb_offset;
1216 if ((vq->desc[desc_idx].flags &
1217 VRING_DESC_F_NEXT) == 0) {
1218 uint16_t wrapped_idx =
1219 cur_idx & (vq->size - 1);
1221 * Update used ring with the
1222 * descriptor information
1224 vq->used->ring[wrapped_idx].id
1226 vq->used->ring[wrapped_idx].len
1233 /* Get next buffer from buf_vec. */
1235 vb_addr = gpa_to_vva(dev,
1236 vq->buf_vec[vec_idx].buf_addr);
1238 vq->buf_vec[vec_idx].buf_len;
1243 seg_avail = rte_pktmbuf_data_len(pkt);
1244 cpy_len = RTE_MIN(vb_avail, seg_avail);
1247 * This whole packet completes.
1250 vq->buf_vec[vec_idx].desc_idx;
1251 vq->desc[desc_idx].len = vb_offset;
1253 while (vq->desc[desc_idx].flags &
1254 VRING_DESC_F_NEXT) {
1255 desc_idx = vq->desc[desc_idx].next;
1256 vq->desc[desc_idx].len = 0;
1259 /* Update used ring with desc information */
1260 vq->used->ring[cur_idx & (vq->size - 1)].id
1261 = vq->buf_vec[vec_idx].desc_idx;
1262 vq->used->ring[cur_idx & (vq->size - 1)].len
1268 cpy_len = RTE_MIN(vb_avail, seg_avail);
1273 return entry_success;
1277 * This function adds buffers to the virtio devices RX virtqueue. Buffers can
1278 * be received from the physical port or from another virtio device. A packet
1279 * count is returned to indicate the number of packets that were succesfully
1280 * added to the RX queue. This function works for mergeable RX.
1282 static inline uint32_t __attribute__((always_inline))
1283 virtio_dev_merge_rx(struct virtio_net *dev, struct rte_mbuf **pkts,
1286 struct vhost_virtqueue *vq;
1287 uint32_t pkt_idx = 0, entry_success = 0;
1289 uint16_t avail_idx, res_cur_idx;
1290 uint16_t res_base_idx, res_end_idx;
1291 uint8_t success = 0;
1293 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_rx()\n",
1295 vq = dev->virtqueue[VIRTIO_RXQ];
1296 count = RTE_MIN((uint32_t)MAX_PKT_BURST, count);
1301 for (pkt_idx = 0; pkt_idx < count; pkt_idx++) {
1302 uint32_t secure_len = 0;
1304 uint32_t vec_idx = 0;
1305 uint32_t pkt_len = pkts[pkt_idx]->pkt_len + vq->vhost_hlen;
1310 * As many data cores may want access to available
1311 * buffers, they need to be reserved.
1313 res_base_idx = vq->last_used_idx_res;
1314 res_cur_idx = res_base_idx;
1317 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1318 if (unlikely(res_cur_idx == avail_idx)) {
1320 * If retry is enabled and the queue is
1321 * full then we wait and retry to avoid
1326 for (retry = 0; retry < burst_rx_retry_num; retry++) {
1327 rte_delay_us(burst_rx_delay_time);
1329 *((volatile uint16_t *)&vq->avail->idx);
1330 if (likely(res_cur_idx != avail_idx)) {
1339 LOG_DEBUG(VHOST_DATA,
1340 "(%"PRIu64") Failed "
1341 "to get enough desc from "
1346 uint16_t wrapped_idx =
1347 (res_cur_idx) & (vq->size - 1);
1349 vq->avail->ring[wrapped_idx];
1354 secure_len += vq->desc[idx].len;
1355 if (vq->desc[idx].flags &
1356 VRING_DESC_F_NEXT) {
1357 idx = vq->desc[idx].next;
1360 } while (next_desc);
1364 } while (pkt_len > secure_len);
1366 /* vq->last_used_idx_res is atomically updated. */
1367 success = rte_atomic16_cmpset(&vq->last_used_idx_res,
1370 } while (success == 0);
1373 need_cnt = res_cur_idx - res_base_idx;
1375 for (i = 0; i < need_cnt; i++, id++) {
1376 uint16_t wrapped_idx = id & (vq->size - 1);
1377 uint32_t idx = vq->avail->ring[wrapped_idx];
1381 vq->buf_vec[vec_idx].buf_addr =
1383 vq->buf_vec[vec_idx].buf_len =
1385 vq->buf_vec[vec_idx].desc_idx = idx;
1388 if (vq->desc[idx].flags & VRING_DESC_F_NEXT) {
1389 idx = vq->desc[idx].next;
1392 } while (next_desc);
1395 res_end_idx = res_cur_idx;
1397 entry_success = copy_from_mbuf_to_vring(dev, res_base_idx,
1398 res_end_idx, pkts[pkt_idx]);
1400 rte_compiler_barrier();
1403 * Wait until it's our turn to add our buffer
1406 while (unlikely(vq->last_used_idx != res_base_idx))
1409 *(volatile uint16_t *)&vq->used->idx += entry_success;
1410 vq->last_used_idx = res_end_idx;
1412 /* Kick the guest if necessary. */
1413 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1414 eventfd_write((int)vq->kickfd, 1);
1421 * Compares a packet destination MAC address to a device MAC address.
1423 static inline int __attribute__((always_inline))
1424 ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb)
1426 return (((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0);
1430 * This function learns the MAC address of the device and registers this along with a
1431 * vlan tag to a VMDQ.
1434 link_vmdq(struct virtio_net *dev, struct rte_mbuf *m)
1436 struct ether_hdr *pkt_hdr;
1437 struct virtio_net_data_ll *dev_ll;
1440 /* Learn MAC address of guest device from packet */
1441 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1443 dev_ll = ll_root_used;
1445 while (dev_ll != NULL) {
1446 if (ether_addr_cmp(&(pkt_hdr->s_addr), &dev_ll->dev->mac_address)) {
1447 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") WARNING: This device is using an existing MAC address and has not been registered.\n", dev->device_fh);
1450 dev_ll = dev_ll->next;
1453 for (i = 0; i < ETHER_ADDR_LEN; i++)
1454 dev->mac_address.addr_bytes[i] = pkt_hdr->s_addr.addr_bytes[i];
1456 /* vlan_tag currently uses the device_id. */
1457 dev->vlan_tag = vlan_tags[dev->device_fh];
1459 /* Print out VMDQ registration info. */
1460 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") MAC_ADDRESS %02x:%02x:%02x:%02x:%02x:%02x and VLAN_TAG %d registered\n",
1462 dev->mac_address.addr_bytes[0], dev->mac_address.addr_bytes[1],
1463 dev->mac_address.addr_bytes[2], dev->mac_address.addr_bytes[3],
1464 dev->mac_address.addr_bytes[4], dev->mac_address.addr_bytes[5],
1467 /* Register the MAC address. */
1468 ret = rte_eth_dev_mac_addr_add(ports[0], &dev->mac_address, (uint32_t)dev->device_fh);
1470 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Failed to add device MAC address to VMDQ\n",
1473 /* Enable stripping of the vlan tag as we handle routing. */
1474 rte_eth_dev_set_vlan_strip_on_queue(ports[0], (uint16_t)dev->vmdq_rx_q, 1);
1476 /* Set device as ready for RX. */
1477 dev->ready = DEVICE_RX;
1483 * Removes MAC address and vlan tag from VMDQ. Ensures that nothing is adding buffers to the RX
1484 * queue before disabling RX on the device.
1487 unlink_vmdq(struct virtio_net *dev)
1491 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
1493 if (dev->ready == DEVICE_RX) {
1494 /*clear MAC and VLAN settings*/
1495 rte_eth_dev_mac_addr_remove(ports[0], &dev->mac_address);
1496 for (i = 0; i < 6; i++)
1497 dev->mac_address.addr_bytes[i] = 0;
1501 /*Clear out the receive buffers*/
1502 rx_count = rte_eth_rx_burst(ports[0],
1503 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1506 for (i = 0; i < rx_count; i++)
1507 rte_pktmbuf_free(pkts_burst[i]);
1509 rx_count = rte_eth_rx_burst(ports[0],
1510 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
1513 dev->ready = DEVICE_MAC_LEARNING;
1518 * Check if the packet destination MAC address is for a local device. If so then put
1519 * the packet on that devices RX queue. If not then return.
1521 static inline unsigned __attribute__((always_inline))
1522 virtio_tx_local(struct virtio_net *dev, struct rte_mbuf *m)
1524 struct virtio_net_data_ll *dev_ll;
1525 struct ether_hdr *pkt_hdr;
1528 pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1530 /*get the used devices list*/
1531 dev_ll = ll_root_used;
1533 while (dev_ll != NULL) {
1534 if ((dev_ll->dev->ready == DEVICE_RX) && ether_addr_cmp(&(pkt_hdr->d_addr),
1535 &dev_ll->dev->mac_address)) {
1537 /* Drop the packet if the TX packet is destined for the TX device. */
1538 if (dev_ll->dev->device_fh == dev->device_fh) {
1539 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: Source and destination MAC addresses are the same. Dropping packet.\n",
1540 dev_ll->dev->device_fh);
1545 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is local\n", dev_ll->dev->device_fh);
1547 if (dev_ll->dev->remove) {
1548 /*drop the packet if the device is marked for removal*/
1549 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Device is marked for removal\n", dev_ll->dev->device_fh);
1551 uint32_t mergeable =
1552 dev_ll->dev->features &
1553 (1 << VIRTIO_NET_F_MRG_RXBUF);
1555 /*send the packet to the local virtio device*/
1556 if (likely(mergeable == 0))
1557 ret = virtio_dev_rx(dev_ll->dev, &m, 1);
1559 ret = virtio_dev_merge_rx(dev_ll->dev,
1564 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
1567 &dev_statistics[dev_ll->dev->device_fh].rx_atomic,
1569 dev_statistics[dev->device_fh].tx_total++;
1570 dev_statistics[dev->device_fh].tx += ret;
1576 dev_ll = dev_ll->next;
1583 * This function routes the TX packet to the correct interface. This may be a local device
1584 * or the physical port.
1586 static inline void __attribute__((always_inline))
1587 virtio_tx_route(struct virtio_net* dev, struct rte_mbuf *m, struct rte_mempool *mbuf_pool, uint16_t vlan_tag)
1589 struct mbuf_table *tx_q;
1590 struct vlan_ethhdr *vlan_hdr;
1591 struct rte_mbuf **m_table;
1592 struct rte_mbuf *mbuf, *prev;
1593 unsigned len, ret, offset = 0;
1594 const uint16_t lcore_id = rte_lcore_id();
1595 struct virtio_net_data_ll *dev_ll = ll_root_used;
1596 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
1598 /*check if destination is local VM*/
1599 if ((vm2vm_mode == VM2VM_SOFTWARE) && (virtio_tx_local(dev, m) == 0))
1602 if (vm2vm_mode == VM2VM_HARDWARE) {
1603 while (dev_ll != NULL) {
1604 if ((dev_ll->dev->ready == DEVICE_RX)
1605 && ether_addr_cmp(&(pkt_hdr->d_addr),
1606 &dev_ll->dev->mac_address)) {
1608 * Drop the packet if the TX packet is
1609 * destined for the TX device.
1611 if (dev_ll->dev->device_fh == dev->device_fh) {
1612 LOG_DEBUG(VHOST_DATA,
1613 "(%"PRIu64") TX: Source and destination"
1614 " MAC addresses are the same. Dropping "
1616 dev_ll->dev->device_fh);
1622 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
1624 LOG_DEBUG(VHOST_DATA,
1625 "(%"PRIu64") TX: pkt to local VM device id:"
1626 "(%"PRIu64") vlan tag: %d.\n",
1627 dev->device_fh, dev_ll->dev->device_fh,
1632 dev_ll = dev_ll->next;
1636 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") TX: MAC address is external\n", dev->device_fh);
1638 /*Add packet to the port tx queue*/
1639 tx_q = &lcore_tx_queue[lcore_id];
1642 /* Allocate an mbuf and populate the structure. */
1643 mbuf = rte_pktmbuf_alloc(mbuf_pool);
1644 if (unlikely(mbuf == NULL)) {
1645 RTE_LOG(ERR, VHOST_DATA,
1646 "Failed to allocate memory for mbuf.\n");
1650 mbuf->data_len = m->data_len + VLAN_HLEN + offset;
1651 mbuf->pkt_len = m->pkt_len + VLAN_HLEN + offset;
1652 mbuf->nb_segs = m->nb_segs;
1654 /* Copy ethernet header to mbuf. */
1655 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
1656 rte_pktmbuf_mtod(m, const void *),
1660 /* Setup vlan header. Bytes need to be re-ordered for network with htons()*/
1661 vlan_hdr = rte_pktmbuf_mtod(mbuf, struct vlan_ethhdr *);
1662 vlan_hdr->h_vlan_encapsulated_proto = vlan_hdr->h_vlan_proto;
1663 vlan_hdr->h_vlan_proto = htons(ETH_P_8021Q);
1664 vlan_hdr->h_vlan_TCI = htons(vlan_tag);
1666 /* Copy the remaining packet contents to the mbuf. */
1667 rte_memcpy((void *)(rte_pktmbuf_mtod(mbuf, uint8_t *) + VLAN_ETH_HLEN),
1668 (const void *)(rte_pktmbuf_mtod(m, uint8_t *) + ETH_HLEN),
1669 (m->data_len - ETH_HLEN));
1671 /* Copy the remaining segments for the whole packet. */
1674 /* Allocate an mbuf and populate the structure. */
1675 struct rte_mbuf *next_mbuf = rte_pktmbuf_alloc(mbuf_pool);
1676 if (unlikely(next_mbuf == NULL)) {
1677 rte_pktmbuf_free(mbuf);
1678 RTE_LOG(ERR, VHOST_DATA,
1679 "Failed to allocate memory for mbuf.\n");
1684 prev->next = next_mbuf;
1686 next_mbuf->data_len = m->data_len;
1688 /* Copy data to next mbuf. */
1689 rte_memcpy(rte_pktmbuf_mtod(next_mbuf, void *),
1690 rte_pktmbuf_mtod(m, const void *), m->data_len);
1693 tx_q->m_table[len] = mbuf;
1696 dev_statistics[dev->device_fh].tx_total++;
1697 dev_statistics[dev->device_fh].tx++;
1700 if (unlikely(len == MAX_PKT_BURST)) {
1701 m_table = (struct rte_mbuf **)tx_q->m_table;
1702 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
1703 /* Free any buffers not handled by TX and update the port stats. */
1704 if (unlikely(ret < len)) {
1706 rte_pktmbuf_free(m_table[ret]);
1707 } while (++ret < len);
1717 static inline void __attribute__((always_inline))
1718 virtio_dev_tx(struct virtio_net* dev, struct rte_mempool *mbuf_pool)
1721 struct vhost_virtqueue *vq;
1722 struct vring_desc *desc;
1723 uint64_t buff_addr = 0;
1724 uint32_t head[MAX_PKT_BURST];
1727 uint16_t free_entries, packet_success = 0;
1730 vq = dev->virtqueue[VIRTIO_TXQ];
1731 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1733 /* If there are no available buffers then return. */
1734 if (vq->last_used_idx == avail_idx)
1737 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
1739 /* Prefetch available ring to retrieve head indexes. */
1740 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1742 /*get the number of free entries in the ring*/
1743 free_entries = (avail_idx - vq->last_used_idx);
1745 /* Limit to MAX_PKT_BURST. */
1746 if (free_entries > MAX_PKT_BURST)
1747 free_entries = MAX_PKT_BURST;
1749 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n", dev->device_fh, free_entries);
1750 /* Retrieve all of the head indexes first to avoid caching issues. */
1751 for (i = 0; i < free_entries; i++)
1752 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1754 /* Prefetch descriptor index. */
1755 rte_prefetch0(&vq->desc[head[packet_success]]);
1756 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1758 while (packet_success < free_entries) {
1759 desc = &vq->desc[head[packet_success]];
1761 /* Discard first buffer as it is the virtio header */
1762 desc = &vq->desc[desc->next];
1764 /* Buffer address translation. */
1765 buff_addr = gpa_to_vva(dev, desc->addr);
1766 /* Prefetch buffer address. */
1767 rte_prefetch0((void*)(uintptr_t)buff_addr);
1769 used_idx = vq->last_used_idx & (vq->size - 1);
1771 if (packet_success < (free_entries - 1)) {
1772 /* Prefetch descriptor index. */
1773 rte_prefetch0(&vq->desc[head[packet_success+1]]);
1774 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1777 /* Update used index buffer information. */
1778 vq->used->ring[used_idx].id = head[packet_success];
1779 vq->used->ring[used_idx].len = 0;
1781 /* Setup dummy mbuf. This is copied to a real mbuf if transmitted out the physical port. */
1782 m.data_len = desc->len;
1783 m.pkt_len = desc->len;
1786 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
1788 /* If this is the first received packet we need to learn the MAC and setup VMDQ */
1789 if (dev->ready == DEVICE_MAC_LEARNING) {
1790 if (dev->remove || (link_vmdq(dev, &m) == -1)) {
1791 /*discard frame if device is scheduled for removal or a duplicate MAC address is found. */
1792 packet_success += free_entries;
1793 vq->last_used_idx += packet_success;
1797 virtio_tx_route(dev, &m, mbuf_pool, (uint16_t)dev->device_fh);
1799 vq->last_used_idx++;
1803 rte_compiler_barrier();
1804 vq->used->idx += packet_success;
1805 /* Kick guest if required. */
1806 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
1807 eventfd_write((int)vq->kickfd, 1);
1810 /* This function works for TX packets with mergeable feature enabled. */
1811 static inline void __attribute__((always_inline))
1812 virtio_dev_merge_tx(struct virtio_net *dev, struct rte_mempool *mbuf_pool)
1814 struct rte_mbuf *m, *prev;
1815 struct vhost_virtqueue *vq;
1816 struct vring_desc *desc;
1817 uint64_t vb_addr = 0;
1818 uint32_t head[MAX_PKT_BURST];
1821 uint16_t free_entries, entry_success = 0;
1823 uint32_t buf_size = MBUF_SIZE - (sizeof(struct rte_mbuf)
1824 + RTE_PKTMBUF_HEADROOM);
1826 vq = dev->virtqueue[VIRTIO_TXQ];
1827 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
1829 /* If there are no available buffers then return. */
1830 if (vq->last_used_idx == avail_idx)
1833 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_merge_tx()\n",
1836 /* Prefetch available ring to retrieve head indexes. */
1837 rte_prefetch0(&vq->avail->ring[vq->last_used_idx & (vq->size - 1)]);
1839 /*get the number of free entries in the ring*/
1840 free_entries = (avail_idx - vq->last_used_idx);
1842 /* Limit to MAX_PKT_BURST. */
1843 free_entries = RTE_MIN(free_entries, MAX_PKT_BURST);
1845 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
1846 dev->device_fh, free_entries);
1847 /* Retrieve all of the head indexes first to avoid caching issues. */
1848 for (i = 0; i < free_entries; i++)
1849 head[i] = vq->avail->ring[(vq->last_used_idx + i) & (vq->size - 1)];
1851 /* Prefetch descriptor index. */
1852 rte_prefetch0(&vq->desc[head[entry_success]]);
1853 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
1855 while (entry_success < free_entries) {
1856 uint32_t vb_avail, vb_offset;
1857 uint32_t seg_avail, seg_offset;
1859 uint32_t seg_num = 0;
1860 struct rte_mbuf *cur;
1861 uint8_t alloc_err = 0;
1863 desc = &vq->desc[head[entry_success]];
1865 /* Discard first buffer as it is the virtio header */
1866 desc = &vq->desc[desc->next];
1868 /* Buffer address translation. */
1869 vb_addr = gpa_to_vva(dev, desc->addr);
1870 /* Prefetch buffer address. */
1871 rte_prefetch0((void *)(uintptr_t)vb_addr);
1873 used_idx = vq->last_used_idx & (vq->size - 1);
1875 if (entry_success < (free_entries - 1)) {
1876 /* Prefetch descriptor index. */
1877 rte_prefetch0(&vq->desc[head[entry_success+1]]);
1878 rte_prefetch0(&vq->used->ring[(used_idx + 1) & (vq->size - 1)]);
1881 /* Update used index buffer information. */
1882 vq->used->ring[used_idx].id = head[entry_success];
1883 vq->used->ring[used_idx].len = 0;
1886 vb_avail = desc->len;
1888 seg_avail = buf_size;
1889 cpy_len = RTE_MIN(vb_avail, seg_avail);
1891 PRINT_PACKET(dev, (uintptr_t)vb_addr, desc->len, 0);
1893 /* Allocate an mbuf and populate the structure. */
1894 m = rte_pktmbuf_alloc(mbuf_pool);
1895 if (unlikely(m == NULL)) {
1896 RTE_LOG(ERR, VHOST_DATA,
1897 "Failed to allocate memory for mbuf.\n");
1904 while (cpy_len != 0) {
1905 rte_memcpy((void *)(rte_pktmbuf_mtod(cur, char *) + seg_offset),
1906 (void *)((uintptr_t)(vb_addr + vb_offset)),
1909 seg_offset += cpy_len;
1910 vb_offset += cpy_len;
1911 vb_avail -= cpy_len;
1912 seg_avail -= cpy_len;
1914 if (vb_avail != 0) {
1916 * The segment reachs to its end,
1917 * while the virtio buffer in TX vring has
1918 * more data to be copied.
1920 cur->data_len = seg_offset;
1921 m->pkt_len += seg_offset;
1922 /* Allocate mbuf and populate the structure. */
1923 cur = rte_pktmbuf_alloc(mbuf_pool);
1924 if (unlikely(cur == NULL)) {
1925 RTE_LOG(ERR, VHOST_DATA, "Failed to "
1926 "allocate memory for mbuf.\n");
1927 rte_pktmbuf_free(m);
1936 seg_avail = buf_size;
1938 if (desc->flags & VRING_DESC_F_NEXT) {
1940 * There are more virtio buffers in
1941 * same vring entry need to be copied.
1943 if (seg_avail == 0) {
1945 * The current segment hasn't
1946 * room to accomodate more
1949 cur->data_len = seg_offset;
1950 m->pkt_len += seg_offset;
1952 * Allocate an mbuf and
1953 * populate the structure.
1955 cur = rte_pktmbuf_alloc(mbuf_pool);
1956 if (unlikely(cur == NULL)) {
1962 rte_pktmbuf_free(m);
1970 seg_avail = buf_size;
1973 desc = &vq->desc[desc->next];
1975 /* Buffer address translation. */
1976 vb_addr = gpa_to_vva(dev, desc->addr);
1977 /* Prefetch buffer address. */
1978 rte_prefetch0((void *)(uintptr_t)vb_addr);
1980 vb_avail = desc->len;
1982 PRINT_PACKET(dev, (uintptr_t)vb_addr,
1985 /* The whole packet completes. */
1986 cur->data_len = seg_offset;
1987 m->pkt_len += seg_offset;
1992 cpy_len = RTE_MIN(vb_avail, seg_avail);
1995 if (unlikely(alloc_err == 1))
1998 m->nb_segs = seg_num;
2001 * If this is the first received packet we need to learn
2002 * the MAC and setup VMDQ
2004 if (dev->ready == DEVICE_MAC_LEARNING) {
2005 if (dev->remove || (link_vmdq(dev, m) == -1)) {
2007 * Discard frame if device is scheduled for
2008 * removal or a duplicate MAC address is found.
2010 entry_success = free_entries;
2011 vq->last_used_idx += entry_success;
2012 rte_pktmbuf_free(m);
2017 virtio_tx_route(dev, m, mbuf_pool, (uint16_t)dev->device_fh);
2018 vq->last_used_idx++;
2020 rte_pktmbuf_free(m);
2023 rte_compiler_barrier();
2024 vq->used->idx += entry_success;
2025 /* Kick guest if required. */
2026 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2027 eventfd_write((int)vq->kickfd, 1);
2032 * This function is called by each data core. It handles all RX/TX registered with the
2033 * core. For TX the specific lcore linked list is used. For RX, MAC addresses are compared
2034 * with all devices in the main linked list.
2037 switch_worker(__attribute__((unused)) void *arg)
2039 struct rte_mempool *mbuf_pool = arg;
2040 struct virtio_net *dev = NULL;
2041 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2042 struct virtio_net_data_ll *dev_ll;
2043 struct mbuf_table *tx_q;
2044 volatile struct lcore_ll_info *lcore_ll;
2045 const uint64_t drain_tsc = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S * BURST_TX_DRAIN_US;
2046 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2048 const uint16_t lcore_id = rte_lcore_id();
2049 const uint16_t num_cores = (uint16_t)rte_lcore_count();
2050 uint16_t rx_count = 0;
2051 uint32_t mergeable = 0;
2053 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2054 lcore_ll = lcore_info[lcore_id].lcore_ll;
2057 tx_q = &lcore_tx_queue[lcore_id];
2058 for (i = 0; i < num_cores; i ++) {
2059 if (lcore_ids[i] == lcore_id) {
2066 cur_tsc = rte_rdtsc();
2068 * TX burst queue drain
2070 diff_tsc = cur_tsc - prev_tsc;
2071 if (unlikely(diff_tsc > drain_tsc)) {
2074 LOG_DEBUG(VHOST_DATA, "TX queue drained after timeout with burst size %u \n", tx_q->len);
2076 /*Tx any packets in the queue*/
2077 ret = rte_eth_tx_burst(ports[0], (uint16_t)tx_q->txq_id,
2078 (struct rte_mbuf **)tx_q->m_table,
2079 (uint16_t)tx_q->len);
2080 if (unlikely(ret < tx_q->len)) {
2082 rte_pktmbuf_free(tx_q->m_table[ret]);
2083 } while (++ret < tx_q->len);
2093 rte_prefetch0(lcore_ll->ll_root_used);
2095 * Inform the configuration core that we have exited the linked list and that no devices are
2096 * in use if requested.
2098 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2099 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2104 dev_ll = lcore_ll->ll_root_used;
2106 while (dev_ll != NULL) {
2107 /*get virtio device ID*/
2110 dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF);
2113 dev_ll = dev_ll->next;
2115 dev->ready = DEVICE_SAFE_REMOVE;
2118 if (likely(dev->ready == DEVICE_RX)) {
2120 rx_count = rte_eth_rx_burst(ports[0],
2121 (uint16_t)dev->vmdq_rx_q, pkts_burst, MAX_PKT_BURST);
2124 if (likely(mergeable == 0))
2127 pkts_burst, rx_count);
2130 virtio_dev_merge_rx(dev,
2131 pkts_burst, rx_count);
2135 &dev_statistics[dev_ll->dev->device_fh].rx_total_atomic,
2138 &dev_statistics[dev_ll->dev->device_fh].rx_atomic, ret_count);
2140 while (likely(rx_count)) {
2142 rte_pktmbuf_free(pkts_burst[rx_count]);
2150 if (likely(mergeable == 0))
2151 virtio_dev_tx(dev, mbuf_pool);
2153 virtio_dev_merge_tx(dev, mbuf_pool);
2156 /*move to the next device in the list*/
2157 dev_ll = dev_ll->next;
2165 * This function gets available ring number for zero copy rx.
2166 * Only one thread will call this funciton for a paticular virtio device,
2167 * so, it is designed as non-thread-safe function.
2169 static inline uint32_t __attribute__((always_inline))
2170 get_available_ring_num_zcp(struct virtio_net *dev)
2172 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2175 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2176 return (uint32_t)(avail_idx - vq->last_used_idx_res);
2180 * This function gets available ring index for zero copy rx,
2181 * it will retry 'burst_rx_retry_num' times till it get enough ring index.
2182 * Only one thread will call this funciton for a paticular virtio device,
2183 * so, it is designed as non-thread-safe function.
2185 static inline uint32_t __attribute__((always_inline))
2186 get_available_ring_index_zcp(struct virtio_net *dev,
2187 uint16_t *res_base_idx, uint32_t count)
2189 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_RXQ];
2192 uint16_t free_entries;
2194 *res_base_idx = vq->last_used_idx_res;
2195 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2196 free_entries = (avail_idx - *res_base_idx);
2198 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") in get_available_ring_index_zcp: "
2200 "res base idx:%d, free entries:%d\n",
2201 dev->device_fh, avail_idx, *res_base_idx,
2205 * If retry is enabled and the queue is full then we wait
2206 * and retry to avoid packet loss.
2208 if (enable_retry && unlikely(count > free_entries)) {
2209 for (retry = 0; retry < burst_rx_retry_num; retry++) {
2210 rte_delay_us(burst_rx_delay_time);
2211 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2212 free_entries = (avail_idx - *res_base_idx);
2213 if (count <= free_entries)
2218 /*check that we have enough buffers*/
2219 if (unlikely(count > free_entries))
2220 count = free_entries;
2222 if (unlikely(count == 0)) {
2223 LOG_DEBUG(VHOST_DATA,
2224 "(%"PRIu64") Fail in get_available_ring_index_zcp: "
2225 "avail idx: %d, res base idx:%d, free entries:%d\n",
2226 dev->device_fh, avail_idx,
2227 *res_base_idx, free_entries);
2231 vq->last_used_idx_res = *res_base_idx + count;
2237 * This function put descriptor back to used list.
2239 static inline void __attribute__((always_inline))
2240 put_desc_to_used_list_zcp(struct vhost_virtqueue *vq, uint16_t desc_idx)
2242 uint16_t res_cur_idx = vq->last_used_idx;
2243 vq->used->ring[res_cur_idx & (vq->size - 1)].id = (uint32_t)desc_idx;
2244 vq->used->ring[res_cur_idx & (vq->size - 1)].len = 0;
2245 rte_compiler_barrier();
2246 *(volatile uint16_t *)&vq->used->idx += 1;
2247 vq->last_used_idx += 1;
2249 /* Kick the guest if necessary. */
2250 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2251 eventfd_write((int)vq->kickfd, 1);
2255 * This function get available descriptor from vitio vring and un-attached mbuf
2256 * from vpool->ring, and then attach them together. It needs adjust the offset
2257 * for buff_addr and phys_addr accroding to PMD implementation, otherwise the
2258 * frame data may be put to wrong location in mbuf.
2260 static inline void __attribute__((always_inline))
2261 attach_rxmbuf_zcp(struct virtio_net *dev)
2263 uint16_t res_base_idx, desc_idx;
2264 uint64_t buff_addr, phys_addr;
2265 struct vhost_virtqueue *vq;
2266 struct vring_desc *desc;
2267 struct rte_mbuf *mbuf = NULL;
2268 struct vpool *vpool;
2271 vpool = &vpool_array[dev->vmdq_rx_q];
2272 vq = dev->virtqueue[VIRTIO_RXQ];
2275 if (unlikely(get_available_ring_index_zcp(dev, &res_base_idx,
2278 desc_idx = vq->avail->ring[(res_base_idx) & (vq->size - 1)];
2280 desc = &vq->desc[desc_idx];
2281 if (desc->flags & VRING_DESC_F_NEXT) {
2282 desc = &vq->desc[desc->next];
2283 buff_addr = gpa_to_vva(dev, desc->addr);
2284 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len,
2287 buff_addr = gpa_to_vva(dev,
2288 desc->addr + vq->vhost_hlen);
2289 phys_addr = gpa_to_hpa(dev,
2290 desc->addr + vq->vhost_hlen,
2291 desc->len, &addr_type);
2294 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2295 RTE_LOG(ERR, VHOST_DATA, "(%"PRIu64") Invalid frame buffer"
2296 " address found when attaching RX frame buffer"
2297 " address!\n", dev->device_fh);
2298 put_desc_to_used_list_zcp(vq, desc_idx);
2303 * Check if the frame buffer address from guest crosses
2304 * sub-region or not.
2306 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2307 RTE_LOG(ERR, VHOST_DATA,
2308 "(%"PRIu64") Frame buffer address cross "
2309 "sub-regioin found when attaching RX frame "
2310 "buffer address!\n",
2312 put_desc_to_used_list_zcp(vq, desc_idx);
2315 } while (unlikely(phys_addr == 0));
2317 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2318 if (unlikely(mbuf == NULL)) {
2319 LOG_DEBUG(VHOST_DATA,
2320 "(%"PRIu64") in attach_rxmbuf_zcp: "
2321 "ring_sc_dequeue fail.\n",
2323 put_desc_to_used_list_zcp(vq, desc_idx);
2327 if (unlikely(vpool->buf_size > desc->len)) {
2328 LOG_DEBUG(VHOST_DATA,
2329 "(%"PRIu64") in attach_rxmbuf_zcp: frame buffer "
2330 "length(%d) of descriptor idx: %d less than room "
2331 "size required: %d\n",
2332 dev->device_fh, desc->len, desc_idx, vpool->buf_size);
2333 put_desc_to_used_list_zcp(vq, desc_idx);
2334 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2338 mbuf->buf_addr = (void *)(uintptr_t)(buff_addr - RTE_PKTMBUF_HEADROOM);
2339 mbuf->data_off = RTE_PKTMBUF_HEADROOM;
2340 mbuf->buf_physaddr = phys_addr - RTE_PKTMBUF_HEADROOM;
2341 mbuf->data_len = desc->len;
2342 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2344 LOG_DEBUG(VHOST_DATA,
2345 "(%"PRIu64") in attach_rxmbuf_zcp: res base idx:%d, "
2346 "descriptor idx:%d\n",
2347 dev->device_fh, res_base_idx, desc_idx);
2349 __rte_mbuf_raw_free(mbuf);
2355 * Detach an attched packet mbuf -
2356 * - restore original mbuf address and length values.
2357 * - reset pktmbuf data and data_len to their default values.
2358 * All other fields of the given packet mbuf will be left intact.
2361 * The attached packet mbuf.
2363 static inline void pktmbuf_detach_zcp(struct rte_mbuf *m)
2365 const struct rte_mempool *mp = m->pool;
2366 void *buf = RTE_MBUF_TO_BADDR(m);
2368 uint32_t buf_len = mp->elt_size - sizeof(*m);
2369 m->buf_physaddr = rte_mempool_virt2phy(mp, m) + sizeof(*m);
2372 m->buf_len = (uint16_t)buf_len;
2374 buf_ofs = (RTE_PKTMBUF_HEADROOM <= m->buf_len) ?
2375 RTE_PKTMBUF_HEADROOM : m->buf_len;
2376 m->data_off = buf_ofs;
2382 * This function is called after packets have been transimited. It fetchs mbuf
2383 * from vpool->pool, detached it and put into vpool->ring. It also update the
2384 * used index and kick the guest if necessary.
2386 static inline uint32_t __attribute__((always_inline))
2387 txmbuf_clean_zcp(struct virtio_net *dev, struct vpool *vpool)
2389 struct rte_mbuf *mbuf;
2390 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2391 uint32_t used_idx = vq->last_used_idx & (vq->size - 1);
2393 uint32_t mbuf_count = rte_mempool_count(vpool->pool);
2395 LOG_DEBUG(VHOST_DATA,
2396 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool before "
2398 dev->device_fh, mbuf_count);
2399 LOG_DEBUG(VHOST_DATA,
2400 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring before "
2402 dev->device_fh, rte_ring_count(vpool->ring));
2404 for (index = 0; index < mbuf_count; index++) {
2405 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2406 if (likely(RTE_MBUF_INDIRECT(mbuf)))
2407 pktmbuf_detach_zcp(mbuf);
2408 rte_ring_sp_enqueue(vpool->ring, mbuf);
2410 /* Update used index buffer information. */
2411 vq->used->ring[used_idx].id = MBUF_HEADROOM_UINT32(mbuf);
2412 vq->used->ring[used_idx].len = 0;
2414 used_idx = (used_idx + 1) & (vq->size - 1);
2417 LOG_DEBUG(VHOST_DATA,
2418 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in mempool after "
2420 dev->device_fh, rte_mempool_count(vpool->pool));
2421 LOG_DEBUG(VHOST_DATA,
2422 "(%"PRIu64") in txmbuf_clean_zcp: mbuf count in ring after "
2424 dev->device_fh, rte_ring_count(vpool->ring));
2425 LOG_DEBUG(VHOST_DATA,
2426 "(%"PRIu64") in txmbuf_clean_zcp: before updated "
2427 "vq->last_used_idx:%d\n",
2428 dev->device_fh, vq->last_used_idx);
2430 vq->last_used_idx += mbuf_count;
2432 LOG_DEBUG(VHOST_DATA,
2433 "(%"PRIu64") in txmbuf_clean_zcp: after updated "
2434 "vq->last_used_idx:%d\n",
2435 dev->device_fh, vq->last_used_idx);
2437 rte_compiler_barrier();
2439 *(volatile uint16_t *)&vq->used->idx += mbuf_count;
2441 /* Kick guest if required. */
2442 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2443 eventfd_write((int)vq->kickfd, 1);
2449 * This function is called when a virtio device is destroy.
2450 * It fetchs mbuf from vpool->pool, and detached it, and put into vpool->ring.
2452 static void mbuf_destroy_zcp(struct vpool *vpool)
2454 struct rte_mbuf *mbuf = NULL;
2455 uint32_t index, mbuf_count = rte_mempool_count(vpool->pool);
2457 LOG_DEBUG(VHOST_CONFIG,
2458 "in mbuf_destroy_zcp: mbuf count in mempool before "
2459 "mbuf_destroy_zcp is: %d\n",
2461 LOG_DEBUG(VHOST_CONFIG,
2462 "in mbuf_destroy_zcp: mbuf count in ring before "
2463 "mbuf_destroy_zcp is : %d\n",
2464 rte_ring_count(vpool->ring));
2466 for (index = 0; index < mbuf_count; index++) {
2467 mbuf = __rte_mbuf_raw_alloc(vpool->pool);
2468 if (likely(mbuf != NULL)) {
2469 if (likely(RTE_MBUF_INDIRECT(mbuf)))
2470 pktmbuf_detach_zcp(mbuf);
2471 rte_ring_sp_enqueue(vpool->ring, (void *)mbuf);
2475 LOG_DEBUG(VHOST_CONFIG,
2476 "in mbuf_destroy_zcp: mbuf count in mempool after "
2477 "mbuf_destroy_zcp is: %d\n",
2478 rte_mempool_count(vpool->pool));
2479 LOG_DEBUG(VHOST_CONFIG,
2480 "in mbuf_destroy_zcp: mbuf count in ring after "
2481 "mbuf_destroy_zcp is : %d\n",
2482 rte_ring_count(vpool->ring));
2486 * This function update the use flag and counter.
2488 static inline uint32_t __attribute__((always_inline))
2489 virtio_dev_rx_zcp(struct virtio_net *dev, struct rte_mbuf **pkts,
2492 struct vhost_virtqueue *vq;
2493 struct vring_desc *desc;
2494 struct rte_mbuf *buff;
2495 /* The virtio_hdr is initialised to 0. */
2496 struct virtio_net_hdr_mrg_rxbuf virtio_hdr
2497 = {{0, 0, 0, 0, 0, 0}, 0};
2498 uint64_t buff_hdr_addr = 0;
2499 uint32_t head[MAX_PKT_BURST], packet_len = 0;
2500 uint32_t head_idx, packet_success = 0;
2501 uint16_t res_cur_idx;
2503 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_rx()\n", dev->device_fh);
2508 vq = dev->virtqueue[VIRTIO_RXQ];
2509 count = (count > MAX_PKT_BURST) ? MAX_PKT_BURST : count;
2511 res_cur_idx = vq->last_used_idx;
2512 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Current Index %d| End Index %d\n",
2513 dev->device_fh, res_cur_idx, res_cur_idx + count);
2515 /* Retrieve all of the head indexes first to avoid caching issues. */
2516 for (head_idx = 0; head_idx < count; head_idx++)
2517 head[head_idx] = MBUF_HEADROOM_UINT32(pkts[head_idx]);
2519 /*Prefetch descriptor index. */
2520 rte_prefetch0(&vq->desc[head[packet_success]]);
2522 while (packet_success != count) {
2523 /* Get descriptor from available ring */
2524 desc = &vq->desc[head[packet_success]];
2526 buff = pkts[packet_success];
2527 LOG_DEBUG(VHOST_DATA,
2528 "(%"PRIu64") in dev_rx_zcp: update the used idx for "
2529 "pkt[%d] descriptor idx: %d\n",
2530 dev->device_fh, packet_success,
2531 MBUF_HEADROOM_UINT32(buff));
2534 (uintptr_t)(((uint64_t)(uintptr_t)buff->buf_addr)
2535 + RTE_PKTMBUF_HEADROOM),
2536 rte_pktmbuf_data_len(buff), 0);
2538 /* Buffer address translation for virtio header. */
2539 buff_hdr_addr = gpa_to_vva(dev, desc->addr);
2540 packet_len = rte_pktmbuf_data_len(buff) + vq->vhost_hlen;
2543 * If the descriptors are chained the header and data are
2544 * placed in separate buffers.
2546 if (desc->flags & VRING_DESC_F_NEXT) {
2547 desc->len = vq->vhost_hlen;
2548 desc = &vq->desc[desc->next];
2549 desc->len = rte_pktmbuf_data_len(buff);
2551 desc->len = packet_len;
2554 /* Update used ring with desc information */
2555 vq->used->ring[res_cur_idx & (vq->size - 1)].id
2556 = head[packet_success];
2557 vq->used->ring[res_cur_idx & (vq->size - 1)].len
2562 /* A header is required per buffer. */
2563 rte_memcpy((void *)(uintptr_t)buff_hdr_addr,
2564 (const void *)&virtio_hdr, vq->vhost_hlen);
2566 PRINT_PACKET(dev, (uintptr_t)buff_hdr_addr, vq->vhost_hlen, 1);
2568 if (likely(packet_success < count)) {
2569 /* Prefetch descriptor index. */
2570 rte_prefetch0(&vq->desc[head[packet_success]]);
2574 rte_compiler_barrier();
2576 LOG_DEBUG(VHOST_DATA,
2577 "(%"PRIu64") in dev_rx_zcp: before update used idx: "
2578 "vq.last_used_idx: %d, vq->used->idx: %d\n",
2579 dev->device_fh, vq->last_used_idx, vq->used->idx);
2581 *(volatile uint16_t *)&vq->used->idx += count;
2582 vq->last_used_idx += count;
2584 LOG_DEBUG(VHOST_DATA,
2585 "(%"PRIu64") in dev_rx_zcp: after update used idx: "
2586 "vq.last_used_idx: %d, vq->used->idx: %d\n",
2587 dev->device_fh, vq->last_used_idx, vq->used->idx);
2589 /* Kick the guest if necessary. */
2590 if (!(vq->avail->flags & VRING_AVAIL_F_NO_INTERRUPT))
2591 eventfd_write((int)vq->kickfd, 1);
2597 * This function routes the TX packet to the correct interface.
2598 * This may be a local device or the physical port.
2600 static inline void __attribute__((always_inline))
2601 virtio_tx_route_zcp(struct virtio_net *dev, struct rte_mbuf *m,
2602 uint32_t desc_idx, uint8_t need_copy)
2604 struct mbuf_table *tx_q;
2605 struct rte_mbuf **m_table;
2606 struct rte_mbuf *mbuf = NULL;
2607 unsigned len, ret, offset = 0;
2608 struct vpool *vpool;
2609 struct virtio_net_data_ll *dev_ll = ll_root_used;
2610 struct ether_hdr *pkt_hdr = rte_pktmbuf_mtod(m, struct ether_hdr *);
2611 uint16_t vlan_tag = (uint16_t)vlan_tags[(uint16_t)dev->device_fh];
2613 /*Add packet to the port tx queue*/
2614 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2617 /* Allocate an mbuf and populate the structure. */
2618 vpool = &vpool_array[MAX_QUEUES + (uint16_t)dev->vmdq_rx_q];
2619 rte_ring_sc_dequeue(vpool->ring, (void **)&mbuf);
2620 if (unlikely(mbuf == NULL)) {
2621 struct vhost_virtqueue *vq = dev->virtqueue[VIRTIO_TXQ];
2622 RTE_LOG(ERR, VHOST_DATA,
2623 "(%"PRIu64") Failed to allocate memory for mbuf.\n",
2625 put_desc_to_used_list_zcp(vq, desc_idx);
2629 if (vm2vm_mode == VM2VM_HARDWARE) {
2630 /* Avoid using a vlan tag from any vm for external pkt, such as
2631 * vlan_tags[dev->device_fh], oterwise, it conflicts when pool
2632 * selection, MAC address determines it as an external pkt
2633 * which should go to network, while vlan tag determine it as
2634 * a vm2vm pkt should forward to another vm. Hardware confuse
2635 * such a ambiguous situation, so pkt will lost.
2637 vlan_tag = external_pkt_default_vlan_tag;
2638 while (dev_ll != NULL) {
2639 if (likely(dev_ll->dev->ready == DEVICE_RX) &&
2640 ether_addr_cmp(&(pkt_hdr->d_addr),
2641 &dev_ll->dev->mac_address)) {
2644 * Drop the packet if the TX packet is destined
2645 * for the TX device.
2647 if (unlikely(dev_ll->dev->device_fh
2648 == dev->device_fh)) {
2649 LOG_DEBUG(VHOST_DATA,
2650 "(%"PRIu64") TX: Source and destination"
2651 "MAC addresses are the same. Dropping "
2653 dev_ll->dev->device_fh);
2654 MBUF_HEADROOM_UINT32(mbuf)
2655 = (uint32_t)desc_idx;
2656 __rte_mbuf_raw_free(mbuf);
2661 * Packet length offset 4 bytes for HW vlan
2662 * strip when L2 switch back.
2667 vlan_tags[(uint16_t)dev_ll->dev->device_fh];
2669 LOG_DEBUG(VHOST_DATA,
2670 "(%"PRIu64") TX: pkt to local VM device id:"
2671 "(%"PRIu64") vlan tag: %d.\n",
2672 dev->device_fh, dev_ll->dev->device_fh,
2677 dev_ll = dev_ll->next;
2681 mbuf->nb_segs = m->nb_segs;
2682 mbuf->next = m->next;
2683 mbuf->data_len = m->data_len + offset;
2684 mbuf->pkt_len = mbuf->data_len;
2685 if (unlikely(need_copy)) {
2686 /* Copy the packet contents to the mbuf. */
2687 rte_memcpy(rte_pktmbuf_mtod(mbuf, void *),
2688 rte_pktmbuf_mtod(m, void *),
2691 mbuf->data_off = m->data_off;
2692 mbuf->buf_physaddr = m->buf_physaddr;
2693 mbuf->buf_addr = m->buf_addr;
2695 mbuf->ol_flags = PKT_TX_VLAN_PKT;
2696 mbuf->vlan_tci = vlan_tag;
2697 mbuf->l2_len = sizeof(struct ether_hdr);
2698 mbuf->l3_len = sizeof(struct ipv4_hdr);
2699 MBUF_HEADROOM_UINT32(mbuf) = (uint32_t)desc_idx;
2701 tx_q->m_table[len] = mbuf;
2704 LOG_DEBUG(VHOST_DATA,
2705 "(%"PRIu64") in tx_route_zcp: pkt: nb_seg: %d, next:%s\n",
2708 (mbuf->next == NULL) ? "null" : "non-null");
2711 dev_statistics[dev->device_fh].tx_total++;
2712 dev_statistics[dev->device_fh].tx++;
2715 if (unlikely(len == MAX_PKT_BURST)) {
2716 m_table = (struct rte_mbuf **)tx_q->m_table;
2717 ret = rte_eth_tx_burst(ports[0],
2718 (uint16_t)tx_q->txq_id, m_table, (uint16_t) len);
2721 * Free any buffers not handled by TX and update
2724 if (unlikely(ret < len)) {
2726 rte_pktmbuf_free(m_table[ret]);
2727 } while (++ret < len);
2731 txmbuf_clean_zcp(dev, vpool);
2740 * This function TX all available packets in virtio TX queue for one
2741 * virtio-net device. If it is first packet, it learns MAC address and
2744 static inline void __attribute__((always_inline))
2745 virtio_dev_tx_zcp(struct virtio_net *dev)
2748 struct vhost_virtqueue *vq;
2749 struct vring_desc *desc;
2750 uint64_t buff_addr = 0, phys_addr;
2751 uint32_t head[MAX_PKT_BURST];
2753 uint16_t free_entries, packet_success = 0;
2755 uint8_t need_copy = 0;
2758 vq = dev->virtqueue[VIRTIO_TXQ];
2759 avail_idx = *((volatile uint16_t *)&vq->avail->idx);
2761 /* If there are no available buffers then return. */
2762 if (vq->last_used_idx_res == avail_idx)
2765 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") virtio_dev_tx()\n", dev->device_fh);
2767 /* Prefetch available ring to retrieve head indexes. */
2768 rte_prefetch0(&vq->avail->ring[vq->last_used_idx_res & (vq->size - 1)]);
2770 /* Get the number of free entries in the ring */
2771 free_entries = (avail_idx - vq->last_used_idx_res);
2773 /* Limit to MAX_PKT_BURST. */
2775 = (free_entries > MAX_PKT_BURST) ? MAX_PKT_BURST : free_entries;
2777 LOG_DEBUG(VHOST_DATA, "(%"PRIu64") Buffers available %d\n",
2778 dev->device_fh, free_entries);
2780 /* Retrieve all of the head indexes first to avoid caching issues. */
2781 for (i = 0; i < free_entries; i++)
2783 = vq->avail->ring[(vq->last_used_idx_res + i)
2786 vq->last_used_idx_res += free_entries;
2788 /* Prefetch descriptor index. */
2789 rte_prefetch0(&vq->desc[head[packet_success]]);
2790 rte_prefetch0(&vq->used->ring[vq->last_used_idx & (vq->size - 1)]);
2792 while (packet_success < free_entries) {
2793 desc = &vq->desc[head[packet_success]];
2795 /* Discard first buffer as it is the virtio header */
2796 desc = &vq->desc[desc->next];
2798 /* Buffer address translation. */
2799 buff_addr = gpa_to_vva(dev, desc->addr);
2800 phys_addr = gpa_to_hpa(dev, desc->addr, desc->len, &addr_type);
2802 if (likely(packet_success < (free_entries - 1)))
2803 /* Prefetch descriptor index. */
2804 rte_prefetch0(&vq->desc[head[packet_success + 1]]);
2806 if (unlikely(addr_type == PHYS_ADDR_INVALID)) {
2807 RTE_LOG(ERR, VHOST_DATA,
2808 "(%"PRIu64") Invalid frame buffer address found"
2809 "when TX packets!\n",
2815 /* Prefetch buffer address. */
2816 rte_prefetch0((void *)(uintptr_t)buff_addr);
2819 * Setup dummy mbuf. This is copied to a real mbuf if
2820 * transmitted out the physical port.
2822 m.data_len = desc->len;
2826 m.buf_addr = (void *)(uintptr_t)buff_addr;
2827 m.buf_physaddr = phys_addr;
2830 * Check if the frame buffer address from guest crosses
2831 * sub-region or not.
2833 if (unlikely(addr_type == PHYS_ADDR_CROSS_SUBREG)) {
2834 RTE_LOG(ERR, VHOST_DATA,
2835 "(%"PRIu64") Frame buffer address cross "
2836 "sub-regioin found when attaching TX frame "
2837 "buffer address!\n",
2843 PRINT_PACKET(dev, (uintptr_t)buff_addr, desc->len, 0);
2846 * If this is the first received packet we need to learn
2847 * the MAC and setup VMDQ
2849 if (unlikely(dev->ready == DEVICE_MAC_LEARNING)) {
2850 if (dev->remove || (link_vmdq(dev, &m) == -1)) {
2852 * Discard frame if device is scheduled for
2853 * removal or a duplicate MAC address is found.
2855 packet_success += free_entries;
2856 vq->last_used_idx += packet_success;
2861 virtio_tx_route_zcp(dev, &m, head[packet_success], need_copy);
2867 * This function is called by each data core. It handles all RX/TX registered
2868 * with the core. For TX the specific lcore linked list is used. For RX, MAC
2869 * addresses are compared with all devices in the main linked list.
2872 switch_worker_zcp(__attribute__((unused)) void *arg)
2874 struct virtio_net *dev = NULL;
2875 struct rte_mbuf *pkts_burst[MAX_PKT_BURST];
2876 struct virtio_net_data_ll *dev_ll;
2877 struct mbuf_table *tx_q;
2878 volatile struct lcore_ll_info *lcore_ll;
2879 const uint64_t drain_tsc
2880 = (rte_get_tsc_hz() + US_PER_S - 1) / US_PER_S
2881 * BURST_TX_DRAIN_US;
2882 uint64_t prev_tsc, diff_tsc, cur_tsc, ret_count = 0;
2884 const uint16_t lcore_id = rte_lcore_id();
2885 uint16_t count_in_ring, rx_count = 0;
2887 RTE_LOG(INFO, VHOST_DATA, "Procesing on Core %u started\n", lcore_id);
2889 lcore_ll = lcore_info[lcore_id].lcore_ll;
2893 cur_tsc = rte_rdtsc();
2895 /* TX burst queue drain */
2896 diff_tsc = cur_tsc - prev_tsc;
2897 if (unlikely(diff_tsc > drain_tsc)) {
2899 * Get mbuf from vpool.pool and detach mbuf and
2900 * put back into vpool.ring.
2902 dev_ll = lcore_ll->ll_root_used;
2903 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2904 /* Get virtio device ID */
2907 if (likely(!dev->remove)) {
2908 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
2910 LOG_DEBUG(VHOST_DATA,
2911 "TX queue drained after timeout"
2912 " with burst size %u\n",
2916 * Tx any packets in the queue
2918 ret = rte_eth_tx_burst(
2920 (uint16_t)tx_q->txq_id,
2921 (struct rte_mbuf **)
2923 (uint16_t)tx_q->len);
2924 if (unlikely(ret < tx_q->len)) {
2927 tx_q->m_table[ret]);
2928 } while (++ret < tx_q->len);
2932 txmbuf_clean_zcp(dev,
2933 &vpool_array[MAX_QUEUES+dev->vmdq_rx_q]);
2936 dev_ll = dev_ll->next;
2941 rte_prefetch0(lcore_ll->ll_root_used);
2944 * Inform the configuration core that we have exited the linked
2945 * list and that no devices are in use if requested.
2947 if (lcore_ll->dev_removal_flag == REQUEST_DEV_REMOVAL)
2948 lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
2950 /* Process devices */
2951 dev_ll = lcore_ll->ll_root_used;
2953 while ((dev_ll != NULL) && (dev_ll->dev != NULL)) {
2955 if (unlikely(dev->remove)) {
2956 dev_ll = dev_ll->next;
2958 dev->ready = DEVICE_SAFE_REMOVE;
2962 if (likely(dev->ready == DEVICE_RX)) {
2963 uint32_t index = dev->vmdq_rx_q;
2966 = rte_ring_count(vpool_array[index].ring);
2967 uint16_t free_entries
2968 = (uint16_t)get_available_ring_num_zcp(dev);
2971 * Attach all mbufs in vpool.ring and put back
2975 i < RTE_MIN(free_entries,
2976 RTE_MIN(count_in_ring, MAX_PKT_BURST));
2978 attach_rxmbuf_zcp(dev);
2980 /* Handle guest RX */
2981 rx_count = rte_eth_rx_burst(ports[0],
2982 (uint16_t)dev->vmdq_rx_q, pkts_burst,
2986 ret_count = virtio_dev_rx_zcp(dev,
2987 pkts_burst, rx_count);
2989 dev_statistics[dev->device_fh].rx_total
2991 dev_statistics[dev->device_fh].rx
2994 while (likely(rx_count)) {
2997 pkts_burst[rx_count]);
2998 rte_ring_sp_enqueue(
2999 vpool_array[index].ring,
3000 (void *)pkts_burst[rx_count]);
3005 if (likely(!dev->remove))
3006 /* Handle guest TX */
3007 virtio_dev_tx_zcp(dev);
3009 /* Move to the next device in the list */
3010 dev_ll = dev_ll->next;
3019 * Add an entry to a used linked list. A free entry must first be found
3020 * in the free linked list using get_data_ll_free_entry();
3023 add_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3024 struct virtio_net_data_ll *ll_dev)
3026 struct virtio_net_data_ll *ll = *ll_root_addr;
3028 /* Set next as NULL and use a compiler barrier to avoid reordering. */
3029 ll_dev->next = NULL;
3030 rte_compiler_barrier();
3032 /* If ll == NULL then this is the first device. */
3034 /* Increment to the tail of the linked list. */
3035 while ((ll->next != NULL) )
3040 *ll_root_addr = ll_dev;
3045 * Remove an entry from a used linked list. The entry must then be added to
3046 * the free linked list using put_data_ll_free_entry().
3049 rm_data_ll_entry(struct virtio_net_data_ll **ll_root_addr,
3050 struct virtio_net_data_ll *ll_dev,
3051 struct virtio_net_data_ll *ll_dev_last)
3053 struct virtio_net_data_ll *ll = *ll_root_addr;
3055 if (unlikely((ll == NULL) || (ll_dev == NULL)))
3059 *ll_root_addr = ll_dev->next;
3061 if (likely(ll_dev_last != NULL))
3062 ll_dev_last->next = ll_dev->next;
3064 RTE_LOG(ERR, VHOST_CONFIG, "Remove entry form ll failed.\n");
3068 * Find and return an entry from the free linked list.
3070 static struct virtio_net_data_ll *
3071 get_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr)
3073 struct virtio_net_data_ll *ll_free = *ll_root_addr;
3074 struct virtio_net_data_ll *ll_dev;
3076 if (ll_free == NULL)
3080 *ll_root_addr = ll_free->next;
3086 * Place an entry back on to the free linked list.
3089 put_data_ll_free_entry(struct virtio_net_data_ll **ll_root_addr,
3090 struct virtio_net_data_ll *ll_dev)
3092 struct virtio_net_data_ll *ll_free = *ll_root_addr;
3097 ll_dev->next = ll_free;
3098 *ll_root_addr = ll_dev;
3102 * Creates a linked list of a given size.
3104 static struct virtio_net_data_ll *
3105 alloc_data_ll(uint32_t size)
3107 struct virtio_net_data_ll *ll_new;
3110 /* Malloc and then chain the linked list. */
3111 ll_new = malloc(size * sizeof(struct virtio_net_data_ll));
3112 if (ll_new == NULL) {
3113 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for ll_new.\n");
3117 for (i = 0; i < size - 1; i++) {
3118 ll_new[i].dev = NULL;
3119 ll_new[i].next = &ll_new[i+1];
3121 ll_new[i].next = NULL;
3127 * Create the main linked list along with each individual cores linked list. A used and a free list
3128 * are created to manage entries.
3135 RTE_LCORE_FOREACH_SLAVE(lcore) {
3136 lcore_info[lcore].lcore_ll = malloc(sizeof(struct lcore_ll_info));
3137 if (lcore_info[lcore].lcore_ll == NULL) {
3138 RTE_LOG(ERR, VHOST_CONFIG, "Failed to allocate memory for lcore_ll.\n");
3142 lcore_info[lcore].lcore_ll->device_num = 0;
3143 lcore_info[lcore].lcore_ll->dev_removal_flag = ACK_DEV_REMOVAL;
3144 lcore_info[lcore].lcore_ll->ll_root_used = NULL;
3145 if (num_devices % num_switching_cores)
3146 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll((num_devices / num_switching_cores) + 1);
3148 lcore_info[lcore].lcore_ll->ll_root_free = alloc_data_ll(num_devices / num_switching_cores);
3151 /* Allocate devices up to a maximum of MAX_DEVICES. */
3152 ll_root_free = alloc_data_ll(MIN((num_devices), MAX_DEVICES));
3158 * Set virtqueue flags so that we do not receive interrupts.
3161 set_irq_status (struct virtio_net *dev)
3163 dev->virtqueue[VIRTIO_RXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3164 dev->virtqueue[VIRTIO_TXQ]->used->flags = VRING_USED_F_NO_NOTIFY;
3168 * Remove a device from the specific data core linked list and from the main linked list. Synchonization
3169 * occurs through the use of the lcore dev_removal_flag. Device is made volatile here to avoid re-ordering
3170 * of dev->remove=1 which can cause an infinite loop in the rte_pause loop.
3173 destroy_device (volatile struct virtio_net *dev)
3175 struct virtio_net_data_ll *ll_lcore_dev_cur;
3176 struct virtio_net_data_ll *ll_main_dev_cur;
3177 struct virtio_net_data_ll *ll_lcore_dev_last = NULL;
3178 struct virtio_net_data_ll *ll_main_dev_last = NULL;
3181 dev->flags &= ~VIRTIO_DEV_RUNNING;
3183 /*set the remove flag. */
3186 while(dev->ready != DEVICE_SAFE_REMOVE) {
3190 /* Search for entry to be removed from lcore ll */
3191 ll_lcore_dev_cur = lcore_info[dev->coreid].lcore_ll->ll_root_used;
3192 while (ll_lcore_dev_cur != NULL) {
3193 if (ll_lcore_dev_cur->dev == dev) {
3196 ll_lcore_dev_last = ll_lcore_dev_cur;
3197 ll_lcore_dev_cur = ll_lcore_dev_cur->next;
3201 if (ll_lcore_dev_cur == NULL) {
3202 RTE_LOG(ERR, VHOST_CONFIG,
3203 "(%"PRIu64") Failed to find the dev to be destroy.\n",
3208 /* Search for entry to be removed from main ll */
3209 ll_main_dev_cur = ll_root_used;
3210 ll_main_dev_last = NULL;
3211 while (ll_main_dev_cur != NULL) {
3212 if (ll_main_dev_cur->dev == dev) {
3215 ll_main_dev_last = ll_main_dev_cur;
3216 ll_main_dev_cur = ll_main_dev_cur->next;
3220 /* Remove entries from the lcore and main ll. */
3221 rm_data_ll_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_used, ll_lcore_dev_cur, ll_lcore_dev_last);
3222 rm_data_ll_entry(&ll_root_used, ll_main_dev_cur, ll_main_dev_last);
3224 /* Set the dev_removal_flag on each lcore. */
3225 RTE_LCORE_FOREACH_SLAVE(lcore) {
3226 lcore_info[lcore].lcore_ll->dev_removal_flag = REQUEST_DEV_REMOVAL;
3230 * Once each core has set the dev_removal_flag to ACK_DEV_REMOVAL we can be sure that
3231 * they can no longer access the device removed from the linked lists and that the devices
3232 * are no longer in use.
3234 RTE_LCORE_FOREACH_SLAVE(lcore) {
3235 while (lcore_info[lcore].lcore_ll->dev_removal_flag != ACK_DEV_REMOVAL) {
3240 /* Add the entries back to the lcore and main free ll.*/
3241 put_data_ll_free_entry(&lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->ll_root_free, ll_lcore_dev_cur);
3242 put_data_ll_free_entry(&ll_root_free, ll_main_dev_cur);
3244 /* Decrement number of device on the lcore. */
3245 lcore_info[ll_lcore_dev_cur->dev->coreid].lcore_ll->device_num--;
3247 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been removed from data core\n", dev->device_fh);
3250 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3252 /* Stop the RX queue. */
3253 if (rte_eth_dev_rx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3254 LOG_DEBUG(VHOST_CONFIG,
3255 "(%"PRIu64") In destroy_device: Failed to stop "
3261 LOG_DEBUG(VHOST_CONFIG,
3262 "(%"PRIu64") in destroy_device: Start put mbuf in "
3263 "mempool back to ring for RX queue: %d\n",
3264 dev->device_fh, dev->vmdq_rx_q);
3266 mbuf_destroy_zcp(vpool);
3268 /* Stop the TX queue. */
3269 if (rte_eth_dev_tx_queue_stop(ports[0], dev->vmdq_rx_q) != 0) {
3270 LOG_DEBUG(VHOST_CONFIG,
3271 "(%"PRIu64") In destroy_device: Failed to "
3272 "stop tx queue:%d\n",
3273 dev->device_fh, dev->vmdq_rx_q);
3276 vpool = &vpool_array[dev->vmdq_rx_q + MAX_QUEUES];
3278 LOG_DEBUG(VHOST_CONFIG,
3279 "(%"PRIu64") destroy_device: Start put mbuf in mempool "
3280 "back to ring for TX queue: %d, dev:(%"PRIu64")\n",
3281 dev->device_fh, (dev->vmdq_rx_q + MAX_QUEUES),
3284 mbuf_destroy_zcp(vpool);
3290 * A new device is added to a data core. First the device is added to the main linked list
3291 * and the allocated to a specific data core.
3294 new_device (struct virtio_net *dev)
3296 struct virtio_net_data_ll *ll_dev;
3297 int lcore, core_add = 0;
3298 uint32_t device_num_min = num_devices;
3300 /* Add device to main ll */
3301 ll_dev = get_data_ll_free_entry(&ll_root_free);
3302 if (ll_dev == NULL) {
3303 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") No free entry found in linked list. Device limit "
3304 "of %d devices per core has been reached\n",
3305 dev->device_fh, num_devices);
3309 add_data_ll_entry(&ll_root_used, ll_dev);
3310 ll_dev->dev->vmdq_rx_q
3311 = ll_dev->dev->device_fh * (num_queues / num_devices);
3314 uint32_t index = ll_dev->dev->vmdq_rx_q;
3315 uint32_t count_in_ring, i;
3316 struct mbuf_table *tx_q;
3318 count_in_ring = rte_ring_count(vpool_array[index].ring);
3320 LOG_DEBUG(VHOST_CONFIG,
3321 "(%"PRIu64") in new_device: mbuf count in mempool "
3322 "before attach is: %d\n",
3324 rte_mempool_count(vpool_array[index].pool));
3325 LOG_DEBUG(VHOST_CONFIG,
3326 "(%"PRIu64") in new_device: mbuf count in ring "
3327 "before attach is : %d\n",
3328 dev->device_fh, count_in_ring);
3331 * Attach all mbufs in vpool.ring and put back intovpool.pool.
3333 for (i = 0; i < count_in_ring; i++)
3334 attach_rxmbuf_zcp(dev);
3336 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3337 "mempool after attach is: %d\n",
3339 rte_mempool_count(vpool_array[index].pool));
3340 LOG_DEBUG(VHOST_CONFIG, "(%"PRIu64") in new_device: mbuf count in "
3341 "ring after attach is : %d\n",
3343 rte_ring_count(vpool_array[index].ring));
3345 tx_q = &tx_queue_zcp[(uint16_t)dev->vmdq_rx_q];
3346 tx_q->txq_id = dev->vmdq_rx_q;
3348 if (rte_eth_dev_tx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3349 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3351 LOG_DEBUG(VHOST_CONFIG,
3352 "(%"PRIu64") In new_device: Failed to start "
3354 dev->device_fh, dev->vmdq_rx_q);
3356 mbuf_destroy_zcp(vpool);
3360 if (rte_eth_dev_rx_queue_start(ports[0], dev->vmdq_rx_q) != 0) {
3361 struct vpool *vpool = &vpool_array[dev->vmdq_rx_q];
3363 LOG_DEBUG(VHOST_CONFIG,
3364 "(%"PRIu64") In new_device: Failed to start "
3366 dev->device_fh, dev->vmdq_rx_q);
3368 /* Stop the TX queue. */
3369 if (rte_eth_dev_tx_queue_stop(ports[0],
3370 dev->vmdq_rx_q) != 0) {
3371 LOG_DEBUG(VHOST_CONFIG,
3372 "(%"PRIu64") In new_device: Failed to "
3373 "stop tx queue:%d\n",
3374 dev->device_fh, dev->vmdq_rx_q);
3377 mbuf_destroy_zcp(vpool);
3383 /*reset ready flag*/
3384 dev->ready = DEVICE_MAC_LEARNING;
3387 /* Find a suitable lcore to add the device. */
3388 RTE_LCORE_FOREACH_SLAVE(lcore) {
3389 if (lcore_info[lcore].lcore_ll->device_num < device_num_min) {
3390 device_num_min = lcore_info[lcore].lcore_ll->device_num;
3394 /* Add device to lcore ll */
3395 ll_dev->dev->coreid = core_add;
3396 ll_dev = get_data_ll_free_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_free);
3397 if (ll_dev == NULL) {
3398 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Failed to add device to data core\n", dev->device_fh);
3399 dev->ready = DEVICE_SAFE_REMOVE;
3400 destroy_device(dev);
3404 add_data_ll_entry(&lcore_info[ll_dev->dev->coreid].lcore_ll->ll_root_used, ll_dev);
3406 /* Initialize device stats */
3407 memset(&dev_statistics[dev->device_fh], 0, sizeof(struct device_statistics));
3409 /* Disable notifications. */
3410 set_irq_status(dev);
3411 lcore_info[ll_dev->dev->coreid].lcore_ll->device_num++;
3412 dev->flags |= VIRTIO_DEV_RUNNING;
3414 RTE_LOG(INFO, VHOST_DATA, "(%"PRIu64") Device has been added to data core %d\n", dev->device_fh, dev->coreid);
3420 * These callback allow devices to be added to the data core when configuration
3421 * has been fully complete.
3423 static const struct virtio_net_device_ops virtio_net_device_ops =
3425 .new_device = new_device,
3426 .destroy_device = destroy_device,
3430 * This is a thread will wake up after a period to print stats if the user has
3436 struct virtio_net_data_ll *dev_ll;
3437 uint64_t tx_dropped, rx_dropped;
3438 uint64_t tx, tx_total, rx, rx_total;
3440 const char clr[] = { 27, '[', '2', 'J', '\0' };
3441 const char top_left[] = { 27, '[', '1', ';', '1', 'H','\0' };
3444 sleep(enable_stats);
3446 /* Clear screen and move to top left */
3447 printf("%s%s", clr, top_left);
3449 printf("\nDevice statistics ====================================");
3451 dev_ll = ll_root_used;
3452 while (dev_ll != NULL) {
3453 device_fh = (uint32_t)dev_ll->dev->device_fh;
3454 tx_total = dev_statistics[device_fh].tx_total;
3455 tx = dev_statistics[device_fh].tx;
3456 tx_dropped = tx_total - tx;
3457 if (zero_copy == 0) {
3458 rx_total = rte_atomic64_read(
3459 &dev_statistics[device_fh].rx_total_atomic);
3460 rx = rte_atomic64_read(
3461 &dev_statistics[device_fh].rx_atomic);
3463 rx_total = dev_statistics[device_fh].rx_total;
3464 rx = dev_statistics[device_fh].rx;
3466 rx_dropped = rx_total - rx;
3468 printf("\nStatistics for device %"PRIu32" ------------------------------"
3469 "\nTX total: %"PRIu64""
3470 "\nTX dropped: %"PRIu64""
3471 "\nTX successful: %"PRIu64""
3472 "\nRX total: %"PRIu64""
3473 "\nRX dropped: %"PRIu64""
3474 "\nRX successful: %"PRIu64"",
3483 dev_ll = dev_ll->next;
3485 printf("\n======================================================\n");
3490 setup_mempool_tbl(int socket, uint32_t index, char *pool_name,
3491 char *ring_name, uint32_t nb_mbuf)
3493 uint16_t roomsize = VIRTIO_DESCRIPTOR_LEN_ZCP + RTE_PKTMBUF_HEADROOM;
3494 vpool_array[index].pool
3495 = rte_mempool_create(pool_name, nb_mbuf, MBUF_SIZE_ZCP,
3496 MBUF_CACHE_SIZE_ZCP, sizeof(struct rte_pktmbuf_pool_private),
3497 rte_pktmbuf_pool_init, (void *)(uintptr_t)roomsize,
3498 rte_pktmbuf_init, NULL, socket, 0);
3499 if (vpool_array[index].pool != NULL) {
3500 vpool_array[index].ring
3501 = rte_ring_create(ring_name,
3502 rte_align32pow2(nb_mbuf + 1),
3503 socket, RING_F_SP_ENQ | RING_F_SC_DEQ);
3504 if (likely(vpool_array[index].ring != NULL)) {
3505 LOG_DEBUG(VHOST_CONFIG,
3506 "in setup_mempool_tbl: mbuf count in "
3508 rte_mempool_count(vpool_array[index].pool));
3509 LOG_DEBUG(VHOST_CONFIG,
3510 "in setup_mempool_tbl: mbuf count in "
3512 rte_ring_count(vpool_array[index].ring));
3514 rte_exit(EXIT_FAILURE, "ring_create(%s) failed",
3518 /* Need consider head room. */
3519 vpool_array[index].buf_size = roomsize - RTE_PKTMBUF_HEADROOM;
3521 rte_exit(EXIT_FAILURE, "mempool_create(%s) failed", pool_name);
3527 * Main function, does initialisation and calls the per-lcore functions. The CUSE
3528 * device is also registered here to handle the IOCTLs.
3531 MAIN(int argc, char *argv[])
3533 struct rte_mempool *mbuf_pool = NULL;
3534 unsigned lcore_id, core_id = 0;
3535 unsigned nb_ports, valid_num_ports;
3537 uint8_t portid, queue_id = 0;
3538 static pthread_t tid;
3541 ret = rte_eal_init(argc, argv);
3543 rte_exit(EXIT_FAILURE, "Error with EAL initialization\n");
3547 /* parse app arguments */
3548 ret = us_vhost_parse_args(argc, argv);
3550 rte_exit(EXIT_FAILURE, "Invalid argument\n");
3552 if (rte_eal_pci_probe() != 0)
3553 rte_exit(EXIT_FAILURE, "Error with NIC driver initialization\n");
3555 for (lcore_id = 0; lcore_id < RTE_MAX_LCORE; lcore_id ++)
3556 if (rte_lcore_is_enabled(lcore_id))
3557 lcore_ids[core_id ++] = lcore_id;
3559 if (rte_lcore_count() > RTE_MAX_LCORE)
3560 rte_exit(EXIT_FAILURE,"Not enough cores\n");
3562 /*set the number of swithcing cores available*/
3563 num_switching_cores = rte_lcore_count()-1;
3565 /* Get the number of physical ports. */
3566 nb_ports = rte_eth_dev_count();
3567 if (nb_ports > RTE_MAX_ETHPORTS)
3568 nb_ports = RTE_MAX_ETHPORTS;
3571 * Update the global var NUM_PORTS and global array PORTS
3572 * and get value of var VALID_NUM_PORTS according to system ports number
3574 valid_num_ports = check_ports_num(nb_ports);
3576 if ((valid_num_ports == 0) || (valid_num_ports > MAX_SUP_PORTS)) {
3577 RTE_LOG(INFO, VHOST_PORT, "Current enabled port number is %u,"
3578 "but only %u port can be enabled\n",num_ports, MAX_SUP_PORTS);
3582 if (zero_copy == 0) {
3583 /* Create the mbuf pool. */
3584 mbuf_pool = rte_mempool_create(
3588 MBUF_SIZE, MBUF_CACHE_SIZE,
3589 sizeof(struct rte_pktmbuf_pool_private),
3590 rte_pktmbuf_pool_init, NULL,
3591 rte_pktmbuf_init, NULL,
3592 rte_socket_id(), 0);
3593 if (mbuf_pool == NULL)
3594 rte_exit(EXIT_FAILURE, "Cannot create mbuf pool\n");
3596 for (queue_id = 0; queue_id < MAX_QUEUES + 1; queue_id++)
3597 vpool_array[queue_id].pool = mbuf_pool;
3599 if (vm2vm_mode == VM2VM_HARDWARE) {
3600 /* Enable VT loop back to let L2 switch to do it. */
3601 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3602 LOG_DEBUG(VHOST_CONFIG,
3603 "Enable loop back for L2 switch in vmdq.\n");
3607 char pool_name[RTE_MEMPOOL_NAMESIZE];
3608 char ring_name[RTE_MEMPOOL_NAMESIZE];
3610 rx_conf_default.start_rx_per_q = (uint8_t)zero_copy;
3611 rx_conf_default.rx_drop_en = 0;
3612 tx_conf_default.start_tx_per_q = (uint8_t)zero_copy;
3613 nb_mbuf = num_rx_descriptor
3614 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3615 + num_switching_cores * MAX_PKT_BURST;
3617 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3618 snprintf(pool_name, sizeof(pool_name),
3619 "rxmbuf_pool_%u", queue_id);
3620 snprintf(ring_name, sizeof(ring_name),
3621 "rxmbuf_ring_%u", queue_id);
3622 setup_mempool_tbl(rte_socket_id(), queue_id,
3623 pool_name, ring_name, nb_mbuf);
3626 nb_mbuf = num_tx_descriptor
3627 + num_switching_cores * MBUF_CACHE_SIZE_ZCP
3628 + num_switching_cores * MAX_PKT_BURST;
3630 for (queue_id = 0; queue_id < MAX_QUEUES; queue_id++) {
3631 snprintf(pool_name, sizeof(pool_name),
3632 "txmbuf_pool_%u", queue_id);
3633 snprintf(ring_name, sizeof(ring_name),
3634 "txmbuf_ring_%u", queue_id);
3635 setup_mempool_tbl(rte_socket_id(),
3636 (queue_id + MAX_QUEUES),
3637 pool_name, ring_name, nb_mbuf);
3640 if (vm2vm_mode == VM2VM_HARDWARE) {
3641 /* Enable VT loop back to let L2 switch to do it. */
3642 vmdq_conf_default.rx_adv_conf.vmdq_rx_conf.enable_loop_back = 1;
3643 LOG_DEBUG(VHOST_CONFIG,
3644 "Enable loop back for L2 switch in vmdq.\n");
3647 /* Set log level. */
3648 rte_set_log_level(LOG_LEVEL);
3650 /* initialize all ports */
3651 for (portid = 0; portid < nb_ports; portid++) {
3652 /* skip ports that are not enabled */
3653 if ((enabled_port_mask & (1 << portid)) == 0) {
3654 RTE_LOG(INFO, VHOST_PORT,
3655 "Skipping disabled port %d\n", portid);
3658 if (port_init(portid) != 0)
3659 rte_exit(EXIT_FAILURE,
3660 "Cannot initialize network ports\n");
3663 /* Initialise all linked lists. */
3664 if (init_data_ll() == -1)
3665 rte_exit(EXIT_FAILURE, "Failed to initialize linked list\n");
3667 /* Initialize device stats */
3668 memset(&dev_statistics, 0, sizeof(dev_statistics));
3670 /* Enable stats if the user option is set. */
3672 pthread_create(&tid, NULL, (void*)print_stats, NULL );
3674 /* Launch all data cores. */
3675 if (zero_copy == 0) {
3676 RTE_LCORE_FOREACH_SLAVE(lcore_id) {
3677 rte_eal_remote_launch(switch_worker,
3678 mbuf_pool, lcore_id);
3681 uint32_t count_in_mempool, index, i;
3682 for (index = 0; index < 2*MAX_QUEUES; index++) {
3683 /* For all RX and TX queues. */
3685 = rte_mempool_count(vpool_array[index].pool);
3688 * Transfer all un-attached mbufs from vpool.pool
3691 for (i = 0; i < count_in_mempool; i++) {
3692 struct rte_mbuf *mbuf
3693 = __rte_mbuf_raw_alloc(
3694 vpool_array[index].pool);
3695 rte_ring_sp_enqueue(vpool_array[index].ring,
3699 LOG_DEBUG(VHOST_CONFIG,
3700 "in MAIN: mbuf count in mempool at initial "
3701 "is: %d\n", count_in_mempool);
3702 LOG_DEBUG(VHOST_CONFIG,
3703 "in MAIN: mbuf count in ring at initial is :"
3705 rte_ring_count(vpool_array[index].ring));
3708 RTE_LCORE_FOREACH_SLAVE(lcore_id)
3709 rte_eal_remote_launch(switch_worker_zcp, NULL,
3713 /* Register CUSE device to handle IOCTLs. */
3714 ret = register_cuse_device((char*)&dev_basename, dev_index, get_virtio_net_callbacks());
3716 rte_exit(EXIT_FAILURE,"CUSE device setup failure.\n");
3718 init_virtio_net(&virtio_net_device_ops);
3720 /* Start CUSE session. */
3721 start_cuse_session_loop();